Total coverage: 232421 (15%)of 1575467
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2004, 2005 Oracle. All rights reserved. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/string.h> #include <linux/uaccess.h> #include "masklog.h" struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK); EXPORT_SYMBOL_GPL(mlog_and_bits); struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(0); EXPORT_SYMBOL_GPL(mlog_not_bits); static ssize_t mlog_mask_show(u64 mask, char *buf) { char *state; if (__mlog_test_u64(mask, mlog_and_bits)) state = "allow"; else if (__mlog_test_u64(mask, mlog_not_bits)) state = "deny"; else state = "off"; return snprintf(buf, PAGE_SIZE, "%s\n", state); } static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) { if (!strncasecmp(buf, "allow", 5)) { __mlog_set_u64(mask, mlog_and_bits); __mlog_clear_u64(mask, mlog_not_bits); } else if (!strncasecmp(buf, "deny", 4)) { __mlog_set_u64(mask, mlog_not_bits); __mlog_clear_u64(mask, mlog_and_bits); } else if (!strncasecmp(buf, "off", 3)) { __mlog_clear_u64(mask, mlog_not_bits); __mlog_clear_u64(mask, mlog_and_bits); } else return -EINVAL; return count; } void __mlog_printk(const u64 *mask, const char *func, int line, const char *fmt, ...) { struct va_format vaf; va_list args; const char *level; const char *prefix = ""; if (!__mlog_test_u64(*mask, mlog_and_bits) || __mlog_test_u64(*mask, mlog_not_bits)) return; if (*mask & ML_ERROR) { level = KERN_ERR; prefix = "ERROR: "; } else if (*mask & ML_NOTICE) { level = KERN_NOTICE; } else { level = KERN_INFO; } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk("%s(%s,%u,%u):%s:%d %s%pV", level, current->comm, task_pid_nr(current), raw_smp_processor_id(), func, line, prefix, &vaf); va_end(args); } EXPORT_SYMBOL_GPL(__mlog_printk); struct mlog_attribute { struct attribute attr; u64 mask; }; #define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr) #define define_mask(_name) { \ .attr = { \ .name = #_name, \ .mode = S_IRUGO | S_IWUSR, \ }, \ .mask = ML_##_name, \ } static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(TCP), define_mask(MSG), define_mask(SOCKET), define_mask(HEARTBEAT), define_mask(HB_BIO), define_mask(DLMFS), define_mask(DLM), define_mask(DLM_DOMAIN), define_mask(DLM_THREAD), define_mask(DLM_MASTER), define_mask(DLM_RECOVERY), define_mask(DLM_GLUE), define_mask(VOTE), define_mask(CONN), define_mask(QUORUM), define_mask(BASTS), define_mask(CLUSTER), define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), }; static struct attribute *mlog_default_attrs[MLOG_MAX_BITS] = {NULL, }; ATTRIBUTE_GROUPS(mlog_default); static ssize_t mlog_show(struct kobject *obj, struct attribute *attr, char *buf) { struct mlog_attribute *mlog_attr = to_mlog_attr(attr); return mlog_mask_show(mlog_attr->mask, buf); } static ssize_t mlog_store(struct kobject *obj, struct attribute *attr, const char *buf, size_t count) { struct mlog_attribute *mlog_attr = to_mlog_attr(attr); return mlog_mask_store(mlog_attr->mask, buf, count); } static const struct sysfs_ops mlog_attr_ops = { .show = mlog_show, .store = mlog_store, }; static struct kobj_type mlog_ktype = { .default_groups = mlog_default_groups, .sysfs_ops = &mlog_attr_ops, }; static struct kset mlog_kset = { .kobj = {.ktype = &mlog_ktype}, }; int mlog_sys_init(struct kset *o2cb_kset) { int i = 0; while (mlog_attrs[i].attr.mode) { mlog_default_attrs[i] = &mlog_attrs[i].attr; i++; } mlog_default_attrs[i] = NULL; kobject_set_name(&mlog_kset.kobj, "logmask"); mlog_kset.kobj.kset = o2cb_kset; return kset_register(&mlog_kset); } void mlog_sys_shutdown(void) { kset_unregister(&mlog_kset); }
38 1 1 6 32 32 2 57 1 14 14 1 41 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 // SPDX-License-Identifier: GPL-2.0-only #include <linux/net.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/types.h> #include <net/pkt_sched.h> #include "sch_mqprio_lib.h" /* Returns true if the intervals [a, b) and [c, d) overlap. */ static bool intervals_overlap(int a, int b, int c, int d) { int left = max(a, c), right = min(b, d); return left < right; } static int mqprio_validate_queue_counts(struct net_device *dev, const struct tc_mqprio_qopt *qopt, bool allow_overlapping_txqs, struct netlink_ext_ack *extack) { int i, j; for (i = 0; i < qopt->num_tc; i++) { unsigned int last = qopt->offset[i] + qopt->count[i]; if (!qopt->count[i]) { NL_SET_ERR_MSG_FMT_MOD(extack, "No queues for TC %d", i); return -EINVAL; } /* Verify the queue count is in tx range being equal to the * real_num_tx_queues indicates the last queue is in use. */ if (qopt->offset[i] >= dev->real_num_tx_queues || last > dev->real_num_tx_queues) { NL_SET_ERR_MSG_FMT_MOD(extack, "Queues %d:%d for TC %d exceed the %d TX queues available", qopt->count[i], qopt->offset[i], i, dev->real_num_tx_queues); return -EINVAL; } if (allow_overlapping_txqs) continue; /* Verify that the offset and counts do not overlap */ for (j = i + 1; j < qopt->num_tc; j++) { if (intervals_overlap(qopt->offset[i], last, qopt->offset[j], qopt->offset[j] + qopt->count[j])) { NL_SET_ERR_MSG_FMT_MOD(extack, "TC %d queues %d@%d overlap with TC %d queues %d@%d", i, qopt->count[i], qopt->offset[i], j, qopt->count[j], qopt->offset[j]); return -EINVAL; } } } return 0; } int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt, bool validate_queue_counts, bool allow_overlapping_txqs, struct netlink_ext_ack *extack) { int i, err; /* Verify num_tc is not out of max range */ if (qopt->num_tc > TC_MAX_QUEUE) { NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range"); return -EINVAL; } /* Verify priority mapping uses valid tcs */ for (i = 0; i <= TC_BITMASK; i++) { if (qopt->prio_tc_map[i] >= qopt->num_tc) { NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping"); return -EINVAL; } } if (validate_queue_counts) { err = mqprio_validate_queue_counts(dev, qopt, allow_overlapping_txqs, extack); if (err) return err; } return 0; } EXPORT_SYMBOL_GPL(mqprio_validate_qopt); void mqprio_qopt_reconstruct(struct net_device *dev, struct tc_mqprio_qopt *qopt) { int tc, num_tc = netdev_get_num_tc(dev); qopt->num_tc = num_tc; memcpy(qopt->prio_tc_map, dev->prio_tc_map, sizeof(qopt->prio_tc_map)); for (tc = 0; tc < num_tc; tc++) { qopt->count[tc] = dev->tc_to_txq[tc].count; qopt->offset[tc] = dev->tc_to_txq[tc].offset; } } EXPORT_SYMBOL_GPL(mqprio_qopt_reconstruct); void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE], struct tc_mqprio_qopt_offload *mqprio) { unsigned long preemptible_tcs = 0; int tc; for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) if (fp[tc] == TC_FP_PREEMPTIBLE) preemptible_tcs |= BIT(tc); mqprio->preemptible_tcs = preemptible_tcs; } EXPORT_SYMBOL_GPL(mqprio_fp_to_offload); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Shared mqprio qdisc code currently between taprio and mqprio");
30 30 189 123 116 116 7 123 142 12 14 14 123 151 20 47 8 14 4 12 2 2 2 30 4 6 21 7 7 30 1 10 2 10 3 18 20 20 20 15 1 13 1 1 13 2 14 14 14 14 15 2 14 14 17 30 30 7 5 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 // SPDX-License-Identifier: GPL-2.0 /* * Tty port functions */ #include <linux/types.h> #include <linux/errno.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/serial.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/serdev.h> #include "tty.h" static size_t tty_port_default_receive_buf(struct tty_port *port, const u8 *p, const u8 *f, size_t count) { struct tty_struct *tty; struct tty_ldisc *ld; tty = READ_ONCE(port->itty); if (!tty) return 0; ld = tty_ldisc_ref(tty); if (!ld) return 0; count = tty_ldisc_receive_buf(ld, p, f, count); tty_ldisc_deref(ld); return count; } static void tty_port_default_lookahead_buf(struct tty_port *port, const u8 *p, const u8 *f, size_t count) { struct tty_struct *tty; struct tty_ldisc *ld; tty = READ_ONCE(port->itty); if (!tty) return; ld = tty_ldisc_ref(tty); if (!ld) return; if (ld->ops->lookahead_buf) ld->ops->lookahead_buf(ld->tty, p, f, count); tty_ldisc_deref(ld); } static void tty_port_default_wakeup(struct tty_port *port) { struct tty_struct *tty = tty_port_tty_get(port); if (tty) { tty_wakeup(tty); tty_kref_put(tty); } } const struct tty_port_client_operations tty_port_default_client_ops = { .receive_buf = tty_port_default_receive_buf, .lookahead_buf = tty_port_default_lookahead_buf, .write_wakeup = tty_port_default_wakeup, }; EXPORT_SYMBOL_GPL(tty_port_default_client_ops); /** * tty_port_init - initialize tty_port * @port: tty_port to initialize * * Initializes the state of struct tty_port. When a port was initialized using * this function, one has to destroy the port by tty_port_destroy(). Either * indirectly by using &tty_port refcounting (tty_port_put()) or directly if * refcounting is not used. */ void tty_port_init(struct tty_port *port) { memset(port, 0, sizeof(*port)); tty_buffer_init(port); init_waitqueue_head(&port->open_wait); init_waitqueue_head(&port->delta_msr_wait); mutex_init(&port->mutex); mutex_init(&port->buf_mutex); spin_lock_init(&port->lock); port->close_delay = (50 * HZ) / 100; port->closing_wait = (3000 * HZ) / 100; port->client_ops = &tty_port_default_client_ops; kref_init(&port->kref); } EXPORT_SYMBOL(tty_port_init); /** * tty_port_link_device - link tty and tty_port * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * * Provide the tty layer with a link from a tty (specified by @index) to a * tty_port (@port). Use this only if neither tty_port_register_device() nor * tty_port_install() is used in the driver. If used, this has to be called * before tty_register_driver(). */ void tty_port_link_device(struct tty_port *port, struct tty_driver *driver, unsigned index) { if (WARN_ON(index >= driver->num)) return; driver->ports[index] = port; } EXPORT_SYMBOL_GPL(tty_port_link_device); /** * tty_port_register_device - register tty device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @device: parent if exists, otherwise NULL * * It is the same as tty_register_device() except the provided @port is linked * to a concrete tty specified by @index. Use this or tty_port_install() (or * both). Call tty_port_link_device() as a last resort. */ struct device *tty_port_register_device(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device) { return tty_port_register_device_attr(port, driver, index, device, NULL, NULL); } EXPORT_SYMBOL_GPL(tty_port_register_device); /** * tty_port_register_device_attr - register tty device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @device: parent if exists, otherwise NULL * @drvdata: Driver data to be set to device. * @attr_grp: Attribute group to be set on device. * * It is the same as tty_register_device_attr() except the provided @port is * linked to a concrete tty specified by @index. Use this or tty_port_install() * (or both). Call tty_port_link_device() as a last resort. */ struct device *tty_port_register_device_attr(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp) { tty_port_link_device(port, driver, index); return tty_register_device_attr(driver, index, device, drvdata, attr_grp); } EXPORT_SYMBOL_GPL(tty_port_register_device_attr); /** * tty_port_register_device_attr_serdev - register tty or serdev device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @host: serial port hardware device * @parent: parent if exists, otherwise NULL * @drvdata: driver data for the device * @attr_grp: attribute group for the device * * Register a serdev or tty device depending on if the parent device has any * defined serdev clients or not. */ struct device *tty_port_register_device_attr_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *host, struct device *parent, void *drvdata, const struct attribute_group **attr_grp) { struct device *dev; tty_port_link_device(port, driver, index); dev = serdev_tty_port_register(port, host, parent, driver, index); if (PTR_ERR(dev) != -ENODEV) { /* Skip creating cdev if we registered a serdev device */ return dev; } return tty_register_device_attr(driver, index, parent, drvdata, attr_grp); } EXPORT_SYMBOL_GPL(tty_port_register_device_attr_serdev); /** * tty_port_register_device_serdev - register tty or serdev device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @host: serial port hardware controller device * @parent: parent if exists, otherwise NULL * * Register a serdev or tty device depending on if the parent device has any * defined serdev clients or not. */ struct device *tty_port_register_device_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *host, struct device *parent) { return tty_port_register_device_attr_serdev(port, driver, index, host, parent, NULL, NULL); } EXPORT_SYMBOL_GPL(tty_port_register_device_serdev); /** * tty_port_unregister_device - deregister a tty or serdev device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * * If a tty or serdev device is registered with a call to * tty_port_register_device_serdev() then this function must be called when * the device is gone. */ void tty_port_unregister_device(struct tty_port *port, struct tty_driver *driver, unsigned index) { int ret; ret = serdev_tty_port_unregister(port); if (ret == 0) return; tty_unregister_device(driver, index); } EXPORT_SYMBOL_GPL(tty_port_unregister_device); int tty_port_alloc_xmit_buf(struct tty_port *port) { /* We may sleep in get_zeroed_page() */ mutex_lock(&port->buf_mutex); if (port->xmit_buf == NULL) { port->xmit_buf = (u8 *)get_zeroed_page(GFP_KERNEL); if (port->xmit_buf) kfifo_init(&port->xmit_fifo, port->xmit_buf, PAGE_SIZE); } mutex_unlock(&port->buf_mutex); if (port->xmit_buf == NULL) return -ENOMEM; return 0; } EXPORT_SYMBOL(tty_port_alloc_xmit_buf); void tty_port_free_xmit_buf(struct tty_port *port) { mutex_lock(&port->buf_mutex); free_page((unsigned long)port->xmit_buf); port->xmit_buf = NULL; INIT_KFIFO(port->xmit_fifo); mutex_unlock(&port->buf_mutex); } EXPORT_SYMBOL(tty_port_free_xmit_buf); /** * tty_port_destroy - destroy inited port * @port: tty port to be destroyed * * When a port was initialized using tty_port_init(), one has to destroy the * port by this function. Either indirectly by using &tty_port refcounting * (tty_port_put()) or directly if refcounting is not used. */ void tty_port_destroy(struct tty_port *port) { tty_buffer_cancel_work(port); tty_buffer_free_all(port); } EXPORT_SYMBOL(tty_port_destroy); static void tty_port_destructor(struct kref *kref) { struct tty_port *port = container_of(kref, struct tty_port, kref); /* check if last port ref was dropped before tty release */ if (WARN_ON(port->itty)) return; free_page((unsigned long)port->xmit_buf); tty_port_destroy(port); if (port->ops && port->ops->destruct) port->ops->destruct(port); else kfree(port); } /** * tty_port_put - drop a reference to tty_port * @port: port to drop a reference of (can be NULL) * * The final put will destroy and free up the @port using * @port->ops->destruct() hook, or using kfree() if not provided. */ void tty_port_put(struct tty_port *port) { if (port) kref_put(&port->kref, tty_port_destructor); } EXPORT_SYMBOL(tty_port_put); /** * tty_port_tty_get - get a tty reference * @port: tty port * * Return a refcount protected tty instance or %NULL if the port is not * associated with a tty (eg due to close or hangup). */ struct tty_struct *tty_port_tty_get(struct tty_port *port) { unsigned long flags; struct tty_struct *tty; spin_lock_irqsave(&port->lock, flags); tty = tty_kref_get(port->tty); spin_unlock_irqrestore(&port->lock, flags); return tty; } EXPORT_SYMBOL(tty_port_tty_get); /** * tty_port_tty_set - set the tty of a port * @port: tty port * @tty: the tty * * Associate the port and tty pair. Manages any internal refcounts. Pass %NULL * to deassociate a port. */ void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&port->lock, flags); tty_kref_put(port->tty); port->tty = tty_kref_get(tty); spin_unlock_irqrestore(&port->lock, flags); } EXPORT_SYMBOL(tty_port_tty_set); /** * tty_port_shutdown - internal helper to shutdown the device * @port: tty port to be shut down * @tty: the associated tty * * It is used by tty_port_hangup() and tty_port_close(). Its task is to * shutdown the device if it was initialized (note consoles remain * functioning). It lowers DTR/RTS (if @tty has HUPCL set) and invokes * @port->ops->shutdown(). */ static void tty_port_shutdown(struct tty_port *port, struct tty_struct *tty) { mutex_lock(&port->mutex); if (port->console) goto out; if (tty_port_initialized(port)) { tty_port_set_initialized(port, false); /* * Drop DTR/RTS if HUPCL is set. This causes any attached * modem to hang up the line. */ if (tty && C_HUPCL(tty)) tty_port_lower_dtr_rts(port); if (port->ops->shutdown) port->ops->shutdown(port); } out: mutex_unlock(&port->mutex); } /** * tty_port_hangup - hangup helper * @port: tty port * * Perform port level tty hangup flag and count changes. Drop the tty * reference. * * Caller holds tty lock. */ void tty_port_hangup(struct tty_port *port) { struct tty_struct *tty; unsigned long flags; spin_lock_irqsave(&port->lock, flags); port->count = 0; tty = port->tty; if (tty) set_bit(TTY_IO_ERROR, &tty->flags); port->tty = NULL; spin_unlock_irqrestore(&port->lock, flags); tty_port_set_active(port, false); tty_port_shutdown(port, tty); tty_kref_put(tty); wake_up_interruptible(&port->open_wait); wake_up_interruptible(&port->delta_msr_wait); } EXPORT_SYMBOL(tty_port_hangup); /** * tty_port_tty_hangup - helper to hang up a tty * @port: tty port * @check_clocal: hang only ttys with %CLOCAL unset? */ void tty_port_tty_hangup(struct tty_port *port, bool check_clocal) { struct tty_struct *tty = tty_port_tty_get(port); if (tty && (!check_clocal || !C_CLOCAL(tty))) tty_hangup(tty); tty_kref_put(tty); } EXPORT_SYMBOL_GPL(tty_port_tty_hangup); /** * tty_port_tty_wakeup - helper to wake up a tty * @port: tty port */ void tty_port_tty_wakeup(struct tty_port *port) { port->client_ops->write_wakeup(port); } EXPORT_SYMBOL_GPL(tty_port_tty_wakeup); /** * tty_port_carrier_raised - carrier raised check * @port: tty port * * Wrapper for the carrier detect logic. For the moment this is used * to hide some internal details. This will eventually become entirely * internal to the tty port. */ bool tty_port_carrier_raised(struct tty_port *port) { if (port->ops->carrier_raised == NULL) return true; return port->ops->carrier_raised(port); } EXPORT_SYMBOL(tty_port_carrier_raised); /** * tty_port_raise_dtr_rts - Raise DTR/RTS * @port: tty port * * Wrapper for the DTR/RTS raise logic. For the moment this is used to hide * some internal details. This will eventually become entirely internal to the * tty port. */ void tty_port_raise_dtr_rts(struct tty_port *port) { if (port->ops->dtr_rts) port->ops->dtr_rts(port, true); } EXPORT_SYMBOL(tty_port_raise_dtr_rts); /** * tty_port_lower_dtr_rts - Lower DTR/RTS * @port: tty port * * Wrapper for the DTR/RTS raise logic. For the moment this is used to hide * some internal details. This will eventually become entirely internal to the * tty port. */ void tty_port_lower_dtr_rts(struct tty_port *port) { if (port->ops->dtr_rts) port->ops->dtr_rts(port, false); } EXPORT_SYMBOL(tty_port_lower_dtr_rts); /** * tty_port_block_til_ready - Waiting logic for tty open * @port: the tty port being opened * @tty: the tty device being bound * @filp: the file pointer of the opener or %NULL * * Implement the core POSIX/SuS tty behaviour when opening a tty device. * Handles: * * - hangup (both before and during) * - non blocking open * - rts/dtr/dcd * - signals * - port flags and counts * * The passed @port must implement the @port->ops->carrier_raised method if it * can do carrier detect and the @port->ops->dtr_rts method if it supports * software management of these lines. Note that the dtr/rts raise is done each * iteration as a hangup may have previously dropped them while we wait. * * Caller holds tty lock. * * Note: May drop and reacquire tty lock when blocking, so @tty and @port may * have changed state (eg., may have been hung up). */ int tty_port_block_til_ready(struct tty_port *port, struct tty_struct *tty, struct file *filp) { int do_clocal = 0, retval; unsigned long flags; DEFINE_WAIT(wait); /* if non-blocking mode is set we can pass directly to open unless * the port has just hung up or is in another error state. */ if (tty_io_error(tty)) { tty_port_set_active(port, true); return 0; } if (filp == NULL || (filp->f_flags & O_NONBLOCK)) { /* Indicate we are open */ if (C_BAUD(tty)) tty_port_raise_dtr_rts(port); tty_port_set_active(port, true); return 0; } if (C_CLOCAL(tty)) do_clocal = 1; /* Block waiting until we can proceed. We may need to wait for the * carrier, but we must also wait for any close that is in progress * before the next open may complete. */ retval = 0; /* The port lock protects the port counts */ spin_lock_irqsave(&port->lock, flags); port->count--; port->blocked_open++; spin_unlock_irqrestore(&port->lock, flags); while (1) { /* Indicate we are open */ if (C_BAUD(tty) && tty_port_initialized(port)) tty_port_raise_dtr_rts(port); prepare_to_wait(&port->open_wait, &wait, TASK_INTERRUPTIBLE); /* Check for a hangup or uninitialised port. * Return accordingly. */ if (tty_hung_up_p(filp) || !tty_port_initialized(port)) { if (port->flags & ASYNC_HUP_NOTIFY) retval = -EAGAIN; else retval = -ERESTARTSYS; break; } /* * Probe the carrier. For devices with no carrier detect * tty_port_carrier_raised will always return true. * Never ask drivers if CLOCAL is set, this causes troubles * on some hardware. */ if (do_clocal || tty_port_carrier_raised(port)) break; if (signal_pending(current)) { retval = -ERESTARTSYS; break; } tty_unlock(tty); schedule(); tty_lock(tty); } finish_wait(&port->open_wait, &wait); /* Update counts. A parallel hangup will have set count to zero and * we must not mess that up further. */ spin_lock_irqsave(&port->lock, flags); if (!tty_hung_up_p(filp)) port->count++; port->blocked_open--; spin_unlock_irqrestore(&port->lock, flags); if (retval == 0) tty_port_set_active(port, true); return retval; } EXPORT_SYMBOL(tty_port_block_til_ready); static void tty_port_drain_delay(struct tty_port *port, struct tty_struct *tty) { unsigned int bps = tty_get_baud_rate(tty); long timeout; if (bps > 1200) { timeout = (HZ * 10 * port->drain_delay) / bps; timeout = max_t(long, timeout, HZ / 10); } else { timeout = 2 * HZ; } schedule_timeout_interruptible(timeout); } /** * tty_port_close_start - helper for tty->ops->close, part 1/2 * @port: tty_port of the device * @tty: tty being closed * @filp: passed file pointer * * Decrements and checks open count. Flushes the port if this is the last * close. That means, dropping the data from the outpu buffer on the device and * waiting for sending logic to finish. The rest of close handling is performed * in tty_port_close_end(). * * Locking: Caller holds tty lock. * * Return: 1 if this is the last close, otherwise 0 */ int tty_port_close_start(struct tty_port *port, struct tty_struct *tty, struct file *filp) { unsigned long flags; if (tty_hung_up_p(filp)) return 0; spin_lock_irqsave(&port->lock, flags); if (tty->count == 1 && port->count != 1) { tty_warn(tty, "%s: tty->count = 1 port count = %d\n", __func__, port->count); port->count = 1; } if (--port->count < 0) { tty_warn(tty, "%s: bad port count (%d)\n", __func__, port->count); port->count = 0; } if (port->count) { spin_unlock_irqrestore(&port->lock, flags); return 0; } spin_unlock_irqrestore(&port->lock, flags); tty->closing = 1; if (tty_port_initialized(port)) { /* Don't block on a stalled port, just pull the chain */ if (tty->flow.tco_stopped) tty_driver_flush_buffer(tty); if (port->closing_wait != ASYNC_CLOSING_WAIT_NONE) tty_wait_until_sent(tty, port->closing_wait); if (port->drain_delay) tty_port_drain_delay(port, tty); } /* Flush the ldisc buffering */ tty_ldisc_flush(tty); /* Report to caller this is the last port reference */ return 1; } EXPORT_SYMBOL(tty_port_close_start); /** * tty_port_close_end - helper for tty->ops->close, part 2/2 * @port: tty_port of the device * @tty: tty being closed * * This is a continuation of the first part: tty_port_close_start(). This * should be called after turning off the device. It flushes the data from the * line discipline and delays the close by @port->close_delay. * * Locking: Caller holds tty lock. */ void tty_port_close_end(struct tty_port *port, struct tty_struct *tty) { unsigned long flags; tty_ldisc_flush(tty); tty->closing = 0; spin_lock_irqsave(&port->lock, flags); if (port->blocked_open) { spin_unlock_irqrestore(&port->lock, flags); if (port->close_delay) msleep_interruptible(jiffies_to_msecs(port->close_delay)); spin_lock_irqsave(&port->lock, flags); wake_up_interruptible(&port->open_wait); } spin_unlock_irqrestore(&port->lock, flags); tty_port_set_active(port, false); } EXPORT_SYMBOL(tty_port_close_end); /** * tty_port_close - generic tty->ops->close handler * @port: tty_port of the device * @tty: tty being closed * @filp: passed file pointer * * It is a generic helper to be used in driver's @tty->ops->close. It wraps a * sequence of tty_port_close_start(), tty_port_shutdown(), and * tty_port_close_end(). The latter two are called only if this is the last * close. See the respective functions for the details. * * Locking: Caller holds tty lock */ void tty_port_close(struct tty_port *port, struct tty_struct *tty, struct file *filp) { if (tty_port_close_start(port, tty, filp) == 0) return; tty_port_shutdown(port, tty); if (!port->console) set_bit(TTY_IO_ERROR, &tty->flags); tty_port_close_end(port, tty); tty_port_tty_set(port, NULL); } EXPORT_SYMBOL(tty_port_close); /** * tty_port_install - generic tty->ops->install handler * @port: tty_port of the device * @driver: tty_driver for this device * @tty: tty to be installed * * It is the same as tty_standard_install() except the provided @port is linked * to a concrete tty specified by @tty. Use this or tty_port_register_device() * (or both). Call tty_port_link_device() as a last resort. */ int tty_port_install(struct tty_port *port, struct tty_driver *driver, struct tty_struct *tty) { tty->port = port; return tty_standard_install(driver, tty); } EXPORT_SYMBOL_GPL(tty_port_install); /** * tty_port_open - generic tty->ops->open handler * @port: tty_port of the device * @tty: tty to be opened * @filp: passed file pointer * * It is a generic helper to be used in driver's @tty->ops->open. It activates * the devices using @port->ops->activate if not active already. And waits for * the device to be ready using tty_port_block_til_ready() (e.g. raises * DTR/CTS and waits for carrier). * * Note that @port->ops->shutdown is not called when @port->ops->activate * returns an error (on the contrary, @tty->ops->close is). * * Locking: Caller holds tty lock. * * Note: may drop and reacquire tty lock (in tty_port_block_til_ready()) so * @tty and @port may have changed state (eg., may be hung up now). */ int tty_port_open(struct tty_port *port, struct tty_struct *tty, struct file *filp) { spin_lock_irq(&port->lock); ++port->count; spin_unlock_irq(&port->lock); tty_port_tty_set(port, tty); /* * Do the device-specific open only if the hardware isn't * already initialized. Serialize open and shutdown using the * port mutex. */ mutex_lock(&port->mutex); if (!tty_port_initialized(port)) { clear_bit(TTY_IO_ERROR, &tty->flags); if (port->ops->activate) { int retval = port->ops->activate(port, tty); if (retval) { mutex_unlock(&port->mutex); return retval; } } tty_port_set_initialized(port, true); } mutex_unlock(&port->mutex); return tty_port_block_til_ready(port, tty, filp); } EXPORT_SYMBOL(tty_port_open);
17 63 221 223 16 208 48 46 31 16 4 4 212 212 211 152 3 149 5 147 25 4 21 14 20 189 210 3 8 3 5 4 12 54 18 36 18 10 1 25 33 33 15 14 3 11 15 14 14 2 8 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 // SPDX-License-Identifier: GPL-2.0-only /* * Pluggable TCP congestion control support and newReno * congestion control. * Based on ideas from I/O scheduler support and Web100. * * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> */ #define pr_fmt(fmt) "TCP: " fmt #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/list.h> #include <linux/gfp.h> #include <linux/jhash.h> #include <net/tcp.h> #include <trace/events/tcp.h> static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); /* Simple linear search, don't expect many entries! */ struct tcp_congestion_ops *tcp_ca_find(const char *name) { struct tcp_congestion_ops *e; list_for_each_entry_rcu(e, &tcp_cong_list, list) { if (strcmp(e->name, name) == 0) return e; } return NULL; } void tcp_set_ca_state(struct sock *sk, const u8 ca_state) { struct inet_connection_sock *icsk = inet_csk(sk); trace_tcp_cong_state_set(sk, ca_state); if (icsk->icsk_ca_ops->set_state) icsk->icsk_ca_ops->set_state(sk, ca_state); icsk->icsk_ca_state = ca_state; } /* Must be called with rcu lock held */ static struct tcp_congestion_ops *tcp_ca_find_autoload(const char *name) { struct tcp_congestion_ops *ca = tcp_ca_find(name); #ifdef CONFIG_MODULES if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); request_module("tcp_%s", name); rcu_read_lock(); ca = tcp_ca_find(name); } #endif return ca; } /* Simple linear search, not much in here. */ struct tcp_congestion_ops *tcp_ca_find_key(u32 key) { struct tcp_congestion_ops *e; list_for_each_entry_rcu(e, &tcp_cong_list, list) { if (e->key == key) return e; } return NULL; } int tcp_validate_congestion_control(struct tcp_congestion_ops *ca) { /* all algorithms must implement these */ if (!ca->ssthresh || !ca->undo_cwnd || !(ca->cong_avoid || ca->cong_control)) { pr_err("%s does not implement required ops\n", ca->name); return -EINVAL; } return 0; } /* Attach new congestion control algorithm to the list * of available options. */ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) { int ret; ret = tcp_validate_congestion_control(ca); if (ret) return ret; ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); spin_lock(&tcp_cong_list_lock); if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) { pr_notice("%s already registered or non-unique key\n", ca->name); ret = -EEXIST; } else { list_add_tail_rcu(&ca->list, &tcp_cong_list); pr_debug("%s registered\n", ca->name); } spin_unlock(&tcp_cong_list_lock); return ret; } EXPORT_SYMBOL_GPL(tcp_register_congestion_control); /* * Remove congestion control algorithm, called from * the module's remove function. Module ref counts are used * to ensure that this can't be done till all sockets using * that method are closed. */ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) { spin_lock(&tcp_cong_list_lock); list_del_rcu(&ca->list); spin_unlock(&tcp_cong_list_lock); /* Wait for outstanding readers to complete before the * module gets removed entirely. * * A try_module_get() should fail by now as our module is * in "going" state since no refs are held anymore and * module_exit() handler being called. */ synchronize_rcu(); } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); /* Replace a registered old ca with a new one. * * The new ca must have the same name as the old one, that has been * registered. */ int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca) { struct tcp_congestion_ops *existing; int ret = 0; ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); spin_lock(&tcp_cong_list_lock); existing = tcp_ca_find_key(old_ca->key); if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) { pr_notice("%s not registered or non-unique key\n", ca->name); ret = -EINVAL; } else if (existing != old_ca) { pr_notice("invalid old congestion control algorithm to replace\n"); ret = -EINVAL; } else { /* Add the new one before removing the old one to keep * one implementation available all the time. */ list_add_tail_rcu(&ca->list, &tcp_cong_list); list_del_rcu(&existing->list); pr_debug("%s updated\n", ca->name); } spin_unlock(&tcp_cong_list_lock); /* Wait for outstanding readers to complete before the * module or struct_ops gets removed entirely. */ if (!ret) synchronize_rcu(); return ret; } u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; u32 key = TCP_CA_UNSPEC; might_sleep(); rcu_read_lock(); ca = tcp_ca_find_autoload(name); if (ca) { key = ca->key; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; } rcu_read_unlock(); return key; } char *tcp_ca_get_name_by_key(u32 key, char *buffer) { const struct tcp_congestion_ops *ca; char *ret = NULL; rcu_read_lock(); ca = tcp_ca_find_key(key); if (ca) { strscpy(buffer, ca->name, TCP_CA_NAME_MAX); ret = buffer; } rcu_read_unlock(); return ret; } /* Assign choice of congestion control. */ void tcp_assign_congestion_control(struct sock *sk) { struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); if (unlikely(!bpf_try_module_get(ca, ca->owner))) ca = &tcp_reno; icsk->icsk_ca_ops = ca; rcu_read_unlock(); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); } void tcp_init_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); icsk->icsk_ca_initialized = 1; } static void tcp_reinit_congestion_control(struct sock *sk, const struct tcp_congestion_ops *ca) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = ca; icsk->icsk_ca_setsockopt = 1; memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) tcp_init_congestion_control(sk); } /* Manage refcounts on socket close. */ void tcp_cleanup_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); icsk->icsk_ca_initialized = 0; bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ int tcp_set_default_congestion_control(struct net *net, const char *name) { struct tcp_congestion_ops *ca; const struct tcp_congestion_ops *prev; int ret; rcu_read_lock(); ca = tcp_ca_find_autoload(name); if (!ca) { ret = -ENOENT; } else if (!bpf_try_module_get(ca, ca->owner)) { ret = -EBUSY; } else if (!net_eq(net, &init_net) && !(ca->flags & TCP_CONG_NON_RESTRICTED)) { /* Only init netns can set default to a restricted algorithm */ ret = -EPERM; } else { prev = xchg(&net->ipv4.tcp_congestion_control, ca); if (prev) bpf_module_put(prev, prev->owner); ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; } rcu_read_unlock(); return ret; } /* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) { return tcp_set_default_congestion_control(&init_net, CONFIG_DEFAULT_TCP_CONG); } late_initcall(tcp_congestion_default); /* Build string with list of available congestion control values */ void tcp_get_available_congestion_control(char *buf, size_t maxlen) { struct tcp_congestion_ops *ca; size_t offs = 0; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } /* Get current default congestion control */ void tcp_get_default_congestion_control(struct net *net, char *name) { const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); strscpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } /* Built list of non-restricted congestion control values */ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) { struct tcp_congestion_ops *ca; size_t offs = 0; *buf = '\0'; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { if (!(ca->flags & TCP_CONG_NON_RESTRICTED)) continue; offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } /* Change list of non-restricted congestion control */ int tcp_set_allowed_congestion_control(char *val) { struct tcp_congestion_ops *ca; char *saved_clone, *clone, *name; int ret = 0; saved_clone = clone = kstrdup(val, GFP_USER); if (!clone) return -ENOMEM; spin_lock(&tcp_cong_list_lock); /* pass 1 check for bad entries */ while ((name = strsep(&clone, " ")) && *name) { ca = tcp_ca_find(name); if (!ca) { ret = -ENOENT; goto out; } } /* pass 2 clear old values */ list_for_each_entry_rcu(ca, &tcp_cong_list, list) ca->flags &= ~TCP_CONG_NON_RESTRICTED; /* pass 3 mark as allowed */ while ((name = strsep(&val, " ")) && *name) { ca = tcp_ca_find(name); WARN_ON(!ca); if (ca) ca->flags |= TCP_CONG_NON_RESTRICTED; } out: spin_unlock(&tcp_cong_list_lock); kfree(saved_clone); return ret; } /* Change congestion control for socket. If load is false, then it is the * responsibility of the caller to call tcp_init_congestion_control or * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; int err = 0; if (icsk->icsk_ca_dst_locked) return -EPERM; rcu_read_lock(); if (!load) ca = tcp_ca_find(name); else ca = tcp_ca_find_autoload(name); /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { icsk->icsk_ca_setsockopt = 1; goto out; } if (!ca) err = -ENOENT; else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) err = -EPERM; else if (!bpf_try_module_get(ca, ca->owner)) err = -EBUSY; else tcp_reinit_congestion_control(sk, ca); out: rcu_read_unlock(); return err; } /* Slow start is used when congestion window is no greater than the slow start * threshold. We base on RFC2581 and also handle stretch ACKs properly. * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but * something better;) a packet is only considered (s)acked in its entirety to * defend the ACK attacks described in the RFC. Slow start processes a stretch * ACK of degree N as if N acks of degree 1 are received back to back except * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ __bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh); acked -= cwnd - tcp_snd_cwnd(tp); tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); return acked; } EXPORT_SYMBOL_GPL(tcp_slow_start); /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w), * for every packet that was ACKed. */ __bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { /* If credits accumulated at a higher w, apply them gently now. */ if (tp->snd_cwnd_cnt >= w) { tp->snd_cwnd_cnt = 0; tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); } tp->snd_cwnd_cnt += acked; if (tp->snd_cwnd_cnt >= w) { u32 delta = tp->snd_cwnd_cnt / w; tp->snd_cwnd_cnt -= delta * w; tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta); } tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp)); } EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); /* * TCP Reno congestion control * This is special case used for fallback as well. */ /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ __bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_is_cwnd_limited(sk)) return; /* In "safe" area, increase. */ if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; } /* In dangerous area, increase slowly. */ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked); } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); /* Slow start threshold is half the congestion window (min 2) */ __bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp) >> 1U, 2U); } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); __bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp), tp->prior_cwnd); } EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd); struct tcp_congestion_ops tcp_reno = { .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .undo_cwnd = tcp_reno_undo_cwnd, };
14 2 2 6 13 2 7 6 7 7 3 6 6 154 164 164 4 11 160 155 151 24 2 8 1 13 1 1 1 1 1 1 11 162 44 121 11 11 11 11 11 11 11 1 11 11 11 11 11 7 7 3 7 1 3 28 25 2 1 2 3 1 27 13 11 3 13 1 1 13 11 11 14 26 1 26 1 25 2 25 2 13 27 27 2 29 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 7 7 2 2 7 7 8 6 2 6 6 6 3 6 1 3 3 3 1 1 262 1 249 252 15 4 2 6 6 6 52 124 64 158 30 16 172 47 124 100 19 16 19 17 15 20 130 2 21 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ #ifndef _IP_SET_HASH_GEN_H #define _IP_SET_HASH_GEN_H #include <linux/rcupdate.h> #include <linux/rcupdate_wait.h> #include <linux/jhash.h> #include <linux/types.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/ipset/ip_set.h> #define __ipset_dereference(p) \ rcu_dereference_protected(p, 1) #define ipset_dereference_nfnl(p) \ rcu_dereference_protected(p, \ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) #define ipset_dereference_set(p, set) \ rcu_dereference_protected(p, \ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ lockdep_is_held(&(set)->lock)) #define ipset_dereference_bh_nfnl(p) \ rcu_dereference_bh_check(p, \ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) /* Hashing which uses arrays to resolve clashing. The hash table is resized * (doubled) when searching becomes too long. * Internally jhash is used with the assumption that the size of the * stored data is a multiple of sizeof(u32). * * Readers and resizing * * Resizing can be triggered by userspace command only, and those * are serialized by the nfnl mutex. During resizing the set is * read-locked, so the only possible concurrent operations are * the kernel side readers. Those must be protected by proper RCU locking. */ /* Number of elements to store in an initial array block */ #define AHASH_INIT_SIZE 2 /* Max number of elements to store in an array block */ #define AHASH_MAX_SIZE (6 * AHASH_INIT_SIZE) /* Max muber of elements in the array block when tuned */ #define AHASH_MAX_TUNED 64 #define AHASH_MAX(h) ((h)->bucketsize) /* A hash bucket */ struct hbucket { struct rcu_head rcu; /* for call_rcu */ /* Which positions are used in the array */ DECLARE_BITMAP(used, AHASH_MAX_TUNED); u8 size; /* size of the array */ u8 pos; /* position of the first free entry */ unsigned char value[] /* the array of the values */ __aligned(__alignof__(u64)); }; /* Region size for locking == 2^HTABLE_REGION_BITS */ #define HTABLE_REGION_BITS 10 #define ahash_numof_locks(htable_bits) \ ((htable_bits) < HTABLE_REGION_BITS ? 1 \ : jhash_size((htable_bits) - HTABLE_REGION_BITS)) #define ahash_sizeof_regions(htable_bits) \ (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) #define ahash_region(n, htable_bits) \ ((n) % ahash_numof_locks(htable_bits)) #define ahash_bucket_start(h, htable_bits) \ ((htable_bits) < HTABLE_REGION_BITS ? 0 \ : (h) * jhash_size(HTABLE_REGION_BITS)) #define ahash_bucket_end(h, htable_bits) \ ((htable_bits) < HTABLE_REGION_BITS ? jhash_size(htable_bits) \ : ((h) + 1) * jhash_size(HTABLE_REGION_BITS)) struct htable_gc { struct delayed_work dwork; struct ip_set *set; /* Set the gc belongs to */ u32 region; /* Last gc run position */ }; /* The hash table: the table size stored here in order to make resizing easy */ struct htable { atomic_t ref; /* References for resizing */ atomic_t uref; /* References for dumping and gc */ u8 htable_bits; /* size of hash table == 2^htable_bits */ u32 maxelem; /* Maxelem per region */ struct ip_set_region *hregion; /* Region locks and ext sizes */ struct hbucket __rcu *bucket[]; /* hashtable buckets */ }; #define hbucket(h, i) ((h)->bucket[i]) #define ext_size(n, dsize) \ (sizeof(struct hbucket) + (n) * (dsize)) #ifndef IPSET_NET_COUNT #define IPSET_NET_COUNT 1 #endif /* Book-keeping of the prefixes added to the set */ struct net_prefixes { u32 nets[IPSET_NET_COUNT]; /* number of elements for this cidr */ u8 cidr[IPSET_NET_COUNT]; /* the cidr value */ }; /* Compute the hash table size */ static size_t htable_size(u8 hbits) { size_t hsize; /* We must fit both into u32 in jhash and INT_MAX in kvmalloc_node() */ if (hbits > 31) return 0; hsize = jhash_size(hbits); if ((INT_MAX - sizeof(struct htable)) / sizeof(struct hbucket *) < hsize) return 0; return hsize * sizeof(struct hbucket *) + sizeof(struct htable); } #ifdef IP_SET_HASH_WITH_NETS #if IPSET_NET_COUNT > 1 #define __CIDR(cidr, i) (cidr[i]) #else #define __CIDR(cidr, i) (cidr) #endif /* cidr + 1 is stored in net_prefixes to support /0 */ #define NCIDR_PUT(cidr) ((cidr) + 1) #define NCIDR_GET(cidr) ((cidr) - 1) #ifdef IP_SET_HASH_WITH_NETS_PACKED /* When cidr is packed with nomatch, cidr - 1 is stored in the data entry */ #define DCIDR_PUT(cidr) ((cidr) - 1) #define DCIDR_GET(cidr, i) (__CIDR(cidr, i) + 1) #else #define DCIDR_PUT(cidr) (cidr) #define DCIDR_GET(cidr, i) __CIDR(cidr, i) #endif #define INIT_CIDR(cidr, host_mask) \ DCIDR_PUT(((cidr) ? NCIDR_GET(cidr) : host_mask)) #ifdef IP_SET_HASH_WITH_NET0 /* cidr from 0 to HOST_MASK value and c = cidr + 1 */ #define NLEN (HOST_MASK + 1) #define CIDR_POS(c) ((c) - 1) #else /* cidr from 1 to HOST_MASK value and c = cidr + 1 */ #define NLEN HOST_MASK #define CIDR_POS(c) ((c) - 2) #endif #else #define NLEN 0 #endif /* IP_SET_HASH_WITH_NETS */ #define SET_ELEM_EXPIRED(set, d) \ (SET_WITH_TIMEOUT(set) && \ ip_set_timeout_expired(ext_timeout(d, set))) #if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) static const union nf_inet_addr onesmask = { .all[0] = 0xffffffff, .all[1] = 0xffffffff, .all[2] = 0xffffffff, .all[3] = 0xffffffff }; static const union nf_inet_addr zeromask = {}; #endif #endif /* _IP_SET_HASH_GEN_H */ #ifndef MTYPE #error "MTYPE is not defined!" #endif #ifndef HTYPE #error "HTYPE is not defined!" #endif #ifndef HOST_MASK #error "HOST_MASK is not defined!" #endif /* Family dependent templates */ #undef ahash_data #undef mtype_data_equal #undef mtype_do_data_match #undef mtype_data_set_flags #undef mtype_data_reset_elem #undef mtype_data_reset_flags #undef mtype_data_netmask #undef mtype_data_list #undef mtype_data_next #undef mtype_elem #undef mtype_ahash_destroy #undef mtype_ext_cleanup #undef mtype_add_cidr #undef mtype_del_cidr #undef mtype_ahash_memsize #undef mtype_flush #undef mtype_destroy #undef mtype_same_set #undef mtype_kadt #undef mtype_uadt #undef mtype_add #undef mtype_del #undef mtype_test_cidrs #undef mtype_test #undef mtype_uref #undef mtype_resize #undef mtype_ext_size #undef mtype_resize_ad #undef mtype_head #undef mtype_list #undef mtype_gc_do #undef mtype_gc #undef mtype_gc_init #undef mtype_cancel_gc #undef mtype_variant #undef mtype_data_match #undef htype #undef HKEY #define mtype_data_equal IPSET_TOKEN(MTYPE, _data_equal) #ifdef IP_SET_HASH_WITH_NETS #define mtype_do_data_match IPSET_TOKEN(MTYPE, _do_data_match) #else #define mtype_do_data_match(d) 1 #endif #define mtype_data_set_flags IPSET_TOKEN(MTYPE, _data_set_flags) #define mtype_data_reset_elem IPSET_TOKEN(MTYPE, _data_reset_elem) #define mtype_data_reset_flags IPSET_TOKEN(MTYPE, _data_reset_flags) #define mtype_data_netmask IPSET_TOKEN(MTYPE, _data_netmask) #define mtype_data_list IPSET_TOKEN(MTYPE, _data_list) #define mtype_data_next IPSET_TOKEN(MTYPE, _data_next) #define mtype_elem IPSET_TOKEN(MTYPE, _elem) #define mtype_ahash_destroy IPSET_TOKEN(MTYPE, _ahash_destroy) #define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup) #define mtype_add_cidr IPSET_TOKEN(MTYPE, _add_cidr) #define mtype_del_cidr IPSET_TOKEN(MTYPE, _del_cidr) #define mtype_ahash_memsize IPSET_TOKEN(MTYPE, _ahash_memsize) #define mtype_flush IPSET_TOKEN(MTYPE, _flush) #define mtype_destroy IPSET_TOKEN(MTYPE, _destroy) #define mtype_same_set IPSET_TOKEN(MTYPE, _same_set) #define mtype_kadt IPSET_TOKEN(MTYPE, _kadt) #define mtype_uadt IPSET_TOKEN(MTYPE, _uadt) #define mtype_add IPSET_TOKEN(MTYPE, _add) #define mtype_del IPSET_TOKEN(MTYPE, _del) #define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs) #define mtype_test IPSET_TOKEN(MTYPE, _test) #define mtype_uref IPSET_TOKEN(MTYPE, _uref) #define mtype_resize IPSET_TOKEN(MTYPE, _resize) #define mtype_ext_size IPSET_TOKEN(MTYPE, _ext_size) #define mtype_resize_ad IPSET_TOKEN(MTYPE, _resize_ad) #define mtype_head IPSET_TOKEN(MTYPE, _head) #define mtype_list IPSET_TOKEN(MTYPE, _list) #define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) #define mtype_variant IPSET_TOKEN(MTYPE, _variant) #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) #ifndef HKEY_DATALEN #define HKEY_DATALEN sizeof(struct mtype_elem) #endif #define htype MTYPE #define HKEY(data, initval, htable_bits) \ ({ \ const u32 *__k = (const u32 *)data; \ u32 __l = HKEY_DATALEN / sizeof(u32); \ \ BUILD_BUG_ON(HKEY_DATALEN % sizeof(u32) != 0); \ \ jhash2(__k, __l, initval) & jhash_mask(htable_bits); \ }) /* The generic hash structure */ struct htype { struct htable __rcu *table; /* the hash table */ struct htable_gc gc; /* gc workqueue */ u32 maxelem; /* max elements in the hash */ u32 initval; /* random jhash init value */ #ifdef IP_SET_HASH_WITH_MARKMASK u32 markmask; /* markmask value for mark mask to store */ #endif u8 bucketsize; /* max elements in an array block */ #if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) u8 netmask; /* netmask value for subnets to store */ union nf_inet_addr bitmask; /* stores bitmask */ #endif struct list_head ad; /* Resize add|del backlist */ struct mtype_elem next; /* temporary storage for uadd */ #ifdef IP_SET_HASH_WITH_NETS struct net_prefixes nets[NLEN]; /* book-keeping of prefixes */ #endif }; /* ADD|DEL entries saved during resize */ struct mtype_resize_ad { struct list_head list; enum ipset_adt ad; /* ADD|DEL element */ struct mtype_elem d; /* Element value */ struct ip_set_ext ext; /* Extensions for ADD */ struct ip_set_ext mext; /* Target extensions for ADD */ u32 flags; /* Flags for ADD */ }; #ifdef IP_SET_HASH_WITH_NETS /* Network cidr size book keeping when the hash stores different * sized networks. cidr == real cidr + 1 to support /0. */ static void mtype_add_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n) { int i, j; spin_lock_bh(&set->lock); /* Add in increasing prefix order, so larger cidr first */ for (i = 0, j = -1; i < NLEN && h->nets[i].cidr[n]; i++) { if (j != -1) { continue; } else if (h->nets[i].cidr[n] < cidr) { j = i; } else if (h->nets[i].cidr[n] == cidr) { h->nets[CIDR_POS(cidr)].nets[n]++; goto unlock; } } if (j != -1) { for (; i > j; i--) h->nets[i].cidr[n] = h->nets[i - 1].cidr[n]; } h->nets[i].cidr[n] = cidr; h->nets[CIDR_POS(cidr)].nets[n] = 1; unlock: spin_unlock_bh(&set->lock); } static void mtype_del_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n) { u8 i, j, net_end = NLEN - 1; spin_lock_bh(&set->lock); for (i = 0; i < NLEN; i++) { if (h->nets[i].cidr[n] != cidr) continue; h->nets[CIDR_POS(cidr)].nets[n]--; if (h->nets[CIDR_POS(cidr)].nets[n] > 0) goto unlock; for (j = i; j < net_end && h->nets[j].cidr[n]; j++) h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; h->nets[j].cidr[n] = 0; goto unlock; } unlock: spin_unlock_bh(&set->lock); } #endif /* Calculate the actual memory size of the set data */ static size_t mtype_ahash_memsize(const struct htype *h, const struct htable *t) { return sizeof(*h) + sizeof(*t) + ahash_sizeof_regions(t->htable_bits); } /* Get the ith element from the array block n */ #define ahash_data(n, i, dsize) \ ((struct mtype_elem *)((n)->value + ((i) * (dsize)))) static void mtype_ext_cleanup(struct ip_set *set, struct hbucket *n) { int i; for (i = 0; i < n->pos; i++) if (test_bit(i, n->used)) ip_set_ext_destroy(set, ahash_data(n, i, set->dsize)); } /* Flush a hash type of set: destroy all elements */ static void mtype_flush(struct ip_set *set) { struct htype *h = set->data; struct htable *t; struct hbucket *n; u32 r, i; t = ipset_dereference_nfnl(h->table); for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) { spin_lock_bh(&t->hregion[r].lock); for (i = ahash_bucket_start(r, t->htable_bits); i < ahash_bucket_end(r, t->htable_bits); i++) { n = __ipset_dereference(hbucket(t, i)); if (!n) continue; if (set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set, n); /* FIXME: use slab cache */ rcu_assign_pointer(hbucket(t, i), NULL); kfree_rcu(n, rcu); } t->hregion[r].ext_size = 0; t->hregion[r].elements = 0; spin_unlock_bh(&t->hregion[r].lock); } #ifdef IP_SET_HASH_WITH_NETS memset(h->nets, 0, sizeof(h->nets)); #endif } /* Destroy the hashtable part of the set */ static void mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) { struct hbucket *n; u32 i; for (i = 0; i < jhash_size(t->htable_bits); i++) { n = (__force struct hbucket *)hbucket(t, i); if (!n) continue; if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) mtype_ext_cleanup(set, n); /* FIXME: use slab cache */ kfree(n); } ip_set_free(t->hregion); ip_set_free(t); } /* Destroy a hash type of set */ static void mtype_destroy(struct ip_set *set) { struct htype *h = set->data; struct list_head *l, *lt; mtype_ahash_destroy(set, (__force struct htable *)h->table, true); list_for_each_safe(l, lt, &h->ad) { list_del(l); kfree(l); } kfree(h); set->data = NULL; } static bool mtype_same_set(const struct ip_set *a, const struct ip_set *b) { const struct htype *x = a->data; const struct htype *y = b->data; /* Resizing changes htable_bits, so we ignore it */ return x->maxelem == y->maxelem && a->timeout == b->timeout && #if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) nf_inet_addr_cmp(&x->bitmask, &y->bitmask) && #endif #ifdef IP_SET_HASH_WITH_MARKMASK x->markmask == y->markmask && #endif a->extensions == b->extensions; } static void mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) { struct hbucket *n, *tmp; struct mtype_elem *data; u32 i, j, d; size_t dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 k; #endif u8 htable_bits = t->htable_bits; spin_lock_bh(&t->hregion[r].lock); for (i = ahash_bucket_start(r, htable_bits); i < ahash_bucket_end(r, htable_bits); i++) { n = __ipset_dereference(hbucket(t, i)); if (!n) continue; for (j = 0, d = 0; j < n->pos; j++) { if (!test_bit(j, n->used)) { d++; continue; } data = ahash_data(n, j, dsize); if (!ip_set_timeout_expired(ext_timeout(data, set))) continue; pr_debug("expired %u/%u\n", i, j); clear_bit(j, n->used); smp_mb__after_atomic(); #ifdef IP_SET_HASH_WITH_NETS for (k = 0; k < IPSET_NET_COUNT; k++) mtype_del_cidr(set, h, NCIDR_PUT(DCIDR_GET(data->cidr, k)), k); #endif t->hregion[r].elements--; ip_set_ext_destroy(set, data); d++; } if (d >= AHASH_INIT_SIZE) { if (d >= n->size) { t->hregion[r].ext_size -= ext_size(n->size, dsize); rcu_assign_pointer(hbucket(t, i), NULL); kfree_rcu(n, rcu); continue; } tmp = kzalloc(sizeof(*tmp) + (n->size - AHASH_INIT_SIZE) * dsize, GFP_ATOMIC); if (!tmp) /* Still try to delete expired elements. */ continue; tmp->size = n->size - AHASH_INIT_SIZE; for (j = 0, d = 0; j < n->pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, dsize); memcpy(tmp->value + d * dsize, data, dsize); set_bit(d, tmp->used); d++; } tmp->pos = d; t->hregion[r].ext_size -= ext_size(AHASH_INIT_SIZE, dsize); rcu_assign_pointer(hbucket(t, i), tmp); kfree_rcu(n, rcu); } } spin_unlock_bh(&t->hregion[r].lock); } static void mtype_gc(struct work_struct *work) { struct htable_gc *gc; struct ip_set *set; struct htype *h; struct htable *t; u32 r, numof_locks; unsigned int next_run; gc = container_of(work, struct htable_gc, dwork.work); set = gc->set; h = set->data; spin_lock_bh(&set->lock); t = ipset_dereference_set(h->table, set); atomic_inc(&t->uref); numof_locks = ahash_numof_locks(t->htable_bits); r = gc->region++; if (r >= numof_locks) { r = gc->region = 0; } next_run = (IPSET_GC_PERIOD(set->timeout) * HZ) / numof_locks; if (next_run < HZ/10) next_run = HZ/10; spin_unlock_bh(&set->lock); mtype_gc_do(set, h, t, r); if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { pr_debug("Table destroy after resize by expire: %p\n", t); mtype_ahash_destroy(set, t, false); } queue_delayed_work(system_power_efficient_wq, &gc->dwork, next_run); } static void mtype_gc_init(struct htable_gc *gc) { INIT_DEFERRABLE_WORK(&gc->dwork, mtype_gc); queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ); } static void mtype_cancel_gc(struct ip_set *set) { struct htype *h = set->data; if (SET_WITH_TIMEOUT(set)) cancel_delayed_work_sync(&h->gc.dwork); } static int mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags); static int mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags); /* Resize a hash: create a new hash table with doubling the hashsize * and inserting the elements to it. Repeat until we succeed or * fail due to memory pressures. */ static int mtype_resize(struct ip_set *set, bool retried) { struct htype *h = set->data; struct htable *t, *orig; u8 htable_bits; size_t hsize, dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; struct mtype_elem *tmp; #endif struct mtype_elem *data; struct mtype_elem *d; struct hbucket *n, *m; struct list_head *l, *lt; struct mtype_resize_ad *x; u32 i, j, r, nr, key; int ret; #ifdef IP_SET_HASH_WITH_NETS tmp = kmalloc(dsize, GFP_KERNEL); if (!tmp) return -ENOMEM; #endif orig = ipset_dereference_bh_nfnl(h->table); htable_bits = orig->htable_bits; retry: ret = 0; htable_bits++; if (!htable_bits) goto hbwarn; hsize = htable_size(htable_bits); if (!hsize) goto hbwarn; t = ip_set_alloc(hsize); if (!t) { ret = -ENOMEM; goto out; } t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits)); if (!t->hregion) { ip_set_free(t); ret = -ENOMEM; goto out; } t->htable_bits = htable_bits; t->maxelem = h->maxelem / ahash_numof_locks(htable_bits); for (i = 0; i < ahash_numof_locks(htable_bits); i++) spin_lock_init(&t->hregion[i].lock); /* There can't be another parallel resizing, * but dumping, gc, kernel side add/del are possible */ orig = ipset_dereference_bh_nfnl(h->table); atomic_set(&orig->ref, 1); atomic_inc(&orig->uref); pr_debug("attempt to resize set %s from %u to %u, t %p\n", set->name, orig->htable_bits, htable_bits, orig); for (r = 0; r < ahash_numof_locks(orig->htable_bits); r++) { /* Expire may replace a hbucket with another one */ rcu_read_lock_bh(); for (i = ahash_bucket_start(r, orig->htable_bits); i < ahash_bucket_end(r, orig->htable_bits); i++) { n = __ipset_dereference(hbucket(orig, i)); if (!n) continue; for (j = 0; j < n->pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, dsize); if (SET_ELEM_EXPIRED(set, data)) continue; #ifdef IP_SET_HASH_WITH_NETS /* We have readers running parallel with us, * so the live data cannot be modified. */ flags = 0; memcpy(tmp, data, dsize); data = tmp; mtype_data_reset_flags(data, &flags); #endif key = HKEY(data, h->initval, htable_bits); m = __ipset_dereference(hbucket(t, key)); nr = ahash_region(key, htable_bits); if (!m) { m = kzalloc(sizeof(*m) + AHASH_INIT_SIZE * dsize, GFP_ATOMIC); if (!m) { ret = -ENOMEM; goto cleanup; } m->size = AHASH_INIT_SIZE; t->hregion[nr].ext_size += ext_size(AHASH_INIT_SIZE, dsize); RCU_INIT_POINTER(hbucket(t, key), m); } else if (m->pos >= m->size) { struct hbucket *ht; if (m->size >= AHASH_MAX(h)) { ret = -EAGAIN; } else { ht = kzalloc(sizeof(*ht) + (m->size + AHASH_INIT_SIZE) * dsize, GFP_ATOMIC); if (!ht) ret = -ENOMEM; } if (ret < 0) goto cleanup; memcpy(ht, m, sizeof(struct hbucket) + m->size * dsize); ht->size = m->size + AHASH_INIT_SIZE; t->hregion[nr].ext_size += ext_size(AHASH_INIT_SIZE, dsize); kfree(m); m = ht; RCU_INIT_POINTER(hbucket(t, key), ht); } d = ahash_data(m, m->pos, dsize); memcpy(d, data, dsize); set_bit(m->pos++, m->used); t->hregion[nr].elements++; #ifdef IP_SET_HASH_WITH_NETS mtype_data_reset_flags(d, &flags); #endif } } rcu_read_unlock_bh(); } /* There can't be any other writer. */ rcu_assign_pointer(h->table, t); /* Give time to other readers of the set */ synchronize_rcu(); pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name, orig->htable_bits, orig, t->htable_bits, t); /* Add/delete elements processed by the SET target during resize. * Kernel-side add cannot trigger a resize and userspace actions * are serialized by the mutex. */ list_for_each_safe(l, lt, &h->ad) { x = list_entry(l, struct mtype_resize_ad, list); if (x->ad == IPSET_ADD) { mtype_add(set, &x->d, &x->ext, &x->mext, x->flags); } else { mtype_del(set, &x->d, NULL, NULL, 0); } list_del(l); kfree(l); } /* If there's nobody else using the table, destroy it */ if (atomic_dec_and_test(&orig->uref)) { pr_debug("Table destroy by resize %p\n", orig); mtype_ahash_destroy(set, orig, false); } out: #ifdef IP_SET_HASH_WITH_NETS kfree(tmp); #endif return ret; cleanup: rcu_read_unlock_bh(); atomic_set(&orig->ref, 0); atomic_dec(&orig->uref); mtype_ahash_destroy(set, t, false); if (ret == -EAGAIN) goto retry; goto out; hbwarn: /* In case we have plenty of memory :-) */ pr_warn("Cannot increase the hashsize of set %s further\n", set->name); ret = -IPSET_ERR_HASH_FULL; goto out; } /* Get the current number of elements and ext_size in the set */ static void mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size) { struct htype *h = set->data; const struct htable *t; u32 i, j, r; struct hbucket *n; struct mtype_elem *data; t = rcu_dereference_bh(h->table); for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) { for (i = ahash_bucket_start(r, t->htable_bits); i < ahash_bucket_end(r, t->htable_bits); i++) { n = rcu_dereference_bh(hbucket(t, i)); if (!n) continue; for (j = 0; j < n->pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, set->dsize); if (!SET_ELEM_EXPIRED(set, data)) (*elements)++; } } *ext_size += t->hregion[r].ext_size; } } /* Add an element to a hash and update the internal counters when succeeded, * otherwise report the proper error code. */ static int mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct htype *h = set->data; struct htable *t; const struct mtype_elem *d = value; struct mtype_elem *data; struct hbucket *n, *old = ERR_PTR(-ENOENT); int i, j = -1, ret; bool flag_exist = flags & IPSET_FLAG_EXIST; bool deleted = false, forceadd = false, reuse = false; u32 r, key, multi = 0, elements, maxelem; rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); r = ahash_region(key, t->htable_bits); atomic_inc(&t->uref); elements = t->hregion[r].elements; maxelem = t->maxelem; if (elements >= maxelem) { u32 e; if (SET_WITH_TIMEOUT(set)) { rcu_read_unlock_bh(); mtype_gc_do(set, h, t, r); rcu_read_lock_bh(); } maxelem = h->maxelem; elements = 0; for (e = 0; e < ahash_numof_locks(t->htable_bits); e++) elements += t->hregion[e].elements; if (elements >= maxelem && SET_WITH_FORCEADD(set)) forceadd = true; } rcu_read_unlock_bh(); spin_lock_bh(&t->hregion[r].lock); n = rcu_dereference_bh(hbucket(t, key)); if (!n) { if (forceadd || elements >= maxelem) goto set_full; old = NULL; n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize, GFP_ATOMIC); if (!n) { ret = -ENOMEM; goto unlock; } n->size = AHASH_INIT_SIZE; t->hregion[r].ext_size += ext_size(AHASH_INIT_SIZE, set->dsize); goto copy_elem; } for (i = 0; i < n->pos; i++) { if (!test_bit(i, n->used)) { /* Reuse first deleted entry */ if (j == -1) { deleted = reuse = true; j = i; } continue; } data = ahash_data(n, i, set->dsize); if (mtype_data_equal(data, d, &multi)) { if (flag_exist || SET_ELEM_EXPIRED(set, data)) { /* Just the extensions could be overwritten */ j = i; goto overwrite_extensions; } ret = -IPSET_ERR_EXIST; goto unlock; } /* Reuse first timed out entry */ if (SET_ELEM_EXPIRED(set, data) && j == -1) { j = i; reuse = true; } } if (reuse || forceadd) { if (j == -1) j = 0; data = ahash_data(n, j, set->dsize); if (!deleted) { #ifdef IP_SET_HASH_WITH_NETS for (i = 0; i < IPSET_NET_COUNT; i++) mtype_del_cidr(set, h, NCIDR_PUT(DCIDR_GET(data->cidr, i)), i); #endif ip_set_ext_destroy(set, data); t->hregion[r].elements--; } goto copy_data; } if (elements >= maxelem) goto set_full; /* Create a new slot */ if (n->pos >= n->size) { #ifdef IP_SET_HASH_WITH_MULTI if (h->bucketsize >= AHASH_MAX_TUNED) goto set_full; else if (h->bucketsize <= multi) h->bucketsize += AHASH_INIT_SIZE; #endif if (n->size >= AHASH_MAX(h)) { /* Trigger rehashing */ mtype_data_next(&h->next, d); ret = -EAGAIN; goto resize; } old = n; n = kzalloc(sizeof(*n) + (old->size + AHASH_INIT_SIZE) * set->dsize, GFP_ATOMIC); if (!n) { ret = -ENOMEM; goto unlock; } memcpy(n, old, sizeof(struct hbucket) + old->size * set->dsize); n->size = old->size + AHASH_INIT_SIZE; t->hregion[r].ext_size += ext_size(AHASH_INIT_SIZE, set->dsize); } copy_elem: j = n->pos++; data = ahash_data(n, j, set->dsize); copy_data: t->hregion[r].elements++; #ifdef IP_SET_HASH_WITH_NETS for (i = 0; i < IPSET_NET_COUNT; i++) mtype_add_cidr(set, h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i); #endif memcpy(data, d, sizeof(struct mtype_elem)); overwrite_extensions: #ifdef IP_SET_HASH_WITH_NETS mtype_data_set_flags(data, flags); #endif if (SET_WITH_COUNTER(set)) ip_set_init_counter(ext_counter(data, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(set, ext_comment(data, set), ext); if (SET_WITH_SKBINFO(set)) ip_set_init_skbinfo(ext_skbinfo(data, set), ext); /* Must come last for the case when timed out entry is reused */ if (SET_WITH_TIMEOUT(set)) ip_set_timeout_set(ext_timeout(data, set), ext->timeout); smp_mb__before_atomic(); set_bit(j, n->used); if (old != ERR_PTR(-ENOENT)) { rcu_assign_pointer(hbucket(t, key), n); if (old) kfree_rcu(old, rcu); } ret = 0; resize: spin_unlock_bh(&t->hregion[r].lock); if (atomic_read(&t->ref) && ext->target) { /* Resize is in process and kernel side add, save values */ struct mtype_resize_ad *x; x = kzalloc(sizeof(struct mtype_resize_ad), GFP_ATOMIC); if (!x) /* Don't bother */ goto out; x->ad = IPSET_ADD; memcpy(&x->d, value, sizeof(struct mtype_elem)); memcpy(&x->ext, ext, sizeof(struct ip_set_ext)); memcpy(&x->mext, mext, sizeof(struct ip_set_ext)); x->flags = flags; spin_lock_bh(&set->lock); list_add_tail(&x->list, &h->ad); spin_unlock_bh(&set->lock); } goto out; set_full: if (net_ratelimit()) pr_warn("Set %s is full, maxelem %u reached\n", set->name, maxelem); ret = -IPSET_ERR_HASH_FULL; unlock: spin_unlock_bh(&t->hregion[r].lock); out: if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { pr_debug("Table destroy after resize by add: %p\n", t); mtype_ahash_destroy(set, t, false); } return ret; } /* Delete an element from the hash and free up space if possible. */ static int mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct htype *h = set->data; struct htable *t; const struct mtype_elem *d = value; struct mtype_elem *data; struct hbucket *n; struct mtype_resize_ad *x = NULL; int i, j, k, r, ret = -IPSET_ERR_EXIST; u32 key, multi = 0; size_t dsize = set->dsize; /* Userspace add and resize is excluded by the mutex. * Kernespace add does not trigger resize. */ rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); r = ahash_region(key, t->htable_bits); atomic_inc(&t->uref); rcu_read_unlock_bh(); spin_lock_bh(&t->hregion[r].lock); n = rcu_dereference_bh(hbucket(t, key)); if (!n) goto out; for (i = 0, k = 0; i < n->pos; i++) { if (!test_bit(i, n->used)) { k++; continue; } data = ahash_data(n, i, dsize); if (!mtype_data_equal(data, d, &multi)) continue; if (SET_ELEM_EXPIRED(set, data)) goto out; ret = 0; clear_bit(i, n->used); smp_mb__after_atomic(); if (i + 1 == n->pos) n->pos--; t->hregion[r].elements--; #ifdef IP_SET_HASH_WITH_NETS for (j = 0; j < IPSET_NET_COUNT; j++) mtype_del_cidr(set, h, NCIDR_PUT(DCIDR_GET(d->cidr, j)), j); #endif ip_set_ext_destroy(set, data); if (atomic_read(&t->ref) && ext->target) { /* Resize is in process and kernel side del, * save values */ x = kzalloc(sizeof(struct mtype_resize_ad), GFP_ATOMIC); if (x) { x->ad = IPSET_DEL; memcpy(&x->d, value, sizeof(struct mtype_elem)); x->flags = flags; } } for (; i < n->pos; i++) { if (!test_bit(i, n->used)) k++; } if (n->pos == 0 && k == 0) { t->hregion[r].ext_size -= ext_size(n->size, dsize); rcu_assign_pointer(hbucket(t, key), NULL); kfree_rcu(n, rcu); } else if (k >= AHASH_INIT_SIZE) { struct hbucket *tmp = kzalloc(sizeof(*tmp) + (n->size - AHASH_INIT_SIZE) * dsize, GFP_ATOMIC); if (!tmp) goto out; tmp->size = n->size - AHASH_INIT_SIZE; for (j = 0, k = 0; j < n->pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, dsize); memcpy(tmp->value + k * dsize, data, dsize); set_bit(k, tmp->used); k++; } tmp->pos = k; t->hregion[r].ext_size -= ext_size(AHASH_INIT_SIZE, dsize); rcu_assign_pointer(hbucket(t, key), tmp); kfree_rcu(n, rcu); } goto out; } out: spin_unlock_bh(&t->hregion[r].lock); if (x) { spin_lock_bh(&set->lock); list_add(&x->list, &h->ad); spin_unlock_bh(&set->lock); } if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { pr_debug("Table destroy after resize by del: %p\n", t); mtype_ahash_destroy(set, t, false); } return ret; } static int mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, struct ip_set_ext *mext, struct ip_set *set, u32 flags) { if (!ip_set_match_extensions(set, ext, mext, flags, data)) return 0; /* nomatch entries return -ENOTEMPTY */ return mtype_do_data_match(data); } #ifdef IP_SET_HASH_WITH_NETS /* Special test function which takes into account the different network * sizes added to the set */ static int mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct htype *h = set->data; struct htable *t = rcu_dereference_bh(h->table); struct hbucket *n; struct mtype_elem *data; #if IPSET_NET_COUNT == 2 struct mtype_elem orig = *d; int ret, i, j = 0, k; #else int ret, i, j = 0; #endif u32 key, multi = 0; pr_debug("test by nets\n"); for (; j < NLEN && h->nets[j].cidr[0] && !multi; j++) { #if IPSET_NET_COUNT == 2 mtype_data_reset_elem(d, &orig); mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]), false); for (k = 0; k < NLEN && h->nets[k].cidr[1] && !multi; k++) { mtype_data_netmask(d, NCIDR_GET(h->nets[k].cidr[1]), true); #else mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0])); #endif key = HKEY(d, h->initval, t->htable_bits); n = rcu_dereference_bh(hbucket(t, key)); if (!n) continue; for (i = 0; i < n->pos; i++) { if (!test_bit(i, n->used)) continue; data = ahash_data(n, i, set->dsize); if (!mtype_data_equal(data, d, &multi)) continue; ret = mtype_data_match(data, ext, mext, set, flags); if (ret != 0) return ret; #ifdef IP_SET_HASH_WITH_MULTI /* No match, reset multiple match flag */ multi = 0; #endif } #if IPSET_NET_COUNT == 2 } #endif } return 0; } #endif /* Test whether the element is added to the set */ static int mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct htype *h = set->data; struct htable *t; struct mtype_elem *d = value; struct hbucket *n; struct mtype_elem *data; int i, ret = 0; u32 key, multi = 0; rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); #ifdef IP_SET_HASH_WITH_NETS /* If we test an IP address and not a network address, * try all possible network sizes */ for (i = 0; i < IPSET_NET_COUNT; i++) if (DCIDR_GET(d->cidr, i) != HOST_MASK) break; if (i == IPSET_NET_COUNT) { ret = mtype_test_cidrs(set, d, ext, mext, flags); goto out; } #endif key = HKEY(d, h->initval, t->htable_bits); n = rcu_dereference_bh(hbucket(t, key)); if (!n) { ret = 0; goto out; } for (i = 0; i < n->pos; i++) { if (!test_bit(i, n->used)) continue; data = ahash_data(n, i, set->dsize); if (!mtype_data_equal(data, d, &multi)) continue; ret = mtype_data_match(data, ext, mext, set, flags); if (ret != 0) goto out; } out: rcu_read_unlock_bh(); return ret; } /* Reply a HEADER request: fill out the header part of the set */ static int mtype_head(struct ip_set *set, struct sk_buff *skb) { struct htype *h = set->data; const struct htable *t; struct nlattr *nested; size_t memsize; u32 elements = 0; size_t ext_size = 0; u8 htable_bits; rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); mtype_ext_size(set, &elements, &ext_size); memsize = mtype_ahash_memsize(h, t) + ext_size + set->ext_size; htable_bits = t->htable_bits; rcu_read_unlock_bh(); nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE, htonl(jhash_size(htable_bits))) || nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem))) goto nla_put_failure; #ifdef IP_SET_HASH_WITH_BITMASK /* if netmask is set to anything other than HOST_MASK we know that the user supplied netmask * and not bitmask. These two are mutually exclusive. */ if (h->netmask == HOST_MASK && !nf_inet_addr_cmp(&onesmask, &h->bitmask)) { if (set->family == NFPROTO_IPV4) { if (nla_put_ipaddr4(skb, IPSET_ATTR_BITMASK, h->bitmask.ip)) goto nla_put_failure; } else if (set->family == NFPROTO_IPV6) { if (nla_put_ipaddr6(skb, IPSET_ATTR_BITMASK, &h->bitmask.in6)) goto nla_put_failure; } } #endif #ifdef IP_SET_HASH_WITH_NETMASK if (h->netmask != HOST_MASK && nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) goto nla_put_failure; #endif #ifdef IP_SET_HASH_WITH_MARKMASK if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask)) goto nla_put_failure; #endif if (set->flags & IPSET_CREATE_FLAG_BUCKETSIZE) { if (nla_put_u8(skb, IPSET_ATTR_BUCKETSIZE, h->bucketsize) || nla_put_net32(skb, IPSET_ATTR_INITVAL, htonl(h->initval))) goto nla_put_failure; } if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; nla_nest_end(skb, nested); return 0; nla_put_failure: return -EMSGSIZE; } /* Make possible to run dumping parallel with resizing */ static void mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start) { struct htype *h = set->data; struct htable *t; if (start) { rcu_read_lock_bh(); t = ipset_dereference_bh_nfnl(h->table); atomic_inc(&t->uref); cb->args[IPSET_CB_PRIVATE] = (unsigned long)t; rcu_read_unlock_bh(); } else if (cb->args[IPSET_CB_PRIVATE]) { t = (struct htable *)cb->args[IPSET_CB_PRIVATE]; if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { pr_debug("Table destroy after resize " " by dump: %p\n", t); mtype_ahash_destroy(set, t, false); } cb->args[IPSET_CB_PRIVATE] = 0; } } /* Reply a LIST/SAVE request: dump the elements of the specified set */ static int mtype_list(const struct ip_set *set, struct sk_buff *skb, struct netlink_callback *cb) { const struct htable *t; struct nlattr *atd, *nested; const struct hbucket *n; const struct mtype_elem *e; u32 first = cb->args[IPSET_CB_ARG0]; /* We assume that one hash bucket fills into one page */ void *incomplete; int i, ret = 0; atd = nla_nest_start(skb, IPSET_ATTR_ADT); if (!atd) return -EMSGSIZE; pr_debug("list hash set %s\n", set->name); t = (const struct htable *)cb->args[IPSET_CB_PRIVATE]; /* Expire may replace a hbucket with another one */ rcu_read_lock(); for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits); cb->args[IPSET_CB_ARG0]++) { cond_resched_rcu(); incomplete = skb_tail_pointer(skb); n = rcu_dereference(hbucket(t, cb->args[IPSET_CB_ARG0])); pr_debug("cb->arg bucket: %lu, t %p n %p\n", cb->args[IPSET_CB_ARG0], t, n); if (!n) continue; for (i = 0; i < n->pos; i++) { if (!test_bit(i, n->used)) continue; e = ahash_data(n, i, set->dsize); if (SET_ELEM_EXPIRED(set, e)) continue; pr_debug("list hash %lu hbucket %p i %u, data %p\n", cb->args[IPSET_CB_ARG0], n, i, e); nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) { if (cb->args[IPSET_CB_ARG0] == first) { nla_nest_cancel(skb, atd); ret = -EMSGSIZE; goto out; } goto nla_put_failure; } if (mtype_data_list(skb, e)) goto nla_put_failure; if (ip_set_put_extensions(skb, set, e, true)) goto nla_put_failure; nla_nest_end(skb, nested); } } nla_nest_end(skb, atd); /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; goto out; nla_put_failure: nlmsg_trim(skb, incomplete); if (unlikely(first == cb->args[IPSET_CB_ARG0])) { pr_warn("Can't list set %s: one bucket does not fit into a message. Please report it!\n", set->name); cb->args[IPSET_CB_ARG0] = 0; ret = -EMSGSIZE; } else { nla_nest_end(skb, atd); } out: rcu_read_unlock(); return ret; } static int IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt); static int IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried); static const struct ip_set_type_variant mtype_variant = { .kadt = mtype_kadt, .uadt = mtype_uadt, .adt = { [IPSET_ADD] = mtype_add, [IPSET_DEL] = mtype_del, [IPSET_TEST] = mtype_test, }, .destroy = mtype_destroy, .flush = mtype_flush, .head = mtype_head, .list = mtype_list, .uref = mtype_uref, .resize = mtype_resize, .same_set = mtype_same_set, .cancel_gc = mtype_cancel_gc, .region_lock = true, }; #ifdef IP_SET_EMIT_CREATE static int IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) { u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; #ifdef IP_SET_HASH_WITH_MARKMASK u32 markmask; #endif u8 hbits; #if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) int ret __attribute__((unused)) = 0; u8 netmask = set->family == NFPROTO_IPV4 ? 32 : 128; union nf_inet_addr bitmask = onesmask; #endif size_t hsize; struct htype *h; struct htable *t; u32 i; pr_debug("Create set %s with family %s\n", set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6"); #ifdef IP_SET_PROTO_UNDEF if (set->family != NFPROTO_UNSPEC) return -IPSET_ERR_INVALID_FAMILY; #else if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) return -IPSET_ERR_INVALID_FAMILY; #endif if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; #ifdef IP_SET_HASH_WITH_MARKMASK /* Separated condition in order to avoid directive in argument list */ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK))) return -IPSET_ERR_PROTOCOL; markmask = 0xffffffff; if (tb[IPSET_ATTR_MARKMASK]) { markmask = ntohl(nla_get_be32(tb[IPSET_ATTR_MARKMASK])); if (markmask == 0) return -IPSET_ERR_INVALID_MARKMASK; } #endif #ifdef IP_SET_HASH_WITH_NETMASK if (tb[IPSET_ATTR_NETMASK]) { netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); if ((set->family == NFPROTO_IPV4 && netmask > 32) || (set->family == NFPROTO_IPV6 && netmask > 128) || netmask == 0) return -IPSET_ERR_INVALID_NETMASK; /* we convert netmask to bitmask and store it */ if (set->family == NFPROTO_IPV4) bitmask.ip = ip_set_netmask(netmask); else ip6_netmask(&bitmask, netmask); } #endif #ifdef IP_SET_HASH_WITH_BITMASK if (tb[IPSET_ATTR_BITMASK]) { /* bitmask and netmask do the same thing, allow only one of these options */ if (tb[IPSET_ATTR_NETMASK]) return -IPSET_ERR_BITMASK_NETMASK_EXCL; if (set->family == NFPROTO_IPV4) { ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_BITMASK], &bitmask.ip); if (ret || !bitmask.ip) return -IPSET_ERR_INVALID_NETMASK; } else if (set->family == NFPROTO_IPV6) { ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_BITMASK], &bitmask); if (ret || ipv6_addr_any(&bitmask.in6)) return -IPSET_ERR_INVALID_NETMASK; } if (nf_inet_addr_cmp(&bitmask, &zeromask)) return -IPSET_ERR_INVALID_NETMASK; } #endif if (tb[IPSET_ATTR_HASHSIZE]) { hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); if (hashsize < IPSET_MIMINAL_HASHSIZE) hashsize = IPSET_MIMINAL_HASHSIZE; } if (tb[IPSET_ATTR_MAXELEM]) maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); hsize = sizeof(*h); h = kzalloc(hsize, GFP_KERNEL); if (!h) return -ENOMEM; /* Compute htable_bits from the user input parameter hashsize. * Assume that hashsize == 2^htable_bits, * otherwise round up to the first 2^n value. */ hbits = fls(hashsize - 1); hsize = htable_size(hbits); if (hsize == 0) { kfree(h); return -ENOMEM; } t = ip_set_alloc(hsize); if (!t) { kfree(h); return -ENOMEM; } t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits)); if (!t->hregion) { ip_set_free(t); kfree(h); return -ENOMEM; } h->gc.set = set; for (i = 0; i < ahash_numof_locks(hbits); i++) spin_lock_init(&t->hregion[i].lock); h->maxelem = maxelem; #if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) h->bitmask = bitmask; h->netmask = netmask; #endif #ifdef IP_SET_HASH_WITH_MARKMASK h->markmask = markmask; #endif if (tb[IPSET_ATTR_INITVAL]) h->initval = ntohl(nla_get_be32(tb[IPSET_ATTR_INITVAL])); else get_random_bytes(&h->initval, sizeof(h->initval)); h->bucketsize = AHASH_MAX_SIZE; if (tb[IPSET_ATTR_BUCKETSIZE]) { h->bucketsize = nla_get_u8(tb[IPSET_ATTR_BUCKETSIZE]); if (h->bucketsize < AHASH_INIT_SIZE) h->bucketsize = AHASH_INIT_SIZE; else if (h->bucketsize > AHASH_MAX_SIZE) h->bucketsize = AHASH_MAX_SIZE; else if (h->bucketsize % 2) h->bucketsize += 1; } t->htable_bits = hbits; t->maxelem = h->maxelem / ahash_numof_locks(hbits); RCU_INIT_POINTER(h->table, t); INIT_LIST_HEAD(&h->ad); set->data = h; #ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) { #endif set->variant = &IPSET_TOKEN(HTYPE, 4_variant); set->dsize = ip_set_elem_len(set, tb, sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)), __alignof__(struct IPSET_TOKEN(HTYPE, 4_elem))); #ifndef IP_SET_PROTO_UNDEF } else { set->variant = &IPSET_TOKEN(HTYPE, 6_variant); set->dsize = ip_set_elem_len(set, tb, sizeof(struct IPSET_TOKEN(HTYPE, 6_elem)), __alignof__(struct IPSET_TOKEN(HTYPE, 6_elem))); } #endif set->timeout = IPSET_NO_TIMEOUT; if (tb[IPSET_ATTR_TIMEOUT]) { set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); #ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) #endif IPSET_TOKEN(HTYPE, 4_gc_init)(&h->gc); #ifndef IP_SET_PROTO_UNDEF else IPSET_TOKEN(HTYPE, 6_gc_init)(&h->gc); #endif } pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", set->name, jhash_size(t->htable_bits), t->htable_bits, h->maxelem, set->data, t); return 0; } #endif /* IP_SET_EMIT_CREATE */ #undef HKEY_DATALEN
32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM smc #if !defined(_TRACE_SMC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SMC_H #include <linux/ipv6.h> #include <linux/tcp.h> #include <linux/tracepoint.h> #include <net/ipv6.h> #include "smc.h" #include "smc_core.h" TRACE_EVENT(smc_switch_to_fallback, TP_PROTO(const struct smc_sock *smc, int fallback_rsn), TP_ARGS(smc, fallback_rsn), TP_STRUCT__entry( __field(const void *, sk) __field(const void *, clcsk) __field(u64, net_cookie) __field(int, fallback_rsn) ), TP_fast_assign( const struct sock *sk = &smc->sk; const struct sock *clcsk = smc->clcsock->sk; __entry->sk = sk; __entry->clcsk = clcsk; __entry->net_cookie = sock_net(sk)->net_cookie; __entry->fallback_rsn = fallback_rsn; ), TP_printk("sk=%p clcsk=%p net=%llu fallback_rsn=%d", __entry->sk, __entry->clcsk, __entry->net_cookie, __entry->fallback_rsn) ); DECLARE_EVENT_CLASS(smc_msg_event, TP_PROTO(const struct smc_sock *smc, size_t len), TP_ARGS(smc, len), TP_STRUCT__entry( __field(const void *, smc) __field(u64, net_cookie) __field(size_t, len) __string(name, smc->conn.lnk->ibname) ), TP_fast_assign( const struct sock *sk = &smc->sk; __entry->smc = smc; __entry->net_cookie = sock_net(sk)->net_cookie; __entry->len = len; __assign_str(name); ), TP_printk("smc=%p net=%llu len=%zu dev=%s", __entry->smc, __entry->net_cookie, __entry->len, __get_str(name)) ); DEFINE_EVENT(smc_msg_event, smc_tx_sendmsg, TP_PROTO(const struct smc_sock *smc, size_t len), TP_ARGS(smc, len) ); DEFINE_EVENT(smc_msg_event, smc_rx_recvmsg, TP_PROTO(const struct smc_sock *smc, size_t len), TP_ARGS(smc, len) ); TRACE_EVENT(smcr_link_down, TP_PROTO(const struct smc_link *lnk, void *location), TP_ARGS(lnk, location), TP_STRUCT__entry( __field(const void *, lnk) __field(const void *, lgr) __field(u64, net_cookie) __field(int, state) __string(name, lnk->ibname) __field(void *, location) ), TP_fast_assign( const struct smc_link_group *lgr = lnk->lgr; __entry->lnk = lnk; __entry->lgr = lgr; __entry->net_cookie = lgr->net->net_cookie; __entry->state = lnk->state; __assign_str(name); __entry->location = location; ), TP_printk("lnk=%p lgr=%p net=%llu state=%d dev=%s location=%pS", __entry->lnk, __entry->lgr, __entry->net_cookie, __entry->state, __get_str(name), __entry->location) ); #endif /* _TRACE_SMC_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE smc_tracepoint #include <trace/define_trace.h>
9 8 6 5 3 7 7 2 1 1 6 1 1 5 4 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 // SPDX-License-Identifier: GPL-2.0+ /* * Driver for USB Mass Storage compliant devices * * Current development and maintenance by: * (c) 1999-2002 Matthew Dharm (mdharm-usb@one-eyed-alien.net) * * Developed with the assistance of: * (c) 2000 David L. Brown, Jr. (usb-storage@davidb.org) * (c) 2000 Stephen J. Gowdy (SGowdy@lbl.gov) * (c) 2002 Alan Stern <stern@rowland.org> * * Initial work by: * (c) 1999 Michael Gee (michael@linuxspecific.com) * * This driver is based on the 'USB Mass Storage Class' document. This * describes in detail the protocol used to communicate with such * devices. Clearly, the designers had SCSI and ATAPI commands in * mind when they created this document. The commands are all very * similar to commands in the SCSI-II and ATAPI specifications. * * It is important to note that in a number of cases this class * exhibits class-specific exemptions from the USB specification. * Notably the usage of NAK, STALL and ACK differs from the norm, in * that they are used to communicate wait, failed and OK on commands. * * Also, for certain devices, the interrupt endpoint is used to convey * status of a command. */ #include <linux/sched.h> #include <linux/gfp.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/usb/quirks.h> #include <scsi/scsi.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_device.h> #include "usb.h" #include "transport.h" #include "protocol.h" #include "scsiglue.h" #include "debug.h" #include <linux/blkdev.h> #include "../../scsi/sd.h" /*********************************************************************** * Data transfer routines ***********************************************************************/ /* * This is subtle, so pay attention: * --------------------------------- * We're very concerned about races with a command abort. Hanging this code * is a sure fire way to hang the kernel. (Note that this discussion applies * only to transactions resulting from a scsi queued-command, since only * these transactions are subject to a scsi abort. Other transactions, such * as those occurring during device-specific initialization, must be handled * by a separate code path.) * * The abort function (usb_storage_command_abort() in scsiglue.c) first * sets the machine state and the ABORTING bit in us->dflags to prevent * new URBs from being submitted. It then calls usb_stor_stop_transport() * below, which atomically tests-and-clears the URB_ACTIVE bit in us->dflags * to see if the current_urb needs to be stopped. Likewise, the SG_ACTIVE * bit is tested to see if the current_sg scatter-gather request needs to be * stopped. The timeout callback routine does much the same thing. * * When a disconnect occurs, the DISCONNECTING bit in us->dflags is set to * prevent new URBs from being submitted, and usb_stor_stop_transport() is * called to stop any ongoing requests. * * The submit function first verifies that the submitting is allowed * (neither ABORTING nor DISCONNECTING bits are set) and that the submit * completes without errors, and only then sets the URB_ACTIVE bit. This * prevents the stop_transport() function from trying to cancel the URB * while the submit call is underway. Next, the submit function must test * the flags to see if an abort or disconnect occurred during the submission * or before the URB_ACTIVE bit was set. If so, it's essential to cancel * the URB if it hasn't been cancelled already (i.e., if the URB_ACTIVE bit * is still set). Either way, the function must then wait for the URB to * finish. Note that the URB can still be in progress even after a call to * usb_unlink_urb() returns. * * The idea is that (1) once the ABORTING or DISCONNECTING bit is set, * either the stop_transport() function or the submitting function * is guaranteed to call usb_unlink_urb() for an active URB, * and (2) test_and_clear_bit() prevents usb_unlink_urb() from being * called more than once or from being called during usb_submit_urb(). */ /* * This is the completion handler which will wake us up when an URB * completes. */ static void usb_stor_blocking_completion(struct urb *urb) { struct completion *urb_done_ptr = urb->context; complete(urb_done_ptr); } /* * This is the common part of the URB message submission code * * All URBs from the usb-storage driver involved in handling a queued scsi * command _must_ pass through this function (or something like it) for the * abort mechanisms to work properly. */ static int usb_stor_msg_common(struct us_data *us, int timeout) { struct completion urb_done; long timeleft; int status; /* don't submit URBs during abort processing */ if (test_bit(US_FLIDX_ABORTING, &us->dflags)) return -EIO; /* set up data structures for the wakeup system */ init_completion(&urb_done); /* fill the common fields in the URB */ us->current_urb->context = &urb_done; us->current_urb->transfer_flags = 0; /* * we assume that if transfer_buffer isn't us->iobuf then it * hasn't been mapped for DMA. Yes, this is clunky, but it's * easier than always having the caller tell us whether the * transfer buffer has already been mapped. */ if (us->current_urb->transfer_buffer == us->iobuf) us->current_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; us->current_urb->transfer_dma = us->iobuf_dma; /* submit the URB */ status = usb_submit_urb(us->current_urb, GFP_NOIO); if (status) { /* something went wrong */ return status; } /* * since the URB has been submitted successfully, it's now okay * to cancel it */ set_bit(US_FLIDX_URB_ACTIVE, &us->dflags); /* did an abort occur during the submission? */ if (test_bit(US_FLIDX_ABORTING, &us->dflags)) { /* cancel the URB, if it hasn't been cancelled already */ if (test_and_clear_bit(US_FLIDX_URB_ACTIVE, &us->dflags)) { usb_stor_dbg(us, "-- cancelling URB\n"); usb_unlink_urb(us->current_urb); } } /* wait for the completion of the URB */ timeleft = wait_for_completion_interruptible_timeout( &urb_done, timeout ? : MAX_SCHEDULE_TIMEOUT); clear_bit(US_FLIDX_URB_ACTIVE, &us->dflags); if (timeleft <= 0) { usb_stor_dbg(us, "%s -- cancelling URB\n", timeleft == 0 ? "Timeout" : "Signal"); usb_kill_urb(us->current_urb); } /* return the URB status */ return us->current_urb->status; } /* * Transfer one control message, with timeouts, and allowing early * termination. Return codes are usual -Exxx, *not* USB_STOR_XFER_xxx. */ int usb_stor_control_msg(struct us_data *us, unsigned int pipe, u8 request, u8 requesttype, u16 value, u16 index, void *data, u16 size, int timeout) { int status; usb_stor_dbg(us, "rq=%02x rqtype=%02x value=%04x index=%02x len=%u\n", request, requesttype, value, index, size); /* fill in the devrequest structure */ us->cr->bRequestType = requesttype; us->cr->bRequest = request; us->cr->wValue = cpu_to_le16(value); us->cr->wIndex = cpu_to_le16(index); us->cr->wLength = cpu_to_le16(size); /* fill and submit the URB */ usb_fill_control_urb(us->current_urb, us->pusb_dev, pipe, (unsigned char*) us->cr, data, size, usb_stor_blocking_completion, NULL); status = usb_stor_msg_common(us, timeout); /* return the actual length of the data transferred if no error */ if (status == 0) status = us->current_urb->actual_length; return status; } EXPORT_SYMBOL_GPL(usb_stor_control_msg); /* * This is a version of usb_clear_halt() that allows early termination and * doesn't read the status from the device -- this is because some devices * crash their internal firmware when the status is requested after a halt. * * A definitive list of these 'bad' devices is too difficult to maintain or * make complete enough to be useful. This problem was first observed on the * Hagiwara FlashGate DUAL unit. However, bus traces reveal that neither * MacOS nor Windows checks the status after clearing a halt. * * Since many vendors in this space limit their testing to interoperability * with these two OSes, specification violations like this one are common. */ int usb_stor_clear_halt(struct us_data *us, unsigned int pipe) { int result; int endp = usb_pipeendpoint(pipe); if (usb_pipein (pipe)) endp |= USB_DIR_IN; result = usb_stor_control_msg(us, us->send_ctrl_pipe, USB_REQ_CLEAR_FEATURE, USB_RECIP_ENDPOINT, USB_ENDPOINT_HALT, endp, NULL, 0, 3*HZ); if (result >= 0) usb_reset_endpoint(us->pusb_dev, endp); usb_stor_dbg(us, "result = %d\n", result); return result; } EXPORT_SYMBOL_GPL(usb_stor_clear_halt); /* * Interpret the results of a URB transfer * * This function prints appropriate debugging messages, clears halts on * non-control endpoints, and translates the status to the corresponding * USB_STOR_XFER_xxx return code. */ static int interpret_urb_result(struct us_data *us, unsigned int pipe, unsigned int length, int result, unsigned int partial) { usb_stor_dbg(us, "Status code %d; transferred %u/%u\n", result, partial, length); switch (result) { /* no error code; did we send all the data? */ case 0: if (partial != length) { usb_stor_dbg(us, "-- short transfer\n"); return USB_STOR_XFER_SHORT; } usb_stor_dbg(us, "-- transfer complete\n"); return USB_STOR_XFER_GOOD; /* stalled */ case -EPIPE: /* * for control endpoints, (used by CB[I]) a stall indicates * a failed command */ if (usb_pipecontrol(pipe)) { usb_stor_dbg(us, "-- stall on control pipe\n"); return USB_STOR_XFER_STALLED; } /* for other sorts of endpoint, clear the stall */ usb_stor_dbg(us, "clearing endpoint halt for pipe 0x%x\n", pipe); if (usb_stor_clear_halt(us, pipe) < 0) return USB_STOR_XFER_ERROR; return USB_STOR_XFER_STALLED; /* babble - the device tried to send more than we wanted to read */ case -EOVERFLOW: usb_stor_dbg(us, "-- babble\n"); return USB_STOR_XFER_LONG; /* the transfer was cancelled by abort, disconnect, or timeout */ case -ECONNRESET: usb_stor_dbg(us, "-- transfer cancelled\n"); return USB_STOR_XFER_ERROR; /* short scatter-gather read transfer */ case -EREMOTEIO: usb_stor_dbg(us, "-- short read transfer\n"); return USB_STOR_XFER_SHORT; /* abort or disconnect in progress */ case -EIO: usb_stor_dbg(us, "-- abort or disconnect in progress\n"); return USB_STOR_XFER_ERROR; /* the catch-all error case */ default: usb_stor_dbg(us, "-- unknown error\n"); return USB_STOR_XFER_ERROR; } } /* * Transfer one control message, without timeouts, but allowing early * termination. Return codes are USB_STOR_XFER_xxx. */ int usb_stor_ctrl_transfer(struct us_data *us, unsigned int pipe, u8 request, u8 requesttype, u16 value, u16 index, void *data, u16 size) { int result; usb_stor_dbg(us, "rq=%02x rqtype=%02x value=%04x index=%02x len=%u\n", request, requesttype, value, index, size); /* fill in the devrequest structure */ us->cr->bRequestType = requesttype; us->cr->bRequest = request; us->cr->wValue = cpu_to_le16(value); us->cr->wIndex = cpu_to_le16(index); us->cr->wLength = cpu_to_le16(size); /* fill and submit the URB */ usb_fill_control_urb(us->current_urb, us->pusb_dev, pipe, (unsigned char*) us->cr, data, size, usb_stor_blocking_completion, NULL); result = usb_stor_msg_common(us, 0); return interpret_urb_result(us, pipe, size, result, us->current_urb->actual_length); } EXPORT_SYMBOL_GPL(usb_stor_ctrl_transfer); /* * Receive one interrupt buffer, without timeouts, but allowing early * termination. Return codes are USB_STOR_XFER_xxx. * * This routine always uses us->recv_intr_pipe as the pipe and * us->ep_bInterval as the interrupt interval. */ static int usb_stor_intr_transfer(struct us_data *us, void *buf, unsigned int length) { int result; unsigned int pipe = us->recv_intr_pipe; unsigned int maxp; usb_stor_dbg(us, "xfer %u bytes\n", length); /* calculate the max packet size */ maxp = usb_maxpacket(us->pusb_dev, pipe); if (maxp > length) maxp = length; /* fill and submit the URB */ usb_fill_int_urb(us->current_urb, us->pusb_dev, pipe, buf, maxp, usb_stor_blocking_completion, NULL, us->ep_bInterval); result = usb_stor_msg_common(us, 0); return interpret_urb_result(us, pipe, length, result, us->current_urb->actual_length); } /* * Transfer one buffer via bulk pipe, without timeouts, but allowing early * termination. Return codes are USB_STOR_XFER_xxx. If the bulk pipe * stalls during the transfer, the halt is automatically cleared. */ int usb_stor_bulk_transfer_buf(struct us_data *us, unsigned int pipe, void *buf, unsigned int length, unsigned int *act_len) { int result; usb_stor_dbg(us, "xfer %u bytes\n", length); /* fill and submit the URB */ usb_fill_bulk_urb(us->current_urb, us->pusb_dev, pipe, buf, length, usb_stor_blocking_completion, NULL); result = usb_stor_msg_common(us, 0); /* store the actual length of the data transferred */ if (act_len) *act_len = us->current_urb->actual_length; return interpret_urb_result(us, pipe, length, result, us->current_urb->actual_length); } EXPORT_SYMBOL_GPL(usb_stor_bulk_transfer_buf); /* * Transfer a scatter-gather list via bulk transfer * * This function does basically the same thing as usb_stor_bulk_transfer_buf() * above, but it uses the usbcore scatter-gather library. */ static int usb_stor_bulk_transfer_sglist(struct us_data *us, unsigned int pipe, struct scatterlist *sg, int num_sg, unsigned int length, unsigned int *act_len) { int result; /* don't submit s-g requests during abort processing */ if (test_bit(US_FLIDX_ABORTING, &us->dflags)) goto usb_stor_xfer_error; /* initialize the scatter-gather request block */ usb_stor_dbg(us, "xfer %u bytes, %d entries\n", length, num_sg); result = usb_sg_init(&us->current_sg, us->pusb_dev, pipe, 0, sg, num_sg, length, GFP_NOIO); if (result) { usb_stor_dbg(us, "usb_sg_init returned %d\n", result); goto usb_stor_xfer_error; } /* * since the block has been initialized successfully, it's now * okay to cancel it */ set_bit(US_FLIDX_SG_ACTIVE, &us->dflags); /* did an abort occur during the submission? */ if (test_bit(US_FLIDX_ABORTING, &us->dflags)) { /* cancel the request, if it hasn't been cancelled already */ if (test_and_clear_bit(US_FLIDX_SG_ACTIVE, &us->dflags)) { usb_stor_dbg(us, "-- cancelling sg request\n"); usb_sg_cancel(&us->current_sg); } } /* wait for the completion of the transfer */ usb_sg_wait(&us->current_sg); clear_bit(US_FLIDX_SG_ACTIVE, &us->dflags); result = us->current_sg.status; if (act_len) *act_len = us->current_sg.bytes; return interpret_urb_result(us, pipe, length, result, us->current_sg.bytes); usb_stor_xfer_error: if (act_len) *act_len = 0; return USB_STOR_XFER_ERROR; } /* * Common used function. Transfer a complete command * via usb_stor_bulk_transfer_sglist() above. Set cmnd resid */ int usb_stor_bulk_srb(struct us_data* us, unsigned int pipe, struct scsi_cmnd* srb) { unsigned int partial; int result = usb_stor_bulk_transfer_sglist(us, pipe, scsi_sglist(srb), scsi_sg_count(srb), scsi_bufflen(srb), &partial); scsi_set_resid(srb, scsi_bufflen(srb) - partial); return result; } EXPORT_SYMBOL_GPL(usb_stor_bulk_srb); /* * Transfer an entire SCSI command's worth of data payload over the bulk * pipe. * * Note that this uses usb_stor_bulk_transfer_buf() and * usb_stor_bulk_transfer_sglist() to achieve its goals -- * this function simply determines whether we're going to use * scatter-gather or not, and acts appropriately. */ int usb_stor_bulk_transfer_sg(struct us_data* us, unsigned int pipe, void *buf, unsigned int length_left, int use_sg, int *residual) { int result; unsigned int partial; /* are we scatter-gathering? */ if (use_sg) { /* use the usb core scatter-gather primitives */ result = usb_stor_bulk_transfer_sglist(us, pipe, (struct scatterlist *) buf, use_sg, length_left, &partial); length_left -= partial; } else { /* no scatter-gather, just make the request */ result = usb_stor_bulk_transfer_buf(us, pipe, buf, length_left, &partial); length_left -= partial; } /* store the residual and return the error code */ if (residual) *residual = length_left; return result; } EXPORT_SYMBOL_GPL(usb_stor_bulk_transfer_sg); /*********************************************************************** * Transport routines ***********************************************************************/ /* * There are so many devices that report the capacity incorrectly, * this routine was written to counteract some of the resulting * problems. */ static void last_sector_hacks(struct us_data *us, struct scsi_cmnd *srb) { struct gendisk *disk; struct scsi_disk *sdkp; u32 sector; /* To Report "Medium Error: Record Not Found */ static unsigned char record_not_found[18] = { [0] = 0x70, /* current error */ [2] = MEDIUM_ERROR, /* = 0x03 */ [7] = 0x0a, /* additional length */ [12] = 0x14 /* Record Not Found */ }; /* * If last-sector problems can't occur, whether because the * capacity was already decremented or because the device is * known to report the correct capacity, then we don't need * to do anything. */ if (!us->use_last_sector_hacks) return; /* Was this command a READ(10) or a WRITE(10)? */ if (srb->cmnd[0] != READ_10 && srb->cmnd[0] != WRITE_10) goto done; /* Did this command access the last sector? */ sector = (srb->cmnd[2] << 24) | (srb->cmnd[3] << 16) | (srb->cmnd[4] << 8) | (srb->cmnd[5]); disk = scsi_cmd_to_rq(srb)->q->disk; if (!disk) goto done; sdkp = scsi_disk(disk); if (!sdkp) goto done; if (sector + 1 != sdkp->capacity) goto done; if (srb->result == SAM_STAT_GOOD && scsi_get_resid(srb) == 0) { /* * The command succeeded. We know this device doesn't * have the last-sector bug, so stop checking it. */ us->use_last_sector_hacks = 0; } else { /* * The command failed. Allow up to 3 retries in case this * is some normal sort of failure. After that, assume the * capacity is wrong and we're trying to access the sector * beyond the end. Replace the result code and sense data * with values that will cause the SCSI core to fail the * command immediately, instead of going into an infinite * (or even just a very long) retry loop. */ if (++us->last_sector_retries < 3) return; srb->result = SAM_STAT_CHECK_CONDITION; memcpy(srb->sense_buffer, record_not_found, sizeof(record_not_found)); } done: /* * Don't reset the retry counter for TEST UNIT READY commands, * because they get issued after device resets which might be * caused by a failed last-sector access. */ if (srb->cmnd[0] != TEST_UNIT_READY) us->last_sector_retries = 0; } /* * Invoke the transport and basic error-handling/recovery methods * * This is used by the protocol layers to actually send the message to * the device and receive the response. */ void usb_stor_invoke_transport(struct scsi_cmnd *srb, struct us_data *us) { int need_auto_sense; int result; /* send the command to the transport layer */ scsi_set_resid(srb, 0); result = us->transport(srb, us); /* * if the command gets aborted by the higher layers, we need to * short-circuit all other processing */ if (test_bit(US_FLIDX_TIMED_OUT, &us->dflags)) { usb_stor_dbg(us, "-- command was aborted\n"); srb->result = DID_ABORT << 16; goto Handle_Errors; } /* if there is a transport error, reset and don't auto-sense */ if (result == USB_STOR_TRANSPORT_ERROR) { usb_stor_dbg(us, "-- transport indicates error, resetting\n"); srb->result = DID_ERROR << 16; goto Handle_Errors; } /* if the transport provided its own sense data, don't auto-sense */ if (result == USB_STOR_TRANSPORT_NO_SENSE) { srb->result = SAM_STAT_CHECK_CONDITION; last_sector_hacks(us, srb); return; } srb->result = SAM_STAT_GOOD; /* * Determine if we need to auto-sense * * I normally don't use a flag like this, but it's almost impossible * to understand what's going on here if I don't. */ need_auto_sense = 0; /* * If we're running the CB transport, which is incapable * of determining status on its own, we will auto-sense * unless the operation involved a data-in transfer. Devices * can signal most data-in errors by stalling the bulk-in pipe. */ if ((us->protocol == USB_PR_CB || us->protocol == USB_PR_DPCM_USB) && srb->sc_data_direction != DMA_FROM_DEVICE) { usb_stor_dbg(us, "-- CB transport device requiring auto-sense\n"); need_auto_sense = 1; } /* Some devices (Kindle) require another command after SYNC CACHE */ if ((us->fflags & US_FL_SENSE_AFTER_SYNC) && srb->cmnd[0] == SYNCHRONIZE_CACHE) { usb_stor_dbg(us, "-- sense after SYNC CACHE\n"); need_auto_sense = 1; } /* * If we have a failure, we're going to do a REQUEST_SENSE * automatically. Note that we differentiate between a command * "failure" and an "error" in the transport mechanism. */ if (result == USB_STOR_TRANSPORT_FAILED) { usb_stor_dbg(us, "-- transport indicates command failure\n"); need_auto_sense = 1; } /* * Determine if this device is SAT by seeing if the * command executed successfully. Otherwise we'll have * to wait for at least one CHECK_CONDITION to determine * SANE_SENSE support */ if (unlikely((srb->cmnd[0] == ATA_16 || srb->cmnd[0] == ATA_12) && result == USB_STOR_TRANSPORT_GOOD && !(us->fflags & US_FL_SANE_SENSE) && !(us->fflags & US_FL_BAD_SENSE) && !(srb->cmnd[2] & 0x20))) { usb_stor_dbg(us, "-- SAT supported, increasing auto-sense\n"); us->fflags |= US_FL_SANE_SENSE; } /* * A short transfer on a command where we don't expect it * is unusual, but it doesn't mean we need to auto-sense. */ if ((scsi_get_resid(srb) > 0) && !((srb->cmnd[0] == REQUEST_SENSE) || (srb->cmnd[0] == INQUIRY) || (srb->cmnd[0] == MODE_SENSE) || (srb->cmnd[0] == LOG_SENSE) || (srb->cmnd[0] == MODE_SENSE_10))) { usb_stor_dbg(us, "-- unexpectedly short transfer\n"); } /* Now, if we need to do the auto-sense, let's do it */ if (need_auto_sense) { int temp_result; struct scsi_eh_save ses; int sense_size = US_SENSE_SIZE; struct scsi_sense_hdr sshdr; const u8 *scdd; u8 fm_ili; /* device supports and needs bigger sense buffer */ if (us->fflags & US_FL_SANE_SENSE) sense_size = ~0; Retry_Sense: usb_stor_dbg(us, "Issuing auto-REQUEST_SENSE\n"); scsi_eh_prep_cmnd(srb, &ses, NULL, 0, sense_size); /* FIXME: we must do the protocol translation here */ if (us->subclass == USB_SC_RBC || us->subclass == USB_SC_SCSI || us->subclass == USB_SC_CYP_ATACB) srb->cmd_len = 6; else srb->cmd_len = 12; /* issue the auto-sense command */ scsi_set_resid(srb, 0); temp_result = us->transport(us->srb, us); /* let's clean up right away */ scsi_eh_restore_cmnd(srb, &ses); if (test_bit(US_FLIDX_TIMED_OUT, &us->dflags)) { usb_stor_dbg(us, "-- auto-sense aborted\n"); srb->result = DID_ABORT << 16; /* If SANE_SENSE caused this problem, disable it */ if (sense_size != US_SENSE_SIZE) { us->fflags &= ~US_FL_SANE_SENSE; us->fflags |= US_FL_BAD_SENSE; } goto Handle_Errors; } /* * Some devices claim to support larger sense but fail when * trying to request it. When a transport failure happens * using US_FS_SANE_SENSE, we always retry with a standard * (small) sense request. This fixes some USB GSM modems */ if (temp_result == USB_STOR_TRANSPORT_FAILED && sense_size != US_SENSE_SIZE) { usb_stor_dbg(us, "-- auto-sense failure, retry small sense\n"); sense_size = US_SENSE_SIZE; us->fflags &= ~US_FL_SANE_SENSE; us->fflags |= US_FL_BAD_SENSE; goto Retry_Sense; } /* Other failures */ if (temp_result != USB_STOR_TRANSPORT_GOOD) { usb_stor_dbg(us, "-- auto-sense failure\n"); /* * we skip the reset if this happens to be a * multi-target device, since failure of an * auto-sense is perfectly valid */ srb->result = DID_ERROR << 16; if (!(us->fflags & US_FL_SCM_MULT_TARG)) goto Handle_Errors; return; } /* * If the sense data returned is larger than 18-bytes then we * assume this device supports requesting more in the future. * The response code must be 70h through 73h inclusive. */ if (srb->sense_buffer[7] > (US_SENSE_SIZE - 8) && !(us->fflags & US_FL_SANE_SENSE) && !(us->fflags & US_FL_BAD_SENSE) && (srb->sense_buffer[0] & 0x7C) == 0x70) { usb_stor_dbg(us, "-- SANE_SENSE support enabled\n"); us->fflags |= US_FL_SANE_SENSE; /* * Indicate to the user that we truncated their sense * because we didn't know it supported larger sense. */ usb_stor_dbg(us, "-- Sense data truncated to %i from %i\n", US_SENSE_SIZE, srb->sense_buffer[7] + 8); srb->sense_buffer[7] = (US_SENSE_SIZE - 8); } scsi_normalize_sense(srb->sense_buffer, SCSI_SENSE_BUFFERSIZE, &sshdr); usb_stor_dbg(us, "-- Result from auto-sense is %d\n", temp_result); usb_stor_dbg(us, "-- code: 0x%x, key: 0x%x, ASC: 0x%x, ASCQ: 0x%x\n", sshdr.response_code, sshdr.sense_key, sshdr.asc, sshdr.ascq); #ifdef CONFIG_USB_STORAGE_DEBUG usb_stor_show_sense(us, sshdr.sense_key, sshdr.asc, sshdr.ascq); #endif /* set the result so the higher layers expect this data */ srb->result = SAM_STAT_CHECK_CONDITION; scdd = scsi_sense_desc_find(srb->sense_buffer, SCSI_SENSE_BUFFERSIZE, 4); fm_ili = (scdd ? scdd[3] : srb->sense_buffer[2]) & 0xA0; /* * We often get empty sense data. This could indicate that * everything worked or that there was an unspecified * problem. We have to decide which. */ if (sshdr.sense_key == 0 && sshdr.asc == 0 && sshdr.ascq == 0 && fm_ili == 0) { /* * If things are really okay, then let's show that. * Zero out the sense buffer so the higher layers * won't realize we did an unsolicited auto-sense. */ if (result == USB_STOR_TRANSPORT_GOOD) { srb->result = SAM_STAT_GOOD; srb->sense_buffer[0] = 0x0; } /* * ATA-passthru commands use sense data to report * the command completion status, and often devices * return Check Condition status when nothing is * wrong. */ else if (srb->cmnd[0] == ATA_16 || srb->cmnd[0] == ATA_12) { /* leave the data alone */ } /* * If there was a problem, report an unspecified * hardware error to prevent the higher layers from * entering an infinite retry loop. */ else { srb->result = DID_ERROR << 16; if ((sshdr.response_code & 0x72) == 0x72) srb->sense_buffer[1] = HARDWARE_ERROR; else srb->sense_buffer[2] = HARDWARE_ERROR; } } } /* * Some devices don't work or return incorrect data the first * time they get a READ(10) command, or for the first READ(10) * after a media change. If the INITIAL_READ10 flag is set, * keep track of whether READ(10) commands succeed. If the * previous one succeeded and this one failed, set the REDO_READ10 * flag to force a retry. */ if (unlikely((us->fflags & US_FL_INITIAL_READ10) && srb->cmnd[0] == READ_10)) { if (srb->result == SAM_STAT_GOOD) { set_bit(US_FLIDX_READ10_WORKED, &us->dflags); } else if (test_bit(US_FLIDX_READ10_WORKED, &us->dflags)) { clear_bit(US_FLIDX_READ10_WORKED, &us->dflags); set_bit(US_FLIDX_REDO_READ10, &us->dflags); } /* * Next, if the REDO_READ10 flag is set, return a result * code that will cause the SCSI core to retry the READ(10) * command immediately. */ if (test_bit(US_FLIDX_REDO_READ10, &us->dflags)) { clear_bit(US_FLIDX_REDO_READ10, &us->dflags); srb->result = DID_IMM_RETRY << 16; srb->sense_buffer[0] = 0; } } /* Did we transfer less than the minimum amount required? */ if ((srb->result == SAM_STAT_GOOD || srb->sense_buffer[2] == 0) && scsi_bufflen(srb) - scsi_get_resid(srb) < srb->underflow) srb->result = DID_ERROR << 16; last_sector_hacks(us, srb); return; /* * Error and abort processing: try to resynchronize with the device * by issuing a port reset. If that fails, try a class-specific * device reset. */ Handle_Errors: /* * Set the RESETTING bit, and clear the ABORTING bit so that * the reset may proceed. */ scsi_lock(us_to_host(us)); set_bit(US_FLIDX_RESETTING, &us->dflags); clear_bit(US_FLIDX_ABORTING, &us->dflags); scsi_unlock(us_to_host(us)); /* * We must release the device lock because the pre_reset routine * will want to acquire it. */ mutex_unlock(&us->dev_mutex); result = usb_stor_port_reset(us); mutex_lock(&us->dev_mutex); if (result < 0) { scsi_lock(us_to_host(us)); usb_stor_report_device_reset(us); scsi_unlock(us_to_host(us)); us->transport_reset(us); } clear_bit(US_FLIDX_RESETTING, &us->dflags); last_sector_hacks(us, srb); } /* Stop the current URB transfer */ void usb_stor_stop_transport(struct us_data *us) { /* * If the state machine is blocked waiting for an URB, * let's wake it up. The test_and_clear_bit() call * guarantees that if a URB has just been submitted, * it won't be cancelled more than once. */ if (test_and_clear_bit(US_FLIDX_URB_ACTIVE, &us->dflags)) { usb_stor_dbg(us, "-- cancelling URB\n"); usb_unlink_urb(us->current_urb); } /* If we are waiting for a scatter-gather operation, cancel it. */ if (test_and_clear_bit(US_FLIDX_SG_ACTIVE, &us->dflags)) { usb_stor_dbg(us, "-- cancelling sg request\n"); usb_sg_cancel(&us->current_sg); } } /* * Control/Bulk and Control/Bulk/Interrupt transport */ int usb_stor_CB_transport(struct scsi_cmnd *srb, struct us_data *us) { unsigned int transfer_length = scsi_bufflen(srb); unsigned int pipe = 0; int result; /* COMMAND STAGE */ /* let's send the command via the control pipe */ /* * Command is sometime (f.e. after scsi_eh_prep_cmnd) on the stack. * Stack may be vmallocated. So no DMA for us. Make a copy. */ memcpy(us->iobuf, srb->cmnd, srb->cmd_len); result = usb_stor_ctrl_transfer(us, us->send_ctrl_pipe, US_CBI_ADSC, USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0, us->ifnum, us->iobuf, srb->cmd_len); /* check the return code for the command */ usb_stor_dbg(us, "Call to usb_stor_ctrl_transfer() returned %d\n", result); /* if we stalled the command, it means command failed */ if (result == USB_STOR_XFER_STALLED) { return USB_STOR_TRANSPORT_FAILED; } /* Uh oh... serious problem here */ if (result != USB_STOR_XFER_GOOD) { return USB_STOR_TRANSPORT_ERROR; } /* DATA STAGE */ /* transfer the data payload for this command, if one exists*/ if (transfer_length) { pipe = srb->sc_data_direction == DMA_FROM_DEVICE ? us->recv_bulk_pipe : us->send_bulk_pipe; result = usb_stor_bulk_srb(us, pipe, srb); usb_stor_dbg(us, "CBI data stage result is 0x%x\n", result); /* if we stalled the data transfer it means command failed */ if (result == USB_STOR_XFER_STALLED) return USB_STOR_TRANSPORT_FAILED; if (result > USB_STOR_XFER_STALLED) return USB_STOR_TRANSPORT_ERROR; } /* STATUS STAGE */ /* * NOTE: CB does not have a status stage. Silly, I know. So * we have to catch this at a higher level. */ if (us->protocol != USB_PR_CBI) return USB_STOR_TRANSPORT_GOOD; result = usb_stor_intr_transfer(us, us->iobuf, 2); usb_stor_dbg(us, "Got interrupt data (0x%x, 0x%x)\n", us->iobuf[0], us->iobuf[1]); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; /* * UFI gives us ASC and ASCQ, like a request sense * * REQUEST_SENSE and INQUIRY don't affect the sense data on UFI * devices, so we ignore the information for those commands. Note * that this means we could be ignoring a real error on these * commands, but that can't be helped. */ if (us->subclass == USB_SC_UFI) { if (srb->cmnd[0] == REQUEST_SENSE || srb->cmnd[0] == INQUIRY) return USB_STOR_TRANSPORT_GOOD; if (us->iobuf[0]) goto Failed; return USB_STOR_TRANSPORT_GOOD; } /* * If not UFI, we interpret the data as a result code * The first byte should always be a 0x0. * * Some bogus devices don't follow that rule. They stuff the ASC * into the first byte -- so if it's non-zero, call it a failure. */ if (us->iobuf[0]) { usb_stor_dbg(us, "CBI IRQ data showed reserved bType 0x%x\n", us->iobuf[0]); goto Failed; } /* The second byte & 0x0F should be 0x0 for good, otherwise error */ switch (us->iobuf[1] & 0x0F) { case 0x00: return USB_STOR_TRANSPORT_GOOD; case 0x01: goto Failed; } return USB_STOR_TRANSPORT_ERROR; /* * the CBI spec requires that the bulk pipe must be cleared * following any data-in/out command failure (section 2.4.3.1.3) */ Failed: if (pipe) usb_stor_clear_halt(us, pipe); return USB_STOR_TRANSPORT_FAILED; } EXPORT_SYMBOL_GPL(usb_stor_CB_transport); /* * Bulk only transport */ /* Determine what the maximum LUN supported is */ int usb_stor_Bulk_max_lun(struct us_data *us) { int result; /* issue the command */ us->iobuf[0] = 0; result = usb_stor_control_msg(us, us->recv_ctrl_pipe, US_BULK_GET_MAX_LUN, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0, us->ifnum, us->iobuf, 1, 10*HZ); usb_stor_dbg(us, "GetMaxLUN command result is %d, data is %d\n", result, us->iobuf[0]); /* If we have a successful request, return the result if valid. */ if (result > 0) { if (us->iobuf[0] <= US_BULK_MAX_LUN_LIMIT) { return us->iobuf[0]; } else { dev_info(&us->pusb_intf->dev, "Max LUN %d is not valid, using 0 instead", us->iobuf[0]); } } /* * Some devices don't like GetMaxLUN. They may STALL the control * pipe, they may return a zero-length result, they may do nothing at * all and timeout, or they may fail in even more bizarrely creative * ways. In these cases the best approach is to use the default * value: only one LUN. */ return 0; } int usb_stor_Bulk_transport(struct scsi_cmnd *srb, struct us_data *us) { struct bulk_cb_wrap *bcb = (struct bulk_cb_wrap *) us->iobuf; struct bulk_cs_wrap *bcs = (struct bulk_cs_wrap *) us->iobuf; unsigned int transfer_length = scsi_bufflen(srb); unsigned int residue; int result; int fake_sense = 0; unsigned int cswlen; unsigned int cbwlen = US_BULK_CB_WRAP_LEN; /* Take care of BULK32 devices; set extra byte to 0 */ if (unlikely(us->fflags & US_FL_BULK32)) { cbwlen = 32; us->iobuf[31] = 0; } /* set up the command wrapper */ bcb->Signature = cpu_to_le32(US_BULK_CB_SIGN); bcb->DataTransferLength = cpu_to_le32(transfer_length); bcb->Flags = srb->sc_data_direction == DMA_FROM_DEVICE ? US_BULK_FLAG_IN : US_BULK_FLAG_OUT; bcb->Tag = ++us->tag; bcb->Lun = srb->device->lun; if (us->fflags & US_FL_SCM_MULT_TARG) bcb->Lun |= srb->device->id << 4; bcb->Length = srb->cmd_len; /* copy the command payload */ memset(bcb->CDB, 0, sizeof(bcb->CDB)); memcpy(bcb->CDB, srb->cmnd, bcb->Length); /* send it to out endpoint */ usb_stor_dbg(us, "Bulk Command S 0x%x T 0x%x L %d F %d Trg %d LUN %d CL %d\n", le32_to_cpu(bcb->Signature), bcb->Tag, le32_to_cpu(bcb->DataTransferLength), bcb->Flags, (bcb->Lun >> 4), (bcb->Lun & 0x0F), bcb->Length); result = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe, bcb, cbwlen, NULL); usb_stor_dbg(us, "Bulk command transfer result=%d\n", result); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; /* DATA STAGE */ /* send/receive data payload, if there is any */ /* * Some USB-IDE converter chips need a 100us delay between the * command phase and the data phase. Some devices need a little * more than that, probably because of clock rate inaccuracies. */ if (unlikely(us->fflags & US_FL_GO_SLOW)) usleep_range(125, 150); if (transfer_length) { unsigned int pipe = srb->sc_data_direction == DMA_FROM_DEVICE ? us->recv_bulk_pipe : us->send_bulk_pipe; result = usb_stor_bulk_srb(us, pipe, srb); usb_stor_dbg(us, "Bulk data transfer result 0x%x\n", result); if (result == USB_STOR_XFER_ERROR) return USB_STOR_TRANSPORT_ERROR; /* * If the device tried to send back more data than the * amount requested, the spec requires us to transfer * the CSW anyway. Since there's no point retrying * the command, we'll return fake sense data indicating * Illegal Request, Invalid Field in CDB. */ if (result == USB_STOR_XFER_LONG) fake_sense = 1; /* * Sometimes a device will mistakenly skip the data phase * and go directly to the status phase without sending a * zero-length packet. If we get a 13-byte response here, * check whether it really is a CSW. */ if (result == USB_STOR_XFER_SHORT && srb->sc_data_direction == DMA_FROM_DEVICE && transfer_length - scsi_get_resid(srb) == US_BULK_CS_WRAP_LEN) { struct scatterlist *sg = NULL; unsigned int offset = 0; if (usb_stor_access_xfer_buf((unsigned char *) bcs, US_BULK_CS_WRAP_LEN, srb, &sg, &offset, FROM_XFER_BUF) == US_BULK_CS_WRAP_LEN && bcs->Signature == cpu_to_le32(US_BULK_CS_SIGN)) { usb_stor_dbg(us, "Device skipped data phase\n"); scsi_set_resid(srb, transfer_length); goto skipped_data_phase; } } } /* * See flow chart on pg 15 of the Bulk Only Transport spec for * an explanation of how this code works. */ /* get CSW for device status */ usb_stor_dbg(us, "Attempting to get CSW...\n"); result = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, bcs, US_BULK_CS_WRAP_LEN, &cswlen); /* * Some broken devices add unnecessary zero-length packets to the * end of their data transfers. Such packets show up as 0-length * CSWs. If we encounter such a thing, try to read the CSW again. */ if (result == USB_STOR_XFER_SHORT && cswlen == 0) { usb_stor_dbg(us, "Received 0-length CSW; retrying...\n"); result = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, bcs, US_BULK_CS_WRAP_LEN, &cswlen); } /* did the attempt to read the CSW fail? */ if (result == USB_STOR_XFER_STALLED) { /* get the status again */ usb_stor_dbg(us, "Attempting to get CSW (2nd try)...\n"); result = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, bcs, US_BULK_CS_WRAP_LEN, NULL); } /* if we still have a failure at this point, we're in trouble */ usb_stor_dbg(us, "Bulk status result = %d\n", result); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; skipped_data_phase: /* check bulk status */ residue = le32_to_cpu(bcs->Residue); usb_stor_dbg(us, "Bulk Status S 0x%x T 0x%x R %u Stat 0x%x\n", le32_to_cpu(bcs->Signature), bcs->Tag, residue, bcs->Status); if (!(bcs->Tag == us->tag || (us->fflags & US_FL_BULK_IGNORE_TAG)) || bcs->Status > US_BULK_STAT_PHASE) { usb_stor_dbg(us, "Bulk logical error\n"); return USB_STOR_TRANSPORT_ERROR; } /* * Some broken devices report odd signatures, so we do not check them * for validity against the spec. We store the first one we see, * and check subsequent transfers for validity against this signature. */ if (!us->bcs_signature) { us->bcs_signature = bcs->Signature; if (us->bcs_signature != cpu_to_le32(US_BULK_CS_SIGN)) usb_stor_dbg(us, "Learnt BCS signature 0x%08X\n", le32_to_cpu(us->bcs_signature)); } else if (bcs->Signature != us->bcs_signature) { usb_stor_dbg(us, "Signature mismatch: got %08X, expecting %08X\n", le32_to_cpu(bcs->Signature), le32_to_cpu(us->bcs_signature)); return USB_STOR_TRANSPORT_ERROR; } /* * try to compute the actual residue, based on how much data * was really transferred and what the device tells us */ if (residue && !(us->fflags & US_FL_IGNORE_RESIDUE)) { /* * Heuristically detect devices that generate bogus residues * by seeing what happens with INQUIRY and READ CAPACITY * commands. */ if (bcs->Status == US_BULK_STAT_OK && scsi_get_resid(srb) == 0 && ((srb->cmnd[0] == INQUIRY && transfer_length == 36) || (srb->cmnd[0] == READ_CAPACITY && transfer_length == 8))) { us->fflags |= US_FL_IGNORE_RESIDUE; } else { residue = min(residue, transfer_length); scsi_set_resid(srb, max(scsi_get_resid(srb), residue)); } } /* based on the status code, we report good or bad */ switch (bcs->Status) { case US_BULK_STAT_OK: /* device babbled -- return fake sense data */ if (fake_sense) { memcpy(srb->sense_buffer, usb_stor_sense_invalidCDB, sizeof(usb_stor_sense_invalidCDB)); return USB_STOR_TRANSPORT_NO_SENSE; } /* command good -- note that data could be short */ return USB_STOR_TRANSPORT_GOOD; case US_BULK_STAT_FAIL: /* command failed */ return USB_STOR_TRANSPORT_FAILED; case US_BULK_STAT_PHASE: /* * phase error -- note that a transport reset will be * invoked by the invoke_transport() function */ return USB_STOR_TRANSPORT_ERROR; } /* we should never get here, but if we do, we're in trouble */ return USB_STOR_TRANSPORT_ERROR; } EXPORT_SYMBOL_GPL(usb_stor_Bulk_transport); /*********************************************************************** * Reset routines ***********************************************************************/ /* * This is the common part of the device reset code. * * It's handy that every transport mechanism uses the control endpoint for * resets. * * Basically, we send a reset with a 5-second timeout, so we don't get * jammed attempting to do the reset. */ static int usb_stor_reset_common(struct us_data *us, u8 request, u8 requesttype, u16 value, u16 index, void *data, u16 size) { int result; int result2; if (test_bit(US_FLIDX_DISCONNECTING, &us->dflags)) { usb_stor_dbg(us, "No reset during disconnect\n"); return -EIO; } result = usb_stor_control_msg(us, us->send_ctrl_pipe, request, requesttype, value, index, data, size, 5*HZ); if (result < 0) { usb_stor_dbg(us, "Soft reset failed: %d\n", result); return result; } /* * Give the device some time to recover from the reset, * but don't delay disconnect processing. */ wait_event_interruptible_timeout(us->delay_wait, test_bit(US_FLIDX_DISCONNECTING, &us->dflags), HZ*6); if (test_bit(US_FLIDX_DISCONNECTING, &us->dflags)) { usb_stor_dbg(us, "Reset interrupted by disconnect\n"); return -EIO; } usb_stor_dbg(us, "Soft reset: clearing bulk-in endpoint halt\n"); result = usb_stor_clear_halt(us, us->recv_bulk_pipe); usb_stor_dbg(us, "Soft reset: clearing bulk-out endpoint halt\n"); result2 = usb_stor_clear_halt(us, us->send_bulk_pipe); /* return a result code based on the result of the clear-halts */ if (result >= 0) result = result2; if (result < 0) usb_stor_dbg(us, "Soft reset failed\n"); else usb_stor_dbg(us, "Soft reset done\n"); return result; } /* This issues a CB[I] Reset to the device in question */ #define CB_RESET_CMD_SIZE 12 int usb_stor_CB_reset(struct us_data *us) { memset(us->iobuf, 0xFF, CB_RESET_CMD_SIZE); us->iobuf[0] = SEND_DIAGNOSTIC; us->iobuf[1] = 4; return usb_stor_reset_common(us, US_CBI_ADSC, USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0, us->ifnum, us->iobuf, CB_RESET_CMD_SIZE); } EXPORT_SYMBOL_GPL(usb_stor_CB_reset); /* * This issues a Bulk-only Reset to the device in question, including * clearing the subsequent endpoint halts that may occur. */ int usb_stor_Bulk_reset(struct us_data *us) { return usb_stor_reset_common(us, US_BULK_RESET_REQUEST, USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0, us->ifnum, NULL, 0); } EXPORT_SYMBOL_GPL(usb_stor_Bulk_reset); /* * Issue a USB port reset to the device. The caller must not hold * us->dev_mutex. */ int usb_stor_port_reset(struct us_data *us) { int result; /*for these devices we must use the class specific method */ if (us->pusb_dev->quirks & USB_QUIRK_RESET) return -EPERM; result = usb_lock_device_for_reset(us->pusb_dev, us->pusb_intf); if (result < 0) usb_stor_dbg(us, "unable to lock device for reset: %d\n", result); else { /* Were we disconnected while waiting for the lock? */ if (test_bit(US_FLIDX_DISCONNECTING, &us->dflags)) { result = -EIO; usb_stor_dbg(us, "No reset during disconnect\n"); } else { result = usb_reset_device(us->pusb_dev); usb_stor_dbg(us, "usb_reset_device returns %d\n", result); } usb_unlock_device(us->pusb_dev); } return result; }
6 1 1 1 4 5 4 2 5 1 1 4 4 4 4 3 1 1 1 12 1 1 11 1 9 5 3 1 2 1 1 1 1 1 1 1 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 // SPDX-License-Identifier: GPL-2.0-only /* * vivid-radio-rx.c - radio receiver support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/delay.h> #include <linux/videodev2.h> #include <linux/v4l2-dv-timings.h> #include <linux/sched/signal.h> #include <media/v4l2-common.h> #include <media/v4l2-event.h> #include <media/v4l2-dv-timings.h> #include "vivid-core.h" #include "vivid-ctrls.h" #include "vivid-radio-common.h" #include "vivid-rds-gen.h" #include "vivid-radio-rx.h" ssize_t vivid_radio_rx_read(struct file *file, char __user *buf, size_t size, loff_t *offset) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_rds_data *data = dev->rds_gen.data; bool use_alternates; ktime_t timestamp; unsigned blk; int perc; int i; if (dev->radio_rx_rds_controls) return -EINVAL; if (size < sizeof(*data)) return 0; size = sizeof(*data) * (size / sizeof(*data)); if (mutex_lock_interruptible(&dev->mutex)) return -ERESTARTSYS; if (dev->radio_rx_rds_owner && file->private_data != dev->radio_rx_rds_owner) { mutex_unlock(&dev->mutex); return -EBUSY; } if (dev->radio_rx_rds_owner == NULL) { vivid_radio_rds_init(dev); dev->radio_rx_rds_owner = file->private_data; } retry: timestamp = ktime_sub(ktime_get(), dev->radio_rds_init_time); blk = ktime_divns(timestamp, VIVID_RDS_NSEC_PER_BLK); use_alternates = (blk % VIVID_RDS_GEN_BLOCKS) & 1; if (dev->radio_rx_rds_last_block == 0 || dev->radio_rx_rds_use_alternates != use_alternates) { dev->radio_rx_rds_use_alternates = use_alternates; /* Re-init the RDS generator */ vivid_radio_rds_init(dev); } if (blk >= dev->radio_rx_rds_last_block + VIVID_RDS_GEN_BLOCKS) dev->radio_rx_rds_last_block = blk - VIVID_RDS_GEN_BLOCKS + 1; /* * No data is available if there hasn't been time to get new data, * or if the RDS receiver has been disabled, or if we use the data * from the RDS transmitter and that RDS transmitter has been disabled, * or if the signal quality is too weak. */ if (blk == dev->radio_rx_rds_last_block || !dev->radio_rx_rds_enabled || (dev->radio_rds_loop && !(dev->radio_tx_subchans & V4L2_TUNER_SUB_RDS)) || abs(dev->radio_rx_sig_qual) > 200) { mutex_unlock(&dev->mutex); if (file->f_flags & O_NONBLOCK) return -EWOULDBLOCK; if (msleep_interruptible(20) && signal_pending(current)) return -EINTR; if (mutex_lock_interruptible(&dev->mutex)) return -ERESTARTSYS; goto retry; } /* abs(dev->radio_rx_sig_qual) <= 200, map that to a 0-50% range */ perc = abs(dev->radio_rx_sig_qual) / 4; for (i = 0; i < size && blk > dev->radio_rx_rds_last_block; dev->radio_rx_rds_last_block++) { unsigned data_blk = dev->radio_rx_rds_last_block % VIVID_RDS_GEN_BLOCKS; struct v4l2_rds_data rds = data[data_blk]; if (data_blk == 0 && dev->radio_rds_loop) vivid_radio_rds_init(dev); if (perc && get_random_u32_below(100) < perc) { switch (get_random_u32_below(4)) { case 0: rds.block |= V4L2_RDS_BLOCK_CORRECTED; break; case 1: rds.block |= V4L2_RDS_BLOCK_INVALID; break; case 2: rds.block |= V4L2_RDS_BLOCK_ERROR; rds.lsb = get_random_u8(); rds.msb = get_random_u8(); break; case 3: /* Skip block altogether */ if (i) continue; /* * Must make sure at least one block is * returned, otherwise the application * might think that end-of-file occurred. */ break; } } if (copy_to_user(buf + i, &rds, sizeof(rds))) { i = -EFAULT; break; } i += sizeof(rds); } mutex_unlock(&dev->mutex); return i; } __poll_t vivid_radio_rx_poll(struct file *file, struct poll_table_struct *wait) { return EPOLLIN | EPOLLRDNORM | v4l2_ctrl_poll(file, wait); } int vivid_radio_rx_enum_freq_bands(struct file *file, void *fh, struct v4l2_frequency_band *band) { if (band->tuner != 0) return -EINVAL; if (band->index >= TOT_BANDS) return -EINVAL; *band = vivid_radio_bands[band->index]; return 0; } int vivid_radio_rx_s_hw_freq_seek(struct file *file, void *fh, const struct v4l2_hw_freq_seek *a) { struct vivid_dev *dev = video_drvdata(file); unsigned low, high; unsigned freq; unsigned spacing; unsigned band; if (a->tuner) return -EINVAL; if (a->wrap_around && dev->radio_rx_hw_seek_mode == VIVID_HW_SEEK_BOUNDED) return -EINVAL; if (!a->wrap_around && dev->radio_rx_hw_seek_mode == VIVID_HW_SEEK_WRAP) return -EINVAL; if (!a->rangelow ^ !a->rangehigh) return -EINVAL; if (file->f_flags & O_NONBLOCK) return -EWOULDBLOCK; if (a->rangelow) { for (band = 0; band < TOT_BANDS; band++) if (a->rangelow >= vivid_radio_bands[band].rangelow && a->rangehigh <= vivid_radio_bands[band].rangehigh) break; if (band == TOT_BANDS) return -EINVAL; if (!dev->radio_rx_hw_seek_prog_lim && (a->rangelow != vivid_radio_bands[band].rangelow || a->rangehigh != vivid_radio_bands[band].rangehigh)) return -EINVAL; low = a->rangelow; high = a->rangehigh; } else { for (band = 0; band < TOT_BANDS; band++) if (dev->radio_rx_freq >= vivid_radio_bands[band].rangelow && dev->radio_rx_freq <= vivid_radio_bands[band].rangehigh) break; if (band == TOT_BANDS) return -EINVAL; low = vivid_radio_bands[band].rangelow; high = vivid_radio_bands[band].rangehigh; } spacing = band == BAND_AM ? 1600 : 16000; freq = clamp(dev->radio_rx_freq, low, high); if (a->seek_upward) { freq = spacing * (freq / spacing) + spacing; if (freq > high) { if (!a->wrap_around) return -ENODATA; freq = spacing * (low / spacing) + spacing; if (freq >= dev->radio_rx_freq) return -ENODATA; } } else { freq = spacing * ((freq + spacing - 1) / spacing) - spacing; if (freq < low) { if (!a->wrap_around) return -ENODATA; freq = spacing * ((high + spacing - 1) / spacing) - spacing; if (freq <= dev->radio_rx_freq) return -ENODATA; } } return 0; } int vivid_radio_rx_g_tuner(struct file *file, void *fh, struct v4l2_tuner *vt) { struct vivid_dev *dev = video_drvdata(file); int delta = 800; int sig_qual; if (vt->index > 0) return -EINVAL; strscpy(vt->name, "AM/FM/SW Receiver", sizeof(vt->name)); vt->capability = V4L2_TUNER_CAP_LOW | V4L2_TUNER_CAP_STEREO | V4L2_TUNER_CAP_FREQ_BANDS | V4L2_TUNER_CAP_RDS | (dev->radio_rx_rds_controls ? V4L2_TUNER_CAP_RDS_CONTROLS : V4L2_TUNER_CAP_RDS_BLOCK_IO) | (dev->radio_rx_hw_seek_prog_lim ? V4L2_TUNER_CAP_HWSEEK_PROG_LIM : 0); switch (dev->radio_rx_hw_seek_mode) { case VIVID_HW_SEEK_BOUNDED: vt->capability |= V4L2_TUNER_CAP_HWSEEK_BOUNDED; break; case VIVID_HW_SEEK_WRAP: vt->capability |= V4L2_TUNER_CAP_HWSEEK_WRAP; break; case VIVID_HW_SEEK_BOTH: vt->capability |= V4L2_TUNER_CAP_HWSEEK_WRAP | V4L2_TUNER_CAP_HWSEEK_BOUNDED; break; } vt->rangelow = AM_FREQ_RANGE_LOW; vt->rangehigh = FM_FREQ_RANGE_HIGH; sig_qual = dev->radio_rx_sig_qual; vt->signal = abs(sig_qual) > delta ? 0 : 0xffff - ((unsigned)abs(sig_qual) * 0xffff) / delta; vt->afc = sig_qual > delta ? 0 : sig_qual; if (abs(sig_qual) > delta) vt->rxsubchans = 0; else if (dev->radio_rx_freq < FM_FREQ_RANGE_LOW || vt->signal < 0x8000) vt->rxsubchans = V4L2_TUNER_SUB_MONO; else if (dev->radio_rds_loop && !(dev->radio_tx_subchans & V4L2_TUNER_SUB_STEREO)) vt->rxsubchans = V4L2_TUNER_SUB_MONO; else vt->rxsubchans = V4L2_TUNER_SUB_STEREO; if (dev->radio_rx_rds_enabled && (!dev->radio_rds_loop || (dev->radio_tx_subchans & V4L2_TUNER_SUB_RDS)) && dev->radio_rx_freq >= FM_FREQ_RANGE_LOW && vt->signal >= 0xc000) vt->rxsubchans |= V4L2_TUNER_SUB_RDS; if (dev->radio_rx_rds_controls) vivid_radio_rds_init(dev); vt->audmode = dev->radio_rx_audmode; return 0; } int vivid_radio_rx_s_tuner(struct file *file, void *fh, const struct v4l2_tuner *vt) { struct vivid_dev *dev = video_drvdata(file); if (vt->index) return -EINVAL; dev->radio_rx_audmode = vt->audmode >= V4L2_TUNER_MODE_STEREO; return 0; }
2 2 2 2 2 2 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 28 2 27 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 5 5 4 3 2 1 2 2 40 40 32 3 30 33 8 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/kthread.h> #include <linux/semaphore.h> #include <linux/uuid.h> #include <linux/list_sort.h> #include <linux/namei.h> #include "misc.h" #include "disk-io.h" #include "extent-tree.h" #include "transaction.h" #include "volumes.h" #include "raid56.h" #include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" #include "tree-checker.h" #include "space-info.h" #include "block-group.h" #include "discard.h" #include "zoned.h" #include "fs.h" #include "accessors.h" #include "uuid-tree.h" #include "ioctl.h" #include "relocation.h" #include "scrub.h" #include "super.h" #include "raid-stripe-tree.h" #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID10 | \ BTRFS_BLOCK_GROUP_RAID56_MASK) struct btrfs_io_geometry { u32 stripe_index; u32 stripe_nr; int mirror_num; int num_stripes; u64 stripe_offset; u64 raid56_full_stripe_start; int max_errors; enum btrfs_map_op op; bool use_rst; }; const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, .dev_stripes = 1, .devs_max = 0, /* 0 == as many as possible */ .devs_min = 2, .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, .nparity = 0, .raid_name = "raid10", .bg_flag = BTRFS_BLOCK_GROUP_RAID10, .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, }, [BTRFS_RAID_RAID1] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 2, .devs_min = 2, .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, .nparity = 0, .raid_name = "raid1", .bg_flag = BTRFS_BLOCK_GROUP_RAID1, .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, }, [BTRFS_RAID_RAID1C3] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 3, .devs_min = 3, .tolerated_failures = 2, .devs_increment = 3, .ncopies = 3, .nparity = 0, .raid_name = "raid1c3", .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, }, [BTRFS_RAID_RAID1C4] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 4, .devs_min = 4, .tolerated_failures = 3, .devs_increment = 4, .ncopies = 4, .nparity = 0, .raid_name = "raid1c4", .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, }, [BTRFS_RAID_DUP] = { .sub_stripes = 1, .dev_stripes = 2, .devs_max = 1, .devs_min = 1, .tolerated_failures = 0, .devs_increment = 1, .ncopies = 2, .nparity = 0, .raid_name = "dup", .bg_flag = BTRFS_BLOCK_GROUP_DUP, .mindev_error = 0, }, [BTRFS_RAID_RAID0] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 0, .devs_min = 1, .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, .nparity = 0, .raid_name = "raid0", .bg_flag = BTRFS_BLOCK_GROUP_RAID0, .mindev_error = 0, }, [BTRFS_RAID_SINGLE] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 1, .devs_min = 1, .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, .nparity = 0, .raid_name = "single", .bg_flag = 0, .mindev_error = 0, }, [BTRFS_RAID_RAID5] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 0, .devs_min = 2, .tolerated_failures = 1, .devs_increment = 1, .ncopies = 1, .nparity = 1, .raid_name = "raid5", .bg_flag = BTRFS_BLOCK_GROUP_RAID5, .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, }, [BTRFS_RAID_RAID6] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 0, .devs_min = 3, .tolerated_failures = 2, .devs_increment = 1, .ncopies = 1, .nparity = 2, .raid_name = "raid6", .bg_flag = BTRFS_BLOCK_GROUP_RAID6, .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, }, }; /* * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which * can be used as index to access btrfs_raid_array[]. */ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) { const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK); if (!profile) return BTRFS_RAID_SINGLE; return BTRFS_BG_FLAG_TO_INDEX(profile); } const char *btrfs_bg_type_to_raid_name(u64 flags) { const int index = btrfs_bg_flags_to_raid_index(flags); if (index >= BTRFS_NR_RAID_TYPES) return NULL; return btrfs_raid_array[index].raid_name; } int btrfs_nr_parity_stripes(u64 type) { enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type); return btrfs_raid_array[index].nparity; } /* * Fill @buf with textual description of @bg_flags, no more than @size_buf * bytes including terminating null byte. */ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) { int i; int ret; char *bp = buf; u64 flags = bg_flags; u32 size_bp = size_buf; if (!flags) { strcpy(bp, "NONE"); return; } #define DESCRIBE_FLAG(flag, desc) \ do { \ if (flags & (flag)) { \ ret = snprintf(bp, size_bp, "%s|", (desc)); \ if (ret < 0 || ret >= size_bp) \ goto out_overflow; \ size_bp -= ret; \ bp += ret; \ flags &= ~(flag); \ } \ } while (0) DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, btrfs_raid_array[i].raid_name); #undef DESCRIBE_FLAG if (flags) { ret = snprintf(bp, size_bp, "0x%llx|", flags); size_bp -= ret; } if (size_bp < size_buf) buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ /* * The text is trimmed, it's up to the caller to provide sufficiently * large buffer */ out_overflow:; } static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); /* * Device locking * ============== * * There are several mutexes that protect manipulation of devices and low-level * structures like chunks but not block groups, extents or files * * uuid_mutex (global lock) * ------------------------ * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from * the SCAN_DEV ioctl registration or from mount either implicitly (the first * device) or requested by the device= mount option * * the mutex can be very coarse and can cover long-running operations * * protects: updates to fs_devices counters like missing devices, rw devices, * seeding, structure cloning, opening/closing devices at mount/umount time * * global::fs_devs - add, remove, updates to the global list * * does not protect: manipulation of the fs_devices::devices list in general * but in mount context it could be used to exclude list modifications by eg. * scan ioctl * * btrfs_device::name - renames (write side), read is RCU * * fs_devices::device_list_mutex (per-fs, with RCU) * ------------------------------------------------ * protects updates to fs_devices::devices, ie. adding and deleting * * simple list traversal with read-only actions can be done with RCU protection * * may be used to exclude some operations from running concurrently without any * modifications to the list (see write_all_supers) * * Is not required at mount and close times, because our device list is * protected by the uuid_mutex at that point. * * balance_mutex * ------------- * protects balance structures (status, state) and context accessed from * several places (internally, ioctl) * * chunk_mutex * ----------- * protects chunks, adding or removing during allocation, trim or when a new * device is added/removed. Additionally it also protects post_commit_list of * individual devices, since they can be added to the transaction's * post_commit_list only with chunk_mutex held. * * cleaner_mutex * ------------- * a big lock that is held by the cleaner thread and prevents running subvolume * cleaning together with relocation or delayed iputs * * * Lock nesting * ============ * * uuid_mutex * device_list_mutex * chunk_mutex * balance_mutex * * * Exclusive operations * ==================== * * Maintains the exclusivity of the following operations that apply to the * whole filesystem and cannot run in parallel. * * - Balance (*) * - Device add * - Device remove * - Device replace (*) * - Resize * * The device operations (as above) can be in one of the following states: * * - Running state * - Paused state * - Completed state * * Only device operations marked with (*) can go into the Paused state for the * following reasons: * * - ioctl (only Balance can be Paused through ioctl) * - filesystem remounted as read-only * - filesystem unmounted and mounted as read-only * - system power-cycle and filesystem mounted as read-only * - filesystem or device errors leading to forced read-only * * The status of exclusive operation is set and cleared atomically. * During the course of Paused state, fs_info::exclusive_operation remains set. * A device operation in Paused or Running state can be canceled or resumed * either by ioctl (Balance only) or when remounted as read-write. * The exclusive status is cleared when the device operation is canceled or * completed. */ DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) { return &fs_uuids; } /* * Allocate new btrfs_fs_devices structure identified by a fsid. * * @fsid: if not NULL, copy the UUID to fs_devices::fsid and to * fs_devices::metadata_fsid * * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). * The returned struct is not linked onto any lists and can be destroyed with * kfree() right away. */ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) { struct btrfs_fs_devices *fs_devs; fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); if (!fs_devs) return ERR_PTR(-ENOMEM); mutex_init(&fs_devs->device_list_mutex); INIT_LIST_HEAD(&fs_devs->devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->fs_list); INIT_LIST_HEAD(&fs_devs->seed_list); if (fsid) { memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); } return fs_devs; } static void btrfs_free_device(struct btrfs_device *device) { WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); extent_io_tree_release(&device->alloc_state); btrfs_destroy_dev_zone_info(device); kfree(device); } static void free_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; WARN_ON(fs_devices->opened); while (!list_empty(&fs_devices->devices)) { device = list_entry(fs_devices->devices.next, struct btrfs_device, dev_list); list_del(&device->dev_list); btrfs_free_device(device); } kfree(fs_devices); } void __exit btrfs_cleanup_fs_uuids(void) { struct btrfs_fs_devices *fs_devices; while (!list_empty(&fs_uuids)) { fs_devices = list_entry(fs_uuids.next, struct btrfs_fs_devices, fs_list); list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } } static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices, const u8 *fsid, const u8 *metadata_fsid) { if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0) return false; if (!metadata_fsid) return true; if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0) return false; return true; } static noinline struct btrfs_fs_devices *find_fsid( const u8 *fsid, const u8 *metadata_fsid) { struct btrfs_fs_devices *fs_devices; ASSERT(fsid); /* Handle non-split brain cases */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid)) return fs_devices; } return NULL; } static int btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, int flush, struct file **bdev_file, struct btrfs_super_block **disk_super) { struct block_device *bdev; int ret; *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL); if (IS_ERR(*bdev_file)) { ret = PTR_ERR(*bdev_file); btrfs_err(NULL, "failed to open device for path %s with flags 0x%x: %d", device_path, flags, ret); goto error; } bdev = file_bdev(*bdev_file); if (flush) sync_blockdev(bdev); if (holder) { ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE); if (ret) { fput(*bdev_file); goto error; } } invalidate_bdev(bdev); *disk_super = btrfs_read_dev_super(bdev); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); fput(*bdev_file); goto error; } return 0; error: *disk_super = NULL; *bdev_file = NULL; return ret; } /* * Search and remove all stale devices (which are not mounted). When both * inputs are NULL, it will search and release all stale devices. * * @devt: Optional. When provided will it release all unmounted devices * matching this devt only. * @skip_device: Optional. Will skip this device when searching for the stale * devices. * * Return: 0 for success or if @devt is 0. * -EBUSY if @devt is a mounted device. * -ENOENT if @devt does not match any device in the list. */ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device) { struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; struct btrfs_device *device, *tmp_device; int ret; bool freed = false; lockdep_assert_held(&uuid_mutex); /* Return good status if there is no instance of devt. */ ret = 0; list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, dev_list) { if (skip_device && skip_device == device) continue; if (devt && devt != device->devt) continue; if (fs_devices->opened) { if (devt) ret = -EBUSY; break; } /* delete the stale device */ fs_devices->num_devices--; list_del(&device->dev_list); btrfs_free_device(device); freed = true; } mutex_unlock(&fs_devices->device_list_mutex); if (fs_devices->num_devices == 0) { btrfs_sysfs_remove_fsid(fs_devices); list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } } /* If there is at least one freed device return 0. */ if (freed) return 0; return ret; } static struct btrfs_fs_devices *find_fsid_by_device( struct btrfs_super_block *disk_super, dev_t devt, bool *same_fsid_diff_dev) { struct btrfs_fs_devices *fsid_fs_devices; struct btrfs_fs_devices *devt_fs_devices; const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); bool found_by_devt = false; /* Find the fs_device by the usual method, if found use it. */ fsid_fs_devices = find_fsid(disk_super->fsid, has_metadata_uuid ? disk_super->metadata_uuid : NULL); /* The temp_fsid feature is supported only with single device filesystem. */ if (btrfs_super_num_devices(disk_super) != 1) return fsid_fs_devices; /* * A seed device is an integral component of the sprout device, which * functions as a multi-device filesystem. So, temp-fsid feature is * not supported. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) return fsid_fs_devices; /* Try to find a fs_devices by matching devt. */ list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) { struct btrfs_device *device; list_for_each_entry(device, &devt_fs_devices->devices, dev_list) { if (device->devt == devt) { found_by_devt = true; break; } } if (found_by_devt) break; } if (found_by_devt) { /* Existing device. */ if (fsid_fs_devices == NULL) { if (devt_fs_devices->opened == 0) { /* Stale device. */ return NULL; } else { /* temp_fsid is mounting a subvol. */ return devt_fs_devices; } } else { /* Regular or temp_fsid device mounting a subvol. */ return devt_fs_devices; } } else { /* New device. */ if (fsid_fs_devices == NULL) { return NULL; } else { /* sb::fsid is already used create a new temp_fsid. */ *same_fsid_diff_dev = true; return NULL; } } /* Not reached. */ } /* * This is only used on mount, and we are protected from competing things * messing with our fs_devices by the uuid_mutex, thus we do not need the * fs_devices->device_list_mutex here. */ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, blk_mode_t flags, void *holder) { struct file *bdev_file; struct btrfs_super_block *disk_super; u64 devid; int ret; if (device->bdev) return -EINVAL; if (!device->name) return -EINVAL; ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, &bdev_file, &disk_super); if (ret) return ret; devid = btrfs_stack_device_id(&disk_super->dev_item); if (devid != device->devid) goto error_free_page; if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) goto error_free_page; device->generation = btrfs_super_generation(disk_super); if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { pr_err( "BTRFS: Invalid seeding and uuid-changed device detected\n"); goto error_free_page; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->seeding = true; } else { if (bdev_read_only(file_bdev(bdev_file))) clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); else set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } if (!bdev_nonrot(file_bdev(bdev_file))) fs_devices->rotating = true; if (bdev_max_discard_sectors(file_bdev(bdev_file))) fs_devices->discardable = true; device->bdev_file = bdev_file; device->bdev = file_bdev(bdev_file); clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); if (device->devt != device->bdev->bd_dev) { btrfs_warn(NULL, "device %s maj:min changed from %d:%d to %d:%d", device->name->str, MAJOR(device->devt), MINOR(device->devt), MAJOR(device->bdev->bd_dev), MINOR(device->bdev->bd_dev)); device->devt = device->bdev->bd_dev; } fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { fs_devices->rw_devices++; list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); } btrfs_release_disk_super(disk_super); return 0; error_free_page: btrfs_release_disk_super(disk_super); fput(bdev_file); return -EINVAL; } const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) { bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; } /* * We can have very weird soft links passed in. * One example is "/proc/self/fd/<fd>", which can be a soft link to * a block device. * * But it's never a good idea to use those weird names. * Here we check if the path (not following symlinks) is a good one inside * "/dev/". */ static bool is_good_dev_path(const char *dev_path) { struct path path = { .mnt = NULL, .dentry = NULL }; char *path_buf = NULL; char *resolved_path; bool is_good = false; int ret; if (!dev_path) goto out; path_buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!path_buf) goto out; /* * Do not follow soft link, just check if the original path is inside * "/dev/". */ ret = kern_path(dev_path, 0, &path); if (ret) goto out; resolved_path = d_path(&path, path_buf, PATH_MAX); if (IS_ERR(resolved_path)) goto out; if (strncmp(resolved_path, "/dev/", strlen("/dev/"))) goto out; is_good = true; out: kfree(path_buf); path_put(&path); return is_good; } static int get_canonical_dev_path(const char *dev_path, char *canonical) { struct path path = { .mnt = NULL, .dentry = NULL }; char *path_buf = NULL; char *resolved_path; int ret; if (!dev_path) { ret = -EINVAL; goto out; } path_buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!path_buf) { ret = -ENOMEM; goto out; } ret = kern_path(dev_path, LOOKUP_FOLLOW, &path); if (ret) goto out; resolved_path = d_path(&path, path_buf, PATH_MAX); if (IS_ERR(resolved_path)) { ret = PTR_ERR(resolved_path); goto out; } ret = strscpy(canonical, resolved_path, PATH_MAX); out: kfree(path_buf); path_put(&path); return ret; } static bool is_same_device(struct btrfs_device *device, const char *new_path) { struct path old = { .mnt = NULL, .dentry = NULL }; struct path new = { .mnt = NULL, .dentry = NULL }; char *old_path = NULL; bool is_same = false; int ret; if (!device->name) goto out; old_path = kzalloc(PATH_MAX, GFP_NOFS); if (!old_path) goto out; rcu_read_lock(); ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX); rcu_read_unlock(); if (ret < 0) goto out; ret = kern_path(old_path, LOOKUP_FOLLOW, &old); if (ret) goto out; ret = kern_path(new_path, LOOKUP_FOLLOW, &new); if (ret) goto out; if (path_equal(&old, &new)) is_same = true; out: kfree(old_path); path_put(&old); path_put(&new); return is_same; } /* * Add new device to list of registered devices * * Returns: * device pointer which was just added or updated when successful * error pointer when failed */ static noinline struct btrfs_device *device_list_add(const char *path, struct btrfs_super_block *disk_super, bool *new_device_added) { struct btrfs_device *device; struct btrfs_fs_devices *fs_devices = NULL; struct rcu_string *name; u64 found_transid = btrfs_super_generation(disk_super); u64 devid = btrfs_stack_device_id(&disk_super->dev_item); dev_t path_devt; int error; bool same_fsid_diff_dev = false; bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { btrfs_err(NULL, "device %s has incomplete metadata_uuid change, please use btrfstune to complete", path); return ERR_PTR(-EAGAIN); } error = lookup_bdev(path, &path_devt); if (error) { btrfs_err(NULL, "failed to lookup block device for path %s: %d", path, error); return ERR_PTR(error); } fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev); if (!fs_devices) { fs_devices = alloc_fs_devices(disk_super->fsid); if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); if (has_metadata_uuid) memcpy(fs_devices->metadata_uuid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); if (same_fsid_diff_dev) { generate_random_uuid(fs_devices->fsid); fs_devices->temp_fsid = true; pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n", path, MAJOR(path_devt), MINOR(path_devt), fs_devices->fsid); } mutex_lock(&fs_devices->device_list_mutex); list_add(&fs_devices->fs_list, &fs_uuids); device = NULL; } else { struct btrfs_dev_lookup_args args = { .devid = devid, .uuid = disk_super->dev_item.uuid, }; mutex_lock(&fs_devices->device_list_mutex); device = btrfs_find_device(fs_devices, &args); if (found_transid > fs_devices->latest_generation) { memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); memcpy(fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE); } } if (!device) { unsigned int nofs_flag; if (fs_devices->opened) { btrfs_err(NULL, "device %s (%d:%d) belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", path, MAJOR(path_devt), MINOR(path_devt), fs_devices->fsid, current->comm, task_pid_nr(current)); mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); } nofs_flag = memalloc_nofs_save(); device = btrfs_alloc_device(NULL, &devid, disk_super->dev_item.uuid, path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(device)) { mutex_unlock(&fs_devices->device_list_mutex); /* we can safely leave the fs_devices entry around */ return device; } device->devt = path_devt; list_add_rcu(&device->dev_list, &fs_devices->devices); fs_devices->num_devices++; device->fs_devices = fs_devices; *new_device_added = true; if (disk_super->label[0]) pr_info( "BTRFS: device label %s devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n", disk_super->label, devid, found_transid, path, MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); else pr_info( "BTRFS: device fsid %pU devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n", disk_super->fsid, devid, found_transid, path, MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); } else if (!device->name || !is_same_device(device, path)) { /* * When FS is already mounted. * 1. If you are here and if the device->name is NULL that * means this device was missing at time of FS mount. * 2. If you are here and if the device->name is different * from 'path' that means either * a. The same device disappeared and reappeared with * different name. or * b. The missing-disk-which-was-replaced, has * reappeared now. * * We must allow 1 and 2a above. But 2b would be a spurious * and unintentional. * * Further in case of 1 and 2a above, the disk at 'path' * would have missed some transaction when it was away and * in case of 2a the stale bdev has to be updated as well. * 2b must not be allowed at all time. */ /* * For now, we do allow update to btrfs_fs_device through the * btrfs dev scan cli after FS has been mounted. We're still * tracking a problem where systems fail mount by subvolume id * when we reject replacement on a mounted FS. */ if (!fs_devices->opened && found_transid < device->generation) { /* * That is if the FS is _not_ mounted and if you * are here, that means there is more than one * disk with same uuid and devid.We keep the one * with larger generation number or the last-in if * generation are equal. */ mutex_unlock(&fs_devices->device_list_mutex); btrfs_err(NULL, "device %s already registered with a higher generation, found %llu expect %llu", path, found_transid, device->generation); return ERR_PTR(-EEXIST); } /* * We are going to replace the device path for a given devid, * make sure it's the same device if the device is mounted * * NOTE: the device->fs_info may not be reliable here so pass * in a NULL to message helpers instead. This avoids a possible * use-after-free when the fs_info and fs_info->sb are already * torn down. */ if (device->bdev) { if (device->devt != path_devt) { mutex_unlock(&fs_devices->device_list_mutex); btrfs_warn_in_rcu(NULL, "duplicate device %s devid %llu generation %llu scanned by %s (%d)", path, devid, found_transid, current->comm, task_pid_nr(current)); return ERR_PTR(-EEXIST); } btrfs_info_in_rcu(NULL, "devid %llu device path %s changed to %s scanned by %s (%d)", devid, btrfs_dev_name(device), path, current->comm, task_pid_nr(current)); } name = rcu_string_strdup(path, GFP_NOFS); if (!name) { mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-ENOMEM); } rcu_string_free(device->name); rcu_assign_pointer(device->name, name); if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { fs_devices->missing_devices--; clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); } device->devt = path_devt; } /* * Unmount does not free the btrfs_device struct but would zero * generation along with most of the other members. So just update * it back. We need it to pick the disk with largest generation * (as above). */ if (!fs_devices->opened) { device->generation = found_transid; fs_devices->latest_generation = max_t(u64, found_transid, fs_devices->latest_generation); } fs_devices->total_devices = btrfs_super_num_devices(disk_super); mutex_unlock(&fs_devices->device_list_mutex); return device; } static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) { struct btrfs_fs_devices *fs_devices; struct btrfs_device *device; struct btrfs_device *orig_dev; int ret = 0; lockdep_assert_held(&uuid_mutex); fs_devices = alloc_fs_devices(orig->fsid); if (IS_ERR(fs_devices)) return fs_devices; fs_devices->total_devices = orig->total_devices; list_for_each_entry(orig_dev, &orig->devices, dev_list) { const char *dev_path = NULL; /* * This is ok to do without RCU read locked because we hold the * uuid mutex so nothing we touch in here is going to disappear. */ if (orig_dev->name) dev_path = orig_dev->name->str; device = btrfs_alloc_device(NULL, &orig_dev->devid, orig_dev->uuid, dev_path); if (IS_ERR(device)) { ret = PTR_ERR(device); goto error; } if (orig_dev->zone_info) { struct btrfs_zoned_device_info *zone_info; zone_info = btrfs_clone_dev_zone_info(orig_dev); if (!zone_info) { btrfs_free_device(device); ret = -ENOMEM; goto error; } device->zone_info = zone_info; } list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; fs_devices->num_devices++; } return fs_devices; error: free_fs_devices(fs_devices); return ERR_PTR(ret); } static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, struct btrfs_device **latest_dev) { struct btrfs_device *device, *next; /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state) && !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) && (!*latest_dev || device->generation > (*latest_dev)->generation)) { *latest_dev = device; } continue; } /* * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, * in btrfs_init_dev_replace() so just continue. */ if (device->devid == BTRFS_DEV_REPLACE_DEVID) continue; if (device->bdev_file) { fput(device->bdev_file); device->bdev = NULL; device->bdev_file = NULL; fs_devices->open_devices--; } if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { list_del_init(&device->dev_alloc_list); clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->rw_devices--; } list_del_init(&device->dev_list); fs_devices->num_devices--; btrfs_free_device(device); } } /* * After we have read the system tree and know devids belonging to this * filesystem, remove the device which does not belong there. */ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *latest_dev = NULL; struct btrfs_fs_devices *seed_dev; mutex_lock(&uuid_mutex); __btrfs_free_extra_devids(fs_devices, &latest_dev); list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) __btrfs_free_extra_devids(seed_dev, &latest_dev); fs_devices->latest_dev = latest_dev; mutex_unlock(&uuid_mutex); } static void btrfs_close_bdev(struct btrfs_device *device) { if (!device->bdev) return; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { sync_blockdev(device->bdev); invalidate_bdev(device->bdev); } fput(device->bdev_file); } static void btrfs_close_one_device(struct btrfs_device *device) { struct btrfs_fs_devices *fs_devices = device->fs_devices; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { list_del_init(&device->dev_alloc_list); fs_devices->rw_devices--; } if (device->devid == BTRFS_DEV_REPLACE_DEVID) clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); fs_devices->missing_devices--; } btrfs_close_bdev(device); if (device->bdev) { fs_devices->open_devices--; device->bdev = NULL; device->bdev_file = NULL; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); btrfs_destroy_dev_zone_info(device); device->fs_info = NULL; atomic_set(&device->dev_stats_ccnt, 0); extent_io_tree_release(&device->alloc_state); /* * Reset the flush error record. We might have a transient flush error * in this mount, and if so we aborted the current transaction and set * the fs to an error state, guaranteeing no super blocks can be further * committed. However that error might be transient and if we unmount the * filesystem and mount it again, we should allow the mount to succeed * (btrfs_check_rw_degradable() should not fail) - if after mounting the * filesystem again we still get flush errors, then we will again abort * any transaction and set the error state, guaranteeing no commits of * unsafe super blocks. */ device->last_flush_error = 0; /* Verify the device is back in a pristine state */ WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); WARN_ON(!list_empty(&device->dev_alloc_list)); WARN_ON(!list_empty(&device->post_commit_list)); } static void close_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; lockdep_assert_held(&uuid_mutex); if (--fs_devices->opened > 0) return; list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) btrfs_close_one_device(device); WARN_ON(fs_devices->open_devices); WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0; fs_devices->seeding = false; fs_devices->fs_info = NULL; } void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { LIST_HEAD(list); struct btrfs_fs_devices *tmp; mutex_lock(&uuid_mutex); close_fs_devices(fs_devices); if (!fs_devices->opened) { list_splice_init(&fs_devices->seed_list, &list); /* * If the struct btrfs_fs_devices is not assembled with any * other device, it can be re-initialized during the next mount * without the needing device-scan step. Therefore, it can be * fully freed. */ if (fs_devices->num_devices == 1) { list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } } list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { close_fs_devices(fs_devices); list_del(&fs_devices->seed_list); free_fs_devices(fs_devices); } mutex_unlock(&uuid_mutex); } static int open_fs_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder) { struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; struct btrfs_device *tmp_device; s64 __maybe_unused value = 0; int ret = 0; list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, dev_list) { int ret2; ret2 = btrfs_open_one_device(fs_devices, device, flags, holder); if (ret2 == 0 && (!latest_dev || device->generation > latest_dev->generation)) { latest_dev = device; } else if (ret2 == -ENODATA) { fs_devices->num_devices--; list_del(&device->dev_list); btrfs_free_device(device); } if (ret == 0 && ret2 != 0) ret = ret2; } if (fs_devices->open_devices == 0) { if (ret) return ret; return -EINVAL; } fs_devices->opened = 1; fs_devices->latest_dev = latest_dev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; #ifdef CONFIG_BTRFS_EXPERIMENTAL fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; fs_devices->read_devid = latest_dev->devid; fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(), &value); if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) fs_devices->collect_fs_stats = true; if (value) { if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) fs_devices->rr_min_contig_read = value; if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID) fs_devices->read_devid = value; } #else fs_devices->read_policy = BTRFS_READ_POLICY_PID; #endif return 0; } static int devid_cmp(void *priv, const struct list_head *a, const struct list_head *b) { const struct btrfs_device *dev1, *dev2; dev1 = list_entry(a, struct btrfs_device, dev_list); dev2 = list_entry(b, struct btrfs_device, dev_list); if (dev1->devid < dev2->devid) return -1; else if (dev1->devid > dev2->devid) return 1; return 0; } int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder) { int ret; lockdep_assert_held(&uuid_mutex); /* * The device_list_mutex cannot be taken here in case opening the * underlying device takes further locks like open_mutex. * * We also don't need the lock here as this is called during mount and * exclusion is provided by uuid_mutex */ if (fs_devices->opened) { fs_devices->opened++; ret = 0; } else { list_sort(NULL, &fs_devices->devices, devid_cmp); ret = open_fs_devices(fs_devices, flags, holder); } return ret; } void btrfs_release_disk_super(struct btrfs_super_block *super) { struct page *page = virt_to_page(super); put_page(page); } static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, u64 bytenr_orig) { struct btrfs_super_block *disk_super; struct page *page; void *p; pgoff_t index; /* make sure our super fits in the device */ if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); /* make sure our super fits in the page */ if (sizeof(*disk_super) > PAGE_SIZE) return ERR_PTR(-EINVAL); /* make sure our super doesn't straddle pages on disk */ index = bytenr >> PAGE_SHIFT; if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) return ERR_PTR(-EINVAL); /* pull in the page with our super */ page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL); if (IS_ERR(page)) return ERR_CAST(page); p = page_address(page); /* align our pointer to the offset of the super block */ disk_super = p + offset_in_page(bytenr); if (btrfs_super_bytenr(disk_super) != bytenr_orig || btrfs_super_magic(disk_super) != BTRFS_MAGIC) { btrfs_release_disk_super(p); return ERR_PTR(-EINVAL); } if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; return disk_super; } int btrfs_forget_devices(dev_t devt) { int ret; mutex_lock(&uuid_mutex); ret = btrfs_free_stale_devices(devt, NULL); mutex_unlock(&uuid_mutex); return ret; } static bool btrfs_skip_registration(struct btrfs_super_block *disk_super, const char *path, dev_t devt, bool mount_arg_dev) { struct btrfs_fs_devices *fs_devices; /* * Do not skip device registration for mounted devices with matching * maj:min but different paths. Booting without initrd relies on * /dev/root initially, later replaced with the actual root device. * A successful scan ensures grub2-probe selects the correct device. */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { struct btrfs_device *device; mutex_lock(&fs_devices->device_list_mutex); if (!fs_devices->opened) { mutex_unlock(&fs_devices->device_list_mutex); continue; } list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev && (device->bdev->bd_dev == devt) && strcmp(device->name->str, path) != 0) { mutex_unlock(&fs_devices->device_list_mutex); /* Do not skip registration. */ return false; } } mutex_unlock(&fs_devices->device_list_mutex); } if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 && !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) return true; return false; } /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock * is read via pagecache. * * With @mount_arg_dev it's a scan during mount time that will always register * the device or return an error. Multi-device and seeding devices are registered * in both cases. */ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, bool mount_arg_dev) { struct btrfs_super_block *disk_super; bool new_device_added = false; struct btrfs_device *device = NULL; struct file *bdev_file; char *canonical_path = NULL; u64 bytenr; dev_t devt; int ret; lockdep_assert_held(&uuid_mutex); if (!is_good_dev_path(path)) { canonical_path = kmalloc(PATH_MAX, GFP_KERNEL); if (canonical_path) { ret = get_canonical_dev_path(path, canonical_path); if (ret < 0) { kfree(canonical_path); canonical_path = NULL; } } } /* * Avoid an exclusive open here, as the systemd-udev may initiate the * device scan which may race with the user's mount or mkfs command, * resulting in failure. * Since the device scan is solely for reading purposes, there is no * need for an exclusive open. Additionally, the devices are read again * during the mount process. It is ok to get some inconsistent * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL); if (IS_ERR(bdev_file)) return ERR_CAST(bdev_file); /* * We would like to check all the super blocks, but doing so would * allow a mount to succeed after a mkfs from a different filesystem. * Currently, recovery from a bad primary btrfs superblock is done * using the userspace command 'btrfs check --super'. */ ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr); if (ret) { device = ERR_PTR(ret); goto error_bdev_put; } disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr, btrfs_sb_offset(0)); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); goto error_bdev_put; } devt = file_bdev(bdev_file)->bd_dev; if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) { pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n", path, MAJOR(devt), MINOR(devt)); btrfs_free_stale_devices(devt, NULL); device = NULL; goto free_disk_super; } device = device_list_add(canonical_path ? : path, disk_super, &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); free_disk_super: btrfs_release_disk_super(disk_super); error_bdev_put: fput(bdev_file); kfree(canonical_path); return device; } /* * Try to find a chunk that intersects [start, start + len] range and when one * such is found, record the end of it in *start */ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, u64 len) { u64 physical_start, physical_end; lockdep_assert_held(&device->fs_info->chunk_mutex); if (find_first_extent_bit(&device->alloc_state, *start, &physical_start, &physical_end, CHUNK_ALLOCATED, NULL)) { if (in_range(physical_start, *start, len) || in_range(*start, physical_start, physical_end + 1 - physical_start)) { *start = physical_end + 1; return true; } } return false; } static u64 dev_extent_search_start(struct btrfs_device *device) { switch (device->fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: return BTRFS_DEVICE_RANGE_RESERVED; case BTRFS_CHUNK_ALLOC_ZONED: /* * We don't care about the starting region like regular * allocator, because we anyway use/reserve the first two zones * for superblock logging. */ return 0; default: BUG(); } } static bool dev_extent_hole_check_zoned(struct btrfs_device *device, u64 *hole_start, u64 *hole_size, u64 num_bytes) { u64 zone_size = device->zone_info->zone_size; u64 pos; int ret; bool changed = false; ASSERT(IS_ALIGNED(*hole_start, zone_size)); while (*hole_size > 0) { pos = btrfs_find_allocatable_zones(device, *hole_start, *hole_start + *hole_size, num_bytes); if (pos != *hole_start) { *hole_size = *hole_start + *hole_size - pos; *hole_start = pos; changed = true; if (*hole_size < num_bytes) break; } ret = btrfs_ensure_empty_zones(device, pos, num_bytes); /* Range is ensured to be empty */ if (!ret) return changed; /* Given hole range was invalid (outside of device) */ if (ret == -ERANGE) { *hole_start += *hole_size; *hole_size = 0; return true; } *hole_start += zone_size; *hole_size -= zone_size; changed = true; } return changed; } /* * Check if specified hole is suitable for allocation. * * @device: the device which we have the hole * @hole_start: starting position of the hole * @hole_size: the size of the hole * @num_bytes: the size of the free space that we need * * This function may modify @hole_start and @hole_size to reflect the suitable * position for allocation. Returns 1 if hole position is updated, 0 otherwise. */ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, u64 *hole_size, u64 num_bytes) { bool changed = false; u64 hole_end = *hole_start + *hole_size; for (;;) { /* * Check before we set max_hole_start, otherwise we could end up * sending back this offset anyway. */ if (contains_pending_extent(device, hole_start, *hole_size)) { if (hole_end >= *hole_start) *hole_size = hole_end - *hole_start; else *hole_size = 0; changed = true; } switch (device->fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: /* No extra check */ break; case BTRFS_CHUNK_ALLOC_ZONED: if (dev_extent_hole_check_zoned(device, hole_start, hole_size, num_bytes)) { changed = true; /* * The changed hole can contain pending extent. * Loop again to check that. */ continue; } break; default: BUG(); } break; } return changed; } /* * Find free space in the specified device. * * @device: the device which we search the free space in * @num_bytes: the size of the free space that we need * @search_start: the position from which to begin the search * @start: store the start of the free space. * @len: the size of the free space. that we find, or the size * of the max free space if we don't find suitable free space * * This does a pretty simple search, the expectation is that it is called very * infrequently and that a given device has a small number of extents. * * @start is used to store the start of the free space if we find. But if we * don't find suitable free space, it will be used to store the start position * of the max free space. * * @len is used to store the size of the free space that we find. * But if we don't find suitable free space, it is used to store the size of * the max free space. * * NOTE: This function will search *commit* root of device tree, and does extra * check to ensure dev extents are not double allocated. * This makes the function safe to allocate dev extents but may not report * correct usable device space, as device extent freed in current transaction * is not reported as available. */ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct btrfs_dev_extent *dev_extent; struct btrfs_path *path; u64 search_start; u64 hole_size; u64 max_hole_start; u64 max_hole_size = 0; u64 extent_end; u64 search_end = device->total_bytes; int ret; int slot; struct extent_buffer *l; search_start = dev_extent_search_start(device); max_hole_start = search_start; WARN_ON(device->zone_info && !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } again: if (search_start >= search_end || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = -ENOSPC; goto out; } path->reada = READA_FORWARD; path->search_commit_root = 1; path->skip_locking = 1; key.objectid = device->devid; key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; ret = btrfs_search_backwards(root, &key, path); if (ret < 0) goto out; while (search_start < search_end) { l = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(l)) { ret = btrfs_next_leaf(root, path); if (ret == 0) continue; if (ret < 0) goto out; break; } btrfs_item_key_to_cpu(l, &key, slot); if (key.objectid < device->devid) goto next; if (key.objectid > device->devid) break; if (key.type != BTRFS_DEV_EXTENT_KEY) goto next; if (key.offset > search_end) break; if (key.offset > search_start) { hole_size = key.offset - search_start; dev_extent_hole_check(device, &search_start, &hole_size, num_bytes); if (hole_size > max_hole_size) { max_hole_start = search_start; max_hole_size = hole_size; } /* * If this free space is greater than which we need, * it must be the max free space that we have found * until now, so max_hole_start must point to the start * of this free space and the length of this free space * is stored in max_hole_size. Thus, we return * max_hole_start and max_hole_size and go back to the * caller. */ if (hole_size >= num_bytes) { ret = 0; goto out; } } dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); extent_end = key.offset + btrfs_dev_extent_length(l, dev_extent); if (extent_end > search_start) search_start = extent_end; next: path->slots[0]++; cond_resched(); } /* * At this point, search_start should be the end of * allocated dev extents, and when shrinking the device, * search_end may be smaller than search_start. */ if (search_end > search_start) { hole_size = search_end - search_start; if (dev_extent_hole_check(device, &search_start, &hole_size, num_bytes)) { btrfs_release_path(path); goto again; } if (hole_size > max_hole_size) { max_hole_start = search_start; max_hole_size = hole_size; } } /* See above. */ if (max_hole_size < num_bytes) ret = -ENOSPC; else ret = 0; ASSERT(max_hole_start + max_hole_size <= search_end); out: btrfs_free_path(path); *start = max_hole_start; if (len) *len = max_hole_size; return ret; } static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 start, u64 *dev_extent_len) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; int ret; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf = NULL; struct btrfs_dev_extent *extent = NULL; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = device->devid; key.offset = start; key.type = BTRFS_DEV_EXTENT_KEY; again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = btrfs_previous_item(root, path, key.objectid, BTRFS_DEV_EXTENT_KEY); if (ret) goto out; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); BUG_ON(found_key.offset > start || found_key.offset + btrfs_dev_extent_length(leaf, extent) < start); key = found_key; btrfs_release_path(path); goto again; } else if (ret == 0) { leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); } else { goto out; } *dev_extent_len = btrfs_dev_extent_length(leaf, extent); ret = btrfs_del_item(trans, root, path); if (ret == 0) set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); out: btrfs_free_path(path); return ret; } static u64 find_next_chunk(struct btrfs_fs_info *fs_info) { struct rb_node *n; u64 ret = 0; read_lock(&fs_info->mapping_tree_lock); n = rb_last(&fs_info->mapping_tree.rb_root); if (n) { struct btrfs_chunk_map *map; map = rb_entry(n, struct btrfs_chunk_map, rb_node); ret = map->start + map->chunk_len; } read_unlock(&fs_info->mapping_tree_lock); return ret; } static noinline int find_next_devid(struct btrfs_fs_info *fs_info, u64 *devid_ret) { int ret; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); if (ret < 0) goto error; if (ret == 0) { /* Corruption */ btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); ret = -EUCLEAN; goto error; } ret = btrfs_previous_item(fs_info->chunk_root, path, BTRFS_DEV_ITEMS_OBJECTID, BTRFS_DEV_ITEM_KEY); if (ret) { *devid_ret = 1; } else { btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); *devid_ret = found_key.offset + 1; } ret = 0; error: btrfs_free_path(path); return ret; } /* * the device information is stored in the chunk root * the btrfs_device struct should be fully filled in */ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, struct btrfs_device *device) { int ret; struct btrfs_path *path; struct btrfs_dev_item *dev_item; struct extent_buffer *leaf; struct btrfs_key key; unsigned long ptr; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; btrfs_reserve_chunk_metadata(trans, true); ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, &key, sizeof(*dev_item)); btrfs_trans_release_chunk_metadata(trans); if (ret) goto out; leaf = path->nodes[0]; dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); btrfs_set_device_id(leaf, dev_item, device->devid); btrfs_set_device_generation(leaf, dev_item, 0); btrfs_set_device_type(leaf, dev_item, device->type); btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); btrfs_set_device_total_bytes(leaf, dev_item, btrfs_device_get_disk_total_bytes(device)); btrfs_set_device_bytes_used(leaf, dev_item, btrfs_device_get_bytes_used(device)); btrfs_set_device_group(leaf, dev_item, 0); btrfs_set_device_seek_speed(leaf, dev_item, 0); btrfs_set_device_bandwidth(leaf, dev_item, 0); btrfs_set_device_start_offset(leaf, dev_item, 0); ptr = btrfs_device_uuid(dev_item); write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); ptr = btrfs_device_fsid(dev_item); write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, ptr, BTRFS_FSID_SIZE); ret = 0; out: btrfs_free_path(path); return ret; } /* * Function to update ctime/mtime for a given device path. * Mainly used for ctime/mtime based probe like libblkid. * * We don't care about errors here, this is just to be kind to userspace. */ static void update_dev_time(const char *device_path) { struct path path; int ret; ret = kern_path(device_path, LOOKUP_FOLLOW, &path); if (ret) return; inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION); path_put(&path); } static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, struct btrfs_device *device) { struct btrfs_root *root = device->fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); btrfs_trans_release_chunk_metadata(trans); if (ret) { if (ret > 0) ret = -ENOENT; goto out; } ret = btrfs_del_item(trans, root, path); out: btrfs_free_path(path); return ret; } /* * Verify that @num_devices satisfies the RAID profile constraints in the whole * filesystem. It's up to the caller to adjust that number regarding eg. device * replace. */ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, u64 num_devices) { u64 all_avail; unsigned seq; int i; do { seq = read_seqbegin(&fs_info->profiles_lock); all_avail = fs_info->avail_data_alloc_bits | fs_info->avail_system_alloc_bits | fs_info->avail_metadata_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq)); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { if (!(all_avail & btrfs_raid_array[i].bg_flag)) continue; if (num_devices < btrfs_raid_array[i].devs_min) return btrfs_raid_array[i].mindev_error; } return 0; } static struct btrfs_device * btrfs_find_next_active_device( struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) { struct btrfs_device *next_device; list_for_each_entry(next_device, &fs_devs->devices, dev_list) { if (next_device != device && !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) && next_device->bdev) return next_device; } return NULL; } /* * Helper function to check if the given device is part of s_bdev / latest_dev * and replace it with the provided or the next active device, in the context * where this function called, there should be always be another device (or * this_dev) which is active. */ void __cold btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *next_device) { struct btrfs_fs_info *fs_info = device->fs_info; if (!next_device) next_device = btrfs_find_next_active_device(fs_info->fs_devices, device); ASSERT(next_device); if (fs_info->sb->s_bdev && (fs_info->sb->s_bdev == device->bdev)) fs_info->sb->s_bdev = next_device->bdev; if (fs_info->fs_devices->latest_dev->bdev == device->bdev) fs_info->fs_devices->latest_dev = next_device; } /* * Return btrfs_fs_devices::num_devices excluding the device that's being * currently replaced. */ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) { u64 num_devices = fs_info->fs_devices->num_devices; down_read(&fs_info->dev_replace.rwsem); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { ASSERT(num_devices > 1); num_devices--; } up_read(&fs_info->dev_replace.rwsem); return num_devices; } static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, struct block_device *bdev, int copy_num) { struct btrfs_super_block *disk_super; const size_t len = sizeof(disk_super->magic); const u64 bytenr = btrfs_sb_offset(copy_num); int ret; disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr); if (IS_ERR(disk_super)) return; memset(&disk_super->magic, 0, len); folio_mark_dirty(virt_to_folio(disk_super)); btrfs_release_disk_super(disk_super); ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1); if (ret) btrfs_warn(fs_info, "error clearing superblock number %d (%d)", copy_num, ret); } void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device) { int copy_num; struct block_device *bdev = device->bdev; if (!bdev) return; for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { if (bdev_is_zoned(bdev)) btrfs_reset_sb_log_zones(bdev, copy_num); else btrfs_scratch_superblock(fs_info, bdev, copy_num); } /* Notify udev that device has changed */ btrfs_kobject_uevent(bdev, KOBJ_CHANGE); /* Update ctime/mtime for device path for libblkid */ update_dev_time(device->name->str); } int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, struct file **bdev_file) { struct btrfs_trans_handle *trans; struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 num_devices; int ret = 0; if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { btrfs_err(fs_info, "device remove not supported on extent tree v2 yet"); return -EINVAL; } /* * The device list in fs_devices is accessed without locks (neither * uuid_mutex nor device_list_mutex) as it won't change on a mounted * filesystem and another device rm cannot run. */ num_devices = btrfs_num_devices(fs_info); ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) return ret; device = btrfs_find_device(fs_info->fs_devices, args); if (!device) { if (args->missing) ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else ret = -ENOENT; return ret; } if (btrfs_pinned_by_swapfile(fs_info, device)) { btrfs_warn_in_rcu(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", btrfs_dev_name(device), device->devid); return -ETXTBSY; } if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) return BTRFS_ERROR_DEV_TGT_REPLACE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && fs_info->fs_devices->rw_devices == 1) return BTRFS_ERROR_DEV_ONLY_WRITABLE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_del_init(&device->dev_alloc_list); device->fs_devices->rw_devices--; mutex_unlock(&fs_info->chunk_mutex); } ret = btrfs_shrink_device(device, 0); if (ret) goto error_undo; trans = btrfs_start_transaction(fs_info->chunk_root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto error_undo; } ret = btrfs_rm_dev_item(trans, device); if (ret) { /* Any error in dev item removal is critical */ btrfs_crit(fs_info, "failed to remove device item for devid %llu: %d", device->devid, ret); btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); btrfs_scrub_cancel_dev(device); /* * the device list mutex makes sure that we don't change * the device list while someone else is writing out all * the device supers. Whoever is writing all supers, should * lock the device list mutex before getting the number of * devices in the super block (super_copy). Conversely, * whoever updates the number of devices in the super block * (super_copy) should hold the device list mutex. */ /* * In normal cases the cur_devices == fs_devices. But in case * of deleting a seed device, the cur_devices should point to * its own fs_devices listed under the fs_devices->seed_list. */ cur_devices = device->fs_devices; mutex_lock(&fs_devices->device_list_mutex); list_del_rcu(&device->dev_list); cur_devices->num_devices--; cur_devices->total_devices--; /* Update total_devices of the parent fs_devices if it's seed */ if (cur_devices != fs_devices) fs_devices->total_devices--; if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) cur_devices->missing_devices--; btrfs_assign_next_active_device(device, NULL); if (device->bdev_file) { cur_devices->open_devices--; /* remove sysfs entry */ btrfs_sysfs_remove_device(device); } num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; btrfs_set_super_num_devices(fs_info->super_copy, num_devices); mutex_unlock(&fs_devices->device_list_mutex); /* * At this point, the device is zero sized and detached from the * devices list. All that's left is to zero out the old supers and * free the device. * * We cannot call btrfs_close_bdev() here because we're holding the sb * write lock, and fput() on the block device will pull in the * ->open_mutex on the block device and it's dependencies. Instead * just flush the device and let the caller do the final bdev_release. */ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { btrfs_scratch_superblocks(fs_info, device); if (device->bdev) { sync_blockdev(device->bdev); invalidate_bdev(device->bdev); } } *bdev_file = device->bdev_file; synchronize_rcu(); btrfs_free_device(device); /* * This can happen if cur_devices is the private seed devices list. We * cannot call close_fs_devices() here because it expects the uuid_mutex * to be held, but in fact we don't need that for the private * seed_devices, we can simply decrement cur_devices->opened and then * remove it from our list and free the fs_devices. */ if (cur_devices->num_devices == 0) { list_del_init(&cur_devices->seed_list); ASSERT(cur_devices->opened == 1); cur_devices->opened--; free_fs_devices(cur_devices); } ret = btrfs_commit_transaction(trans); return ret; error_undo: if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, &fs_devices->alloc_list); device->fs_devices->rw_devices++; mutex_unlock(&fs_info->chunk_mutex); } return ret; } void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) { struct btrfs_fs_devices *fs_devices; lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); /* * in case of fs with no seed, srcdev->fs_devices will point * to fs_devices of fs_info. However when the dev being replaced is * a seed dev it will point to the seed's local fs_devices. In short * srcdev will have its correct fs_devices in both the cases. */ fs_devices = srcdev->fs_devices; list_del_rcu(&srcdev->dev_list); list_del(&srcdev->dev_alloc_list); fs_devices->num_devices--; if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) fs_devices->missing_devices--; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) fs_devices->rw_devices--; if (srcdev->bdev) fs_devices->open_devices--; } void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) { struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; mutex_lock(&uuid_mutex); btrfs_close_bdev(srcdev); synchronize_rcu(); btrfs_free_device(srcdev); /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { /* * On a mounted FS, num_devices can't be zero unless it's a * seed. In case of a seed device being replaced, the replace * target added to the sprout FS, so there will be no more * device left under the seed FS. */ ASSERT(fs_devices->seeding); list_del_init(&fs_devices->seed_list); close_fs_devices(fs_devices); free_fs_devices(fs_devices); } mutex_unlock(&uuid_mutex); } void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) { struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; mutex_lock(&fs_devices->device_list_mutex); btrfs_sysfs_remove_device(tgtdev); if (tgtdev->bdev) fs_devices->open_devices--; fs_devices->num_devices--; btrfs_assign_next_active_device(tgtdev, NULL); list_del_rcu(&tgtdev->dev_list); mutex_unlock(&fs_devices->device_list_mutex); btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev); btrfs_close_bdev(tgtdev); synchronize_rcu(); btrfs_free_device(tgtdev); } /* * Populate args from device at path. * * @fs_info: the filesystem * @args: the args to populate * @path: the path to the device * * This will read the super block of the device at @path and populate @args with * the devid, fsid, and uuid. This is meant to be used for ioctls that need to * lookup a device to operate on, but need to do it before we take any locks. * This properly handles the special case of "missing" that a user may pass in, * and does some basic sanity checks. The caller must make sure that @path is * properly NUL terminated before calling in, and must call * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and * uuid buffers. * * Return: 0 for success, -errno for failure */ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, const char *path) { struct btrfs_super_block *disk_super; struct file *bdev_file; int ret; if (!path || !path[0]) return -EINVAL; if (!strcmp(path, "missing")) { args->missing = true; return 0; } args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL); args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL); if (!args->uuid || !args->fsid) { btrfs_put_dev_args_from_path(args); return -ENOMEM; } ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0, &bdev_file, &disk_super); if (ret) { btrfs_put_dev_args_from_path(args); return ret; } args->devid = btrfs_stack_device_id(&disk_super->dev_item); memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); if (btrfs_fs_incompat(fs_info, METADATA_UUID)) memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); else memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); fput(bdev_file); return 0; } /* * Only use this jointly with btrfs_get_dev_args_from_path() because we will * allocate our ->uuid and ->fsid pointers, everybody else uses local variables * that don't need to be freed. */ void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args) { kfree(args->uuid); kfree(args->fsid); args->uuid = NULL; args->fsid = NULL; } struct btrfs_device *btrfs_find_device_by_devspec( struct btrfs_fs_info *fs_info, u64 devid, const char *device_path) { BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_device *device; int ret; if (devid) { args.devid = devid; device = btrfs_find_device(fs_info->fs_devices, &args); if (!device) return ERR_PTR(-ENOENT); return device; } ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path); if (ret) return ERR_PTR(ret); device = btrfs_find_device(fs_info->fs_devices, &args); btrfs_put_dev_args_from_path(&args); if (!device) return ERR_PTR(-ENOENT); return device; } static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *old_devices; struct btrfs_fs_devices *seed_devices; lockdep_assert_held(&uuid_mutex); if (!fs_devices->seeding) return ERR_PTR(-EINVAL); /* * Private copy of the seed devices, anchored at * fs_info->fs_devices->seed_list */ seed_devices = alloc_fs_devices(NULL); if (IS_ERR(seed_devices)) return seed_devices; /* * It's necessary to retain a copy of the original seed fs_devices in * fs_uuids so that filesystems which have been seeded can successfully * reference the seed device from open_seed_devices. This also supports * multiple fs seed. */ old_devices = clone_fs_devices(fs_devices); if (IS_ERR(old_devices)) { kfree(seed_devices); return old_devices; } list_add(&old_devices->fs_list, &fs_uuids); memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); seed_devices->opened = 1; INIT_LIST_HEAD(&seed_devices->devices); INIT_LIST_HEAD(&seed_devices->alloc_list); mutex_init(&seed_devices->device_list_mutex); return seed_devices; } /* * Splice seed devices into the sprout fs_devices. * Generate a new fsid for the sprouted read-write filesystem. */ static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *seed_devices) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_super_block *disk_super = fs_info->super_copy; struct btrfs_device *device; u64 super_flags; /* * We are updating the fsid, the thread leading to device_list_add() * could race, so uuid_mutex is needed. */ lockdep_assert_held(&uuid_mutex); /* * The threads listed below may traverse dev_list but can do that without * device_list_mutex: * - All device ops and balance - as we are in btrfs_exclop_start. * - Various dev_list readers - are using RCU. * - btrfs_ioctl_fitrim() - is using RCU. * * For-read threads as below are using device_list_mutex: * - Readonly scrub btrfs_scrub_dev() * - Readonly scrub btrfs_scrub_progress() * - btrfs_get_dev_stats() */ lockdep_assert_held(&fs_devices->device_list_mutex); list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, synchronize_rcu); list_for_each_entry(device, &seed_devices->devices, dev_list) device->fs_devices = seed_devices; fs_devices->seeding = false; fs_devices->num_devices = 0; fs_devices->open_devices = 0; fs_devices->missing_devices = 0; fs_devices->rotating = false; list_add(&seed_devices->seed_list, &fs_devices->seed_list); generate_random_uuid(fs_devices->fsid); memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); super_flags = btrfs_super_flags(disk_super) & ~BTRFS_SUPER_FLAG_SEEDING; btrfs_set_super_flags(disk_super, super_flags); } /* * Store the expected generation for seed devices in device items. */ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) { BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_dev_item *dev_item; struct btrfs_device *device; struct btrfs_key key; u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.offset = 0; key.type = BTRFS_DEV_ITEM_KEY; while (1) { btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, &key, path, 0, 1); btrfs_trans_release_chunk_metadata(trans); if (ret < 0) goto error; leaf = path->nodes[0]; next_slot: if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret > 0) break; if (ret < 0) goto error; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); btrfs_release_path(path); continue; } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || key.type != BTRFS_DEV_ITEM_KEY) break; dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); args.devid = btrfs_device_id(leaf, dev_item); read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); args.uuid = dev_uuid; args.fsid = fs_uuid; device = btrfs_find_device(fs_info->fs_devices, &args); BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) btrfs_set_device_generation(leaf, dev_item, device->generation); path->slots[0]++; goto next_slot; } ret = 0; error: btrfs_free_path(path); return ret; } int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) { struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; struct btrfs_device *device; struct file *bdev_file; struct super_block *sb = fs_info->sb; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *seed_devices = NULL; u64 orig_super_total_bytes; u64 orig_super_num_devices; int ret = 0; bool seeding_dev = false; bool locked = false; if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, fs_info->bdev_holder, NULL); if (IS_ERR(bdev_file)) return PTR_ERR(bdev_file); if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) { ret = -EINVAL; goto error; } if (fs_devices->seeding) { seeding_dev = true; down_write(&sb->s_umount); mutex_lock(&uuid_mutex); locked = true; } sync_blockdev(file_bdev(bdev_file)); rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (device->bdev == file_bdev(bdev_file)) { ret = -EEXIST; rcu_read_unlock(); goto error; } } rcu_read_unlock(); device = btrfs_alloc_device(fs_info, NULL, NULL, device_path); if (IS_ERR(device)) { /* we can safely leave the fs_devices entry around */ ret = PTR_ERR(device); goto error; } device->fs_info = fs_info; device->bdev_file = bdev_file; device->bdev = file_bdev(bdev_file); ret = lookup_bdev(device_path, &device->devt); if (ret) goto error_free_device; ret = btrfs_get_dev_zone_info(device, false); if (ret) goto error_free_device; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto error_free_zone; } set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = trans->transid; device->io_width = fs_info->sectorsize; device->io_align = fs_info->sectorsize; device->sector_size = fs_info->sectorsize; device->total_bytes = round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->dev_stats_valid = 1; set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE); if (seeding_dev) { /* GFP_KERNEL allocation must not be under device_list_mutex */ seed_devices = btrfs_init_sprout(fs_info); if (IS_ERR(seed_devices)) { ret = PTR_ERR(seed_devices); btrfs_abort_transaction(trans, ret); goto error_trans; } } mutex_lock(&fs_devices->device_list_mutex); if (seeding_dev) { btrfs_setup_sprout(fs_info, seed_devices); btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, device); } device->fs_devices = fs_devices; mutex_lock(&fs_info->chunk_mutex); list_add_rcu(&device->dev_list, &fs_devices->devices); list_add(&device->dev_alloc_list, &fs_devices->alloc_list); fs_devices->num_devices++; fs_devices->open_devices++; fs_devices->rw_devices++; fs_devices->total_devices++; fs_devices->total_rw_bytes += device->total_bytes; atomic64_add(device->total_bytes, &fs_info->free_chunk_space); if (!bdev_nonrot(device->bdev)) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); btrfs_set_super_total_bytes(fs_info->super_copy, round_down(orig_super_total_bytes + device->total_bytes, fs_info->sectorsize)); orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); btrfs_set_super_num_devices(fs_info->super_copy, orig_super_num_devices + 1); /* * we've got more storage, clear any full flags on the space * infos */ btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); /* Add sysfs device entry */ btrfs_sysfs_add_device(device); mutex_unlock(&fs_devices->device_list_mutex); if (seeding_dev) { mutex_lock(&fs_info->chunk_mutex); ret = init_first_rw_device(trans); mutex_unlock(&fs_info->chunk_mutex); if (ret) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } } ret = btrfs_add_dev_item(trans, device); if (ret) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } if (seeding_dev) { ret = btrfs_finish_sprout(trans); if (ret) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } /* * fs_devices now represents the newly sprouted filesystem and * its fsid has been changed by btrfs_sprout_splice(). */ btrfs_sysfs_update_sprout_fsid(fs_devices); } ret = btrfs_commit_transaction(trans); if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); locked = false; if (ret) /* transaction commit */ return ret; ret = btrfs_relocate_sys_chunks(fs_info); if (ret < 0) btrfs_handle_fs_error(fs_info, ret, "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) == -ENOENT) return 0; ret = PTR_ERR(trans); trans = NULL; goto error_sysfs; } ret = btrfs_commit_transaction(trans); } /* * Now that we have written a new super block to this device, check all * other fs_devices list if device_path alienates any other scanned * device. * We can ignore the return value as it typically returns -EINVAL and * only succeeds if the device was an alien. */ btrfs_forget_devices(device->devt); /* Update ctime/mtime for blkid or udev */ update_dev_time(device_path); return ret; error_sysfs: btrfs_sysfs_remove_device(device); mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); list_del_rcu(&device->dev_list); list_del(&device->dev_alloc_list); fs_info->fs_devices->num_devices--; fs_info->fs_devices->open_devices--; fs_info->fs_devices->rw_devices--; fs_info->fs_devices->total_devices--; fs_info->fs_devices->total_rw_bytes -= device->total_bytes; atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); btrfs_set_super_total_bytes(fs_info->super_copy, orig_super_total_bytes); btrfs_set_super_num_devices(fs_info->super_copy, orig_super_num_devices); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); error_trans: if (trans) btrfs_end_transaction(trans); error_free_zone: btrfs_destroy_dev_zone_info(device); error_free_device: btrfs_free_device(device); error: fput(bdev_file); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } return ret; } static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { int ret; struct btrfs_path *path; struct btrfs_root *root = device->fs_info->chunk_root; struct btrfs_dev_item *dev_item; struct extent_buffer *leaf; struct btrfs_key key; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) goto out; if (ret > 0) { ret = -ENOENT; goto out; } leaf = path->nodes[0]; dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); btrfs_set_device_id(leaf, dev_item, device->devid); btrfs_set_device_type(leaf, dev_item, device->type); btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); btrfs_set_device_total_bytes(leaf, dev_item, btrfs_device_get_disk_total_bytes(device)); btrfs_set_device_bytes_used(leaf, dev_item, btrfs_device_get_bytes_used(device)); out: btrfs_free_path(path); return ret; } int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_super_block *super_copy = fs_info->super_copy; u64 old_total; u64 diff; int ret; if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return -EACCES; new_size = round_down(new_size, fs_info->sectorsize); mutex_lock(&fs_info->chunk_mutex); old_total = btrfs_super_total_bytes(super_copy); diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); if (new_size <= device->total_bytes || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { mutex_unlock(&fs_info->chunk_mutex); return -EINVAL; } btrfs_set_super_total_bytes(super_copy, round_down(old_total + diff, fs_info->sectorsize)); device->fs_devices->total_rw_bytes += diff; atomic64_add(diff, &fs_info->free_chunk_space); btrfs_device_set_total_bytes(device, new_size); btrfs_device_set_disk_total_bytes(device, new_size); btrfs_clear_space_info_full(device->fs_info); if (list_empty(&device->post_commit_list)) list_add_tail(&device->post_commit_list, &trans->transaction->dev_update_list); mutex_unlock(&fs_info->chunk_mutex); btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_update_device(trans, device); btrfs_trans_release_chunk_metadata(trans); return ret; } static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = chunk_offset; key.type = BTRFS_CHUNK_ITEM_KEY; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; else if (ret > 0) { /* Logic error or corruption */ btrfs_err(fs_info, "failed to lookup chunk %llu when freeing", chunk_offset); btrfs_abort_transaction(trans, -ENOENT); ret = -EUCLEAN; goto out; } ret = btrfs_del_item(trans, root, path); if (ret < 0) { btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset); btrfs_abort_transaction(trans, ret); goto out; } out: btrfs_free_path(path); return ret; } static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct btrfs_super_block *super_copy = fs_info->super_copy; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; u8 *ptr; int ret = 0; u32 num_stripes; u32 array_size; u32 len = 0; u32 cur; struct btrfs_key key; lockdep_assert_held(&fs_info->chunk_mutex); array_size = btrfs_super_sys_array_size(super_copy); ptr = super_copy->sys_chunk_array; cur = 0; while (cur < array_size) { disk_key = (struct btrfs_disk_key *)ptr; btrfs_disk_key_to_cpu(&key, disk_key); len = sizeof(*disk_key); if (key.type == BTRFS_CHUNK_ITEM_KEY) { chunk = (struct btrfs_chunk *)(ptr + len); num_stripes = btrfs_stack_chunk_num_stripes(chunk); len += btrfs_chunk_item_size(num_stripes); } else { ret = -EIO; break; } if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && key.offset == chunk_offset) { memmove(ptr, ptr + len, array_size - (cur + len)); array_size -= len; btrfs_set_super_sys_array_size(super_copy, array_size); } else { ptr += len; cur += len; } } return ret; } struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node; struct rb_node *prev = NULL; struct rb_node *orig_prev; struct btrfs_chunk_map *map; struct btrfs_chunk_map *prev_map = NULL; while (node) { map = rb_entry(node, struct btrfs_chunk_map, rb_node); prev = node; prev_map = map; if (logical < map->start) { node = node->rb_left; } else if (logical >= map->start + map->chunk_len) { node = node->rb_right; } else { refcount_inc(&map->refs); return map; } } if (!prev) return NULL; orig_prev = prev; while (prev && logical >= prev_map->start + prev_map->chunk_len) { prev = rb_next(prev); prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); } if (!prev) { prev = orig_prev; prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); while (prev && logical < prev_map->start) { prev = rb_prev(prev); prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); } } if (prev) { u64 end = logical + length; /* * Caller can pass a U64_MAX length when it wants to get any * chunk starting at an offset of 'logical' or higher, so deal * with underflow by resetting the end offset to U64_MAX. */ if (end < logical) end = U64_MAX; if (end > prev_map->start && logical < prev_map->start + prev_map->chunk_len) { refcount_inc(&prev_map->refs); return prev_map; } } return NULL; } struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_chunk_map *map; read_lock(&fs_info->mapping_tree_lock); map = btrfs_find_chunk_map_nolock(fs_info, logical, length); read_unlock(&fs_info->mapping_tree_lock); return map; } /* * Find the mapping containing the given logical extent. * * @logical: Logical block offset in bytes. * @length: Length of extent in bytes. * * Return: Chunk mapping or ERR_PTR. */ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_chunk_map *map; map = btrfs_find_chunk_map(fs_info, logical, length); if (unlikely(!map)) { btrfs_crit(fs_info, "unable to find chunk map for logical %llu length %llu", logical, length); return ERR_PTR(-EINVAL); } if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) { btrfs_crit(fs_info, "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", logical, logical + length, map->start, map->start + map->chunk_len); btrfs_free_chunk_map(map); return ERR_PTR(-EINVAL); } /* Callers are responsible for dropping the reference. */ return map; } static int remove_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map, u64 chunk_offset) { int i; /* * Removing chunk items and updating the device items in the chunks btree * requires holding the chunk_mutex. * See the comment at btrfs_chunk_alloc() for the details. */ lockdep_assert_held(&trans->fs_info->chunk_mutex); for (i = 0; i < map->num_stripes; i++) { int ret; ret = btrfs_update_device(trans, map->stripes[i].dev); if (ret) return ret; } return btrfs_free_chunk(trans, chunk_offset); } int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_chunk_map *map; u64 dev_extent_len = 0; int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(map)) { /* * This is a logic error, but we don't want to just rely on the * user having built with ASSERT enabled, so if ASSERT doesn't * do anything we still error out. */ ASSERT(0); return PTR_ERR(map); } /* * First delete the device extent items from the devices btree. * We take the device_list_mutex to avoid racing with the finishing phase * of a device replace operation. See the comment below before acquiring * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex * because that can result in a deadlock when deleting the device extent * items from the devices btree - COWing an extent buffer from the btree * may result in allocating a new metadata chunk, which would attempt to * lock again fs_info->chunk_mutex. */ mutex_lock(&fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; ret = btrfs_free_dev_extent(trans, device, map->stripes[i].physical, &dev_extent_len); if (ret) { mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, ret); goto out; } if (device->bytes_used > 0) { mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_bytes_used(device, device->bytes_used - dev_extent_len); atomic64_add(dev_extent_len, &fs_info->free_chunk_space); btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); } } mutex_unlock(&fs_devices->device_list_mutex); /* * We acquire fs_info->chunk_mutex for 2 reasons: * * 1) Just like with the first phase of the chunk allocation, we must * reserve system space, do all chunk btree updates and deletions, and * update the system chunk array in the superblock while holding this * mutex. This is for similar reasons as explained on the comment at * the top of btrfs_chunk_alloc(); * * 2) Prevent races with the final phase of a device replace operation * that replaces the device object associated with the map's stripes, * because the device object's id can change at any time during that * final phase of the device replace operation * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the * replaced device and then see it with an ID of * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating * the device item, which does not exists on the chunk btree. * The finishing phase of device replace acquires both the * device_list_mutex and the chunk_mutex, in that order, so we are * safe by just acquiring the chunk_mutex. */ trans->removing_chunk = true; mutex_lock(&fs_info->chunk_mutex); check_system_chunk(trans, map->type); ret = remove_chunk_item(trans, map, chunk_offset); /* * Normally we should not get -ENOSPC since we reserved space before * through the call to check_system_chunk(). * * Despite our system space_info having enough free space, we may not * be able to allocate extents from its block groups, because all have * an incompatible profile, which will force us to allocate a new system * block group with the right profile, or right after we called * check_system_space() above, a scrub turned the only system block group * with enough free space into RO mode. * This is explained with more detail at do_chunk_alloc(). * * So if we get -ENOSPC, allocate a new system chunk and retry once. */ if (ret == -ENOSPC) { const u64 sys_flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *sys_bg; sys_bg = btrfs_create_chunk(trans, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } ret = remove_chunk_item(trans, map, chunk_offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } else if (ret) { btrfs_abort_transaction(trans, ret); goto out; } trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(fs_info, chunk_offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } mutex_unlock(&fs_info->chunk_mutex); trans->removing_chunk = false; /* * We are done with chunk btree updates and deletions, so release the * system space we previously reserved (with check_system_chunk()). */ btrfs_trans_release_chunk_metadata(trans); ret = btrfs_remove_block_group(trans, map); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } out: if (trans->removing_chunk) { mutex_unlock(&fs_info->chunk_mutex); trans->removing_chunk = false; } /* once for us */ btrfs_free_chunk_map(map); return ret; } int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct btrfs_root *root = fs_info->chunk_root; struct btrfs_trans_handle *trans; struct btrfs_block_group *block_group; u64 length; int ret; if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { btrfs_err(fs_info, "relocate: not supported on extent tree v2 yet"); return -EINVAL; } /* * Prevent races with automatic removal of unused block groups. * After we relocate and before we remove the chunk with offset * chunk_offset, automatic removal of the block group can kick in, * resulting in a failure when calling btrfs_remove_chunk() below. * * Make sure to acquire this mutex before doing a tree search (dev * or chunk trees) to find chunks. Otherwise the cleaner kthread might * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after * we release the path used to search the chunk/dev tree and before * the current task acquires this mutex and calls us. */ lockdep_assert_held(&fs_info->reclaim_bgs_lock); /* step one, relocate all the extents inside this chunk */ btrfs_scrub_pause(fs_info); ret = btrfs_relocate_block_group(fs_info, chunk_offset); btrfs_scrub_continue(fs_info); if (ret) { /* * If we had a transaction abort, stop all running scrubs. * See transaction.c:cleanup_transaction() why we do it here. */ if (BTRFS_FS_ERROR(fs_info)) btrfs_scrub_cancel(fs_info); return ret; } block_group = btrfs_lookup_block_group(fs_info, chunk_offset); if (!block_group) return -ENOENT; btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); length = block_group->length; btrfs_put_block_group(block_group); /* * On a zoned file system, discard the whole block group, this will * trigger a REQ_OP_ZONE_RESET operation on the device zone. If * resetting the zone fails, don't treat it as a fatal problem from the * filesystem's point of view. */ if (btrfs_is_zoned(fs_info)) { ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); if (ret) btrfs_info(fs_info, "failed to reset zone %llu after relocation", chunk_offset); } trans = btrfs_start_trans_remove_block_group(root->fs_info, chunk_offset); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_handle_fs_error(root->fs_info, ret, NULL); return ret; } /* * step two, delete the device extents and the * chunk tree entries */ ret = btrfs_remove_chunk(trans, chunk_offset); btrfs_end_transaction(trans); return ret; } static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) { struct btrfs_root *chunk_root = fs_info->chunk_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_chunk *chunk; struct btrfs_key key; struct btrfs_key found_key; u64 chunk_type; bool retried = false; int failed = 0; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; again: key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { mutex_lock(&fs_info->reclaim_bgs_lock); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) { mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } if (ret == 0) { /* * On the first search we would find chunk tree with * offset -1, which is not possible. On subsequent * loops this would find an existing item on an invalid * offset (one less than the previous one, wrong * alignment and size). */ ret = -EUCLEAN; mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); if (ret) mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret < 0) goto error; if (ret > 0) break; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); chunk = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk); chunk_type = btrfs_chunk_type(leaf, chunk); btrfs_release_path(path); if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_relocate_chunk(fs_info, found_key.offset); if (ret == -ENOSPC) failed++; else BUG_ON(ret); } mutex_unlock(&fs_info->reclaim_bgs_lock); if (found_key.offset == 0) break; key.offset = found_key.offset - 1; } ret = 0; if (failed && !retried) { failed = 0; retried = true; goto again; } else if (WARN_ON(failed && retried)) { ret = -ENOSPC; } error: btrfs_free_path(path); return ret; } /* * return 1 : allocate a data chunk successfully, * return <0: errors during allocating a data chunk, * return 0 : no need to allocate a data chunk. */ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct btrfs_block_group *cache; u64 bytes_used; u64 chunk_type; cache = btrfs_lookup_block_group(fs_info, chunk_offset); ASSERT(cache); chunk_type = cache->flags; btrfs_put_block_group(cache); if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) return 0; spin_lock(&fs_info->data_sinfo->lock); bytes_used = fs_info->data_sinfo->bytes_used; spin_unlock(&fs_info->data_sinfo->lock); if (!bytes_used) { struct btrfs_trans_handle *trans; int ret; trans = btrfs_join_transaction(fs_info->tree_root); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); btrfs_end_transaction(trans); if (ret < 0) return ret; return 1; } return 0; } static void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, const struct btrfs_disk_balance_args *disk) { memset(cpu, 0, sizeof(*cpu)); cpu->profiles = le64_to_cpu(disk->profiles); cpu->usage = le64_to_cpu(disk->usage); cpu->devid = le64_to_cpu(disk->devid); cpu->pstart = le64_to_cpu(disk->pstart); cpu->pend = le64_to_cpu(disk->pend); cpu->vstart = le64_to_cpu(disk->vstart); cpu->vend = le64_to_cpu(disk->vend); cpu->target = le64_to_cpu(disk->target); cpu->flags = le64_to_cpu(disk->flags); cpu->limit = le64_to_cpu(disk->limit); cpu->stripes_min = le32_to_cpu(disk->stripes_min); cpu->stripes_max = le32_to_cpu(disk->stripes_max); } static void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, const struct btrfs_balance_args *cpu) { memset(disk, 0, sizeof(*disk)); disk->profiles = cpu_to_le64(cpu->profiles); disk->usage = cpu_to_le64(cpu->usage); disk->devid = cpu_to_le64(cpu->devid); disk->pstart = cpu_to_le64(cpu->pstart); disk->pend = cpu_to_le64(cpu->pend); disk->vstart = cpu_to_le64(cpu->vstart); disk->vend = cpu_to_le64(cpu->vend); disk->target = cpu_to_le64(cpu->target); disk->flags = cpu_to_le64(cpu->flags); disk->limit = cpu_to_le64(cpu->limit); disk->stripes_min = cpu_to_le32(cpu->stripes_min); disk->stripes_max = cpu_to_le32(cpu->stripes_max); } static int insert_balance_item(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl) { struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; struct btrfs_balance_item *item; struct btrfs_disk_balance_args disk_bargs; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; int ret, err; path = btrfs_alloc_path(); if (!path) return -ENOMEM; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { btrfs_free_path(path); return PTR_ERR(trans); } key.objectid = BTRFS_BALANCE_OBJECTID; key.type = BTRFS_TEMPORARY_ITEM_KEY; key.offset = 0; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*item)); if (ret) goto out; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); btrfs_set_balance_data(leaf, item, &disk_bargs); btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); btrfs_set_balance_meta(leaf, item, &disk_bargs); btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); btrfs_set_balance_sys(leaf, item, &disk_bargs); btrfs_set_balance_flags(leaf, item, bctl->flags); out: btrfs_free_path(path); err = btrfs_commit_transaction(trans); if (err && !ret) ret = err; return ret; } static int del_balance_item(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; struct btrfs_path *path; struct btrfs_key key; int ret, err; path = btrfs_alloc_path(); if (!path) return -ENOMEM; trans = btrfs_start_transaction_fallback_global_rsv(root, 0); if (IS_ERR(trans)) { btrfs_free_path(path); return PTR_ERR(trans); } key.objectid = BTRFS_BALANCE_OBJECTID; key.type = BTRFS_TEMPORARY_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; if (ret > 0) { ret = -ENOENT; goto out; } ret = btrfs_del_item(trans, root, path); out: btrfs_free_path(path); err = btrfs_commit_transaction(trans); if (err && !ret) ret = err; return ret; } /* * This is a heuristic used to reduce the number of chunks balanced on * resume after balance was interrupted. */ static void update_balance_args(struct btrfs_balance_control *bctl) { /* * Turn on soft mode for chunk types that were being converted. */ if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; /* * Turn on usage filter if is not already used. The idea is * that chunks that we have already balanced should be * reasonably full. Don't do it for chunks that are being * converted - that will keep us from relocating unconverted * (albeit full) chunks. */ if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->data.usage = 90; } if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->sys.usage = 90; } if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->meta.usage = 90; } } /* * Clear the balance status in fs_info and delete the balance item from disk. */ static void reset_balance_state(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; int ret; ASSERT(fs_info->balance_ctl); spin_lock(&fs_info->balance_lock); fs_info->balance_ctl = NULL; spin_unlock(&fs_info->balance_lock); kfree(bctl); ret = del_balance_item(fs_info); if (ret) btrfs_handle_fs_error(fs_info, ret, NULL); } /* * Balance filters. Return 1 if chunk should be filtered out * (should not be balanced). */ static int chunk_profiles_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { chunk_type = chunk_to_extended(chunk_type) & BTRFS_EXTENDED_PROFILE_MASK; if (bargs->profiles & chunk_type) return 0; return 1; } static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { struct btrfs_block_group *cache; u64 chunk_used; u64 user_thresh_min; u64 user_thresh_max; int ret = 1; cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = cache->used; if (bargs->usage_min == 0) user_thresh_min = 0; else user_thresh_min = mult_perc(cache->length, bargs->usage_min); if (bargs->usage_max == 0) user_thresh_max = 1; else if (bargs->usage_max > 100) user_thresh_max = cache->length; else user_thresh_max = mult_perc(cache->length, bargs->usage_max); if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) ret = 0; btrfs_put_block_group(cache); return ret; } static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { struct btrfs_block_group *cache; u64 chunk_used, user_thresh; int ret = 1; cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = cache->used; if (bargs->usage_min == 0) user_thresh = 1; else if (bargs->usage > 100) user_thresh = cache->length; else user_thresh = mult_perc(cache->length, bargs->usage); if (chunk_used < user_thresh) ret = 0; btrfs_put_block_group(cache); return ret; } static int chunk_devid_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, struct btrfs_balance_args *bargs) { struct btrfs_stripe *stripe; int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); int i; for (i = 0; i < num_stripes; i++) { stripe = btrfs_stripe_nr(chunk, i); if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) return 0; } return 1; } static u64 calc_data_stripes(u64 type, int num_stripes) { const int index = btrfs_bg_flags_to_raid_index(type); const int ncopies = btrfs_raid_array[index].ncopies; const int nparity = btrfs_raid_array[index].nparity; return (num_stripes - nparity) / ncopies; } /* [pstart, pend) */ static int chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, struct btrfs_balance_args *bargs) { struct btrfs_stripe *stripe; int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); u64 stripe_offset; u64 stripe_length; u64 type; int factor; int i; if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) return 0; type = btrfs_chunk_type(leaf, chunk); factor = calc_data_stripes(type, num_stripes); for (i = 0; i < num_stripes; i++) { stripe = btrfs_stripe_nr(chunk, i); if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) continue; stripe_offset = btrfs_stripe_offset(leaf, stripe); stripe_length = btrfs_chunk_length(leaf, chunk); stripe_length = div_u64(stripe_length, factor); if (stripe_offset < bargs->pend && stripe_offset + stripe_length > bargs->pstart) return 0; } return 1; } /* [vstart, vend) */ static int chunk_vrange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset, struct btrfs_balance_args *bargs) { if (chunk_offset < bargs->vend && chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) /* at least part of the chunk is inside this vrange */ return 0; return 1; } static int chunk_stripes_range_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, struct btrfs_balance_args *bargs) { int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); if (bargs->stripes_min <= num_stripes && num_stripes <= bargs->stripes_max) return 0; return 1; } static int chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) return 0; chunk_type = chunk_to_extended(chunk_type) & BTRFS_EXTENDED_PROFILE_MASK; if (bargs->target == chunk_type) return 1; return 0; } static int should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_balance_control *bctl = fs_info->balance_ctl; struct btrfs_balance_args *bargs = NULL; u64 chunk_type = btrfs_chunk_type(leaf, chunk); /* type filter */ if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { return 0; } if (chunk_type & BTRFS_BLOCK_GROUP_DATA) bargs = &bctl->data; else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) bargs = &bctl->sys; else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) bargs = &bctl->meta; /* profiles filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && chunk_profiles_filter(chunk_type, bargs)) { return 0; } /* usage filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && chunk_usage_filter(fs_info, chunk_offset, bargs)) { return 0; } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { return 0; } /* devid filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && chunk_devid_filter(leaf, chunk, bargs)) { return 0; } /* drange filter, makes sense only with devid filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && chunk_drange_filter(leaf, chunk, bargs)) { return 0; } /* vrange filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { return 0; } /* stripes filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && chunk_stripes_range_filter(leaf, chunk, bargs)) { return 0; } /* soft profile changing mode */ if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && chunk_soft_convert_filter(chunk_type, bargs)) { return 0; } /* * limited by count, must be the last filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { if (bargs->limit == 0) return 0; else bargs->limit--; } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { /* * Same logic as the 'limit' filter; the minimum cannot be * determined here because we do not have the global information * about the count of all chunks that satisfy the filters. */ if (bargs->limit_max == 0) return 0; else bargs->limit_max--; } return 1; } static int __btrfs_balance(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; struct btrfs_root *chunk_root = fs_info->chunk_root; u64 chunk_type; struct btrfs_chunk *chunk; struct btrfs_path *path = NULL; struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf; int slot; int ret; int enospc_errors = 0; bool counting = true; /* The single value limit and min/max limits use the same bytes in the */ u64 limit_data = bctl->data.limit; u64 limit_meta = bctl->meta.limit; u64 limit_sys = bctl->sys.limit; u32 count_data = 0; u32 count_meta = 0; u32 count_sys = 0; int chunk_reserved = 0; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto error; } /* zero out stat counters */ spin_lock(&fs_info->balance_lock); memset(&bctl->stat, 0, sizeof(bctl->stat)); spin_unlock(&fs_info->balance_lock); again: if (!counting) { /* * The single value limit and min/max limits use the same bytes * in the */ bctl->data.limit = limit_data; bctl->meta.limit = limit_meta; bctl->sys.limit = limit_sys; } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { if ((!counting && atomic_read(&fs_info->balance_pause_req)) || atomic_read(&fs_info->balance_cancel_req)) { ret = -ECANCELED; goto error; } mutex_lock(&fs_info->reclaim_bgs_lock); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) { mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } /* * this shouldn't happen, it means the last relocate * failed */ if (ret == 0) BUG(); /* FIXME break ? */ ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); if (ret) { mutex_unlock(&fs_info->reclaim_bgs_lock); ret = 0; break; } leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != key.objectid) { mutex_unlock(&fs_info->reclaim_bgs_lock); break; } chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); chunk_type = btrfs_chunk_type(leaf, chunk); if (!counting) { spin_lock(&fs_info->balance_lock); bctl->stat.considered++; spin_unlock(&fs_info->balance_lock); } ret = should_balance_chunk(leaf, chunk, found_key.offset); btrfs_release_path(path); if (!ret) { mutex_unlock(&fs_info->reclaim_bgs_lock); goto loop; } if (counting) { mutex_unlock(&fs_info->reclaim_bgs_lock); spin_lock(&fs_info->balance_lock); bctl->stat.expected++; spin_unlock(&fs_info->balance_lock); if (chunk_type & BTRFS_BLOCK_GROUP_DATA) count_data++; else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) count_sys++; else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) count_meta++; goto loop; } /* * Apply limit_min filter, no need to check if the LIMITS * filter is used, limit_min is 0 by default */ if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && count_data < bctl->data.limit_min) || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && count_meta < bctl->meta.limit_min) || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && count_sys < bctl->sys.limit_min)) { mutex_unlock(&fs_info->reclaim_bgs_lock); goto loop; } if (!chunk_reserved) { /* * We may be relocating the only data chunk we have, * which could potentially end up with losing data's * raid profile, so lets allocate an empty one in * advance. */ ret = btrfs_may_alloc_data_chunk(fs_info, found_key.offset); if (ret < 0) { mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } else if (ret == 1) { chunk_reserved = 1; } } ret = btrfs_relocate_chunk(fs_info, found_key.offset); mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret == -ENOSPC) { enospc_errors++; } else if (ret == -ETXTBSY) { btrfs_info(fs_info, "skipping relocation of block group %llu due to active swapfile", found_key.offset); ret = 0; } else if (ret) { goto error; } else { spin_lock(&fs_info->balance_lock); bctl->stat.completed++; spin_unlock(&fs_info->balance_lock); } loop: if (found_key.offset == 0) break; key.offset = found_key.offset - 1; } if (counting) { btrfs_release_path(path); counting = false; goto again; } error: btrfs_free_path(path); if (enospc_errors) { btrfs_info(fs_info, "%d enospc errors during balance", enospc_errors); if (!ret) ret = -ENOSPC; } return ret; } /* * See if a given profile is valid and reduced. * * @flags: profile to validate * @extended: if true @flags is treated as an extended profile */ static int alloc_profile_is_valid(u64 flags, int extended) { u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : BTRFS_BLOCK_GROUP_PROFILE_MASK); flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; /* 1) check that all other bits are zeroed */ if (flags & ~mask) return 0; /* 2) see if profile is reduced */ if (flags == 0) return !extended; /* "0" is valid for usual profiles */ return has_single_bit_set(flags); } /* * Validate target profile against allowed profiles and return true if it's OK. * Otherwise print the error message and return false. */ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, const struct btrfs_balance_args *bargs, u64 allowed, const char *type) { if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) return true; /* Profile is valid and does not have bits outside of the allowed set */ if (alloc_profile_is_valid(bargs->target, 1) && (bargs->target & ~allowed) == 0) return true; btrfs_err(fs_info, "balance: invalid convert %s profile %s", type, btrfs_bg_type_to_raid_name(bargs->target)); return false; } /* * Fill @buf with textual description of balance filter flags @bargs, up to * @size_buf including the terminating null. The output may be trimmed if it * does not fit into the provided buffer. */ static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, u32 size_buf) { int ret; u32 size_bp = size_buf; char *bp = buf; u64 flags = bargs->flags; char tmp_buf[128] = {'\0'}; if (!flags) return; #define CHECK_APPEND_NOARG(a) \ do { \ ret = snprintf(bp, size_bp, (a)); \ if (ret < 0 || ret >= size_bp) \ goto out_overflow; \ size_bp -= ret; \ bp += ret; \ } while (0) #define CHECK_APPEND_1ARG(a, v1) \ do { \ ret = snprintf(bp, size_bp, (a), (v1)); \ if (ret < 0 || ret >= size_bp) \ goto out_overflow; \ size_bp -= ret; \ bp += ret; \ } while (0) #define CHECK_APPEND_2ARG(a, v1, v2) \ do { \ ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ if (ret < 0 || ret >= size_bp) \ goto out_overflow; \ size_bp -= ret; \ bp += ret; \ } while (0) if (flags & BTRFS_BALANCE_ARGS_CONVERT) CHECK_APPEND_1ARG("convert=%s,", btrfs_bg_type_to_raid_name(bargs->target)); if (flags & BTRFS_BALANCE_ARGS_SOFT) CHECK_APPEND_NOARG("soft,"); if (flags & BTRFS_BALANCE_ARGS_PROFILES) { btrfs_describe_block_groups(bargs->profiles, tmp_buf, sizeof(tmp_buf)); CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); } if (flags & BTRFS_BALANCE_ARGS_USAGE) CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) CHECK_APPEND_2ARG("usage=%u..%u,", bargs->usage_min, bargs->usage_max); if (flags & BTRFS_BALANCE_ARGS_DEVID) CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); if (flags & BTRFS_BALANCE_ARGS_DRANGE) CHECK_APPEND_2ARG("drange=%llu..%llu,", bargs->pstart, bargs->pend); if (flags & BTRFS_BALANCE_ARGS_VRANGE) CHECK_APPEND_2ARG("vrange=%llu..%llu,", bargs->vstart, bargs->vend); if (flags & BTRFS_BALANCE_ARGS_LIMIT) CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) CHECK_APPEND_2ARG("limit=%u..%u,", bargs->limit_min, bargs->limit_max); if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) CHECK_APPEND_2ARG("stripes=%u..%u,", bargs->stripes_min, bargs->stripes_max); #undef CHECK_APPEND_2ARG #undef CHECK_APPEND_1ARG #undef CHECK_APPEND_NOARG out_overflow: if (size_bp < size_buf) buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ else buf[0] = '\0'; } static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) { u32 size_buf = 1024; char tmp_buf[192] = {'\0'}; char *buf; char *bp; u32 size_bp = size_buf; int ret; struct btrfs_balance_control *bctl = fs_info->balance_ctl; buf = kzalloc(size_buf, GFP_KERNEL); if (!buf) return; bp = buf; #define CHECK_APPEND_1ARG(a, v1) \ do { \ ret = snprintf(bp, size_bp, (a), (v1)); \ if (ret < 0 || ret >= size_bp) \ goto out_overflow; \ size_bp -= ret; \ bp += ret; \ } while (0) if (bctl->flags & BTRFS_BALANCE_FORCE) CHECK_APPEND_1ARG("%s", "-f "); if (bctl->flags & BTRFS_BALANCE_DATA) { describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); CHECK_APPEND_1ARG("-d%s ", tmp_buf); } if (bctl->flags & BTRFS_BALANCE_METADATA) { describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); CHECK_APPEND_1ARG("-m%s ", tmp_buf); } if (bctl->flags & BTRFS_BALANCE_SYSTEM) { describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); CHECK_APPEND_1ARG("-s%s ", tmp_buf); } #undef CHECK_APPEND_1ARG out_overflow: if (size_bp < size_buf) buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ btrfs_info(fs_info, "balance: %s %s", (bctl->flags & BTRFS_BALANCE_RESUME) ? "resume" : "start", buf); kfree(buf); } /* * Should be called with balance mutexe held */ int btrfs_balance(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs) { u64 meta_target, data_target; u64 allowed; int mixed = 0; int ret; u64 num_devices; unsigned seq; bool reducing_redundancy; bool paused = false; int i; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || btrfs_should_cancel_balance(fs_info)) { ret = -EINVAL; goto out; } allowed = btrfs_super_incompat_flags(fs_info->super_copy); if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) mixed = 1; /* * In case of mixed groups both data and meta should be picked, * and identical options should be given for both of them. */ allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; if (mixed && (bctl->flags & allowed)) { if (!(bctl->flags & BTRFS_BALANCE_DATA) || !(bctl->flags & BTRFS_BALANCE_METADATA) || memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { btrfs_err(fs_info, "balance: mixed groups data and metadata options must be the same"); ret = -EINVAL; goto out; } } /* * rw_devices will not change at the moment, device add/delete/replace * are exclusive */ num_devices = fs_info->fs_devices->rw_devices; /* * SINGLE profile on-disk has no profile bit, but in-memory we have a * special bit for it, to make it easier to distinguish. Thus we need * to set it manually, or balance would refuse the profile. */ allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) if (num_devices >= btrfs_raid_array[i].devs_min) allowed |= btrfs_raid_array[i].bg_flag; if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { ret = -EINVAL; goto out; } /* * Allow to reduce metadata or system integrity only if force set for * profiles with redundancy (copies, parity) */ allowed = 0; for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { if (btrfs_raid_array[i].ncopies >= 2 || btrfs_raid_array[i].tolerated_failures >= 1) allowed |= btrfs_raid_array[i].bg_flag; } do { seq = read_seqbegin(&fs_info->profiles_lock); if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && (fs_info->avail_system_alloc_bits & allowed) && !(bctl->sys.target & allowed)) || ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && (fs_info->avail_metadata_alloc_bits & allowed) && !(bctl->meta.target & allowed))) reducing_redundancy = true; else reducing_redundancy = false; /* if we're not converting, the target field is uninitialized */ meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? bctl->meta.target : fs_info->avail_metadata_alloc_bits; data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? bctl->data.target : fs_info->avail_data_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq)); if (reducing_redundancy) { if (bctl->flags & BTRFS_BALANCE_FORCE) { btrfs_info(fs_info, "balance: force reducing metadata redundancy"); } else { btrfs_err(fs_info, "balance: reduces metadata redundancy, use --force if you want this"); ret = -EINVAL; goto out; } } if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { btrfs_warn(fs_info, "balance: metadata profile %s has lower redundancy than data profile %s", btrfs_bg_type_to_raid_name(meta_target), btrfs_bg_type_to_raid_name(data_target)); } ret = insert_balance_item(fs_info, bctl); if (ret && ret != -EEXIST) goto out; if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { BUG_ON(ret == -EEXIST); BUG_ON(fs_info->balance_ctl); spin_lock(&fs_info->balance_lock); fs_info->balance_ctl = bctl; spin_unlock(&fs_info->balance_lock); } else { BUG_ON(ret != -EEXIST); spin_lock(&fs_info->balance_lock); update_balance_args(bctl); spin_unlock(&fs_info->balance_lock); } ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); describe_balance_start_or_resume(fs_info); mutex_unlock(&fs_info->balance_mutex); ret = __btrfs_balance(fs_info); mutex_lock(&fs_info->balance_mutex); if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { btrfs_info(fs_info, "balance: paused"); btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); paused = true; } /* * Balance can be canceled by: * * - Regular cancel request * Then ret == -ECANCELED and balance_cancel_req > 0 * * - Fatal signal to "btrfs" process * Either the signal caught by wait_reserve_ticket() and callers * got -EINTR, or caught by btrfs_should_cancel_balance() and * got -ECANCELED. * Either way, in this case balance_cancel_req = 0, and * ret == -EINTR or ret == -ECANCELED. * * So here we only check the return value to catch canceled balance. */ else if (ret == -ECANCELED || ret == -EINTR) btrfs_info(fs_info, "balance: canceled"); else btrfs_info(fs_info, "balance: ended with status: %d", ret); clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); if (bargs) { memset(bargs, 0, sizeof(*bargs)); btrfs_update_ioctl_balance_args(fs_info, bargs); } /* We didn't pause, we can clean everything up. */ if (!paused) { reset_balance_state(fs_info); btrfs_exclop_finish(fs_info); } wake_up(&fs_info->balance_wait_q); return ret; out: if (bctl->flags & BTRFS_BALANCE_RESUME) reset_balance_state(fs_info); else kfree(bctl); btrfs_exclop_finish(fs_info); return ret; } static int balance_kthread(void *data) { struct btrfs_fs_info *fs_info = data; int ret = 0; sb_start_write(fs_info->sb); mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); mutex_unlock(&fs_info->balance_mutex); sb_end_write(fs_info->sb); return ret; } int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) { struct task_struct *tsk; mutex_lock(&fs_info->balance_mutex); if (!fs_info->balance_ctl) { mutex_unlock(&fs_info->balance_mutex); return 0; } mutex_unlock(&fs_info->balance_mutex); if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { btrfs_info(fs_info, "balance: resume skipped"); return 0; } spin_lock(&fs_info->super_lock); ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; spin_unlock(&fs_info->super_lock); /* * A ro->rw remount sequence should continue with the paused balance * regardless of who pauses it, system or the user as of now, so set * the resume flag. */ spin_lock(&fs_info->balance_lock); fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; spin_unlock(&fs_info->balance_lock); tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); return PTR_ERR_OR_ZERO(tsk); } int btrfs_recover_balance(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl; struct btrfs_balance_item *item; struct btrfs_disk_balance_args disk_bargs; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = BTRFS_BALANCE_OBJECTID; key.type = BTRFS_TEMPORARY_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) goto out; if (ret > 0) { /* ret = -ENOENT; */ ret = 0; goto out; } bctl = kzalloc(sizeof(*bctl), GFP_NOFS); if (!bctl) { ret = -ENOMEM; goto out; } leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); bctl->flags = btrfs_balance_flags(leaf, item); bctl->flags |= BTRFS_BALANCE_RESUME; btrfs_balance_data(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); btrfs_balance_meta(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); /* * This should never happen, as the paused balance state is recovered * during mount without any chance of other exclusive ops to collide. * * This gives the exclusive op status to balance and keeps in paused * state until user intervention (cancel or umount). If the ownership * cannot be assigned, show a message but do not fail. The balance * is in a paused state and must have fs_info::balance_ctl properly * set up. */ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED)) btrfs_warn(fs_info, "balance: cannot set exclusive op status, resume manually"); btrfs_release_path(path); mutex_lock(&fs_info->balance_mutex); BUG_ON(fs_info->balance_ctl); spin_lock(&fs_info->balance_lock); fs_info->balance_ctl = bctl; spin_unlock(&fs_info->balance_lock); mutex_unlock(&fs_info->balance_mutex); out: btrfs_free_path(path); return ret; } int btrfs_pause_balance(struct btrfs_fs_info *fs_info) { int ret = 0; mutex_lock(&fs_info->balance_mutex); if (!fs_info->balance_ctl) { mutex_unlock(&fs_info->balance_mutex); return -ENOTCONN; } if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { atomic_inc(&fs_info->balance_pause_req); mutex_unlock(&fs_info->balance_mutex); wait_event(fs_info->balance_wait_q, !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); mutex_lock(&fs_info->balance_mutex); /* we are good with balance_ctl ripped off from under us */ BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); atomic_dec(&fs_info->balance_pause_req); } else { ret = -ENOTCONN; } mutex_unlock(&fs_info->balance_mutex); return ret; } int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) { mutex_lock(&fs_info->balance_mutex); if (!fs_info->balance_ctl) { mutex_unlock(&fs_info->balance_mutex); return -ENOTCONN; } /* * A paused balance with the item stored on disk can be resumed at * mount time if the mount is read-write. Otherwise it's still paused * and we must not allow cancelling as it deletes the item. */ if (sb_rdonly(fs_info->sb)) { mutex_unlock(&fs_info->balance_mutex); return -EROFS; } atomic_inc(&fs_info->balance_cancel_req); /* * if we are running just wait and return, balance item is * deleted in btrfs_balance in this case */ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { mutex_unlock(&fs_info->balance_mutex); wait_event(fs_info->balance_wait_q, !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); mutex_lock(&fs_info->balance_mutex); } else { mutex_unlock(&fs_info->balance_mutex); /* * Lock released to allow other waiters to continue, we'll * reexamine the status again. */ mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) { reset_balance_state(fs_info); btrfs_exclop_finish(fs_info); btrfs_info(fs_info, "balance: canceled"); } } ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); atomic_dec(&fs_info->balance_cancel_req); mutex_unlock(&fs_info->balance_mutex); return 0; } /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. * The chunk relocation code actually frees the device extent */ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; u64 length; u64 chunk_offset; int ret; int slot; int failed = 0; bool retried = false; struct extent_buffer *l; struct btrfs_key key; struct btrfs_super_block *super_copy = fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = btrfs_device_get_total_bytes(device); u64 diff; u64 start; u64 free_diff = 0; new_size = round_down(new_size, fs_info->sectorsize); start = new_size; diff = round_down(old_size - new_size, fs_info->sectorsize); if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) return -EINVAL; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_BACK; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { btrfs_free_path(path); return PTR_ERR(trans); } mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, new_size); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { device->fs_devices->total_rw_bytes -= diff; /* * The new free_chunk_space is new_size - used, so we have to * subtract the delta of the old free_chunk_space which included * old_size - used. If used > new_size then just subtract this * entire device's free space. */ if (device->bytes_used < new_size) free_diff = (old_size - device->bytes_used) - (new_size - device->bytes_used); else free_diff = old_size - device->bytes_used; atomic64_sub(free_diff, &fs_info->free_chunk_space); } /* * Once the device's size has been set to the new size, ensure all * in-memory chunks are synced to disk so that the loop below sees them * and relocates them accordingly. */ if (contains_pending_extent(device, &start, diff)) { mutex_unlock(&fs_info->chunk_mutex); ret = btrfs_commit_transaction(trans); if (ret) goto done; } else { mutex_unlock(&fs_info->chunk_mutex); btrfs_end_transaction(trans); } again: key.objectid = device->devid; key.offset = (u64)-1; key.type = BTRFS_DEV_EXTENT_KEY; do { mutex_lock(&fs_info->reclaim_bgs_lock); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { mutex_unlock(&fs_info->reclaim_bgs_lock); goto done; } ret = btrfs_previous_item(root, path, 0, key.type); if (ret) { mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret < 0) goto done; ret = 0; btrfs_release_path(path); break; } l = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(l, &key, path->slots[0]); if (key.objectid != device->devid) { mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_release_path(path); break; } dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) { mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_release_path(path); break; } chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); btrfs_release_path(path); /* * We may be relocating the only data chunk we have, * which could potentially end up with losing data's * raid profile, so lets allocate an empty one in * advance. */ ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); if (ret < 0) { mutex_unlock(&fs_info->reclaim_bgs_lock); goto done; } ret = btrfs_relocate_chunk(fs_info, chunk_offset); mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret == -ENOSPC) { failed++; } else if (ret) { if (ret == -ETXTBSY) { btrfs_warn(fs_info, "could not shrink block group %llu due to active swapfile", chunk_offset); } goto done; } } while (key.offset-- > 0); if (failed && !retried) { failed = 0; retried = true; goto again; } else if (failed && retried) { ret = -ENOSPC; goto done; } /* Shrinking succeeded, else we would be at "done". */ trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto done; } mutex_lock(&fs_info->chunk_mutex); /* Clear all state bits beyond the shrunk device size */ clear_extent_bits(&device->alloc_state, new_size, (u64)-1, CHUNK_STATE_MASK); btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->post_commit_list)) list_add_tail(&device->post_commit_list, &trans->transaction->dev_update_list); WARN_ON(diff > old_total); btrfs_set_super_total_bytes(super_copy, round_down(old_total - diff, fs_info->sectorsize)); mutex_unlock(&fs_info->chunk_mutex); btrfs_reserve_chunk_metadata(trans, false); /* Now btrfs_update_device() will change the on-disk size. */ ret = btrfs_update_device(trans, device); btrfs_trans_release_chunk_metadata(trans); if (ret < 0) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); } else { ret = btrfs_commit_transaction(trans); } done: btrfs_free_path(path); if (ret) { mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, old_size); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { device->fs_devices->total_rw_bytes += diff; atomic64_add(free_diff, &fs_info->free_chunk_space); } mutex_unlock(&fs_info->chunk_mutex); } return ret; } static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { struct btrfs_super_block *super_copy = fs_info->super_copy; struct btrfs_disk_key disk_key; u32 array_size; u8 *ptr; lockdep_assert_held(&fs_info->chunk_mutex); array_size = btrfs_super_sys_array_size(super_copy); if (array_size + item_size + sizeof(disk_key) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) return -EFBIG; ptr = super_copy->sys_chunk_array + array_size; btrfs_cpu_key_to_disk(&disk_key, key); memcpy(ptr, &disk_key, sizeof(disk_key)); ptr += sizeof(disk_key); memcpy(ptr, chunk, item_size); item_size += sizeof(disk_key); btrfs_set_super_sys_array_size(super_copy, array_size + item_size); return 0; } /* * sort the devices in descending order by max_avail, total_avail */ static int btrfs_cmp_device_info(const void *a, const void *b) { const struct btrfs_device_info *di_a = a; const struct btrfs_device_info *di_b = b; if (di_a->max_avail > di_b->max_avail) return -1; if (di_a->max_avail < di_b->max_avail) return 1; if (di_a->total_avail > di_b->total_avail) return -1; if (di_a->total_avail < di_b->total_avail) return 1; return 0; } static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) { if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) return; btrfs_set_fs_incompat(info, RAID56); } static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) { if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) return; btrfs_set_fs_incompat(info, RAID1C34); } /* * Structure used internally for btrfs_create_chunk() function. * Wraps needed parameters. */ struct alloc_chunk_ctl { u64 start; u64 type; /* Total number of stripes to allocate */ int num_stripes; /* sub_stripes info for map */ int sub_stripes; /* Stripes per device */ int dev_stripes; /* Maximum number of devices to use */ int devs_max; /* Minimum number of devices to use */ int devs_min; /* ndevs has to be a multiple of this */ int devs_increment; /* Number of copies */ int ncopies; /* Number of stripes worth of bytes to store parity information */ int nparity; u64 max_stripe_size; u64 max_chunk_size; u64 dev_extent_min; u64 stripe_size; u64 chunk_size; int ndevs; }; static void init_alloc_chunk_ctl_policy_regular( struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl) { struct btrfs_space_info *space_info; space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type); ASSERT(space_info); ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G); if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); /* We don't want a chunk larger than 10% of writable space */ ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), ctl->max_chunk_size); ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes); } static void init_alloc_chunk_ctl_policy_zoned( struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl) { u64 zone_size = fs_devices->fs_info->zone_size; u64 limit; int min_num_stripes = ctl->devs_min * ctl->dev_stripes; int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; u64 min_chunk_size = min_data_stripes * zone_size; u64 type = ctl->type; ctl->max_stripe_size = zone_size; if (type & BTRFS_BLOCK_GROUP_DATA) { ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, zone_size); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { ctl->max_chunk_size = ctl->max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { ctl->max_chunk_size = 2 * ctl->max_stripe_size; ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); } else { BUG(); } /* We don't want a chunk larger than 10% of writable space */ limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10), zone_size), min_chunk_size); ctl->max_chunk_size = min(limit, ctl->max_chunk_size); ctl->dev_extent_min = zone_size * ctl->dev_stripes; } static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl) { int index = btrfs_bg_flags_to_raid_index(ctl->type); ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; ctl->devs_max = btrfs_raid_array[index].devs_max; if (!ctl->devs_max) ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); ctl->devs_min = btrfs_raid_array[index].devs_min; ctl->devs_increment = btrfs_raid_array[index].devs_increment; ctl->ncopies = btrfs_raid_array[index].ncopies; ctl->nparity = btrfs_raid_array[index].nparity; ctl->ndevs = 0; switch (fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); break; case BTRFS_CHUNK_ALLOC_ZONED: init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); break; default: BUG(); } } static int gather_device_info(struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) { struct btrfs_fs_info *info = fs_devices->fs_info; struct btrfs_device *device; u64 total_avail; u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; int ret; int ndevs = 0; u64 max_avail; u64 dev_offset; /* * in the first pass through the devices list, we gather information * about the available holes on each device. */ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { WARN(1, KERN_ERR "BTRFS: read-only device in alloc_list\n"); continue; } if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) continue; if (device->total_bytes > device->bytes_used) total_avail = device->total_bytes - device->bytes_used; else total_avail = 0; /* If there is no space on this device, skip it. */ if (total_avail < ctl->dev_extent_min) continue; ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, &max_avail); if (ret && ret != -ENOSPC) return ret; if (ret == 0) max_avail = dev_extent_want; if (max_avail < ctl->dev_extent_min) { if (btrfs_test_opt(info, ENOSPC_DEBUG)) btrfs_debug(info, "%s: devid %llu has no free space, have=%llu want=%llu", __func__, device->devid, max_avail, ctl->dev_extent_min); continue; } if (ndevs == fs_devices->rw_devices) { WARN(1, "%s: found more than %llu devices\n", __func__, fs_devices->rw_devices); break; } devices_info[ndevs].dev_offset = dev_offset; devices_info[ndevs].max_avail = max_avail; devices_info[ndevs].total_avail = total_avail; devices_info[ndevs].dev = device; ++ndevs; } ctl->ndevs = ndevs; /* * now sort the devices by hole size / available space */ sort(devices_info, ndevs, sizeof(struct btrfs_device_info), btrfs_cmp_device_info, NULL); return 0; } static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) { /* Number of stripes that count for block group size */ int data_stripes; /* * The primary goal is to maximize the number of stripes, so use as * many devices as possible, even if the stripes are not maximum sized. * * The DUP profile stores more than one stripe per device, the * max_avail is the total size so we have to adjust. */ ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, ctl->dev_stripes); ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; /* This will have to be fixed for RAID1 and RAID10 over more drives */ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; /* * Use the number of data stripes to figure out how big this chunk is * really going to be in terms of logical address space, and compare * that answer with the max chunk size. If it's higher, we try to * reduce stripe_size. */ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { /* * Reduce stripe_size, round it up to a 16MB boundary again and * then use it, unless it ends up being even bigger than the * previous value we had already. */ ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, data_stripes), SZ_16M), ctl->stripe_size); } /* Stripe size should not go beyond 1G. */ ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G); /* Align to BTRFS_STRIPE_LEN */ ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); ctl->chunk_size = ctl->stripe_size * data_stripes; return 0; } static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) { u64 zone_size = devices_info[0].dev->zone_info->zone_size; /* Number of stripes that count for block group size */ int data_stripes; /* * It should hold because: * dev_extent_min == dev_extent_want == zone_size * dev_stripes */ ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); ctl->stripe_size = zone_size; ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; /* stripe_size is fixed in zoned filesystem. Reduce ndevs instead. */ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, ctl->stripe_size) + ctl->nparity, ctl->dev_stripes); ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); } ctl->chunk_size = ctl->stripe_size * data_stripes; return 0; } static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) { struct btrfs_fs_info *info = fs_devices->fs_info; /* * Round down to number of usable stripes, devs_increment can be any * number so we can't use round_down() that requires power of 2, while * rounddown is safe. */ ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); if (ctl->ndevs < ctl->devs_min) { if (btrfs_test_opt(info, ENOSPC_DEBUG)) { btrfs_debug(info, "%s: not enough devices with free space: have=%d minimum required=%d", __func__, ctl->ndevs, ctl->devs_min); } return -ENOSPC; } ctl->ndevs = min(ctl->ndevs, ctl->devs_max); switch (fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: return decide_stripe_size_regular(ctl, devices_info); case BTRFS_CHUNK_ALLOC_ZONED: return decide_stripe_size_zoned(ctl, devices_info); default: BUG(); } } static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits) { for (int i = 0; i < map->num_stripes; i++) { struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; set_extent_bit(&device->alloc_state, stripe->physical, stripe->physical + map->stripe_size - 1, bits | EXTENT_NOWAIT, NULL); } } static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits) { for (int i = 0; i < map->num_stripes; i++) { struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; __clear_extent_bit(&device->alloc_state, stripe->physical, stripe->physical + map->stripe_size - 1, bits | EXTENT_NOWAIT, NULL, NULL); } } void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) { write_lock(&fs_info->mapping_tree_lock); rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); RB_CLEAR_NODE(&map->rb_node); chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); write_unlock(&fs_info->mapping_tree_lock); /* Once for the tree reference. */ btrfs_free_chunk_map(map); } static int btrfs_chunk_map_cmp(const struct rb_node *new, const struct rb_node *exist) { const struct btrfs_chunk_map *new_map = rb_entry(new, struct btrfs_chunk_map, rb_node); const struct btrfs_chunk_map *exist_map = rb_entry(exist, struct btrfs_chunk_map, rb_node); if (new_map->start == exist_map->start) return 0; if (new_map->start < exist_map->start) return -1; return 1; } EXPORT_FOR_TESTS int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) { struct rb_node *exist; write_lock(&fs_info->mapping_tree_lock); exist = rb_find_add_cached(&map->rb_node, &fs_info->mapping_tree, btrfs_chunk_map_cmp); if (exist) { write_unlock(&fs_info->mapping_tree_lock); return -EEXIST; } chunk_map_device_set_bits(map, CHUNK_ALLOCATED); chunk_map_device_clear_bits(map, CHUNK_TRIMMED); write_unlock(&fs_info->mapping_tree_lock); return 0; } EXPORT_FOR_TESTS struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp) { struct btrfs_chunk_map *map; map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp); if (!map) return NULL; refcount_set(&map->refs, 1); RB_CLEAR_NODE(&map->rb_node); return map; } static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_chunk_map *map; struct btrfs_block_group *block_group; u64 start = ctl->start; u64 type = ctl->type; int ret; map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS); if (!map) return ERR_PTR(-ENOMEM); map->start = start; map->chunk_len = ctl->chunk_size; map->stripe_size = ctl->stripe_size; map->type = type; map->io_align = BTRFS_STRIPE_LEN; map->io_width = BTRFS_STRIPE_LEN; map->sub_stripes = ctl->sub_stripes; map->num_stripes = ctl->num_stripes; for (int i = 0; i < ctl->ndevs; i++) { for (int j = 0; j < ctl->dev_stripes; j++) { int s = i * ctl->dev_stripes + j; map->stripes[s].dev = devices_info[i].dev; map->stripes[s].physical = devices_info[i].dev_offset + j * ctl->stripe_size; } } trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); ret = btrfs_add_chunk_map(info, map); if (ret) { btrfs_free_chunk_map(map); return ERR_PTR(ret); } block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); if (IS_ERR(block_group)) { btrfs_remove_chunk_map(info, map); return block_group; } for (int i = 0; i < map->num_stripes; i++) { struct btrfs_device *dev = map->stripes[i].dev; btrfs_device_set_bytes_used(dev, dev->bytes_used + ctl->stripe_size); if (list_empty(&dev->post_commit_list)) list_add_tail(&dev->post_commit_list, &trans->transaction->dev_update_list); } atomic64_sub(ctl->stripe_size * map->num_stripes, &info->free_chunk_space); check_raid56_incompat_flag(info, type); check_raid1c34_incompat_flag(info, type); return block_group; } struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; struct btrfs_device_info *devices_info = NULL; struct alloc_chunk_ctl ctl; struct btrfs_block_group *block_group; int ret; lockdep_assert_held(&info->chunk_mutex); if (!alloc_profile_is_valid(type, 0)) { ASSERT(0); return ERR_PTR(-EINVAL); } if (list_empty(&fs_devices->alloc_list)) { if (btrfs_test_opt(info, ENOSPC_DEBUG)) btrfs_debug(info, "%s: no writable device", __func__); return ERR_PTR(-ENOSPC); } if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(info, "invalid chunk type 0x%llx requested", type); ASSERT(0); return ERR_PTR(-EINVAL); } ctl.start = find_next_chunk(info); ctl.type = type; init_alloc_chunk_ctl(fs_devices, &ctl); devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), GFP_NOFS); if (!devices_info) return ERR_PTR(-ENOMEM); ret = gather_device_info(fs_devices, &ctl, devices_info); if (ret < 0) { block_group = ERR_PTR(ret); goto out; } ret = decide_stripe_size(fs_devices, &ctl, devices_info); if (ret < 0) { block_group = ERR_PTR(ret); goto out; } block_group = create_chunk(trans, &ctl, devices_info); out: kfree(devices_info); return block_group; } /* * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system * chunks. * * See the comment at btrfs_chunk_alloc() for details about the chunk allocation * phases. */ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *chunk_root = fs_info->chunk_root; struct btrfs_key key; struct btrfs_chunk *chunk; struct btrfs_stripe *stripe; struct btrfs_chunk_map *map; size_t item_size; int i; int ret; /* * We take the chunk_mutex for 2 reasons: * * 1) Updates and insertions in the chunk btree must be done while holding * the chunk_mutex, as well as updating the system chunk array in the * superblock. See the comment on top of btrfs_chunk_alloc() for the * details; * * 2) To prevent races with the final phase of a device replace operation * that replaces the device object associated with the map's stripes, * because the device object's id can change at any time during that * final phase of the device replace operation * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, * which would cause a failure when updating the device item, which does * not exists, or persisting a stripe of the chunk item with such ID. * Here we can't use the device_list_mutex because our caller already * has locked the chunk_mutex, and the final phase of device replace * acquires both mutexes - first the device_list_mutex and then the * chunk_mutex. Using any of those two mutexes protects us from a * concurrent device replace. */ lockdep_assert_held(&fs_info->chunk_mutex); map = btrfs_get_chunk_map(fs_info, bg->start, bg->length); if (IS_ERR(map)) { ret = PTR_ERR(map); btrfs_abort_transaction(trans, ret); return ret; } item_size = btrfs_chunk_item_size(map->num_stripes); chunk = kzalloc(item_size, GFP_NOFS); if (!chunk) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; } for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; ret = btrfs_update_device(trans, device); if (ret) goto out; } stripe = &chunk->stripe; for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; const u64 dev_offset = map->stripes[i].physical; btrfs_set_stack_stripe_devid(stripe, device->devid); btrfs_set_stack_stripe_offset(stripe, dev_offset); memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); stripe++; } btrfs_set_stack_chunk_length(chunk, bg->length); btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN); btrfs_set_stack_chunk_type(chunk, map->type); btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN); btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN); btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.type = BTRFS_CHUNK_ITEM_KEY; key.offset = bg->start; ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); if (ret) goto out; set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); if (ret) goto out; } out: kfree(chunk); btrfs_free_chunk_map(map); return ret; } static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; u64 alloc_profile; struct btrfs_block_group *meta_bg; struct btrfs_block_group *sys_bg; /* * When adding a new device for sprouting, the seed device is read-only * so we must first allocate a metadata and a system chunk. But before * adding the block group items to the extent, device and chunk btrees, * we must first: * * 1) Create both chunks without doing any changes to the btrees, as * otherwise we would get -ENOSPC since the block groups from the * seed device are read-only; * * 2) Add the device item for the new sprout device - finishing the setup * of a new block group requires updating the device item in the chunk * btree, so it must exist when we attempt to do it. The previous step * ensures this does not fail with -ENOSPC. * * After that we can add the block group items to their btrees: * update existing device item in the chunk btree, add a new block group * item to the extent btree, add a new chunk item to the chunk btree and * finally add the new device extent items to the devices btree. */ alloc_profile = btrfs_metadata_alloc_profile(fs_info); meta_bg = btrfs_create_chunk(trans, alloc_profile); if (IS_ERR(meta_bg)) return PTR_ERR(meta_bg); alloc_profile = btrfs_system_alloc_profile(fs_info); sys_bg = btrfs_create_chunk(trans, alloc_profile); if (IS_ERR(sys_bg)) return PTR_ERR(sys_bg); return 0; } static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map) { const int index = btrfs_bg_flags_to_raid_index(map->type); return btrfs_raid_array[index].tolerated_failures; } bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct btrfs_chunk_map *map; int miss_ndevs = 0; int i; bool ret = true; map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(map)) return false; for (i = 0; i < map->num_stripes; i++) { if (test_bit(BTRFS_DEV_STATE_MISSING, &map->stripes[i].dev->dev_state)) { miss_ndevs++; continue; } if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &map->stripes[i].dev->dev_state)) { ret = false; goto end; } } /* * If the number of missing devices is larger than max errors, we can * not write the data into that chunk successfully. */ if (miss_ndevs > btrfs_chunk_max_errors(map)) ret = false; end: btrfs_free_chunk_map(map); return ret; } void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info) { write_lock(&fs_info->mapping_tree_lock); while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) { struct btrfs_chunk_map *map; struct rb_node *node; node = rb_first_cached(&fs_info->mapping_tree); map = rb_entry(node, struct btrfs_chunk_map, rb_node); rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); RB_CLEAR_NODE(&map->rb_node); chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); /* Once for the tree ref. */ btrfs_free_chunk_map(map); cond_resched_rwlock_write(&fs_info->mapping_tree_lock); } write_unlock(&fs_info->mapping_tree_lock); } static int btrfs_chunk_map_num_copies(const struct btrfs_chunk_map *map) { enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(map->type); if (map->type & BTRFS_BLOCK_GROUP_RAID5) return 2; /* * There could be two corrupted data stripes, we need to loop retry in * order to rebuild the correct data. * * Fail a stripe at a time on every retry except the stripe under * reconstruction. */ if (map->type & BTRFS_BLOCK_GROUP_RAID6) return map->num_stripes; /* Non-RAID56, use their ncopies from btrfs_raid_array. */ return btrfs_raid_array[index].ncopies; } int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { struct btrfs_chunk_map *map; int ret; map = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(map)) /* * We could return errors for these cases, but that could get * ugly and we'd probably do the same thing which is just not do * anything else and exit, so return 1 so the callers don't try * to use other copies. */ return 1; ret = btrfs_chunk_map_num_copies(map); btrfs_free_chunk_map(map); return ret; } unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical) { struct btrfs_chunk_map *map; unsigned long len = fs_info->sectorsize; if (!btrfs_fs_incompat(fs_info, RAID56)) return len; map = btrfs_get_chunk_map(fs_info, logical, len); if (!WARN_ON(IS_ERR(map))) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); btrfs_free_chunk_map(map); } return len; } #ifdef CONFIG_BTRFS_EXPERIMENTAL static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes) { for (int index = first; index < first + num_stripes; index++) { const struct btrfs_device *device = map->stripes[index].dev; if (device->devid == READ_ONCE(device->fs_devices->read_devid)) return index; } /* If no read-preferred device is set use the first stripe. */ return first; } struct stripe_mirror { u64 devid; int num; }; static int btrfs_cmp_devid(const void *a, const void *b) { const struct stripe_mirror *s1 = (const struct stripe_mirror *)a; const struct stripe_mirror *s2 = (const struct stripe_mirror *)b; if (s1->devid < s2->devid) return -1; if (s1->devid > s2->devid) return 1; return 0; } /* * Select a stripe for reading using the round-robin algorithm. * * 1. Compute the read cycle as the total sectors read divided by the minimum * sectors per device. * 2. Determine the stripe number for the current read by taking the modulus * of the read cycle with the total number of stripes: * * stripe index = (total sectors / min sectors per dev) % num stripes * * The calculated stripe index is then used to select the corresponding device * from the list of devices, which is ordered by devid. */ static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes) { struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 }; struct btrfs_device *device = map->stripes[first].dev; struct btrfs_fs_info *fs_info = device->fs_devices->fs_info; unsigned int read_cycle; unsigned int total_reads; unsigned int min_reads_per_dev; total_reads = percpu_counter_sum(&fs_info->stats_read_blocks); min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >> fs_info->sectorsize_bits; for (int index = 0, i = first; i < first + num_stripes; i++) { stripes[index].devid = map->stripes[i].dev->devid; stripes[index].num = i; index++; } sort(stripes, num_stripes, sizeof(struct stripe_mirror), btrfs_cmp_devid, NULL); read_cycle = total_reads / min_reads_per_dev; return stripes[read_cycle % num_stripes].num; } #endif static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) { const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy); int i; int num_stripes; int preferred_mirror; int tolerance; struct btrfs_device *srcdev; ASSERT((map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); if (map->type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = map->sub_stripes; else num_stripes = map->num_stripes; switch (policy) { default: /* Shouldn't happen, just warn and use pid instead of failing */ btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid", policy); WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID); fallthrough; case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); break; #ifdef CONFIG_BTRFS_EXPERIMENTAL case BTRFS_READ_POLICY_RR: preferred_mirror = btrfs_read_rr(map, first, num_stripes); break; case BTRFS_READ_POLICY_DEVID: preferred_mirror = btrfs_read_preferred(map, first, num_stripes); break; #endif } if (dev_replace_is_ongoing && fs_info->dev_replace.cont_reading_from_srcdev_mode == BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) srcdev = fs_info->dev_replace.srcdev; else srcdev = NULL; /* * try to avoid the drive that is the source drive for a * dev-replace procedure, only choose it if no other non-missing * mirror is available */ for (tolerance = 0; tolerance < 2; tolerance++) { if (map->stripes[preferred_mirror].dev->bdev && (tolerance || map->stripes[preferred_mirror].dev != srcdev)) return preferred_mirror; for (i = first; i < first + num_stripes; i++) { if (map->stripes[i].dev->bdev && (tolerance || map->stripes[i].dev != srcdev)) return i; } } /* we couldn't find one that doesn't fail. Just return something * and the io error handling code will clean up eventually */ return preferred_mirror; } EXPORT_FOR_TESTS struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, u64 logical, u16 total_stripes) { struct btrfs_io_context *bioc; bioc = kzalloc( /* The size of btrfs_io_context */ sizeof(struct btrfs_io_context) + /* Plus the variable array for the stripes */ sizeof(struct btrfs_io_stripe) * (total_stripes), GFP_NOFS); if (!bioc) return NULL; refcount_set(&bioc->refs, 1); bioc->fs_info = fs_info; bioc->replace_stripe_src = -1; bioc->full_stripe_logical = (u64)-1; bioc->logical = logical; return bioc; } void btrfs_get_bioc(struct btrfs_io_context *bioc) { WARN_ON(!refcount_read(&bioc->refs)); refcount_inc(&bioc->refs); } void btrfs_put_bioc(struct btrfs_io_context *bioc) { if (!bioc) return; if (refcount_dec_and_test(&bioc->refs)) kfree(bioc); } /* * Please note that, discard won't be sent to target device of device * replace. */ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes) { struct btrfs_chunk_map *map; struct btrfs_discard_stripe *stripes; u64 length = *length_ret; u64 offset; u32 stripe_nr; u32 stripe_nr_end; u32 stripe_cnt; u64 stripe_end_offset; u64 stripe_offset; u32 stripe_index; u32 factor = 0; u32 sub_stripes = 0; u32 stripes_per_dev = 0; u32 remaining_stripes = 0; u32 last_stripe = 0; int ret; int i; map = btrfs_get_chunk_map(fs_info, logical, length); if (IS_ERR(map)) return ERR_CAST(map); /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; goto out_free_map; } offset = logical - map->start; length = min_t(u64, map->start + map->chunk_len - logical, length); *length_ret = length; /* * stripe_nr counts the total number of stripes we have to stride * to get to this block */ stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; /* stripe_offset is the offset of this block in its stripe */ stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr); stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >> BTRFS_STRIPE_LEN_SHIFT; stripe_cnt = stripe_nr_end - stripe_nr; stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) - (offset + length); /* * after this, stripe_nr is the number of stripes on this * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ *num_stripes = 1; stripe_index = 0; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { if (map->type & BTRFS_BLOCK_GROUP_RAID0) sub_stripes = 1; else sub_stripes = map->sub_stripes; factor = map->num_stripes / sub_stripes; *num_stripes = min_t(u64, map->num_stripes, sub_stripes * stripe_cnt); stripe_index = stripe_nr % factor; stripe_nr /= factor; stripe_index *= sub_stripes; remaining_stripes = stripe_cnt % factor; stripes_per_dev = stripe_cnt / factor; last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes; } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_DUP)) { *num_stripes = map->num_stripes; } else { stripe_index = stripe_nr % map->num_stripes; stripe_nr /= map->num_stripes; } stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); if (!stripes) { ret = -ENOMEM; goto out_free_map; } for (i = 0; i < *num_stripes; i++) { stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev); if (i / sub_stripes < remaining_stripes) stripes[i].length += BTRFS_STRIPE_LEN; /* * Special for the first stripe and * the last stripe: * * |-------|...|-------| * |----------| * off end_off */ if (i < sub_stripes) stripes[i].length -= stripe_offset; if (stripe_index >= last_stripe && stripe_index <= (last_stripe + sub_stripes - 1)) stripes[i].length -= stripe_end_offset; if (i == sub_stripes - 1) stripe_offset = 0; } else { stripes[i].length = length; } stripe_index++; if (stripe_index == map->num_stripes) { stripe_index = 0; stripe_nr++; } } btrfs_free_chunk_map(map); return stripes; out_free_map: btrfs_free_chunk_map(map); return ERR_PTR(ret); } static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) { struct btrfs_block_group *cache; bool ret; /* Non zoned filesystem does not use "to_copy" flag */ if (!btrfs_is_zoned(fs_info)) return false; cache = btrfs_lookup_block_group(fs_info, logical); ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); btrfs_put_block_group(cache); return ret; } static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc, struct btrfs_dev_replace *dev_replace, u64 logical, struct btrfs_io_geometry *io_geom) { u64 srcdev_devid = dev_replace->srcdev->devid; /* * At this stage, num_stripes is still the real number of stripes, * excluding the duplicated stripes. */ int num_stripes = io_geom->num_stripes; int max_errors = io_geom->max_errors; int nr_extra_stripes = 0; int i; /* * A block group which has "to_copy" set will eventually be copied by * the dev-replace process. We can avoid cloning IO here. */ if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) return; /* * Duplicate the write operations while the dev-replace procedure is * running. Since the copying of the old disk to the new disk takes * place at run time while the filesystem is mounted writable, the * regular write operations to the old disk have to be duplicated to go * to the new disk as well. * * Note that device->missing is handled by the caller, and that the * write to the old disk is already set up in the stripes array. */ for (i = 0; i < num_stripes; i++) { struct btrfs_io_stripe *old = &bioc->stripes[i]; struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes]; if (old->dev->devid != srcdev_devid) continue; new->physical = old->physical; new->dev = dev_replace->tgtdev; if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) bioc->replace_stripe_src = i; nr_extra_stripes++; } /* We can only have at most 2 extra nr_stripes (for DUP). */ ASSERT(nr_extra_stripes <= 2); /* * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for * replace. * If we have 2 extra stripes, only choose the one with smaller physical. */ if (io_geom->op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) { struct btrfs_io_stripe *first = &bioc->stripes[num_stripes]; struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1]; /* Only DUP can have two extra stripes. */ ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP); /* * Swap the last stripe stripes and reduce @nr_extra_stripes. * The extra stripe would still be there, but won't be accessed. */ if (first->physical > second->physical) { swap(second->physical, first->physical); swap(second->dev, first->dev); nr_extra_stripes--; } } io_geom->num_stripes = num_stripes + nr_extra_stripes; io_geom->max_errors = max_errors + nr_extra_stripes; bioc->replace_nr_stripes = nr_extra_stripes; } static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset, struct btrfs_io_geometry *io_geom) { /* * Stripe_nr is the stripe where this block falls. stripe_offset is * the offset of this block in its stripe. */ io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; ASSERT(io_geom->stripe_offset < U32_MAX); if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { unsigned long full_stripe_len = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); /* * For full stripe start, we use previously calculated * @stripe_nr. Align it to nr_data_stripes, then multiply with * STRIPE_LEN. * * By this we can avoid u64 division completely. And we have * to go rounddown(), not round_down(), as nr_data_stripes is * not ensured to be power of 2. */ io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset( rounddown(io_geom->stripe_nr, nr_data_stripes(map))); ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset); ASSERT(io_geom->raid56_full_stripe_start <= offset); /* * For writes to RAID56, allow to write a full stripe set, but * no straddling of stripe sets. */ if (io_geom->op == BTRFS_MAP_WRITE) return full_stripe_len - (offset - io_geom->raid56_full_stripe_start); } /* * For other RAID types and for RAID56 reads, allow a single stripe (on * a single disk). */ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) return BTRFS_STRIPE_LEN - io_geom->stripe_offset; return U64_MAX; } static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical, u64 *length, struct btrfs_io_stripe *dst, struct btrfs_chunk_map *map, struct btrfs_io_geometry *io_geom) { dst->dev = map->stripes[io_geom->stripe_index].dev; if (io_geom->op == BTRFS_MAP_READ && io_geom->use_rst) return btrfs_get_raid_extent_offset(fs_info, logical, length, map->type, io_geom->stripe_index, dst); dst->physical = map->stripes[io_geom->stripe_index].physical + io_geom->stripe_offset + btrfs_stripe_nr_to_offset(io_geom->stripe_nr); return 0; } static bool is_single_device_io(struct btrfs_fs_info *fs_info, const struct btrfs_io_stripe *smap, const struct btrfs_chunk_map *map, int num_alloc_stripes, struct btrfs_io_geometry *io_geom) { if (!smap) return false; if (num_alloc_stripes != 1) return false; if (io_geom->use_rst && io_geom->op != BTRFS_MAP_READ) return false; if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && io_geom->mirror_num > 1) return false; return true; } static void map_blocks_raid0(const struct btrfs_chunk_map *map, struct btrfs_io_geometry *io_geom) { io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes; io_geom->stripe_nr /= map->num_stripes; if (io_geom->op == BTRFS_MAP_READ) io_geom->mirror_num = 1; } static void map_blocks_raid1(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, struct btrfs_io_geometry *io_geom, bool dev_replace_is_ongoing) { if (io_geom->op != BTRFS_MAP_READ) { io_geom->num_stripes = map->num_stripes; return; } if (io_geom->mirror_num) { io_geom->stripe_index = io_geom->mirror_num - 1; return; } io_geom->stripe_index = find_live_mirror(fs_info, map, 0, dev_replace_is_ongoing); io_geom->mirror_num = io_geom->stripe_index + 1; } static void map_blocks_dup(const struct btrfs_chunk_map *map, struct btrfs_io_geometry *io_geom) { if (io_geom->op != BTRFS_MAP_READ) { io_geom->num_stripes = map->num_stripes; return; } if (io_geom->mirror_num) { io_geom->stripe_index = io_geom->mirror_num - 1; return; } io_geom->mirror_num = 1; } static void map_blocks_raid10(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, struct btrfs_io_geometry *io_geom, bool dev_replace_is_ongoing) { u32 factor = map->num_stripes / map->sub_stripes; int old_stripe_index; io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes; io_geom->stripe_nr /= factor; if (io_geom->op != BTRFS_MAP_READ) { io_geom->num_stripes = map->sub_stripes; return; } if (io_geom->mirror_num) { io_geom->stripe_index += io_geom->mirror_num - 1; return; } old_stripe_index = io_geom->stripe_index; io_geom->stripe_index = find_live_mirror(fs_info, map, io_geom->stripe_index, dev_replace_is_ongoing); io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1; } static void map_blocks_raid56_write(struct btrfs_chunk_map *map, struct btrfs_io_geometry *io_geom, u64 logical, u64 *length) { int data_stripes = nr_data_stripes(map); /* * Needs full stripe mapping. * * Push stripe_nr back to the start of the full stripe For those cases * needing a full stripe, @stripe_nr is the full stripe number. * * Originally we go raid56_full_stripe_start / full_stripe_len, but * that can be expensive. Here we just divide @stripe_nr with * @data_stripes. */ io_geom->stripe_nr /= data_stripes; /* RAID[56] write or recovery. Return all stripes */ io_geom->num_stripes = map->num_stripes; io_geom->max_errors = btrfs_chunk_max_errors(map); /* Return the length to the full stripe end. */ *length = min(logical + *length, io_geom->raid56_full_stripe_start + map->start + btrfs_stripe_nr_to_offset(data_stripes)) - logical; io_geom->stripe_index = 0; io_geom->stripe_offset = 0; } static void map_blocks_raid56_read(struct btrfs_chunk_map *map, struct btrfs_io_geometry *io_geom) { int data_stripes = nr_data_stripes(map); ASSERT(io_geom->mirror_num <= 1); /* Just grab the data stripe directly. */ io_geom->stripe_index = io_geom->stripe_nr % data_stripes; io_geom->stripe_nr /= data_stripes; /* We distribute the parity blocks across stripes. */ io_geom->stripe_index = (io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes; if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1) io_geom->mirror_num = 1; } static void map_blocks_single(const struct btrfs_chunk_map *map, struct btrfs_io_geometry *io_geom) { io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes; io_geom->stripe_nr /= map->num_stripes; io_geom->mirror_num = io_geom->stripe_index + 1; } /* * Map one logical range to one or more physical ranges. * * @length: (Mandatory) mapped length of this run. * One logical range can be split into different segments * due to factors like zones and RAID0/5/6/10 stripe * boundaries. * * @bioc_ret: (Mandatory) returned btrfs_io_context structure. * which has one or more physical ranges (btrfs_io_stripe) * recorded inside. * Caller should call btrfs_put_bioc() to free it after use. * * @smap: (Optional) single physical range optimization. * If the map request can be fulfilled by one single * physical range, and this is parameter is not NULL, * then @bioc_ret would be NULL, and @smap would be * updated. * * @mirror_num_ret: (Mandatory) returned mirror number if the original * value is 0. * * Mirror number 0 means to choose any live mirrors. * * For non-RAID56 profiles, non-zero mirror_num means * the Nth mirror. (e.g. mirror_num 1 means the first * copy). * * For RAID56 profile, mirror 1 means rebuild from P and * the remaining data stripes. * * For RAID6 profile, mirror > 2 means mark another * data/P stripe error and rebuild from the remaining * stripes.. */ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, struct btrfs_io_stripe *smap, int *mirror_num_ret) { struct btrfs_chunk_map *map; struct btrfs_io_geometry io_geom = { 0 }; u64 map_offset; int ret = 0; int num_copies; struct btrfs_io_context *bioc = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; u16 num_alloc_stripes; u64 max_len; ASSERT(bioc_ret); io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); io_geom.num_stripes = 1; io_geom.stripe_index = 0; io_geom.op = op; map = btrfs_get_chunk_map(fs_info, logical, *length); if (IS_ERR(map)) return PTR_ERR(map); num_copies = btrfs_chunk_map_num_copies(map); if (io_geom.mirror_num > num_copies) return -EINVAL; map_offset = logical - map->start; io_geom.raid56_full_stripe_start = (u64)-1; max_len = btrfs_max_io_len(map, map_offset, &io_geom); *length = min_t(u64, map->chunk_len - map_offset, max_len); io_geom.use_rst = btrfs_need_stripe_tree_update(fs_info, map->type); if (dev_replace->replace_task != current) down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); /* * Hold the semaphore for read during the whole operation, write is * requested at commit time but must wait. */ if (!dev_replace_is_ongoing && dev_replace->replace_task != current) up_read(&dev_replace->rwsem); switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { case BTRFS_BLOCK_GROUP_RAID0: map_blocks_raid0(map, &io_geom); break; case BTRFS_BLOCK_GROUP_RAID1: case BTRFS_BLOCK_GROUP_RAID1C3: case BTRFS_BLOCK_GROUP_RAID1C4: map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing); break; case BTRFS_BLOCK_GROUP_DUP: map_blocks_dup(map, &io_geom); break; case BTRFS_BLOCK_GROUP_RAID10: map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing); break; case BTRFS_BLOCK_GROUP_RAID5: case BTRFS_BLOCK_GROUP_RAID6: if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1) map_blocks_raid56_write(map, &io_geom, logical, length); else map_blocks_raid56_read(map, &io_geom); break; default: /* * After this, stripe_nr is the number of stripes on this * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ map_blocks_single(map, &io_geom); break; } if (io_geom.stripe_index >= map->num_stripes) { btrfs_crit(fs_info, "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", io_geom.stripe_index, map->num_stripes); ret = -EINVAL; goto out; } num_alloc_stripes = io_geom.num_stripes; if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && op != BTRFS_MAP_READ) /* * For replace case, we need to add extra stripes for extra * duplicated stripes. * * For both WRITE and GET_READ_MIRRORS, we may have at most * 2 more stripes (DUP types, otherwise 1). */ num_alloc_stripes += 2; /* * If this I/O maps to a single device, try to return the device and * physical block information on the stack instead of allocating an * I/O context structure. */ if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, &io_geom)) { ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom); if (mirror_num_ret) *mirror_num_ret = io_geom.mirror_num; *bioc_ret = NULL; goto out; } bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes); if (!bioc) { ret = -ENOMEM; goto out; } bioc->map_type = map->type; bioc->use_rst = io_geom.use_rst; /* * For RAID56 full map, we need to make sure the stripes[] follows the * rule that data stripes are all ordered, then followed with P and Q * (if we have). * * It's still mostly the same as other profiles, just with extra rotation. */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) { /* * For RAID56 @stripe_nr is already the number of full stripes * before us, which is also the rotation value (needs to modulo * with num_stripes). * * In this case, we just add @stripe_nr with @i, then do the * modulo, to reduce one modulo call. */ bioc->full_stripe_logical = map->start + btrfs_stripe_nr_to_offset(io_geom.stripe_nr * nr_data_stripes(map)); for (int i = 0; i < io_geom.num_stripes; i++) { struct btrfs_io_stripe *dst = &bioc->stripes[i]; u32 stripe_index; stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes; dst->dev = map->stripes[stripe_index].dev; dst->physical = map->stripes[stripe_index].physical + io_geom.stripe_offset + btrfs_stripe_nr_to_offset(io_geom.stripe_nr); } } else { /* * For all other non-RAID56 profiles, just copy the target * stripe into the bioc. */ for (int i = 0; i < io_geom.num_stripes; i++) { ret = set_io_stripe(fs_info, logical, length, &bioc->stripes[i], map, &io_geom); if (ret < 0) break; io_geom.stripe_index++; } } if (ret) { *bioc_ret = NULL; btrfs_put_bioc(bioc); goto out; } if (op != BTRFS_MAP_READ) io_geom.max_errors = btrfs_chunk_max_errors(map); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && op != BTRFS_MAP_READ) { handle_ops_on_dev_replace(bioc, dev_replace, logical, &io_geom); } *bioc_ret = bioc; bioc->num_stripes = io_geom.num_stripes; bioc->max_errors = io_geom.max_errors; bioc->mirror_num = io_geom.mirror_num; out: if (dev_replace_is_ongoing && dev_replace->replace_task != current) { lockdep_assert_held(&dev_replace->rwsem); /* Unlock and let waiting writers proceed */ up_read(&dev_replace->rwsem); } btrfs_free_chunk_map(map); return ret; } static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, const struct btrfs_fs_devices *fs_devices) { if (args->fsid == NULL) return true; if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0) return true; return false; } static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, const struct btrfs_device *device) { if (args->missing) { if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && !device->bdev) return true; return false; } if (device->devid != args->devid) return false; if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) return false; return true; } /* * Find a device specified by @devid or @uuid in the list of @fs_devices, or * return NULL. * * If devid and uuid are both specified, the match must be exact, otherwise * only devid is used. */ struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, const struct btrfs_dev_lookup_args *args) { struct btrfs_device *device; struct btrfs_fs_devices *seed_devs; if (dev_args_match_fs_devices(args, fs_devices)) { list_for_each_entry(device, &fs_devices->devices, dev_list) { if (dev_args_match_device(args, device)) return device; } } list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { if (!dev_args_match_fs_devices(args, seed_devs)) continue; list_for_each_entry(device, &seed_devs->devices, dev_list) { if (dev_args_match_device(args, device)) return device; } } return NULL; } static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, u64 devid, u8 *dev_uuid) { struct btrfs_device *device; unsigned int nofs_flag; /* * We call this under the chunk_mutex, so we want to use NOFS for this * allocation, however we don't want to change btrfs_alloc_device() to * always do NOFS because we use it in a lot of other GFP_KERNEL safe * places. */ nofs_flag = memalloc_nofs_save(); device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL); memalloc_nofs_restore(nofs_flag); if (IS_ERR(device)) return device; list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; fs_devices->num_devices++; set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); fs_devices->missing_devices++; return device; } /* * Allocate new device struct, set up devid and UUID. * * @fs_info: used only for generating a new devid, can be NULL if * devid is provided (i.e. @devid != NULL). * @devid: a pointer to devid for this device. If NULL a new devid * is generated. * @uuid: a pointer to UUID for this device. If NULL a new UUID * is generated. * @path: a pointer to device path if available, NULL otherwise. * * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() * on error. Returned struct is not linked onto any lists and must be * destroyed with btrfs_free_device. */ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid, const char *path) { struct btrfs_device *dev; u64 tmp; if (WARN_ON(!devid && !fs_info)) return ERR_PTR(-EINVAL); dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); INIT_LIST_HEAD(&dev->post_commit_list); atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); if (devid) tmp = *devid; else { int ret; ret = find_next_devid(fs_info, &tmp); if (ret) { btrfs_free_device(dev); return ERR_PTR(ret); } } dev->devid = tmp; if (uuid) memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); else generate_random_uuid(dev->uuid); if (path) { struct rcu_string *name; name = rcu_string_strdup(path, GFP_KERNEL); if (!name) { btrfs_free_device(dev); return ERR_PTR(-ENOMEM); } rcu_assign_pointer(dev->name, name); } return dev; } static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, bool error) { if (error) btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid); else btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid); } u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map) { const int data_stripes = calc_data_stripes(map->type, map->num_stripes); return div_u64(map->chunk_len, data_stripes); } #if BITS_PER_LONG == 32 /* * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE * can't be accessed on 32bit systems. * * This function do mount time check to reject the fs if it already has * metadata chunk beyond that limit. */ static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, u64 logical, u64 length, u64 type) { if (!(type & BTRFS_BLOCK_GROUP_METADATA)) return 0; if (logical + length < MAX_LFS_FILESIZE) return 0; btrfs_err_32bit_limit(fs_info); return -EOVERFLOW; } /* * This is to give early warning for any metadata chunk reaching * BTRFS_32BIT_EARLY_WARN_THRESHOLD. * Although we can still access the metadata, it's not going to be possible * once the limit is reached. */ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, u64 logical, u64 length, u64 type) { if (!(type & BTRFS_BLOCK_GROUP_METADATA)) return; if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) return; btrfs_warn_32bit_limit(fs_info); } #endif static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid) { struct btrfs_device *dev; if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_report_missing_device(fs_info, devid, uuid, true); return ERR_PTR(-ENOENT); } dev = add_missing_dev(fs_info->fs_devices, devid, uuid); if (IS_ERR(dev)) { btrfs_err(fs_info, "failed to init missing device %llu: %ld", devid, PTR_ERR(dev)); return dev; } btrfs_report_missing_device(fs_info, devid, uuid, false); return dev; } static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_chunk_map *map; u64 logical; u64 length; u64 devid; u64 type; u8 uuid[BTRFS_UUID_SIZE]; int index; int num_stripes; int ret; int i; logical = key->offset; length = btrfs_chunk_length(leaf, chunk); type = btrfs_chunk_type(leaf, chunk); index = btrfs_bg_flags_to_raid_index(type); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); #if BITS_PER_LONG == 32 ret = check_32bit_meta_chunk(fs_info, logical, length, type); if (ret < 0) return ret; warn_32bit_meta_chunk(fs_info, logical, length, type); #endif map = btrfs_find_chunk_map(fs_info, logical, 1); /* already mapped? */ if (map && map->start <= logical && map->start + map->chunk_len > logical) { btrfs_free_chunk_map(map); return 0; } else if (map) { btrfs_free_chunk_map(map); } map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS); if (!map) return -ENOMEM; map->start = logical; map->chunk_len = length; map->num_stripes = num_stripes; map->io_width = btrfs_chunk_io_width(leaf, chunk); map->io_align = btrfs_chunk_io_align(leaf, chunk); map->type = type; /* * We can't use the sub_stripes value, as for profiles other than * RAID10, they may have 0 as sub_stripes for filesystems created by * older mkfs (<v5.4). * In that case, it can cause divide-by-zero errors later. * Since currently sub_stripes is fixed for each profile, let's * use the trusted value instead. */ map->sub_stripes = btrfs_raid_array[index].sub_stripes; map->verified_stripes = 0; map->stripe_size = btrfs_calc_stripe_length(map); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); devid = btrfs_stripe_devid_nr(leaf, chunk, i); args.devid = devid; read_extent_buffer(leaf, uuid, (unsigned long) btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); args.uuid = uuid; map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); if (!map->stripes[i].dev) { map->stripes[i].dev = handle_missing_device(fs_info, devid, uuid); if (IS_ERR(map->stripes[i].dev)) { ret = PTR_ERR(map->stripes[i].dev); btrfs_free_chunk_map(map); return ret; } } set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &(map->stripes[i].dev->dev_state)); } ret = btrfs_add_chunk_map(fs_info, map); if (ret < 0) { btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", map->start, map->chunk_len, ret); } return ret; } static void fill_device_from_item(struct extent_buffer *leaf, struct btrfs_dev_item *dev_item, struct btrfs_device *device) { unsigned long ptr; device->devid = btrfs_device_id(leaf, dev_item); device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); device->total_bytes = device->disk_total_bytes; device->commit_total_bytes = device->disk_total_bytes; device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); device->commit_bytes_used = device->bytes_used; device->type = btrfs_device_type(leaf, dev_item); device->io_align = btrfs_device_io_align(leaf, dev_item); device->io_width = btrfs_device_io_width(leaf, dev_item); device->sector_size = btrfs_device_sector_size(leaf, dev_item); WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); ptr = btrfs_device_uuid(dev_item); read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); } static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, u8 *fsid) { struct btrfs_fs_devices *fs_devices; int ret; lockdep_assert_held(&uuid_mutex); ASSERT(fsid); /* This will match only for multi-device seed fs */ list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) return fs_devices; fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { if (!btrfs_test_opt(fs_info, DEGRADED)) return ERR_PTR(-ENOENT); fs_devices = alloc_fs_devices(fsid); if (IS_ERR(fs_devices)) return fs_devices; fs_devices->seeding = true; fs_devices->opened = 1; return fs_devices; } /* * Upon first call for a seed fs fsid, just create a private copy of the * respective fs_devices and anchor it at fs_info->fs_devices->seed_list */ fs_devices = clone_fs_devices(fs_devices); if (IS_ERR(fs_devices)) return fs_devices; ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); return ERR_PTR(ret); } if (!fs_devices->seeding) { close_fs_devices(fs_devices); free_fs_devices(fs_devices); return ERR_PTR(-EINVAL); } list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); return fs_devices; } static int read_one_dev(struct extent_buffer *leaf, struct btrfs_dev_item *dev_item) { BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 devid; int ret; u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; devid = btrfs_device_id(leaf, dev_item); args.devid = devid; read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); args.uuid = dev_uuid; args.fsid = fs_uuid; if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { fs_devices = open_seed_devices(fs_info, fs_uuid); if (IS_ERR(fs_devices)) return PTR_ERR(fs_devices); } device = btrfs_find_device(fs_info->fs_devices, &args); if (!device) { if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_report_missing_device(fs_info, devid, dev_uuid, true); return -ENOENT; } device = add_missing_dev(fs_devices, devid, dev_uuid); if (IS_ERR(device)) { btrfs_err(fs_info, "failed to add missing dev %llu: %ld", devid, PTR_ERR(device)); return PTR_ERR(device); } btrfs_report_missing_device(fs_info, devid, dev_uuid, false); } else { if (!device->bdev) { if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_report_missing_device(fs_info, devid, dev_uuid, true); return -ENOENT; } btrfs_report_missing_device(fs_info, devid, dev_uuid, false); } if (!device->bdev && !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { /* * this happens when a device that was properly setup * in the device info lists suddenly goes bad. * device->bdev is NULL, and so we have to set * device->missing to one here */ device->fs_devices->missing_devices++; set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); } /* Move the device to its own fs_devices */ if (device->fs_devices != fs_devices) { ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)); list_move(&device->dev_list, &fs_devices->devices); device->fs_devices->num_devices--; fs_devices->num_devices++; device->fs_devices->missing_devices--; fs_devices->missing_devices++; device->fs_devices = fs_devices; } } if (device->fs_devices != fs_info->fs_devices) { BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); if (device->generation != btrfs_device_generation(leaf, dev_item)) return -EINVAL; } fill_device_from_item(leaf, dev_item, device); if (device->bdev) { u64 max_total_bytes = bdev_nr_bytes(device->bdev); if (device->total_bytes > max_total_bytes) { btrfs_err(fs_info, "device total_bytes should be at most %llu but found %llu", max_total_bytes, device->total_bytes); return -EINVAL; } } set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { device->fs_devices->total_rw_bytes += device->total_bytes; atomic64_add(device->total_bytes - device->bytes_used, &fs_info->free_chunk_space); } ret = 0; return ret; } int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) { struct btrfs_super_block *super_copy = fs_info->super_copy; struct extent_buffer *sb; u8 *array_ptr; unsigned long sb_array_offset; int ret = 0; u32 array_size; u32 cur_offset; struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); /* * We allocated a dummy extent, just to use extent buffer accessors. * There will be unused space after BTRFS_SUPER_INFO_SIZE, but * that's fine, we will not go beyond system chunk array anyway. */ sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET); if (!sb) return -ENOMEM; set_extent_buffer_uptodate(sb); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); array_ptr = super_copy->sys_chunk_array; sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); cur_offset = 0; while (cur_offset < array_size) { struct btrfs_chunk *chunk; struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)array_ptr; u32 len = sizeof(*disk_key); /* * The sys_chunk_array has been already verified at super block * read time. Only do ASSERT()s for basic checks. */ ASSERT(cur_offset + len <= array_size); btrfs_disk_key_to_cpu(&key, disk_key); array_ptr += len; sb_array_offset += len; cur_offset += len; ASSERT(key.type == BTRFS_CHUNK_ITEM_KEY); chunk = (struct btrfs_chunk *)sb_array_offset; ASSERT(btrfs_chunk_type(sb, chunk) & BTRFS_BLOCK_GROUP_SYSTEM); len = btrfs_chunk_item_size(btrfs_chunk_num_stripes(sb, chunk)); ASSERT(cur_offset + len <= array_size); ret = read_one_chunk(&key, sb, chunk); if (ret) break; array_ptr += len; sb_array_offset += len; cur_offset += len; } clear_extent_buffer_uptodate(sb); free_extent_buffer_stale(sb); return ret; } /* * Check if all chunks in the fs are OK for read-write degraded mount * * If the @failing_dev is specified, it's accounted as missing. * * Return true if all chunks meet the minimal RW mount requirements. * Return false if any chunk doesn't meet the minimal RW mount requirements. */ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev) { struct btrfs_chunk_map *map; u64 next_start; bool ret = true; map = btrfs_find_chunk_map(fs_info, 0, U64_MAX); /* No chunk at all? Return false anyway */ if (!map) { ret = false; goto out; } while (map) { int missing = 0; int max_tolerated; int i; max_tolerated = btrfs_get_num_tolerated_disk_barrier_failures( map->type); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *dev = map->stripes[i].dev; if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || dev->last_flush_error) missing++; else if (failing_dev && failing_dev == dev) missing++; } if (missing > max_tolerated) { if (!failing_dev) btrfs_warn(fs_info, "chunk %llu missing %d devices, max tolerance is %d for writable mount", map->start, missing, max_tolerated); btrfs_free_chunk_map(map); ret = false; goto out; } next_start = map->start + map->chunk_len; btrfs_free_chunk_map(map); map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start); } out: return ret; } static void readahead_tree_node_children(struct extent_buffer *node) { int i; const int nr_items = btrfs_header_nritems(node); for (i = 0; i < nr_items; i++) btrfs_readahead_node_child(node, i); } int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->chunk_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; struct btrfs_key found_key; int ret; int slot; int iter_ret = 0; u64 total_dev = 0; u64 last_ra_node = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* * uuid_mutex is needed only if we are mounting a sprout FS * otherwise we don't need it. */ mutex_lock(&uuid_mutex); /* * It is possible for mount and umount to race in such a way that * we execute this code path, but open_fs_devices failed to clear * total_rw_bytes. We certainly want it cleared before reading the * device items, so clear it here. */ fs_info->fs_devices->total_rw_bytes = 0; /* * Lockdep complains about possible circular locking dependency between * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores * used for freeze procection of a fs (struct super_block.s_writers), * which we take when starting a transaction, and extent buffers of the * chunk tree if we call read_one_dev() while holding a lock on an * extent buffer of the chunk tree. Since we are mounting the filesystem * and at this point there can't be any concurrent task modifying the * chunk tree, to keep it simple, just skip locking on the chunk tree. */ ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); path->skip_locking = 1; /* * Read all device items, and then all the chunk items. All * device items are found before any chunk item (their object id * is smaller than the lowest possible object id for a chunk * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). */ key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.offset = 0; key.type = 0; btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *node = path->nodes[1]; leaf = path->nodes[0]; slot = path->slots[0]; if (node) { if (last_ra_node != node->start) { readahead_tree_node_children(node); last_ra_node = node->start; } } if (found_key.type == BTRFS_DEV_ITEM_KEY) { struct btrfs_dev_item *dev_item; dev_item = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); ret = read_one_dev(leaf, dev_item); if (ret) goto error; total_dev++; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; /* * We are only called at mount time, so no need to take * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, * we always lock first fs_info->chunk_mutex before * acquiring any locks on the chunk tree. This is a * requirement for chunk allocation, see the comment on * top of btrfs_chunk_alloc() for details. */ chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); ret = read_one_chunk(&found_key, leaf, chunk); if (ret) goto error; } } /* Catch error found during iteration */ if (iter_ret < 0) { ret = iter_ret; goto error; } /* * After loading chunk tree, we've got all device information, * do another round of validation checks. */ if (total_dev != fs_info->fs_devices->total_devices) { btrfs_warn(fs_info, "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", btrfs_super_num_devices(fs_info->super_copy), total_dev); fs_info->fs_devices->total_devices = total_dev; btrfs_set_super_num_devices(fs_info->super_copy, total_dev); } if (btrfs_super_total_bytes(fs_info->super_copy) < fs_info->fs_devices->total_rw_bytes) { btrfs_err(fs_info, "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", btrfs_super_total_bytes(fs_info->super_copy), fs_info->fs_devices->total_rw_bytes); ret = -EINVAL; goto error; } ret = 0; error: mutex_unlock(&uuid_mutex); btrfs_free_path(path); return ret; } int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; int ret = 0; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) device->fs_info = fs_info; list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { list_for_each_entry(device, &seed_devs->devices, dev_list) { device->fs_info = fs_info; ret = btrfs_get_dev_zone_info(device, false); if (ret) break; } seed_devs->fs_info = fs_info; } mutex_unlock(&fs_devices->device_list_mutex); return ret; } static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, const struct btrfs_dev_stats_item *ptr, int index) { u64 val; read_extent_buffer(eb, &val, offsetof(struct btrfs_dev_stats_item, values) + ((unsigned long)ptr) + (index * sizeof(u64)), sizeof(val)); return val; } static void btrfs_set_dev_stats_value(struct extent_buffer *eb, struct btrfs_dev_stats_item *ptr, int index, u64 val) { write_extent_buffer(eb, &val, offsetof(struct btrfs_dev_stats_item, values) + ((unsigned long)ptr) + (index * sizeof(u64)), sizeof(val)); } static int btrfs_device_init_dev_stats(struct btrfs_device *device, struct btrfs_path *path) { struct btrfs_dev_stats_item *ptr; struct extent_buffer *eb; struct btrfs_key key; int item_size; int i, ret, slot; if (!device->fs_info->dev_root) return 0; key.objectid = BTRFS_DEV_STATS_OBJECTID; key.type = BTRFS_PERSISTENT_ITEM_KEY; key.offset = device->devid; ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); if (ret) { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) btrfs_dev_stat_set(device, i, 0); device->dev_stats_valid = 1; btrfs_release_path(path); return ret < 0 ? ret : 0; } slot = path->slots[0]; eb = path->nodes[0]; item_size = btrfs_item_size(eb, slot); ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { if (item_size >= (1 + i) * sizeof(__le64)) btrfs_dev_stat_set(device, i, btrfs_dev_stats_value(eb, ptr, i)); else btrfs_dev_stat_set(device, i, 0); } device->dev_stats_valid = 1; btrfs_dev_stat_print_on_load(device); btrfs_release_path(path); return 0; } int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; struct btrfs_path *path = NULL; int ret = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { ret = btrfs_device_init_dev_stats(device, path); if (ret) goto out; } list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { list_for_each_entry(device, &seed_devs->devices, dev_list) { ret = btrfs_device_init_dev_stats(device, path); if (ret) goto out; } } out: mutex_unlock(&fs_devices->device_list_mutex); btrfs_free_path(path); return ret; } static int update_dev_stat_item(struct btrfs_trans_handle *trans, struct btrfs_device *device) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *dev_root = fs_info->dev_root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *eb; struct btrfs_dev_stats_item *ptr; int ret; int i; key.objectid = BTRFS_DEV_STATS_OBJECTID; key.type = BTRFS_PERSISTENT_ITEM_KEY; key.offset = device->devid; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { btrfs_warn_in_rcu(fs_info, "error %d while searching for dev_stats item for device %s", ret, btrfs_dev_name(device)); goto out; } if (ret == 0 && btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { btrfs_warn_in_rcu(fs_info, "delete too small dev_stats item for device %s failed %d", btrfs_dev_name(device), ret); goto out; } ret = 1; } if (ret == 1) { /* need to insert a new item */ btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { btrfs_warn_in_rcu(fs_info, "insert dev_stats item for device %s failed %d", btrfs_dev_name(device), ret); goto out; } } eb = path->nodes[0]; ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) btrfs_set_dev_stats_value(eb, ptr, i, btrfs_dev_stat_read(device, i)); out: btrfs_free_path(path); return ret; } /* * called from commit_transaction. Writes all changed device stats to disk. */ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; int stats_cnt; int ret = 0; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { stats_cnt = atomic_read(&device->dev_stats_ccnt); if (!device->dev_stats_valid || stats_cnt == 0) continue; /* * There is a LOAD-LOAD control dependency between the value of * dev_stats_ccnt and updating the on-disk values which requires * reading the in-memory counters. Such control dependencies * require explicit read memory barriers. * * This memory barriers pairs with smp_mb__before_atomic in * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full * barrier implied by atomic_xchg in * btrfs_dev_stats_read_and_reset */ smp_rmb(); ret = update_dev_stat_item(trans, device); if (!ret) atomic_sub(stats_cnt, &device->dev_stats_ccnt); } mutex_unlock(&fs_devices->device_list_mutex); return ret; } void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) { btrfs_dev_stat_inc(dev, index); if (!dev->dev_stats_valid) return; btrfs_err_rl_in_rcu(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); } static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) { int i; for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) if (btrfs_dev_stat_read(dev, i) != 0) break; if (i == BTRFS_DEV_STAT_VALUES_MAX) return; /* all values == 0, suppress message */ btrfs_info_in_rcu(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); } int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_get_dev_stats *stats) { BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_device *dev; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; int i; mutex_lock(&fs_devices->device_list_mutex); args.devid = stats->devid; dev = btrfs_find_device(fs_info->fs_devices, &args); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { btrfs_warn(fs_info, "get dev_stats failed, device not found"); return -ENODEV; } else if (!dev->dev_stats_valid) { btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); return -ENODEV; } else if (stats->flags & BTRFS_DEV_STATS_RESET) { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { if (stats->nr_items > i) stats->values[i] = btrfs_dev_stat_read_and_reset(dev, i); else btrfs_dev_stat_set(dev, i, 0); } btrfs_info(fs_info, "device stats zeroed by %s (%d)", current->comm, task_pid_nr(current)); } else { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) if (stats->nr_items > i) stats->values[i] = btrfs_dev_stat_read(dev, i); } if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; return 0; } /* * Update the size and bytes used for each device where it changed. This is * delayed since we would otherwise get errors while writing out the * superblocks. * * Must be invoked during transaction commit. */ void btrfs_commit_device_sizes(struct btrfs_transaction *trans) { struct btrfs_device *curr, *next; ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); if (list_empty(&trans->dev_update_list)) return; /* * We don't need the device_list_mutex here. This list is owned by the * transaction and the transaction must complete before the device is * released. */ mutex_lock(&trans->fs_info->chunk_mutex); list_for_each_entry_safe(curr, next, &trans->dev_update_list, post_commit_list) { list_del_init(&curr->post_commit_list); curr->commit_total_bytes = curr->disk_total_bytes; curr->commit_bytes_used = curr->bytes_used; } mutex_unlock(&trans->fs_info->chunk_mutex); } /* * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. */ int btrfs_bg_type_to_factor(u64 flags) { const int index = btrfs_bg_flags_to_raid_index(flags); return btrfs_raid_array[index].ncopies; } static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 devid, u64 physical_offset, u64 physical_len) { struct btrfs_dev_lookup_args args = { .devid = devid }; struct btrfs_chunk_map *map; struct btrfs_device *dev; u64 stripe_len; bool found = false; int ret = 0; int i; map = btrfs_find_chunk_map(fs_info, chunk_offset, 1); if (!map) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", physical_offset, devid); ret = -EUCLEAN; goto out; } stripe_len = btrfs_calc_stripe_length(map); if (physical_len != stripe_len) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", physical_offset, devid, map->start, physical_len, stripe_len); ret = -EUCLEAN; goto out; } /* * Very old mkfs.btrfs (before v4.1) will not respect the reserved * space. Although kernel can handle it without problem, better to warn * the users. */ if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED) btrfs_warn(fs_info, "devid %llu physical %llu len %llu inside the reserved space", devid, physical_offset, physical_len); for (i = 0; i < map->num_stripes; i++) { if (map->stripes[i].dev->devid == devid && map->stripes[i].physical == physical_offset) { found = true; if (map->verified_stripes >= map->num_stripes) { btrfs_err(fs_info, "too many dev extents for chunk %llu found", map->start); ret = -EUCLEAN; goto out; } map->verified_stripes++; break; } } if (!found) { btrfs_err(fs_info, "dev extent physical offset %llu devid %llu has no corresponding chunk", physical_offset, devid); ret = -EUCLEAN; } /* Make sure no dev extent is beyond device boundary */ dev = btrfs_find_device(fs_info->fs_devices, &args); if (!dev) { btrfs_err(fs_info, "failed to find devid %llu", devid); ret = -EUCLEAN; goto out; } if (physical_offset + physical_len > dev->disk_total_bytes) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", devid, physical_offset, physical_len, dev->disk_total_bytes); ret = -EUCLEAN; goto out; } if (dev->zone_info) { u64 zone_size = dev->zone_info->zone_size; if (!IS_ALIGNED(physical_offset, zone_size) || !IS_ALIGNED(physical_len, zone_size)) { btrfs_err(fs_info, "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", devid, physical_offset, physical_len); ret = -EUCLEAN; goto out; } } out: btrfs_free_chunk_map(map); return ret; } static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) { struct rb_node *node; int ret = 0; read_lock(&fs_info->mapping_tree_lock); for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) { struct btrfs_chunk_map *map; map = rb_entry(node, struct btrfs_chunk_map, rb_node); if (map->num_stripes != map->verified_stripes) { btrfs_err(fs_info, "chunk %llu has missing dev extent, have %d expect %d", map->start, map->verified_stripes, map->num_stripes); ret = -EUCLEAN; goto out; } } out: read_unlock(&fs_info->mapping_tree_lock); return ret; } /* * Ensure that all dev extents are mapped to correct chunk, otherwise * later chunk allocation/free would cause unexpected behavior. * * NOTE: This will iterate through the whole device tree, which should be of * the same size level as the chunk tree. This slightly increases mount time. */ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) { struct btrfs_path *path; struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; u64 prev_devid = 0; u64 prev_dev_ext_end = 0; int ret = 0; /* * We don't have a dev_root because we mounted with ignorebadroots and * failed to load the root, so we want to skip the verification in this * case for sure. * * However if the dev root is fine, but the tree itself is corrupted * we'd still fail to mount. This verification is only to make sure * writes can happen safely, so instead just bypass this check * completely in the case of IGNOREBADROOTS. */ if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) return 0; key.objectid = 1; key.type = BTRFS_DEV_EXTENT_KEY; key.offset = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_FORWARD; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto out; /* No dev extents at all? Not good */ if (ret > 0) { ret = -EUCLEAN; goto out; } } while (1) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_dev_extent *dext; int slot = path->slots[0]; u64 chunk_offset; u64 physical_offset; u64 physical_len; u64 devid; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.type != BTRFS_DEV_EXTENT_KEY) break; devid = key.objectid; physical_offset = key.offset; dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); physical_len = btrfs_dev_extent_length(leaf, dext); /* Check if this dev extent overlaps with the previous one */ if (devid == prev_devid && physical_offset < prev_dev_ext_end) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", devid, physical_offset, prev_dev_ext_end); ret = -EUCLEAN; goto out; } ret = verify_one_dev_extent(fs_info, chunk_offset, devid, physical_offset, physical_len); if (ret < 0) goto out; prev_devid = devid; prev_dev_ext_end = physical_offset + physical_len; ret = btrfs_next_item(root, path); if (ret < 0) goto out; if (ret > 0) { ret = 0; break; } } /* Ensure all chunks have corresponding dev extents */ ret = verify_chunk_dev_extent_mapping(fs_info); out: btrfs_free_path(path); return ret; } /* * Check whether the given block group or device is pinned by any inode being * used as a swapfile. */ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) { struct btrfs_swapfile_pin *sp; struct rb_node *node; spin_lock(&fs_info->swapfile_pins_lock); node = fs_info->swapfile_pins.rb_node; while (node) { sp = rb_entry(node, struct btrfs_swapfile_pin, node); if (ptr < sp->ptr) node = node->rb_left; else if (ptr > sp->ptr) node = node->rb_right; else break; } spin_unlock(&fs_info->swapfile_pins_lock); return node != NULL; } static int relocating_repair_kthread(void *data) { struct btrfs_block_group *cache = data; struct btrfs_fs_info *fs_info = cache->fs_info; u64 target; int ret = 0; target = cache->start; btrfs_put_block_group(cache); sb_start_write(fs_info->sb); if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { btrfs_info(fs_info, "zoned: skip relocating block group %llu to repair: EBUSY", target); sb_end_write(fs_info->sb); return -EBUSY; } mutex_lock(&fs_info->reclaim_bgs_lock); /* Ensure block group still exists */ cache = btrfs_lookup_block_group(fs_info, target); if (!cache) goto out; if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) goto out; ret = btrfs_may_alloc_data_chunk(fs_info, target); if (ret < 0) goto out; btrfs_info(fs_info, "zoned: relocating block group %llu to repair IO failure", target); ret = btrfs_relocate_chunk(fs_info, target); out: if (cache) btrfs_put_block_group(cache); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); sb_end_write(fs_info->sb); return ret; } bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) { struct btrfs_block_group *cache; if (!btrfs_is_zoned(fs_info)) return false; /* Do not attempt to repair in degraded state */ if (btrfs_test_opt(fs_info, DEGRADED)) return true; cache = btrfs_lookup_block_group(fs_info, logical); if (!cache) return true; if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) { btrfs_put_block_group(cache); return true; } kthread_run(relocating_repair_kthread, cache, "btrfs-relocating-repair"); return true; } static void map_raid56_repair_block(struct btrfs_io_context *bioc, struct btrfs_io_stripe *smap, u64 logical) { int data_stripes = nr_bioc_data_stripes(bioc); int i; for (i = 0; i < data_stripes; i++) { u64 stripe_start = bioc->full_stripe_logical + btrfs_stripe_nr_to_offset(i); if (logical >= stripe_start && logical < stripe_start + BTRFS_STRIPE_LEN) break; } ASSERT(i < data_stripes); smap->dev = bioc->stripes[i].dev; smap->physical = bioc->stripes[i].physical + ((logical - bioc->full_stripe_logical) & BTRFS_STRIPE_LEN_MASK); } /* * Map a repair write into a single device. * * A repair write is triggered by read time repair or scrub, which would only * update the contents of a single device. * Not update any other mirrors nor go through RMW path. * * Callers should ensure: * * - Call btrfs_bio_counter_inc_blocked() first * - The range does not cross stripe boundary * - Has a valid @mirror_num passed in. */ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, struct btrfs_io_stripe *smap, u64 logical, u32 length, int mirror_num) { struct btrfs_io_context *bioc = NULL; u64 map_length = length; int mirror_ret = mirror_num; int ret; ASSERT(mirror_num > 0); ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, &bioc, smap, &mirror_ret); if (ret < 0) return ret; /* The map range should not cross stripe boundary. */ ASSERT(map_length >= length); /* Already mapped to single stripe. */ if (!bioc) goto out; /* Map the RAID56 multi-stripe writes to a single one. */ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { map_raid56_repair_block(bioc, smap, logical); goto out; } ASSERT(mirror_num <= bioc->num_stripes); smap->dev = bioc->stripes[mirror_num - 1].dev; smap->physical = bioc->stripes[mirror_num - 1].physical; out: btrfs_put_bioc(bioc); ASSERT(smap->dev); return 0; }
38 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_ftp.c: IPVS ftp application module * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * Changes: * * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. * * IP_MASQ_FTP ftp masquerading module * * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 * * Author: Wouter Gadeyne */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/ctype.h> #include <linux/inet.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/netfilter.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #include <linux/gfp.h> #include <net/protocol.h> #include <net/tcp.h> #include <linux/unaligned.h> #include <net/ip_vs.h> #define SERVER_STRING_PASV "227 " #define CLIENT_STRING_PORT "PORT" #define SERVER_STRING_EPSV "229 " #define CLIENT_STRING_EPRT "EPRT" enum { IP_VS_FTP_ACTIVE = 0, IP_VS_FTP_PORT = 0, IP_VS_FTP_PASV, IP_VS_FTP_EPRT, IP_VS_FTP_EPSV, }; /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper * First port is set to the default port. */ static unsigned int ports_count = 1; static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; module_param_array(ports, ushort, &ports_count, 0444); MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); static char *ip_vs_ftp_data_ptr(struct sk_buff *skb, struct ip_vs_iphdr *ipvsh) { struct tcphdr *th = (struct tcphdr *)((char *)skb->data + ipvsh->len); if ((th->doff << 2) < sizeof(struct tcphdr)) return NULL; return (char *)th + (th->doff << 2); } static int ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) { /* We use connection tracking for the command connection */ cp->flags |= IP_VS_CONN_F_NFCT; return 0; } static int ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) { return 0; } /* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started * with the "pattern". <addr,port> is in network order. * Parse extended format depending on ext. In this case addr can be pre-set. */ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, const char *pattern, size_t plen, char skip, bool ext, int mode, union nf_inet_addr *addr, __be16 *port, __u16 af, char **start, char **end) { char *s, c; unsigned char p[6]; char edelim; __u16 hport; int i = 0; if (data_limit - data < plen) { /* check if there is partial match */ if (strncasecmp(data, pattern, data_limit - data) == 0) return -1; else return 0; } if (strncasecmp(data, pattern, plen) != 0) { return 0; } s = data + plen; if (skip) { bool found = false; for (;; s++) { if (s == data_limit) return -1; if (!found) { /* "(" is optional for non-extended format, * so catch the start of IPv4 address */ if (!ext && isdigit(*s)) break; if (*s == skip) found = true; } else if (*s != skip) { break; } } } /* Old IPv4-only format? */ if (!ext) { p[0] = 0; for (data = s; ; data++) { if (data == data_limit) return -1; c = *data; if (isdigit(c)) { p[i] = p[i]*10 + c - '0'; } else if (c == ',' && i < 5) { i++; p[i] = 0; } else { /* unexpected character or terminator */ break; } } if (i != 5) return -1; *start = s; *end = data; addr->ip = get_unaligned((__be32 *) p); *port = get_unaligned((__be16 *) (p + 4)); return 1; } if (s == data_limit) return -1; *start = s; edelim = *s++; if (edelim < 33 || edelim > 126) return -1; if (s == data_limit) return -1; if (*s == edelim) { /* Address family is usually missing for EPSV response */ if (mode != IP_VS_FTP_EPSV) return -1; s++; if (s == data_limit) return -1; /* Then address should be missing too */ if (*s != edelim) return -1; /* Caller can pre-set addr, if needed */ s++; } else { const char *ep; /* We allow address only from same family */ if (af == AF_INET6 && *s != '2') return -1; if (af == AF_INET && *s != '1') return -1; s++; if (s == data_limit) return -1; if (*s != edelim) return -1; s++; if (s == data_limit) return -1; if (af == AF_INET6) { if (in6_pton(s, data_limit - s, (u8 *)addr, edelim, &ep) <= 0) return -1; } else { if (in4_pton(s, data_limit - s, (u8 *)addr, edelim, &ep) <= 0) return -1; } s = (char *) ep; if (s == data_limit) return -1; if (*s != edelim) return -1; s++; } for (hport = 0; ; s++) { if (s == data_limit) return -1; if (!isdigit(*s)) break; hport = hport * 10 + *s - '0'; } if (s == data_limit || !hport || *s != edelim) return -1; s++; *end = s; *port = htons(hport); return 1; } /* Look at outgoing ftp packets to catch the response to a PASV/EPSV command * from the server (inside-to-outside). * When we see one, we build a connection entry with the client address, * client port 0 (unknown at the moment), the server address and the * server port. Mark the current connection entry as a control channel * of the new entry. All this work is just to make the data connection * can be scheduled to the right server later. * * The outgoing packet should be something like * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. * The extended format for EPSV response provides usually only port: * "229 Entering Extended Passive Mode (|||ppp|)" */ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, struct sk_buff *skb, int *diff, struct ip_vs_iphdr *ipvsh) { char *data, *data_limit; char *start, *end; union nf_inet_addr from; __be16 port; struct ip_vs_conn *n_cp; char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ unsigned int buf_len; int ret = 0; enum ip_conntrack_info ctinfo; struct nf_conn *ct; *diff = 0; /* Only useful for established sessions */ if (cp->state != IP_VS_TCP_S_ESTABLISHED) return 1; /* Linear packets are much easier to deal with. */ if (skb_ensure_writable(skb, skb->len)) return 0; if (cp->app_data == (void *) IP_VS_FTP_PASV) { data = ip_vs_ftp_data_ptr(skb, ipvsh); data_limit = skb_tail_pointer(skb); if (!data || data >= data_limit) return 1; if (ip_vs_ftp_get_addrport(data, data_limit, SERVER_STRING_PASV, sizeof(SERVER_STRING_PASV)-1, '(', false, IP_VS_FTP_PASV, &from, &port, cp->af, &start, &end) != 1) return 1; IP_VS_DBG(7, "PASV response (%pI4:%u) -> %pI4:%u detected\n", &from.ip, ntohs(port), &cp->caddr.ip, 0); } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) { data = ip_vs_ftp_data_ptr(skb, ipvsh); data_limit = skb_tail_pointer(skb); if (!data || data >= data_limit) return 1; /* Usually, data address is not specified but * we support different address, so pre-set it. */ from = cp->daddr; if (ip_vs_ftp_get_addrport(data, data_limit, SERVER_STRING_EPSV, sizeof(SERVER_STRING_EPSV)-1, '(', true, IP_VS_FTP_EPSV, &from, &port, cp->af, &start, &end) != 1) return 1; IP_VS_DBG_BUF(7, "EPSV response (%s:%u) -> %s:%u detected\n", IP_VS_DBG_ADDR(cp->af, &from), ntohs(port), IP_VS_DBG_ADDR(cp->af, &cp->caddr), 0); } else { return 1; } /* Now update or create a connection entry for it */ { struct ip_vs_conn_param p; ip_vs_conn_fill_param(cp->ipvs, cp->af, ipvsh->protocol, &from, port, &cp->caddr, 0, &p); n_cp = ip_vs_conn_out_get(&p); } if (!n_cp) { struct ip_vs_conn_param p; ip_vs_conn_fill_param(cp->ipvs, cp->af, ipvsh->protocol, &cp->caddr, 0, &cp->vaddr, port, &p); n_cp = ip_vs_conn_new(&p, cp->af, &from, port, IP_VS_CONN_F_NO_CPORT | IP_VS_CONN_F_NFCT, cp->dest, skb->mark); if (!n_cp) return 0; /* add its controller */ ip_vs_control_add(n_cp, cp); } /* Replace the old passive address with the new one */ if (cp->app_data == (void *) IP_VS_FTP_PASV) { from.ip = n_cp->vaddr.ip; port = n_cp->vport; snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u", ((unsigned char *)&from.ip)[0], ((unsigned char *)&from.ip)[1], ((unsigned char *)&from.ip)[2], ((unsigned char *)&from.ip)[3], ntohs(port) >> 8, ntohs(port) & 0xFF); } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) { from = n_cp->vaddr; port = n_cp->vport; /* Only port, client will use VIP for the data connection */ snprintf(buf, sizeof(buf), "|||%u|", ntohs(port)); } else { *buf = 0; } buf_len = strlen(buf); ct = nf_ct_get(skb, &ctinfo); if (ct) { bool mangled; /* If mangling fails this function will return 0 * which will cause the packet to be dropped. * Mangling can only fail under memory pressure, * hopefully it will succeed on the retransmitted * packet. */ mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, ipvsh->len, start - data, end - start, buf, buf_len); if (mangled) { ip_vs_nfct_expect_related(skb, ct, n_cp, ipvsh->protocol, 0, 0); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_UNNECESSARY; /* csum is updated */ ret = 1; } } /* Not setting 'diff' is intentional, otherwise the sequence * would be adjusted twice. */ cp->app_data = (void *) IP_VS_FTP_ACTIVE; ip_vs_tcp_conn_listen(n_cp); ip_vs_conn_put(n_cp); return ret; } /* Look at incoming ftp packets to catch the PASV/PORT/EPRT/EPSV command * (outside-to-inside). * * The incoming packet having the PORT command should be something like * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. * In this case, we create a connection entry using the client address and * port, so that the active ftp data connection from the server can reach * the client. * Extended format: * "EPSV\r\n" when client requests server address from same family * "EPSV 1\r\n" when client requests IPv4 server address * "EPSV 2\r\n" when client requests IPv6 server address * "EPSV ALL\r\n" - not supported * EPRT with specified delimiter (ASCII 33..126), "|" by default: * "EPRT |1|IPv4ADDR|PORT|\r\n" when client provides IPv4 addrport * "EPRT |2|IPv6ADDR|PORT|\r\n" when client provides IPv6 addrport */ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, struct sk_buff *skb, int *diff, struct ip_vs_iphdr *ipvsh) { char *data, *data_start, *data_limit; char *start, *end; union nf_inet_addr to; __be16 port; struct ip_vs_conn *n_cp; /* no diff required for incoming packets */ *diff = 0; /* Only useful for established sessions */ if (cp->state != IP_VS_TCP_S_ESTABLISHED) return 1; /* Linear packets are much easier to deal with. */ if (skb_ensure_writable(skb, skb->len)) return 0; data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh); data_limit = skb_tail_pointer(skb); if (!data || data >= data_limit) return 1; while (data <= data_limit - 6) { if (cp->af == AF_INET && strncasecmp(data, "PASV\r\n", 6) == 0) { /* Passive mode on */ IP_VS_DBG(7, "got PASV at %td of %td\n", data - data_start, data_limit - data_start); cp->app_data = (void *) IP_VS_FTP_PASV; return 1; } /* EPSV or EPSV<space><net-prt> */ if (strncasecmp(data, "EPSV", 4) == 0 && (data[4] == ' ' || data[4] == '\r')) { if (data[4] == ' ') { char proto = data[5]; if (data > data_limit - 7 || data[6] != '\r') return 1; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && proto == '2') { } else #endif if (cp->af == AF_INET && proto == '1') { } else { return 1; } } /* Extended Passive mode on */ IP_VS_DBG(7, "got EPSV at %td of %td\n", data - data_start, data_limit - data_start); cp->app_data = (void *) IP_VS_FTP_EPSV; return 1; } data++; } /* * To support virtual FTP server, the scenerio is as follows: * FTP client ----> Load Balancer ----> FTP server * First detect the port number in the application data, * then create a new connection entry for the coming data * connection. */ if (cp->af == AF_INET && ip_vs_ftp_get_addrport(data_start, data_limit, CLIENT_STRING_PORT, sizeof(CLIENT_STRING_PORT)-1, ' ', false, IP_VS_FTP_PORT, &to, &port, cp->af, &start, &end) == 1) { IP_VS_DBG(7, "PORT %pI4:%u detected\n", &to.ip, ntohs(port)); /* Now update or create a connection entry for it */ IP_VS_DBG(7, "protocol %s %pI4:%u %pI4:%u\n", ip_vs_proto_name(ipvsh->protocol), &to.ip, ntohs(port), &cp->vaddr.ip, ntohs(cp->vport)-1); } else if (ip_vs_ftp_get_addrport(data_start, data_limit, CLIENT_STRING_EPRT, sizeof(CLIENT_STRING_EPRT)-1, ' ', true, IP_VS_FTP_EPRT, &to, &port, cp->af, &start, &end) == 1) { IP_VS_DBG_BUF(7, "EPRT %s:%u detected\n", IP_VS_DBG_ADDR(cp->af, &to), ntohs(port)); /* Now update or create a connection entry for it */ IP_VS_DBG_BUF(7, "protocol %s %s:%u %s:%u\n", ip_vs_proto_name(ipvsh->protocol), IP_VS_DBG_ADDR(cp->af, &to), ntohs(port), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport)-1); } else { return 1; } /* Passive mode off */ cp->app_data = (void *) IP_VS_FTP_ACTIVE; { struct ip_vs_conn_param p; ip_vs_conn_fill_param(cp->ipvs, cp->af, ipvsh->protocol, &to, port, &cp->vaddr, htons(ntohs(cp->vport)-1), &p); n_cp = ip_vs_conn_in_get(&p); if (!n_cp) { n_cp = ip_vs_conn_new(&p, cp->af, &cp->daddr, htons(ntohs(cp->dport)-1), IP_VS_CONN_F_NFCT, cp->dest, skb->mark); if (!n_cp) return 0; /* add its controller */ ip_vs_control_add(n_cp, cp); } } /* * Move tunnel to listen state */ ip_vs_tcp_conn_listen(n_cp); ip_vs_conn_put(n_cp); return 1; } static struct ip_vs_app ip_vs_ftp = { .name = "ftp", .type = IP_VS_APP_TYPE_FTP, .protocol = IPPROTO_TCP, .module = THIS_MODULE, .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), .init_conn = ip_vs_ftp_init_conn, .done_conn = ip_vs_ftp_done_conn, .bind_conn = NULL, .unbind_conn = NULL, .pkt_out = ip_vs_ftp_out, .pkt_in = ip_vs_ftp_in, }; /* * per netns ip_vs_ftp initialization */ static int __net_init __ip_vs_ftp_init(struct net *net) { int i, ret; struct ip_vs_app *app; struct netns_ipvs *ipvs = net_ipvs(net); if (!ipvs) return -ENOENT; app = register_ip_vs_app(ipvs, &ip_vs_ftp); if (IS_ERR(app)) return PTR_ERR(app); for (i = 0; i < ports_count; i++) { if (!ports[i]) continue; ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]); if (ret) goto err_unreg; } return 0; err_unreg: unregister_ip_vs_app(ipvs, &ip_vs_ftp); return ret; } /* * netns exit */ static void __ip_vs_ftp_exit(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); if (!ipvs) return; unregister_ip_vs_app(ipvs, &ip_vs_ftp); } static struct pernet_operations ip_vs_ftp_ops = { .init = __ip_vs_ftp_init, .exit = __ip_vs_ftp_exit, }; static int __init ip_vs_ftp_init(void) { /* rcu_barrier() is called by netns on error */ return register_pernet_subsys(&ip_vs_ftp_ops); } /* * ip_vs_ftp finish. */ static void __exit ip_vs_ftp_exit(void) { unregister_pernet_subsys(&ip_vs_ftp_ops); /* rcu_barrier() is called by netns */ } module_init(ip_vs_ftp_init); module_exit(ip_vs_ftp_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ipvs ftp helper");
2 4 4 6 3 2 6 4 4 4 2 4 4 4 9 4 6 6 4 4 6 8 9 7 2 4 2 6 7 3 8 7 1 4 3 3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 // SPDX-License-Identifier: GPL-2.0-only /* * CAIA Delay-Gradient (CDG) congestion control * * This implementation is based on the paper: * D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using * delay gradients." In IFIP Networking, pages 328-341. Springer, 2011. * * Scavenger traffic (Less-than-Best-Effort) should disable coexistence * heuristics using parameters use_shadow=0 and use_ineff=0. * * Parameters window, backoff_beta, and backoff_factor are crucial for * throughput and delay. Future work is needed to determine better defaults, * and to provide guidelines for use in different environments/contexts. * * Except for window, knobs are configured via /sys/module/tcp_cdg/parameters/. * Parameter window is only configurable when loading tcp_cdg as a module. * * Notable differences from paper/FreeBSD: * o Using Hybrid Slow start and Proportional Rate Reduction. * o Add toggle for shadow window mechanism. Suggested by David Hayes. * o Add toggle for non-congestion loss tolerance. * o Scaling parameter G is changed to a backoff factor; * conversion is given by: backoff_factor = 1000/(G * window). * o Limit shadow window to 2 * cwnd, or to cwnd when application limited. * o More accurate e^-x. */ #include <linux/kernel.h> #include <linux/random.h> #include <linux/module.h> #include <linux/sched/clock.h> #include <net/tcp.h> #define HYSTART_ACK_TRAIN 1 #define HYSTART_DELAY 2 static int window __read_mostly = 8; static unsigned int backoff_beta __read_mostly = 0.7071 * 1024; /* sqrt 0.5 */ static unsigned int backoff_factor __read_mostly = 42; static unsigned int hystart_detect __read_mostly = 3; static unsigned int use_ineff __read_mostly = 5; static bool use_shadow __read_mostly = true; static bool use_tolerance __read_mostly; module_param(window, int, 0444); MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)"); module_param(backoff_beta, uint, 0644); MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)"); module_param(backoff_factor, uint, 0644); MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor"); module_param(hystart_detect, uint, 0644); MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start " "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)"); module_param(use_ineff, uint, 0644); MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)"); module_param(use_shadow, bool, 0644); MODULE_PARM_DESC(use_shadow, "use shadow window heuristic"); module_param(use_tolerance, bool, 0644); MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic"); struct cdg_minmax { union { struct { s32 min; s32 max; }; u64 v64; }; }; enum cdg_state { CDG_UNKNOWN = 0, CDG_NONFULL = 1, CDG_FULL = 2, CDG_BACKOFF = 3, }; struct cdg { struct cdg_minmax rtt; struct cdg_minmax rtt_prev; struct cdg_minmax *gradients; struct cdg_minmax gsum; bool gfilled; u8 tail; u8 state; u8 delack; u32 rtt_seq; u32 shadow_wnd; u16 backoff_cnt; u16 sample_cnt; s32 delay_min; u32 last_ack; u32 round_start; }; /** * nexp_u32 - negative base-e exponential * @ux: x in units of micro * * Returns exp(ux * -1e-6) * U32_MAX. */ static u32 __pure nexp_u32(u32 ux) { static const u16 v[] = { /* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */ 65535, 65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422, 61378, 57484, 50423, 38795, 22965, 8047, 987, 14, }; u32 msb = ux >> 8; u32 res; int i; /* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */ if (msb > U16_MAX) return 0; /* Scale first eight bits linearly: */ res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000); /* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */ for (i = 1; msb; i++, msb >>= 1) { u32 y = v[i & -(msb & 1)] + U32_C(1); res = ((u64)res * y) >> 16; } return res; } /* Based on the HyStart algorithm (by Ha et al.) that is implemented in * tcp_cubic. Differences/experimental changes: * o Using Hayes' delayed ACK filter. * o Using a usec clock for the ACK train. * o Reset ACK train when application limited. * o Invoked at any cwnd (i.e. also when cwnd < 16). * o Invoked only when cwnd < ssthresh (i.e. not when cwnd == ssthresh). */ static void tcp_cdg_hystart_update(struct sock *sk) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min); if (ca->delay_min == 0) return; if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now_us = tp->tcp_mstamp; if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) { ca->last_ack = now_us; ca->round_start = now_us; } else if (before(now_us, ca->last_ack + 3000)) { u32 base_owd = max(ca->delay_min / 2U, 125U); ca->last_ack = now_us; if (after(now_us, ca->round_start + base_owd)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINCWND, tcp_snd_cwnd(tp)); tp->snd_ssthresh = tcp_snd_cwnd(tp); return; } } } if (hystart_detect & HYSTART_DELAY) { if (ca->sample_cnt < 8) { ca->sample_cnt++; } else { s32 thresh = max(ca->delay_min + ca->delay_min / 8U, 125U); if (ca->rtt.min > thresh) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYDETECT); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYCWND, tcp_snd_cwnd(tp)); tp->snd_ssthresh = tcp_snd_cwnd(tp); } } } } static s32 tcp_cdg_grad(struct cdg *ca) { s32 gmin = ca->rtt.min - ca->rtt_prev.min; s32 gmax = ca->rtt.max - ca->rtt_prev.max; s32 grad; if (ca->gradients) { ca->gsum.min += gmin - ca->gradients[ca->tail].min; ca->gsum.max += gmax - ca->gradients[ca->tail].max; ca->gradients[ca->tail].min = gmin; ca->gradients[ca->tail].max = gmax; ca->tail = (ca->tail + 1) & (window - 1); gmin = ca->gsum.min; gmax = ca->gsum.max; } /* We keep sums to ignore gradients during cwnd reductions; * the paper's smoothed gradients otherwise simplify to: * (rtt_latest - rtt_oldest) / window. * * We also drop division by window here. */ grad = gmin > 0 ? gmin : gmax; /* Extrapolate missing values in gradient window: */ if (!ca->gfilled) { if (!ca->gradients && window > 1) grad *= window; /* Memory allocation failed. */ else if (ca->tail == 0) ca->gfilled = true; else grad = (grad * window) / (int)ca->tail; } /* Backoff was effectual: */ if (gmin <= -32 || gmax <= -32) ca->backoff_cnt = 0; if (use_tolerance) { /* Reduce small variations to zero: */ gmin = DIV_ROUND_CLOSEST(gmin, 64); gmax = DIV_ROUND_CLOSEST(gmax, 64); if (gmin > 0 && gmax <= 0) ca->state = CDG_FULL; else if ((gmin > 0 && gmax > 0) || gmax < 0) ca->state = CDG_NONFULL; } return grad; } static bool tcp_cdg_backoff(struct sock *sk, u32 grad) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); if (get_random_u32() <= nexp_u32(grad * backoff_factor)) return false; if (use_ineff) { ca->backoff_cnt++; if (ca->backoff_cnt > use_ineff) return false; } ca->shadow_wnd = max(ca->shadow_wnd, tcp_snd_cwnd(tp)); ca->state = CDG_BACKOFF; tcp_enter_cwr(sk); return true; } /* Not called in CWR or Recovery state. */ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); u32 prior_snd_cwnd; u32 incr; if (tcp_in_slow_start(tp) && hystart_detect) tcp_cdg_hystart_update(sk); if (after(ack, ca->rtt_seq) && ca->rtt.v64) { s32 grad = 0; if (ca->rtt_prev.v64) grad = tcp_cdg_grad(ca); ca->rtt_seq = tp->snd_nxt; ca->rtt_prev = ca->rtt; ca->rtt.v64 = 0; ca->last_ack = 0; ca->sample_cnt = 0; if (grad > 0 && tcp_cdg_backoff(sk, grad)) return; } if (!tcp_is_cwnd_limited(sk)) { ca->shadow_wnd = min(ca->shadow_wnd, tcp_snd_cwnd(tp)); return; } prior_snd_cwnd = tcp_snd_cwnd(tp); tcp_reno_cong_avoid(sk, ack, acked); incr = tcp_snd_cwnd(tp) - prior_snd_cwnd; ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr); } static void tcp_cdg_acked(struct sock *sk, const struct ack_sample *sample) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); if (sample->rtt_us <= 0) return; /* A heuristic for filtering delayed ACKs, adapted from: * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010. */ if (tp->sacked_out == 0) { if (sample->pkts_acked == 1 && ca->delack) { /* A delayed ACK is only used for the minimum if it is * provenly lower than an existing non-zero minimum. */ ca->rtt.min = min(ca->rtt.min, sample->rtt_us); ca->delack--; return; } else if (sample->pkts_acked > 1 && ca->delack < 5) { ca->delack++; } } ca->rtt.min = min_not_zero(ca->rtt.min, sample->rtt_us); ca->rtt.max = max(ca->rtt.max, sample->rtt_us); } static u32 tcp_cdg_ssthresh(struct sock *sk) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); if (ca->state == CDG_BACKOFF) return max(2U, (tcp_snd_cwnd(tp) * min(1024U, backoff_beta)) >> 10); if (ca->state == CDG_NONFULL && use_tolerance) return tcp_snd_cwnd(tp); ca->shadow_wnd = min(ca->shadow_wnd >> 1, tcp_snd_cwnd(tp)); if (use_shadow) return max3(2U, ca->shadow_wnd, tcp_snd_cwnd(tp) >> 1); return max(2U, tcp_snd_cwnd(tp) >> 1); } static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); struct cdg_minmax *gradients; switch (ev) { case CA_EVENT_CWND_RESTART: gradients = ca->gradients; if (gradients) memset(gradients, 0, window * sizeof(gradients[0])); memset(ca, 0, sizeof(*ca)); ca->gradients = gradients; ca->rtt_seq = tp->snd_nxt; ca->shadow_wnd = tcp_snd_cwnd(tp); break; case CA_EVENT_COMPLETE_CWR: ca->state = CDG_UNKNOWN; ca->rtt_seq = tp->snd_nxt; ca->rtt_prev = ca->rtt; ca->rtt.v64 = 0; break; default: break; } } static void tcp_cdg_init(struct sock *sk) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->gradients = NULL; /* We silently fall back to window = 1 if allocation fails. */ if (window > 1) ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), GFP_NOWAIT | __GFP_NOWARN); ca->rtt_seq = tp->snd_nxt; ca->shadow_wnd = tcp_snd_cwnd(tp); } static void tcp_cdg_release(struct sock *sk) { struct cdg *ca = inet_csk_ca(sk); kfree(ca->gradients); ca->gradients = NULL; } static struct tcp_congestion_ops tcp_cdg __read_mostly = { .cong_avoid = tcp_cdg_cong_avoid, .cwnd_event = tcp_cdg_cwnd_event, .pkts_acked = tcp_cdg_acked, .undo_cwnd = tcp_reno_undo_cwnd, .ssthresh = tcp_cdg_ssthresh, .release = tcp_cdg_release, .init = tcp_cdg_init, .owner = THIS_MODULE, .name = "cdg", }; static int __init tcp_cdg_register(void) { if (backoff_beta > 1024 || window < 1 || window > 256) return -ERANGE; if (!is_power_of_2(window)) return -EINVAL; BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE); tcp_register_congestion_control(&tcp_cdg); return 0; } static void __exit tcp_cdg_unregister(void) { tcp_unregister_congestion_control(&tcp_cdg); } module_init(tcp_cdg_register); module_exit(tcp_cdg_unregister); MODULE_AUTHOR("Kenneth Klette Jonassen"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TCP CDG");
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 // SPDX-License-Identifier: GPL-2.0-only /* * TCP Low Priority (TCP-LP) * * TCP Low Priority is a distributed algorithm whose goal is to utilize only * the excess network bandwidth as compared to the ``fair share`` of * bandwidth as targeted by TCP. * * As of 2.6.13, Linux supports pluggable congestion control algorithms. * Due to the limitation of the API, we take the following changes from * the original TCP-LP implementation: * o We use newReno in most core CA handling. Only add some checking * within cong_avoid. * o Error correcting in remote HZ, therefore remote HZ will be keeped * on checking and updating. * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, since * OWD have a similar meaning as RTT. Also correct the buggy formular. * o Handle reaction for Early Congestion Indication (ECI) within * pkts_acked, as mentioned within pseudo code. * o OWD is handled in relative format, where local time stamp will in * tcp_time_stamp format. * * Original Author: * Aleksandar Kuzmanovic <akuzma@northwestern.edu> * Available from: * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf * Original implementation for 2.4.19: * http://www-ece.rice.edu/networks/TCP-LP/ * * 2.6.x module Authors: * Wong Hoi Sing, Edison <hswong3i@gmail.com> * Hung Hing Lun, Mike <hlhung3i@gmail.com> * SourceForge project page: * http://tcp-lp-mod.sourceforge.net/ */ #include <linux/module.h> #include <net/tcp.h> /* resolution of owd */ #define LP_RESOL TCP_TS_HZ /** * enum tcp_lp_state * @LP_VALID_RHZ: is remote HZ valid? * @LP_VALID_OWD: is OWD valid? * @LP_WITHIN_THR: are we within threshold? * @LP_WITHIN_INF: are we within inference? * * TCP-LP's state flags. * We create this set of state flag mainly for debugging. */ enum tcp_lp_state { LP_VALID_RHZ = (1 << 0), LP_VALID_OWD = (1 << 1), LP_WITHIN_THR = (1 << 3), LP_WITHIN_INF = (1 << 4), }; /** * struct lp * @flag: TCP-LP state flag * @sowd: smoothed OWD << 3 * @owd_min: min OWD * @owd_max: max OWD * @owd_max_rsv: reserved max owd * @remote_hz: estimated remote HZ * @remote_ref_time: remote reference time * @local_ref_time: local reference time * @last_drop: time for last active drop * @inference: current inference * * TCP-LP's private struct. * We get the idea from original TCP-LP implementation where only left those we * found are really useful. */ struct lp { u32 flag; u32 sowd; u32 owd_min; u32 owd_max; u32 owd_max_rsv; u32 remote_hz; u32 remote_ref_time; u32 local_ref_time; u32 last_drop; u32 inference; }; /** * tcp_lp_init * @sk: socket to initialize congestion control algorithm for * * Init all required variables. * Clone the handling from Vegas module implementation. */ static void tcp_lp_init(struct sock *sk) { struct lp *lp = inet_csk_ca(sk); lp->flag = 0; lp->sowd = 0; lp->owd_min = 0xffffffff; lp->owd_max = 0; lp->owd_max_rsv = 0; lp->remote_hz = 0; lp->remote_ref_time = 0; lp->local_ref_time = 0; lp->last_drop = 0; lp->inference = 0; } /** * tcp_lp_cong_avoid * @sk: socket to avoid congesting * * Implementation of cong_avoid. * Will only call newReno CA when away from inference. * From TCP-LP's paper, this will be handled in additive increasement. */ static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct lp *lp = inet_csk_ca(sk); if (!(lp->flag & LP_WITHIN_INF)) tcp_reno_cong_avoid(sk, ack, acked); } /** * tcp_lp_remote_hz_estimator * @sk: socket which needs an estimate for the remote HZs * * Estimate remote HZ. * We keep on updating the estimated value, where original TCP-LP * implementation only guest it for once and use forever. */ static u32 tcp_lp_remote_hz_estimator(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); s64 rhz = lp->remote_hz << 6; /* remote HZ << 6 */ s64 m = 0; /* not yet record reference time * go away!! record it before come back!! */ if (lp->remote_ref_time == 0 || lp->local_ref_time == 0) goto out; /* we can't calc remote HZ with no different!! */ if (tp->rx_opt.rcv_tsval == lp->remote_ref_time || tp->rx_opt.rcv_tsecr == lp->local_ref_time) goto out; m = TCP_TS_HZ * (tp->rx_opt.rcv_tsval - lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr - lp->local_ref_time); if (m < 0) m = -m; if (rhz > 0) { m -= rhz >> 6; /* m is now error in remote HZ est */ rhz += m; /* 63/64 old + 1/64 new */ } else rhz = m << 6; out: /* record time for successful remote HZ calc */ if ((rhz >> 6) > 0) lp->flag |= LP_VALID_RHZ; else lp->flag &= ~LP_VALID_RHZ; /* record reference time stamp */ lp->remote_ref_time = tp->rx_opt.rcv_tsval; lp->local_ref_time = tp->rx_opt.rcv_tsecr; return rhz >> 6; } /** * tcp_lp_owd_calculator * @sk: socket to calculate one way delay for * * Calculate one way delay (in relative format). * Original implement OWD as minus of remote time difference to local time * difference directly. As this time difference just simply equal to RTT, when * the network status is stable, remote RTT will equal to local RTT, and result * OWD into zero. * It seems to be a bug and so we fixed it. */ static u32 tcp_lp_owd_calculator(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); s64 owd = 0; lp->remote_hz = tcp_lp_remote_hz_estimator(sk); if (lp->flag & LP_VALID_RHZ) { owd = tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) - tp->rx_opt.rcv_tsecr * (LP_RESOL / TCP_TS_HZ); if (owd < 0) owd = -owd; } if (owd > 0) lp->flag |= LP_VALID_OWD; else lp->flag &= ~LP_VALID_OWD; return owd; } /** * tcp_lp_rtt_sample * @sk: socket to add a rtt sample to * @rtt: round trip time, which is ignored! * * Implementation or rtt_sample. * Will take the following action, * 1. calc OWD, * 2. record the min/max OWD, * 3. calc smoothed OWD (SOWD). * Most ideas come from the original TCP-LP implementation. */ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt) { struct lp *lp = inet_csk_ca(sk); s64 mowd = tcp_lp_owd_calculator(sk); /* sorry that we don't have valid data */ if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD)) return; /* record the next min owd */ if (mowd < lp->owd_min) lp->owd_min = mowd; /* always forget the max of the max * we just set owd_max as one below it */ if (mowd > lp->owd_max) { if (mowd > lp->owd_max_rsv) { if (lp->owd_max_rsv == 0) lp->owd_max = mowd; else lp->owd_max = lp->owd_max_rsv; lp->owd_max_rsv = mowd; } else lp->owd_max = mowd; } /* calc for smoothed owd */ if (lp->sowd != 0) { mowd -= lp->sowd >> 3; /* m is now error in owd est */ lp->sowd += mowd; /* owd = 7/8 owd + 1/8 new */ } else lp->sowd = mowd << 3; /* take the measured time be owd */ } /** * tcp_lp_pkts_acked * @sk: socket requiring congestion avoidance calculations * * Implementation of pkts_acked. * Deal with active drop under Early Congestion Indication. * Only drop to half and 1 will be handle, because we hope to use back * newReno in increase case. * We work it out by following the idea from TCP-LP's paper directly */ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); u32 now = tcp_time_stamp_ts(tp); u32 delta; if (sample->rtt_us > 0) tcp_lp_rtt_sample(sk, sample->rtt_us); /* calc inference */ delta = now - tp->rx_opt.rcv_tsecr; if ((s32)delta > 0) lp->inference = 3 * delta; /* test if within inference */ if (lp->last_drop && (now - lp->last_drop < lp->inference)) lp->flag |= LP_WITHIN_INF; else lp->flag &= ~LP_WITHIN_INF; /* test if within threshold */ if (lp->sowd >> 3 < lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100) lp->flag |= LP_WITHIN_THR; else lp->flag &= ~LP_WITHIN_THR; pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag, tcp_snd_cwnd(tp), lp->remote_hz, lp->owd_min, lp->owd_max, lp->sowd >> 3); if (lp->flag & LP_WITHIN_THR) return; /* FIXME: try to reset owd_min and owd_max here * so decrease the chance the min/max is no longer suitable * and will usually within threshold when within inference */ lp->owd_min = lp->sowd >> 3; lp->owd_max = lp->sowd >> 2; lp->owd_max_rsv = lp->sowd >> 2; /* happened within inference * drop snd_cwnd into 1 */ if (lp->flag & LP_WITHIN_INF) tcp_snd_cwnd_set(tp, 1U); /* happened after inference * cut snd_cwnd into half */ else tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp) >> 1U, 1U)); /* record this drop time */ lp->last_drop = now; } static struct tcp_congestion_ops tcp_lp __read_mostly = { .init = tcp_lp_init, .ssthresh = tcp_reno_ssthresh, .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_lp_cong_avoid, .pkts_acked = tcp_lp_pkts_acked, .owner = THIS_MODULE, .name = "lp" }; static int __init tcp_lp_register(void) { BUILD_BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&tcp_lp); } static void __exit tcp_lp_unregister(void) { tcp_unregister_congestion_control(&tcp_lp); } module_init(tcp_lp_register); module_exit(tcp_lp_unregister); MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun Mike"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TCP Low Priority");
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 // SPDX-License-Identifier: GPL-2.0 /* * The base64 encode/decode code was copied from fscrypt: * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility * Written by Uday Savagaonkar, 2014. * Modified by Jaegeuk Kim, 2015. */ #include <linux/ceph/ceph_debug.h> #include <linux/xattr.h> #include <linux/fscrypt.h> #include <linux/ceph/striper.h> #include "super.h" #include "mds_client.h" #include "crypto.h" /* * The base64url encoding used by fscrypt includes the '_' character, which may * cause problems in snapshot names (which can not start with '_'). Thus, we * used the base64 encoding defined for IMAP mailbox names (RFC 3501) instead, * which replaces '-' and '_' by '+' and ','. */ static const char base64_table[65] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; int ceph_base64_encode(const u8 *src, int srclen, char *dst) { u32 ac = 0; int bits = 0; int i; char *cp = dst; for (i = 0; i < srclen; i++) { ac = (ac << 8) | src[i]; bits += 8; do { bits -= 6; *cp++ = base64_table[(ac >> bits) & 0x3f]; } while (bits >= 6); } if (bits) *cp++ = base64_table[(ac << (6 - bits)) & 0x3f]; return cp - dst; } int ceph_base64_decode(const char *src, int srclen, u8 *dst) { u32 ac = 0; int bits = 0; int i; u8 *bp = dst; for (i = 0; i < srclen; i++) { const char *p = strchr(base64_table, src[i]); if (p == NULL || src[i] == 0) return -1; ac = (ac << 6) | (p - base64_table); bits += 6; if (bits >= 8) { bits -= 8; *bp++ = (u8)(ac >> bits); } } if (ac & ((1 << bits) - 1)) return -1; return bp - dst; } static int ceph_crypt_get_context(struct inode *inode, void *ctx, size_t len) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fscrypt_auth *cfa = (struct ceph_fscrypt_auth *)ci->fscrypt_auth; u32 ctxlen; /* Non existent or too short? */ if (!cfa || (ci->fscrypt_auth_len < (offsetof(struct ceph_fscrypt_auth, cfa_blob) + 1))) return -ENOBUFS; /* Some format we don't recognize? */ if (le32_to_cpu(cfa->cfa_version) != CEPH_FSCRYPT_AUTH_VERSION) return -ENOBUFS; ctxlen = le32_to_cpu(cfa->cfa_blob_len); if (len < ctxlen) return -ERANGE; memcpy(ctx, cfa->cfa_blob, ctxlen); return ctxlen; } static int ceph_crypt_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { int ret; struct iattr attr = { }; struct ceph_iattr cia = { }; struct ceph_fscrypt_auth *cfa; WARN_ON_ONCE(fs_data); if (len > FSCRYPT_SET_CONTEXT_MAX_SIZE) return -EINVAL; cfa = kzalloc(sizeof(*cfa), GFP_KERNEL); if (!cfa) return -ENOMEM; cfa->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION); cfa->cfa_blob_len = cpu_to_le32(len); memcpy(cfa->cfa_blob, ctx, len); cia.fscrypt_auth = cfa; ret = __ceph_setattr(&nop_mnt_idmap, inode, &attr, &cia); if (ret == 0) inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED); kfree(cia.fscrypt_auth); return ret; } static bool ceph_crypt_empty_dir(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); return ci->i_rsubdirs + ci->i_rfiles == 1; } static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb) { return ceph_sb_to_fs_client(sb)->fsc_dummy_enc_policy.policy; } static struct fscrypt_operations ceph_fscrypt_ops = { .needs_bounce_pages = 1, .get_context = ceph_crypt_get_context, .set_context = ceph_crypt_set_context, .get_dummy_policy = ceph_get_dummy_policy, .empty_dir = ceph_crypt_empty_dir, }; void ceph_fscrypt_set_ops(struct super_block *sb) { fscrypt_set_ops(sb, &ceph_fscrypt_ops); } void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc) { fscrypt_free_dummy_policy(&fsc->fsc_dummy_enc_policy); } int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode, struct ceph_acl_sec_ctx *as) { int ret, ctxsize; bool encrypted = false; struct ceph_inode_info *ci = ceph_inode(inode); ret = fscrypt_prepare_new_inode(dir, inode, &encrypted); if (ret) return ret; if (!encrypted) return 0; as->fscrypt_auth = kzalloc(sizeof(*as->fscrypt_auth), GFP_KERNEL); if (!as->fscrypt_auth) return -ENOMEM; ctxsize = fscrypt_context_for_new_inode(as->fscrypt_auth->cfa_blob, inode); if (ctxsize < 0) return ctxsize; as->fscrypt_auth->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION); as->fscrypt_auth->cfa_blob_len = cpu_to_le32(ctxsize); WARN_ON_ONCE(ci->fscrypt_auth); kfree(ci->fscrypt_auth); ci->fscrypt_auth_len = ceph_fscrypt_auth_len(as->fscrypt_auth); ci->fscrypt_auth = kmemdup(as->fscrypt_auth, ci->fscrypt_auth_len, GFP_KERNEL); if (!ci->fscrypt_auth) return -ENOMEM; inode->i_flags |= S_ENCRYPTED; return 0; } void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, struct ceph_acl_sec_ctx *as) { swap(req->r_fscrypt_auth, as->fscrypt_auth); } /* * User-created snapshots can't start with '_'. Snapshots that start with this * character are special (hint: there aren't real snapshots) and use the * following format: * * _<SNAPSHOT-NAME>_<INODE-NUMBER> * * where: * - <SNAPSHOT-NAME> - the real snapshot name that may need to be decrypted, * - <INODE-NUMBER> - the inode number (in decimal) for the actual snapshot * * This function parses these snapshot names and returns the inode * <INODE-NUMBER>. 'name_len' will also bet set with the <SNAPSHOT-NAME> * length. */ static struct inode *parse_longname(const struct inode *parent, const char *name, int *name_len) { struct ceph_client *cl = ceph_inode_to_client(parent); struct inode *dir = NULL; struct ceph_vino vino = { .snap = CEPH_NOSNAP }; char *inode_number; char *name_end; int orig_len = *name_len; int ret = -EIO; /* Skip initial '_' */ name++; name_end = strrchr(name, '_'); if (!name_end) { doutc(cl, "failed to parse long snapshot name: %s\n", name); return ERR_PTR(-EIO); } *name_len = (name_end - name); if (*name_len <= 0) { pr_err_client(cl, "failed to parse long snapshot name\n"); return ERR_PTR(-EIO); } /* Get the inode number */ inode_number = kmemdup_nul(name_end + 1, orig_len - *name_len - 2, GFP_KERNEL); if (!inode_number) return ERR_PTR(-ENOMEM); ret = kstrtou64(inode_number, 10, &vino.ino); if (ret) { doutc(cl, "failed to parse inode number: %s\n", name); dir = ERR_PTR(ret); goto out; } /* And finally the inode */ dir = ceph_find_inode(parent->i_sb, vino); if (!dir) { /* This can happen if we're not mounting cephfs on the root */ dir = ceph_get_inode(parent->i_sb, vino, NULL); if (IS_ERR(dir)) doutc(cl, "can't find inode %s (%s)\n", inode_number, name); } out: kfree(inode_number); return dir; } int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name, char *buf) { struct ceph_client *cl = ceph_inode_to_client(parent); struct inode *dir = parent; struct qstr iname; u32 len; int name_len; int elen; int ret; u8 *cryptbuf = NULL; iname.name = d_name->name; name_len = d_name->len; /* Handle the special case of snapshot names that start with '_' */ if ((ceph_snap(dir) == CEPH_SNAPDIR) && (name_len > 0) && (iname.name[0] == '_')) { dir = parse_longname(parent, iname.name, &name_len); if (IS_ERR(dir)) return PTR_ERR(dir); iname.name++; /* skip initial '_' */ } iname.len = name_len; if (!fscrypt_has_encryption_key(dir)) { memcpy(buf, d_name->name, d_name->len); elen = d_name->len; goto out; } /* * Convert cleartext d_name to ciphertext. If result is longer than * CEPH_NOHASH_NAME_MAX, sha256 the remaining bytes * * See: fscrypt_setup_filename */ if (!fscrypt_fname_encrypted_size(dir, iname.len, NAME_MAX, &len)) { elen = -ENAMETOOLONG; goto out; } /* Allocate a buffer appropriate to hold the result */ cryptbuf = kmalloc(len > CEPH_NOHASH_NAME_MAX ? NAME_MAX : len, GFP_KERNEL); if (!cryptbuf) { elen = -ENOMEM; goto out; } ret = fscrypt_fname_encrypt(dir, &iname, cryptbuf, len); if (ret) { elen = ret; goto out; } /* hash the end if the name is long enough */ if (len > CEPH_NOHASH_NAME_MAX) { u8 hash[SHA256_DIGEST_SIZE]; u8 *extra = cryptbuf + CEPH_NOHASH_NAME_MAX; /* * hash the extra bytes and overwrite crypttext beyond that * point with it */ sha256(extra, len - CEPH_NOHASH_NAME_MAX, hash); memcpy(extra, hash, SHA256_DIGEST_SIZE); len = CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE; } /* base64 encode the encrypted name */ elen = ceph_base64_encode(cryptbuf, len, buf); doutc(cl, "base64-encoded ciphertext name = %.*s\n", elen, buf); /* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */ WARN_ON(elen > 240); if ((elen > 0) && (dir != parent)) { char tmp_buf[NAME_MAX]; elen = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld", elen, buf, dir->i_ino); memcpy(buf, tmp_buf, elen); } out: kfree(cryptbuf); if (dir != parent) { if ((dir->i_state & I_NEW)) discard_new_inode(dir); else iput(dir); } return elen; } int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry, char *buf) { WARN_ON_ONCE(!fscrypt_has_encryption_key(parent)); return ceph_encode_encrypted_dname(parent, &dentry->d_name, buf); } /** * ceph_fname_to_usr - convert a filename for userland presentation * @fname: ceph_fname to be converted * @tname: temporary name buffer to use for conversion (may be NULL) * @oname: where converted name should be placed * @is_nokey: set to true if key wasn't available during conversion (may be NULL) * * Given a filename (usually from the MDS), format it for presentation to * userland. If @parent is not encrypted, just pass it back as-is. * * Otherwise, base64 decode the string, and then ask fscrypt to format it * for userland presentation. * * Returns 0 on success or negative error code on error. */ int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname, struct fscrypt_str *oname, bool *is_nokey) { struct inode *dir = fname->dir; struct fscrypt_str _tname = FSTR_INIT(NULL, 0); struct fscrypt_str iname; char *name = fname->name; int name_len = fname->name_len; int ret; /* Sanity check that the resulting name will fit in the buffer */ if (fname->name_len > NAME_MAX || fname->ctext_len > NAME_MAX) return -EIO; /* Handle the special case of snapshot names that start with '_' */ if ((ceph_snap(dir) == CEPH_SNAPDIR) && (name_len > 0) && (name[0] == '_')) { dir = parse_longname(dir, name, &name_len); if (IS_ERR(dir)) return PTR_ERR(dir); name++; /* skip initial '_' */ } if (!IS_ENCRYPTED(dir)) { oname->name = fname->name; oname->len = fname->name_len; ret = 0; goto out_inode; } ret = ceph_fscrypt_prepare_readdir(dir); if (ret) goto out_inode; /* * Use the raw dentry name as sent by the MDS instead of * generating a nokey name via fscrypt. */ if (!fscrypt_has_encryption_key(dir)) { if (fname->no_copy) oname->name = fname->name; else memcpy(oname->name, fname->name, fname->name_len); oname->len = fname->name_len; if (is_nokey) *is_nokey = true; ret = 0; goto out_inode; } if (fname->ctext_len == 0) { int declen; if (!tname) { ret = fscrypt_fname_alloc_buffer(NAME_MAX, &_tname); if (ret) goto out_inode; tname = &_tname; } declen = ceph_base64_decode(name, name_len, tname->name); if (declen <= 0) { ret = -EIO; goto out; } iname.name = tname->name; iname.len = declen; } else { iname.name = fname->ctext; iname.len = fname->ctext_len; } ret = fscrypt_fname_disk_to_usr(dir, 0, 0, &iname, oname); if (!ret && (dir != fname->dir)) { char tmp_buf[CEPH_BASE64_CHARS(NAME_MAX)]; name_len = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld", oname->len, oname->name, dir->i_ino); memcpy(oname->name, tmp_buf, name_len); oname->len = name_len; } out: fscrypt_fname_free_buffer(&_tname); out_inode: if (dir != fname->dir) { if ((dir->i_state & I_NEW)) discard_new_inode(dir); else iput(dir); } return ret; } /** * ceph_fscrypt_prepare_readdir - simple __fscrypt_prepare_readdir() wrapper * @dir: directory inode for readdir prep * * Simple wrapper around __fscrypt_prepare_readdir() that will mark directory as * non-complete if this call results in having the directory unlocked. * * Returns: * 1 - if directory was locked and key is now loaded (i.e. dir is unlocked) * 0 - if directory is still locked * < 0 - if __fscrypt_prepare_readdir() fails */ int ceph_fscrypt_prepare_readdir(struct inode *dir) { bool had_key = fscrypt_has_encryption_key(dir); int err; if (!IS_ENCRYPTED(dir)) return 0; err = __fscrypt_prepare_readdir(dir); if (err) return err; if (!had_key && fscrypt_has_encryption_key(dir)) { /* directory just got unlocked, mark it as not complete */ ceph_dir_clear_complete(dir); return 1; } return 0; } int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num) { struct ceph_client *cl = ceph_inode_to_client(inode); doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode, ceph_vinop(inode), len, offs, lblk_num); return fscrypt_decrypt_block_inplace(inode, page, len, offs, lblk_num); } int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags) { struct ceph_client *cl = ceph_inode_to_client(inode); doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode, ceph_vinop(inode), len, offs, lblk_num); return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num, gfp_flags); } /** * ceph_fscrypt_decrypt_pages - decrypt an array of pages * @inode: pointer to inode associated with these pages * @page: pointer to page array * @off: offset into the file that the read data starts * @len: max length to decrypt * * Decrypt an array of fscrypt'ed pages and return the amount of * data decrypted. Any data in the page prior to the start of the * first complete block in the read is ignored. Any incomplete * crypto blocks at the end of the array are ignored (and should * probably be zeroed by the caller). * * Returns the length of the decrypted data or a negative errno. */ int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page, u64 off, int len) { int i, num_blocks; u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT; int ret = 0; /* * We can't deal with partial blocks on an encrypted file, so mask off * the last bit. */ num_blocks = ceph_fscrypt_blocks(off, len & CEPH_FSCRYPT_BLOCK_MASK); /* Decrypt each block */ for (i = 0; i < num_blocks; ++i) { int blkoff = i << CEPH_FSCRYPT_BLOCK_SHIFT; int pgidx = blkoff >> PAGE_SHIFT; unsigned int pgoffs = offset_in_page(blkoff); int fret; fret = ceph_fscrypt_decrypt_block_inplace(inode, page[pgidx], CEPH_FSCRYPT_BLOCK_SIZE, pgoffs, baseblk + i); if (fret < 0) { if (ret == 0) ret = fret; break; } ret += CEPH_FSCRYPT_BLOCK_SIZE; } return ret; } /** * ceph_fscrypt_decrypt_extents: decrypt received extents in given buffer * @inode: inode associated with pages being decrypted * @page: pointer to page array * @off: offset into the file that the data in page[0] starts * @map: pointer to extent array * @ext_cnt: length of extent array * * Given an extent map and a page array, decrypt the received data in-place, * skipping holes. Returns the offset into buffer of end of last decrypted * block. */ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page, u64 off, struct ceph_sparse_extent *map, u32 ext_cnt) { struct ceph_client *cl = ceph_inode_to_client(inode); int i, ret = 0; struct ceph_inode_info *ci = ceph_inode(inode); u64 objno, objoff; u32 xlen; /* Nothing to do for empty array */ if (ext_cnt == 0) { doutc(cl, "%p %llx.%llx empty array, ret 0\n", inode, ceph_vinop(inode)); return 0; } ceph_calc_file_object_mapping(&ci->i_layout, off, map[0].len, &objno, &objoff, &xlen); for (i = 0; i < ext_cnt; ++i) { struct ceph_sparse_extent *ext = &map[i]; int pgsoff = ext->off - objoff; int pgidx = pgsoff >> PAGE_SHIFT; int fret; if ((ext->off | ext->len) & ~CEPH_FSCRYPT_BLOCK_MASK) { pr_warn_client(cl, "%p %llx.%llx bad encrypted sparse extent " "idx %d off %llx len %llx\n", inode, ceph_vinop(inode), i, ext->off, ext->len); return -EIO; } fret = ceph_fscrypt_decrypt_pages(inode, &page[pgidx], off + pgsoff, ext->len); doutc(cl, "%p %llx.%llx [%d] 0x%llx~0x%llx fret %d\n", inode, ceph_vinop(inode), i, ext->off, ext->len, fret); if (fret < 0) { if (ret == 0) ret = fret; break; } ret = pgsoff + fret; } doutc(cl, "ret %d\n", ret); return ret; } /** * ceph_fscrypt_encrypt_pages - encrypt an array of pages * @inode: pointer to inode associated with these pages * @page: pointer to page array * @off: offset into the file that the data starts * @len: max length to encrypt * @gfp: gfp flags to use for allocation * * Decrypt an array of cleartext pages and return the amount of * data encrypted. Any data in the page prior to the start of the * first complete block in the read is ignored. Any incomplete * crypto blocks at the end of the array are ignored. * * Returns the length of the encrypted data or a negative errno. */ int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off, int len, gfp_t gfp) { int i, num_blocks; u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT; int ret = 0; /* * We can't deal with partial blocks on an encrypted file, so mask off * the last bit. */ num_blocks = ceph_fscrypt_blocks(off, len & CEPH_FSCRYPT_BLOCK_MASK); /* Encrypt each block */ for (i = 0; i < num_blocks; ++i) { int blkoff = i << CEPH_FSCRYPT_BLOCK_SHIFT; int pgidx = blkoff >> PAGE_SHIFT; unsigned int pgoffs = offset_in_page(blkoff); int fret; fret = ceph_fscrypt_encrypt_block_inplace(inode, page[pgidx], CEPH_FSCRYPT_BLOCK_SIZE, pgoffs, baseblk + i, gfp); if (fret < 0) { if (ret == 0) ret = fret; break; } ret += CEPH_FSCRYPT_BLOCK_SIZE; } return ret; }
3 5 14 14 2 14 17 17 17 22 8 20 18 20 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> */ #include <net/inet_dscp.h> #include <net/ip.h> #include "ipvlan.h" static u32 ipvlan_jhash_secret __read_mostly; void ipvlan_init_secret(void) { net_get_random_once(&ipvlan_jhash_secret, sizeof(ipvlan_jhash_secret)); } void ipvlan_count_rx(const struct ipvl_dev *ipvlan, unsigned int len, bool success, bool mcast) { if (likely(success)) { struct ipvl_pcpu_stats *pcptr; pcptr = this_cpu_ptr(ipvlan->pcpu_stats); u64_stats_update_begin(&pcptr->syncp); u64_stats_inc(&pcptr->rx_pkts); u64_stats_add(&pcptr->rx_bytes, len); if (mcast) u64_stats_inc(&pcptr->rx_mcast); u64_stats_update_end(&pcptr->syncp); } else { this_cpu_inc(ipvlan->pcpu_stats->rx_errs); } } EXPORT_SYMBOL_GPL(ipvlan_count_rx); #if IS_ENABLED(CONFIG_IPV6) static u8 ipvlan_get_v6_hash(const void *iaddr) { const struct in6_addr *ip6_addr = iaddr; return __ipv6_addr_jhash(ip6_addr, ipvlan_jhash_secret) & IPVLAN_HASH_MASK; } #else static u8 ipvlan_get_v6_hash(const void *iaddr) { return 0; } #endif static u8 ipvlan_get_v4_hash(const void *iaddr) { const struct in_addr *ip4_addr = iaddr; return jhash_1word(ip4_addr->s_addr, ipvlan_jhash_secret) & IPVLAN_HASH_MASK; } static bool addr_equal(bool is_v6, struct ipvl_addr *addr, const void *iaddr) { if (!is_v6 && addr->atype == IPVL_IPV4) { struct in_addr *i4addr = (struct in_addr *)iaddr; return addr->ip4addr.s_addr == i4addr->s_addr; #if IS_ENABLED(CONFIG_IPV6) } else if (is_v6 && addr->atype == IPVL_IPV6) { struct in6_addr *i6addr = (struct in6_addr *)iaddr; return ipv6_addr_equal(&addr->ip6addr, i6addr); #endif } return false; } static struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port, const void *iaddr, bool is_v6) { struct ipvl_addr *addr; u8 hash; hash = is_v6 ? ipvlan_get_v6_hash(iaddr) : ipvlan_get_v4_hash(iaddr); hlist_for_each_entry_rcu(addr, &port->hlhead[hash], hlnode) if (addr_equal(is_v6, addr, iaddr)) return addr; return NULL; } void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr) { struct ipvl_port *port = ipvlan->port; u8 hash; hash = (addr->atype == IPVL_IPV6) ? ipvlan_get_v6_hash(&addr->ip6addr) : ipvlan_get_v4_hash(&addr->ip4addr); if (hlist_unhashed(&addr->hlnode)) hlist_add_head_rcu(&addr->hlnode, &port->hlhead[hash]); } void ipvlan_ht_addr_del(struct ipvl_addr *addr) { hlist_del_init_rcu(&addr->hlnode); } struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6) { struct ipvl_addr *addr, *ret = NULL; rcu_read_lock(); list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) { if (addr_equal(is_v6, addr, iaddr)) { ret = addr; break; } } rcu_read_unlock(); return ret; } bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6) { struct ipvl_dev *ipvlan; bool ret = false; rcu_read_lock(); list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) { if (ipvlan_find_addr(ipvlan, iaddr, is_v6)) { ret = true; break; } } rcu_read_unlock(); return ret; } void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type) { void *lyr3h = NULL; switch (skb->protocol) { case htons(ETH_P_ARP): { struct arphdr *arph; if (unlikely(!pskb_may_pull(skb, arp_hdr_len(port->dev)))) return NULL; arph = arp_hdr(skb); *type = IPVL_ARP; lyr3h = arph; break; } case htons(ETH_P_IP): { u32 pktlen; struct iphdr *ip4h; if (unlikely(!pskb_may_pull(skb, sizeof(*ip4h)))) return NULL; ip4h = ip_hdr(skb); pktlen = skb_ip_totlen(skb); if (ip4h->ihl < 5 || ip4h->version != 4) return NULL; if (skb->len < pktlen || pktlen < (ip4h->ihl * 4)) return NULL; *type = IPVL_IPV4; lyr3h = ip4h; break; } #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): { struct ipv6hdr *ip6h; if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h)))) return NULL; ip6h = ipv6_hdr(skb); if (ip6h->version != 6) return NULL; *type = IPVL_IPV6; lyr3h = ip6h; /* Only Neighbour Solicitation pkts need different treatment */ if (ipv6_addr_any(&ip6h->saddr) && ip6h->nexthdr == NEXTHDR_ICMP) { struct icmp6hdr *icmph; if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h) + sizeof(*icmph)))) return NULL; ip6h = ipv6_hdr(skb); icmph = (struct icmp6hdr *)(ip6h + 1); if (icmph->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) { /* Need to access the ipv6 address in body */ if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h) + sizeof(*icmph) + sizeof(struct in6_addr)))) return NULL; ip6h = ipv6_hdr(skb); icmph = (struct icmp6hdr *)(ip6h + 1); } *type = IPVL_ICMPV6; lyr3h = icmph; } break; } #endif default: return NULL; } return lyr3h; } unsigned int ipvlan_mac_hash(const unsigned char *addr) { u32 hash = jhash_1word(__get_unaligned_cpu32(addr+2), ipvlan_jhash_secret); return hash & IPVLAN_MAC_FILTER_MASK; } void ipvlan_process_multicast(struct work_struct *work) { struct ipvl_port *port = container_of(work, struct ipvl_port, wq); struct ethhdr *ethh; struct ipvl_dev *ipvlan; struct sk_buff *skb, *nskb; struct sk_buff_head list; unsigned int len; unsigned int mac_hash; int ret; u8 pkt_type; bool tx_pkt; __skb_queue_head_init(&list); spin_lock_bh(&port->backlog.lock); skb_queue_splice_tail_init(&port->backlog, &list); spin_unlock_bh(&port->backlog.lock); while ((skb = __skb_dequeue(&list)) != NULL) { struct net_device *dev = skb->dev; bool consumed = false; ethh = eth_hdr(skb); tx_pkt = IPVL_SKB_CB(skb)->tx_pkt; mac_hash = ipvlan_mac_hash(ethh->h_dest); if (ether_addr_equal(ethh->h_dest, port->dev->broadcast)) pkt_type = PACKET_BROADCAST; else pkt_type = PACKET_MULTICAST; rcu_read_lock(); list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) { if (tx_pkt && (ipvlan->dev == skb->dev)) continue; if (!test_bit(mac_hash, ipvlan->mac_filters)) continue; if (!(ipvlan->dev->flags & IFF_UP)) continue; ret = NET_RX_DROP; len = skb->len + ETH_HLEN; nskb = skb_clone(skb, GFP_ATOMIC); local_bh_disable(); if (nskb) { consumed = true; nskb->pkt_type = pkt_type; nskb->dev = ipvlan->dev; if (tx_pkt) ret = dev_forward_skb(ipvlan->dev, nskb); else ret = netif_rx(nskb); } ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, true); local_bh_enable(); } rcu_read_unlock(); if (tx_pkt) { /* If the packet originated here, send it out. */ skb->dev = port->dev; skb->pkt_type = pkt_type; dev_queue_xmit(skb); } else { if (consumed) consume_skb(skb); else kfree_skb(skb); } dev_put(dev); cond_resched(); } } static void ipvlan_skb_crossing_ns(struct sk_buff *skb, struct net_device *dev) { bool xnet = true; if (dev) xnet = !net_eq(dev_net(skb->dev), dev_net(dev)); skb_scrub_packet(skb, xnet); if (dev) skb->dev = dev; } static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff **pskb, bool local) { struct ipvl_dev *ipvlan = addr->master; struct net_device *dev = ipvlan->dev; unsigned int len; rx_handler_result_t ret = RX_HANDLER_CONSUMED; bool success = false; struct sk_buff *skb = *pskb; len = skb->len + ETH_HLEN; /* Only packets exchanged between two local slaves need to have * device-up check as well as skb-share check. */ if (local) { if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb(skb); goto out; } skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out; *pskb = skb; } if (local) { skb->pkt_type = PACKET_HOST; if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS) success = true; } else { skb->dev = dev; ret = RX_HANDLER_ANOTHER; success = true; } out: ipvlan_count_rx(ipvlan, len, success, false); return ret; } struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, int addr_type, bool use_dest) { struct ipvl_addr *addr = NULL; switch (addr_type) { #if IS_ENABLED(CONFIG_IPV6) case IPVL_IPV6: { struct ipv6hdr *ip6h; struct in6_addr *i6addr; ip6h = (struct ipv6hdr *)lyr3h; i6addr = use_dest ? &ip6h->daddr : &ip6h->saddr; addr = ipvlan_ht_addr_lookup(port, i6addr, true); break; } case IPVL_ICMPV6: { struct nd_msg *ndmh; struct in6_addr *i6addr; /* Make sure that the NeighborSolicitation ICMPv6 packets * are handled to avoid DAD issue. */ ndmh = (struct nd_msg *)lyr3h; if (ndmh->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) { i6addr = &ndmh->target; addr = ipvlan_ht_addr_lookup(port, i6addr, true); } break; } #endif case IPVL_IPV4: { struct iphdr *ip4h; __be32 *i4addr; ip4h = (struct iphdr *)lyr3h; i4addr = use_dest ? &ip4h->daddr : &ip4h->saddr; addr = ipvlan_ht_addr_lookup(port, i4addr, false); break; } case IPVL_ARP: { struct arphdr *arph; unsigned char *arp_ptr; __be32 dip; arph = (struct arphdr *)lyr3h; arp_ptr = (unsigned char *)(arph + 1); if (use_dest) arp_ptr += (2 * port->dev->addr_len) + 4; else arp_ptr += port->dev->addr_len; memcpy(&dip, arp_ptr, 4); addr = ipvlan_ht_addr_lookup(port, &dip, false); break; } } return addr; } static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) { const struct iphdr *ip4h = ip_hdr(skb); struct net_device *dev = skb->dev; struct net *net = dev_net(dev); struct rtable *rt; int err, ret = NET_XMIT_DROP; struct flowi4 fl4 = { .flowi4_oif = dev->ifindex, .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)), .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, .daddr = ip4h->daddr, .saddr = ip4h->saddr, }; rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err; if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { ip_rt_put(rt); goto err; } skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); err = ip_local_out(net, NULL, skb); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out; err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out: return ret; } #if IS_ENABLED(CONFIG_IPV6) static noinline_for_stack int ipvlan_route_v6_outbound(struct net_device *dev, struct sk_buff *skb) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .daddr = ip6h->daddr, .saddr = ip6h->saddr, .flowi6_flags = FLOWI_FLAG_ANYSRC, .flowlabel = ip6_flowinfo(ip6h), .flowi6_mark = skb->mark, .flowi6_proto = ip6h->nexthdr, }; struct dst_entry *dst; int err; dst = ip6_route_output(dev_net(dev), NULL, &fl6); err = dst->error; if (err) { dst_release(dst); return err; } skb_dst_set(skb, dst); return 0; } static int ipvlan_process_v6_outbound(struct sk_buff *skb) { struct net_device *dev = skb->dev; int err, ret = NET_XMIT_DROP; err = ipvlan_route_v6_outbound(dev, skb); if (unlikely(err)) { DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return err; } memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); err = ip6_local_out(dev_net(dev), NULL, skb); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; return ret; } #else static int ipvlan_process_v6_outbound(struct sk_buff *skb) { return NET_XMIT_DROP; } #endif static int ipvlan_process_outbound(struct sk_buff *skb) { int ret = NET_XMIT_DROP; /* The ipvlan is a pseudo-L2 device, so the packets that we receive * will have L2; which need to discarded and processed further * in the net-ns of the main-device. */ if (skb_mac_header_was_set(skb)) { /* In this mode we dont care about * multicast and broadcast traffic */ struct ethhdr *ethh = eth_hdr(skb); if (is_multicast_ether_addr(ethh->h_dest)) { pr_debug_ratelimited( "Dropped {multi|broad}cast of type=[%x]\n", ntohs(skb->protocol)); kfree_skb(skb); goto out; } skb_pull(skb, sizeof(*ethh)); skb->mac_header = (typeof(skb->mac_header))~0U; skb_reset_network_header(skb); } if (skb->protocol == htons(ETH_P_IPV6)) ret = ipvlan_process_v6_outbound(skb); else if (skb->protocol == htons(ETH_P_IP)) ret = ipvlan_process_v4_outbound(skb); else { pr_warn_ratelimited("Dropped outbound packet type=%x\n", ntohs(skb->protocol)); kfree_skb(skb); } out: return ret; } static void ipvlan_multicast_enqueue(struct ipvl_port *port, struct sk_buff *skb, bool tx_pkt) { if (skb->protocol == htons(ETH_P_PAUSE)) { kfree_skb(skb); return; } /* Record that the deferred packet is from TX or RX path. By * looking at mac-addresses on packet will lead to erronus decisions. * (This would be true for a loopback-mode on master device or a * hair-pin mode of the switch.) */ IPVL_SKB_CB(skb)->tx_pkt = tx_pkt; spin_lock(&port->backlog.lock); if (skb_queue_len(&port->backlog) < IPVLAN_QBACKLOG_LIMIT) { dev_hold(skb->dev); __skb_queue_tail(&port->backlog, skb); spin_unlock(&port->backlog.lock); schedule_work(&port->wq); } else { spin_unlock(&port->backlog.lock); dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb(skb); } } static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev) { const struct ipvl_dev *ipvlan = netdev_priv(dev); void *lyr3h; struct ipvl_addr *addr; int addr_type; lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type); if (!lyr3h) goto out; if (!ipvlan_is_vepa(ipvlan->port)) { addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); if (addr) { if (ipvlan_is_private(ipvlan->port)) { consume_skb(skb); return NET_XMIT_DROP; } ipvlan_rcv_frame(addr, &skb, true); return NET_XMIT_SUCCESS; } } out: ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev); return ipvlan_process_outbound(skb); } static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) { const struct ipvl_dev *ipvlan = netdev_priv(dev); struct ethhdr *eth = skb_eth_hdr(skb); struct ipvl_addr *addr; void *lyr3h; int addr_type; if (!ipvlan_is_vepa(ipvlan->port) && ether_addr_equal(eth->h_dest, eth->h_source)) { lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type); if (lyr3h) { addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); if (addr) { if (ipvlan_is_private(ipvlan->port)) { consume_skb(skb); return NET_XMIT_DROP; } ipvlan_rcv_frame(addr, &skb, true); return NET_XMIT_SUCCESS; } } skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_XMIT_DROP; /* Packet definitely does not belong to any of the * virtual devices, but the dest is local. So forward * the skb for the main-dev. At the RX side we just return * RX_PASS for it to be processed further on the stack. */ dev_forward_skb(ipvlan->phy_dev, skb); return NET_XMIT_SUCCESS; } else if (is_multicast_ether_addr(eth->h_dest)) { skb_reset_mac_header(skb); ipvlan_skb_crossing_ns(skb, NULL); ipvlan_multicast_enqueue(ipvlan->port, skb, true); return NET_XMIT_SUCCESS; } skb->dev = ipvlan->phy_dev; return dev_queue_xmit(skb); } int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_port *port = ipvlan_port_get_rcu_bh(ipvlan->phy_dev); if (!port) goto out; if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) goto out; switch(port->mode) { case IPVLAN_MODE_L2: return ipvlan_xmit_mode_l2(skb, dev); case IPVLAN_MODE_L3: #ifdef CONFIG_IPVLAN_L3S case IPVLAN_MODE_L3S: #endif return ipvlan_xmit_mode_l3(skb, dev); } /* Should not reach here */ WARN_ONCE(true, "%s called for mode = [%x]\n", __func__, port->mode); out: kfree_skb(skb); return NET_XMIT_DROP; } static bool ipvlan_external_frame(struct sk_buff *skb, struct ipvl_port *port) { struct ethhdr *eth = eth_hdr(skb); struct ipvl_addr *addr; void *lyr3h; int addr_type; if (ether_addr_equal(eth->h_source, skb->dev->dev_addr)) { lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); if (!lyr3h) return true; addr = ipvlan_addr_lookup(port, lyr3h, addr_type, false); if (addr) return false; } return true; } static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb, struct ipvl_port *port) { void *lyr3h; int addr_type; struct ipvl_addr *addr; struct sk_buff *skb = *pskb; rx_handler_result_t ret = RX_HANDLER_PASS; lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); if (!lyr3h) goto out; addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); if (addr) ret = ipvlan_rcv_frame(addr, pskb, false); out: return ret; } static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, struct ipvl_port *port) { struct sk_buff *skb = *pskb; struct ethhdr *eth = eth_hdr(skb); rx_handler_result_t ret = RX_HANDLER_PASS; if (is_multicast_ether_addr(eth->h_dest)) { if (ipvlan_external_frame(skb, port)) { struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); /* External frames are queued for device local * distribution, but a copy is given to master * straight away to avoid sending duplicates later * when work-queue processes this frame. This is * achieved by returning RX_HANDLER_PASS. */ if (nskb) { ipvlan_skb_crossing_ns(nskb, NULL); ipvlan_multicast_enqueue(port, nskb, false); } } } else { /* Perform like l3 mode for non-multicast packet */ ret = ipvlan_handle_mode_l3(pskb, port); } return ret; } rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct ipvl_port *port = ipvlan_port_get_rcu(skb->dev); if (!port) return RX_HANDLER_PASS; switch (port->mode) { case IPVLAN_MODE_L2: return ipvlan_handle_mode_l2(pskb, port); case IPVLAN_MODE_L3: return ipvlan_handle_mode_l3(pskb, port); #ifdef CONFIG_IPVLAN_L3S case IPVLAN_MODE_L3S: return RX_HANDLER_PASS; #endif } /* Should not reach here */ WARN_ONCE(true, "%s called for mode = [%x]\n", __func__, port->mode); kfree_skb(skb); return RX_HANDLER_CONSUMED; }
13 2 3 3 12 12 12 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen */ #include "network-coding.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/byteorder/generic.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_packet.h> #include <linux/init.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/workqueue.h> #include <uapi/linux/batadv_packet.h> #include "hash.h" #include "log.h" #include "originator.h" #include "routing.h" #include "send.h" #include "tvlv.h" static struct lock_class_key batadv_nc_coding_hash_lock_class_key; static struct lock_class_key batadv_nc_decoding_hash_lock_class_key; static void batadv_nc_worker(struct work_struct *work); static int batadv_nc_recv_coded_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if); /** * batadv_nc_init() - one-time initialization for network coding * * Return: 0 on success or negative error number in case of failure */ int __init batadv_nc_init(void) { /* Register our packet type */ return batadv_recv_handler_register(BATADV_CODED, batadv_nc_recv_coded_packet); } /** * batadv_nc_start_timer() - initialise the nc periodic worker * @bat_priv: the bat priv with all the soft interface information */ static void batadv_nc_start_timer(struct batadv_priv *bat_priv) { queue_delayed_work(batadv_event_workqueue, &bat_priv->nc.work, msecs_to_jiffies(10)); } /** * batadv_nc_tvlv_container_update() - update the network coding tvlv container * after network coding setting change * @bat_priv: the bat priv with all the soft interface information */ static void batadv_nc_tvlv_container_update(struct batadv_priv *bat_priv) { char nc_mode; nc_mode = atomic_read(&bat_priv->network_coding); switch (nc_mode) { case 0: batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_NC, 1); break; case 1: batadv_tvlv_container_register(bat_priv, BATADV_TVLV_NC, 1, NULL, 0); break; } } /** * batadv_nc_status_update() - update the network coding tvlv container after * network coding setting change * @net_dev: the soft interface net device */ void batadv_nc_status_update(struct net_device *net_dev) { struct batadv_priv *bat_priv = netdev_priv(net_dev); batadv_nc_tvlv_container_update(bat_priv); } /** * batadv_nc_tvlv_ogm_handler_v1() - process incoming nc tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the gateway data * @tvlv_value_len: tvlv buffer length */ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len) { if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) clear_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); else set_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); } /** * batadv_nc_mesh_init() - initialise coding hash table and start housekeeping * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or negative error number in case of failure */ int batadv_nc_mesh_init(struct batadv_priv *bat_priv) { bat_priv->nc.timestamp_fwd_flush = jiffies; bat_priv->nc.timestamp_sniffed_purge = jiffies; if (bat_priv->nc.coding_hash || bat_priv->nc.decoding_hash) return 0; bat_priv->nc.coding_hash = batadv_hash_new(128); if (!bat_priv->nc.coding_hash) goto err; batadv_hash_set_lock_class(bat_priv->nc.coding_hash, &batadv_nc_coding_hash_lock_class_key); bat_priv->nc.decoding_hash = batadv_hash_new(128); if (!bat_priv->nc.decoding_hash) { batadv_hash_destroy(bat_priv->nc.coding_hash); goto err; } batadv_hash_set_lock_class(bat_priv->nc.decoding_hash, &batadv_nc_decoding_hash_lock_class_key); INIT_DELAYED_WORK(&bat_priv->nc.work, batadv_nc_worker); batadv_nc_start_timer(bat_priv); batadv_tvlv_handler_register(bat_priv, batadv_nc_tvlv_ogm_handler_v1, NULL, NULL, BATADV_TVLV_NC, 1, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); batadv_nc_tvlv_container_update(bat_priv); return 0; err: return -ENOMEM; } /** * batadv_nc_init_bat_priv() - initialise the nc specific bat_priv variables * @bat_priv: the bat priv with all the soft interface information */ void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) { atomic_set(&bat_priv->network_coding, 0); bat_priv->nc.min_tq = 200; bat_priv->nc.max_fwd_delay = 10; bat_priv->nc.max_buffer_time = 200; } /** * batadv_nc_init_orig() - initialise the nc fields of an orig_node * @orig_node: the orig_node which is going to be initialised */ void batadv_nc_init_orig(struct batadv_orig_node *orig_node) { INIT_LIST_HEAD(&orig_node->in_coding_list); INIT_LIST_HEAD(&orig_node->out_coding_list); spin_lock_init(&orig_node->in_coding_list_lock); spin_lock_init(&orig_node->out_coding_list_lock); } /** * batadv_nc_node_release() - release nc_node from lists and queue for free * after rcu grace period * @ref: kref pointer of the nc_node */ static void batadv_nc_node_release(struct kref *ref) { struct batadv_nc_node *nc_node; nc_node = container_of(ref, struct batadv_nc_node, refcount); batadv_orig_node_put(nc_node->orig_node); kfree_rcu(nc_node, rcu); } /** * batadv_nc_node_put() - decrement the nc_node refcounter and possibly * release it * @nc_node: nc_node to be free'd */ static void batadv_nc_node_put(struct batadv_nc_node *nc_node) { if (!nc_node) return; kref_put(&nc_node->refcount, batadv_nc_node_release); } /** * batadv_nc_path_release() - release nc_path from lists and queue for free * after rcu grace period * @ref: kref pointer of the nc_path */ static void batadv_nc_path_release(struct kref *ref) { struct batadv_nc_path *nc_path; nc_path = container_of(ref, struct batadv_nc_path, refcount); kfree_rcu(nc_path, rcu); } /** * batadv_nc_path_put() - decrement the nc_path refcounter and possibly * release it * @nc_path: nc_path to be free'd */ static void batadv_nc_path_put(struct batadv_nc_path *nc_path) { if (!nc_path) return; kref_put(&nc_path->refcount, batadv_nc_path_release); } /** * batadv_nc_packet_free() - frees nc packet * @nc_packet: the nc packet to free * @dropped: whether the packet is freed because is dropped */ static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet, bool dropped) { if (dropped) kfree_skb(nc_packet->skb); else consume_skb(nc_packet->skb); batadv_nc_path_put(nc_packet->nc_path); kfree(nc_packet); } /** * batadv_nc_to_purge_nc_node() - checks whether an nc node has to be purged * @bat_priv: the bat priv with all the soft interface information * @nc_node: the nc node to check * * Return: true if the entry has to be purged now, false otherwise */ static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv, struct batadv_nc_node *nc_node) { if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) return true; return batadv_has_timed_out(nc_node->last_seen, BATADV_NC_NODE_TIMEOUT); } /** * batadv_nc_to_purge_nc_path_coding() - checks whether an nc path has timed out * @bat_priv: the bat priv with all the soft interface information * @nc_path: the nc path to check * * Return: true if the entry has to be purged now, false otherwise */ static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv, struct batadv_nc_path *nc_path) { if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) return true; /* purge the path when no packets has been added for 10 times the * max_fwd_delay time */ return batadv_has_timed_out(nc_path->last_valid, bat_priv->nc.max_fwd_delay * 10); } /** * batadv_nc_to_purge_nc_path_decoding() - checks whether an nc path has timed * out * @bat_priv: the bat priv with all the soft interface information * @nc_path: the nc path to check * * Return: true if the entry has to be purged now, false otherwise */ static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv, struct batadv_nc_path *nc_path) { if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) return true; /* purge the path when no packets has been added for 10 times the * max_buffer time */ return batadv_has_timed_out(nc_path->last_valid, bat_priv->nc.max_buffer_time * 10); } /** * batadv_nc_purge_orig_nc_nodes() - go through list of nc nodes and purge stale * entries * @bat_priv: the bat priv with all the soft interface information * @list: list of nc nodes * @lock: nc node list lock * @to_purge: function in charge to decide whether an entry has to be purged or * not. This function takes the nc node as argument and has to return * a boolean value: true if the entry has to be deleted, false * otherwise */ static void batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv, struct list_head *list, spinlock_t *lock, bool (*to_purge)(struct batadv_priv *, struct batadv_nc_node *)) { struct batadv_nc_node *nc_node, *nc_node_tmp; /* For each nc_node in list */ spin_lock_bh(lock); list_for_each_entry_safe(nc_node, nc_node_tmp, list, list) { /* if an helper function has been passed as parameter, * ask it if the entry has to be purged or not */ if (to_purge && !to_purge(bat_priv, nc_node)) continue; batadv_dbg(BATADV_DBG_NC, bat_priv, "Removing nc_node %pM -> %pM\n", nc_node->addr, nc_node->orig_node->orig); list_del_rcu(&nc_node->list); batadv_nc_node_put(nc_node); } spin_unlock_bh(lock); } /** * batadv_nc_purge_orig() - purges all nc node data attached of the given * originator * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig_node with the nc node entries to be purged * @to_purge: function in charge to decide whether an entry has to be purged or * not. This function takes the nc node as argument and has to return * a boolean value: true is the entry has to be deleted, false * otherwise */ void batadv_nc_purge_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, bool (*to_purge)(struct batadv_priv *, struct batadv_nc_node *)) { /* Check ingoing nc_node's of this orig_node */ batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->in_coding_list, &orig_node->in_coding_list_lock, to_purge); /* Check outgoing nc_node's of this orig_node */ batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->out_coding_list, &orig_node->out_coding_list_lock, to_purge); } /** * batadv_nc_purge_orig_hash() - traverse entire originator hash to check if * they have timed out nc nodes * @bat_priv: the bat priv with all the soft interface information */ static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node; u32 i; if (!hash) return; /* For each orig_node */ for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) batadv_nc_purge_orig(bat_priv, orig_node, batadv_nc_to_purge_nc_node); rcu_read_unlock(); } } /** * batadv_nc_purge_paths() - traverse all nc paths part of the hash and remove * unused ones * @bat_priv: the bat priv with all the soft interface information * @hash: hash table containing the nc paths to check * @to_purge: function in charge to decide whether an entry has to be purged or * not. This function takes the nc node as argument and has to return * a boolean value: true is the entry has to be deleted, false * otherwise */ static void batadv_nc_purge_paths(struct batadv_priv *bat_priv, struct batadv_hashtable *hash, bool (*to_purge)(struct batadv_priv *, struct batadv_nc_path *)) { struct hlist_head *head; struct hlist_node *node_tmp; struct batadv_nc_path *nc_path; spinlock_t *lock; /* Protects lists in hash */ u32 i; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; lock = &hash->list_locks[i]; /* For each nc_path in this bin */ spin_lock_bh(lock); hlist_for_each_entry_safe(nc_path, node_tmp, head, hash_entry) { /* if an helper function has been passed as parameter, * ask it if the entry has to be purged or not */ if (to_purge && !to_purge(bat_priv, nc_path)) continue; /* purging an non-empty nc_path should never happen, but * is observed under high CPU load. Delay the purging * until next iteration to allow the packet_list to be * emptied first. */ if (!unlikely(list_empty(&nc_path->packet_list))) { net_ratelimited_function(printk, KERN_WARNING "Skipping free of non-empty nc_path (%pM -> %pM)!\n", nc_path->prev_hop, nc_path->next_hop); continue; } /* nc_path is unused, so remove it */ batadv_dbg(BATADV_DBG_NC, bat_priv, "Remove nc_path %pM -> %pM\n", nc_path->prev_hop, nc_path->next_hop); hlist_del_rcu(&nc_path->hash_entry); batadv_nc_path_put(nc_path); } spin_unlock_bh(lock); } } /** * batadv_nc_hash_key_gen() - computes the nc_path hash key * @key: buffer to hold the final hash key * @src: source ethernet mac address going into the hash key * @dst: destination ethernet mac address going into the hash key */ static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src, const char *dst) { memcpy(key->prev_hop, src, sizeof(key->prev_hop)); memcpy(key->next_hop, dst, sizeof(key->next_hop)); } /** * batadv_nc_hash_choose() - compute the hash value for an nc path * @data: data to hash * @size: size of the hash table * * Return: the selected index in the hash table for the given data. */ static u32 batadv_nc_hash_choose(const void *data, u32 size) { const struct batadv_nc_path *nc_path = data; u32 hash = 0; hash = jhash(&nc_path->prev_hop, sizeof(nc_path->prev_hop), hash); hash = jhash(&nc_path->next_hop, sizeof(nc_path->next_hop), hash); return hash % size; } /** * batadv_nc_hash_compare() - comparing function used in the network coding hash * tables * @node: node in the local table * @data2: second object to compare the node to * * Return: true if the two entry are the same, false otherwise */ static bool batadv_nc_hash_compare(const struct hlist_node *node, const void *data2) { const struct batadv_nc_path *nc_path1, *nc_path2; nc_path1 = container_of(node, struct batadv_nc_path, hash_entry); nc_path2 = data2; /* Return 1 if the two keys are identical */ if (!batadv_compare_eth(nc_path1->prev_hop, nc_path2->prev_hop)) return false; if (!batadv_compare_eth(nc_path1->next_hop, nc_path2->next_hop)) return false; return true; } /** * batadv_nc_hash_find() - search for an existing nc path and return it * @hash: hash table containing the nc path * @data: search key * * Return: the nc_path if found, NULL otherwise. */ static struct batadv_nc_path * batadv_nc_hash_find(struct batadv_hashtable *hash, void *data) { struct hlist_head *head; struct batadv_nc_path *nc_path, *nc_path_tmp = NULL; int index; if (!hash) return NULL; index = batadv_nc_hash_choose(data, hash->size); head = &hash->table[index]; rcu_read_lock(); hlist_for_each_entry_rcu(nc_path, head, hash_entry) { if (!batadv_nc_hash_compare(&nc_path->hash_entry, data)) continue; if (!kref_get_unless_zero(&nc_path->refcount)) continue; nc_path_tmp = nc_path; break; } rcu_read_unlock(); return nc_path_tmp; } /** * batadv_nc_send_packet() - send non-coded packet and free nc_packet struct * @nc_packet: the nc packet to send */ static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet) { batadv_send_unicast_skb(nc_packet->skb, nc_packet->neigh_node); nc_packet->skb = NULL; batadv_nc_packet_free(nc_packet, false); } /** * batadv_nc_sniffed_purge() - Checks timestamp of given sniffed nc_packet. * @bat_priv: the bat priv with all the soft interface information * @nc_path: the nc path the packet belongs to * @nc_packet: the nc packet to be checked * * Checks whether the given sniffed (overheard) nc_packet has hit its buffering * timeout. If so, the packet is no longer kept and the entry deleted from the * queue. Has to be called with the appropriate locks. * * Return: false as soon as the entry in the fifo queue has not been timed out * yet and true otherwise. */ static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv, struct batadv_nc_path *nc_path, struct batadv_nc_packet *nc_packet) { unsigned long timeout = bat_priv->nc.max_buffer_time; bool res = false; lockdep_assert_held(&nc_path->packet_list_lock); /* Packets are added to tail, so the remaining packets did not time * out and we can stop processing the current queue */ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE && !batadv_has_timed_out(nc_packet->timestamp, timeout)) goto out; /* purge nc packet */ list_del(&nc_packet->list); batadv_nc_packet_free(nc_packet, true); res = true; out: return res; } /** * batadv_nc_fwd_flush() - Checks the timestamp of the given nc packet. * @bat_priv: the bat priv with all the soft interface information * @nc_path: the nc path the packet belongs to * @nc_packet: the nc packet to be checked * * Checks whether the given nc packet has hit its forward timeout. If so, the * packet is no longer delayed, immediately sent and the entry deleted from the * queue. Has to be called with the appropriate locks. * * Return: false as soon as the entry in the fifo queue has not been timed out * yet and true otherwise. */ static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv, struct batadv_nc_path *nc_path, struct batadv_nc_packet *nc_packet) { unsigned long timeout = bat_priv->nc.max_fwd_delay; lockdep_assert_held(&nc_path->packet_list_lock); /* Packets are added to tail, so the remaining packets did not time * out and we can stop processing the current queue */ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE && !batadv_has_timed_out(nc_packet->timestamp, timeout)) return false; /* Send packet */ batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD); batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES, nc_packet->skb->len + ETH_HLEN); list_del(&nc_packet->list); batadv_nc_send_packet(nc_packet); return true; } /** * batadv_nc_process_nc_paths() - traverse given nc packet pool and free timed * out nc packets * @bat_priv: the bat priv with all the soft interface information * @hash: to be processed hash table * @process_fn: Function called to process given nc packet. Should return true * to encourage this function to proceed with the next packet. * Otherwise the rest of the current queue is skipped. */ static void batadv_nc_process_nc_paths(struct batadv_priv *bat_priv, struct batadv_hashtable *hash, bool (*process_fn)(struct batadv_priv *, struct batadv_nc_path *, struct batadv_nc_packet *)) { struct hlist_head *head; struct batadv_nc_packet *nc_packet, *nc_packet_tmp; struct batadv_nc_path *nc_path; bool ret; int i; if (!hash) return; /* Loop hash table bins */ for (i = 0; i < hash->size; i++) { head = &hash->table[i]; /* Loop coding paths */ rcu_read_lock(); hlist_for_each_entry_rcu(nc_path, head, hash_entry) { /* Loop packets */ spin_lock_bh(&nc_path->packet_list_lock); list_for_each_entry_safe(nc_packet, nc_packet_tmp, &nc_path->packet_list, list) { ret = process_fn(bat_priv, nc_path, nc_packet); if (!ret) break; } spin_unlock_bh(&nc_path->packet_list_lock); } rcu_read_unlock(); } } /** * batadv_nc_worker() - periodic task for housekeeping related to network * coding * @work: kernel work struct */ static void batadv_nc_worker(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv_nc *priv_nc; struct batadv_priv *bat_priv; unsigned long timeout; delayed_work = to_delayed_work(work); priv_nc = container_of(delayed_work, struct batadv_priv_nc, work); bat_priv = container_of(priv_nc, struct batadv_priv, nc); batadv_nc_purge_orig_hash(bat_priv); batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, batadv_nc_to_purge_nc_path_coding); batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, batadv_nc_to_purge_nc_path_decoding); timeout = bat_priv->nc.max_fwd_delay; if (batadv_has_timed_out(bat_priv->nc.timestamp_fwd_flush, timeout)) { batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.coding_hash, batadv_nc_fwd_flush); bat_priv->nc.timestamp_fwd_flush = jiffies; } if (batadv_has_timed_out(bat_priv->nc.timestamp_sniffed_purge, bat_priv->nc.max_buffer_time)) { batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.decoding_hash, batadv_nc_sniffed_purge); bat_priv->nc.timestamp_sniffed_purge = jiffies; } /* Schedule a new check */ batadv_nc_start_timer(bat_priv); } /** * batadv_can_nc_with_orig() - checks whether the given orig node is suitable * for coding or not * @bat_priv: the bat priv with all the soft interface information * @orig_node: neighboring orig node which may be used as nc candidate * @ogm_packet: incoming ogm packet also used for the checks * * Return: true if: * 1) The OGM must have the most recent sequence number. * 2) The TTL must be decremented by one and only one. * 3) The OGM must be received from the first hop from orig_node. * 4) The TQ value of the OGM must be above bat_priv->nc.min_tq. */ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_ogm_packet *ogm_packet) { struct batadv_orig_ifinfo *orig_ifinfo; u32 last_real_seqno; u8 last_ttl; orig_ifinfo = batadv_orig_ifinfo_get(orig_node, BATADV_IF_DEFAULT); if (!orig_ifinfo) return false; last_ttl = orig_ifinfo->last_ttl; last_real_seqno = orig_ifinfo->last_real_seqno; batadv_orig_ifinfo_put(orig_ifinfo); if (last_real_seqno != ntohl(ogm_packet->seqno)) return false; if (last_ttl != ogm_packet->ttl + 1) return false; if (!batadv_compare_eth(ogm_packet->orig, ogm_packet->prev_sender)) return false; if (ogm_packet->tq < bat_priv->nc.min_tq) return false; return true; } /** * batadv_nc_find_nc_node() - search for an existing nc node and return it * @orig_node: orig node originating the ogm packet * @orig_neigh_node: neighboring orig node from which we received the ogm packet * (can be equal to orig_node) * @in_coding: traverse incoming or outgoing network coding list * * Return: the nc_node if found, NULL otherwise. */ static struct batadv_nc_node * batadv_nc_find_nc_node(struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh_node, bool in_coding) { struct batadv_nc_node *nc_node, *nc_node_out = NULL; struct list_head *list; if (in_coding) list = &orig_neigh_node->in_coding_list; else list = &orig_neigh_node->out_coding_list; /* Traverse list of nc_nodes to orig_node */ rcu_read_lock(); list_for_each_entry_rcu(nc_node, list, list) { if (!batadv_compare_eth(nc_node->addr, orig_node->orig)) continue; if (!kref_get_unless_zero(&nc_node->refcount)) continue; /* Found a match */ nc_node_out = nc_node; break; } rcu_read_unlock(); return nc_node_out; } /** * batadv_nc_get_nc_node() - retrieves an nc node or creates the entry if it was * not found * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node originating the ogm packet * @orig_neigh_node: neighboring orig node from which we received the ogm packet * (can be equal to orig_node) * @in_coding: traverse incoming or outgoing network coding list * * Return: the nc_node if found or created, NULL in case of an error. */ static struct batadv_nc_node * batadv_nc_get_nc_node(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh_node, bool in_coding) { struct batadv_nc_node *nc_node; spinlock_t *lock; /* Used to lock list selected by "int in_coding" */ struct list_head *list; /* Select ingoing or outgoing coding node */ if (in_coding) { lock = &orig_neigh_node->in_coding_list_lock; list = &orig_neigh_node->in_coding_list; } else { lock = &orig_neigh_node->out_coding_list_lock; list = &orig_neigh_node->out_coding_list; } spin_lock_bh(lock); /* Check if nc_node is already added */ nc_node = batadv_nc_find_nc_node(orig_node, orig_neigh_node, in_coding); /* Node found */ if (nc_node) goto unlock; nc_node = kzalloc(sizeof(*nc_node), GFP_ATOMIC); if (!nc_node) goto unlock; /* Initialize nc_node */ INIT_LIST_HEAD(&nc_node->list); kref_init(&nc_node->refcount); ether_addr_copy(nc_node->addr, orig_node->orig); kref_get(&orig_neigh_node->refcount); nc_node->orig_node = orig_neigh_node; batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_node %pM -> %pM\n", nc_node->addr, nc_node->orig_node->orig); /* Add nc_node to orig_node */ kref_get(&nc_node->refcount); list_add_tail_rcu(&nc_node->list, list); unlock: spin_unlock_bh(lock); return nc_node; } /** * batadv_nc_update_nc_node() - updates stored incoming and outgoing nc node * structs (best called on incoming OGMs) * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node originating the ogm packet * @orig_neigh_node: neighboring orig node from which we received the ogm packet * (can be equal to orig_node) * @ogm_packet: incoming ogm packet * @is_single_hop_neigh: orig_node is a single hop neighbor */ void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh_node, struct batadv_ogm_packet *ogm_packet, int is_single_hop_neigh) { struct batadv_nc_node *in_nc_node = NULL; struct batadv_nc_node *out_nc_node = NULL; /* Check if network coding is enabled */ if (!atomic_read(&bat_priv->network_coding)) goto out; /* check if orig node is network coding enabled */ if (!test_bit(BATADV_ORIG_CAPA_HAS_NC, &orig_node->capabilities)) goto out; /* accept ogms from 'good' neighbors and single hop neighbors */ if (!batadv_can_nc_with_orig(bat_priv, orig_node, ogm_packet) && !is_single_hop_neigh) goto out; /* Add orig_node as in_nc_node on hop */ in_nc_node = batadv_nc_get_nc_node(bat_priv, orig_node, orig_neigh_node, true); if (!in_nc_node) goto out; in_nc_node->last_seen = jiffies; /* Add hop as out_nc_node on orig_node */ out_nc_node = batadv_nc_get_nc_node(bat_priv, orig_neigh_node, orig_node, false); if (!out_nc_node) goto out; out_nc_node->last_seen = jiffies; out: batadv_nc_node_put(in_nc_node); batadv_nc_node_put(out_nc_node); } /** * batadv_nc_get_path() - get existing nc_path or allocate a new one * @bat_priv: the bat priv with all the soft interface information * @hash: hash table containing the nc path * @src: ethernet source address - first half of the nc path search key * @dst: ethernet destination address - second half of the nc path search key * * Return: pointer to nc_path if the path was found or created, returns NULL * on error. */ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, struct batadv_hashtable *hash, u8 *src, u8 *dst) { int hash_added; struct batadv_nc_path *nc_path, nc_path_key; batadv_nc_hash_key_gen(&nc_path_key, src, dst); /* Search for existing nc_path */ nc_path = batadv_nc_hash_find(hash, (void *)&nc_path_key); if (nc_path) { /* Set timestamp to delay removal of nc_path */ nc_path->last_valid = jiffies; return nc_path; } /* No existing nc_path was found; create a new */ nc_path = kzalloc(sizeof(*nc_path), GFP_ATOMIC); if (!nc_path) return NULL; /* Initialize nc_path */ INIT_LIST_HEAD(&nc_path->packet_list); spin_lock_init(&nc_path->packet_list_lock); kref_init(&nc_path->refcount); nc_path->last_valid = jiffies; ether_addr_copy(nc_path->next_hop, dst); ether_addr_copy(nc_path->prev_hop, src); batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_path %pM -> %pM\n", nc_path->prev_hop, nc_path->next_hop); /* Add nc_path to hash table */ kref_get(&nc_path->refcount); hash_added = batadv_hash_add(hash, batadv_nc_hash_compare, batadv_nc_hash_choose, &nc_path_key, &nc_path->hash_entry); if (hash_added < 0) { kfree(nc_path); return NULL; } return nc_path; } /** * batadv_nc_random_weight_tq() - scale the receivers TQ-value to avoid unfair * selection of a receiver with slightly lower TQ than the other * @tq: to be weighted tq value * * Return: scaled tq value */ static u8 batadv_nc_random_weight_tq(u8 tq) { /* randomize the estimated packet loss (max TQ - estimated TQ) */ u8 rand_tq = get_random_u32_below(BATADV_TQ_MAX_VALUE + 1 - tq); /* convert to (randomized) estimated tq again */ return BATADV_TQ_MAX_VALUE - rand_tq; } /** * batadv_nc_memxor() - XOR destination with source * @dst: byte array to XOR into * @src: byte array to XOR from * @len: length of destination array */ static void batadv_nc_memxor(char *dst, const char *src, unsigned int len) { unsigned int i; for (i = 0; i < len; ++i) dst[i] ^= src[i]; } /** * batadv_nc_code_packets() - code a received unicast_packet with an nc packet * into a coded_packet and send it * @bat_priv: the bat priv with all the soft interface information * @skb: data skb to forward * @ethhdr: pointer to the ethernet header inside the skb * @nc_packet: structure containing the packet to the skb can be coded with * @neigh_node: next hop to forward packet to * * Return: true if both packets are consumed, false otherwise. */ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, struct sk_buff *skb, struct ethhdr *ethhdr, struct batadv_nc_packet *nc_packet, struct batadv_neigh_node *neigh_node) { u8 tq_weighted_neigh, tq_weighted_coding, tq_tmp; struct sk_buff *skb_dest, *skb_src; struct batadv_unicast_packet *packet1; struct batadv_unicast_packet *packet2; struct batadv_coded_packet *coded_packet; struct batadv_neigh_node *neigh_tmp, *router_neigh, *first_dest; struct batadv_neigh_node *router_coding = NULL, *second_dest; struct batadv_neigh_ifinfo *router_neigh_ifinfo = NULL; struct batadv_neigh_ifinfo *router_coding_ifinfo = NULL; u8 *first_source, *second_source; __be32 packet_id1, packet_id2; size_t count; bool res = false; int coding_len; int unicast_size = sizeof(*packet1); int coded_size = sizeof(*coded_packet); int header_add = coded_size - unicast_size; /* TODO: do we need to consider the outgoing interface for * coded packets? */ router_neigh = batadv_orig_router_get(neigh_node->orig_node, BATADV_IF_DEFAULT); if (!router_neigh) goto out; router_neigh_ifinfo = batadv_neigh_ifinfo_get(router_neigh, BATADV_IF_DEFAULT); if (!router_neigh_ifinfo) goto out; neigh_tmp = nc_packet->neigh_node; router_coding = batadv_orig_router_get(neigh_tmp->orig_node, BATADV_IF_DEFAULT); if (!router_coding) goto out; router_coding_ifinfo = batadv_neigh_ifinfo_get(router_coding, BATADV_IF_DEFAULT); if (!router_coding_ifinfo) goto out; tq_tmp = router_neigh_ifinfo->bat_iv.tq_avg; tq_weighted_neigh = batadv_nc_random_weight_tq(tq_tmp); tq_tmp = router_coding_ifinfo->bat_iv.tq_avg; tq_weighted_coding = batadv_nc_random_weight_tq(tq_tmp); /* Select one destination for the MAC-header dst-field based on * weighted TQ-values. */ if (tq_weighted_neigh >= tq_weighted_coding) { /* Destination from nc_packet is selected for MAC-header */ first_dest = nc_packet->neigh_node; first_source = nc_packet->nc_path->prev_hop; second_dest = neigh_node; second_source = ethhdr->h_source; packet1 = (struct batadv_unicast_packet *)nc_packet->skb->data; packet2 = (struct batadv_unicast_packet *)skb->data; packet_id1 = nc_packet->packet_id; packet_id2 = batadv_skb_crc32(skb, skb->data + sizeof(*packet2)); } else { /* Destination for skb is selected for MAC-header */ first_dest = neigh_node; first_source = ethhdr->h_source; second_dest = nc_packet->neigh_node; second_source = nc_packet->nc_path->prev_hop; packet1 = (struct batadv_unicast_packet *)skb->data; packet2 = (struct batadv_unicast_packet *)nc_packet->skb->data; packet_id1 = batadv_skb_crc32(skb, skb->data + sizeof(*packet1)); packet_id2 = nc_packet->packet_id; } /* Instead of zero padding the smallest data buffer, we * code into the largest. */ if (skb->len <= nc_packet->skb->len) { skb_dest = nc_packet->skb; skb_src = skb; } else { skb_dest = skb; skb_src = nc_packet->skb; } /* coding_len is used when decoding the packet shorter packet */ coding_len = skb_src->len - unicast_size; if (skb_linearize(skb_dest) < 0 || skb_linearize(skb_src) < 0) goto out; skb_push(skb_dest, header_add); coded_packet = (struct batadv_coded_packet *)skb_dest->data; skb_reset_mac_header(skb_dest); coded_packet->packet_type = BATADV_CODED; coded_packet->version = BATADV_COMPAT_VERSION; coded_packet->ttl = packet1->ttl; /* Info about first unicast packet */ ether_addr_copy(coded_packet->first_source, first_source); ether_addr_copy(coded_packet->first_orig_dest, packet1->dest); coded_packet->first_crc = packet_id1; coded_packet->first_ttvn = packet1->ttvn; /* Info about second unicast packet */ ether_addr_copy(coded_packet->second_dest, second_dest->addr); ether_addr_copy(coded_packet->second_source, second_source); ether_addr_copy(coded_packet->second_orig_dest, packet2->dest); coded_packet->second_crc = packet_id2; coded_packet->second_ttl = packet2->ttl; coded_packet->second_ttvn = packet2->ttvn; coded_packet->coded_len = htons(coding_len); /* This is where the magic happens: Code skb_src into skb_dest */ batadv_nc_memxor(skb_dest->data + coded_size, skb_src->data + unicast_size, coding_len); /* Update counters accordingly */ if (BATADV_SKB_CB(skb_src)->decoded && BATADV_SKB_CB(skb_dest)->decoded) { /* Both packets are recoded */ count = skb_src->len + ETH_HLEN; count += skb_dest->len + ETH_HLEN; batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE, 2); batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, count); } else if (!BATADV_SKB_CB(skb_src)->decoded && !BATADV_SKB_CB(skb_dest)->decoded) { /* Both packets are newly coded */ count = skb_src->len + ETH_HLEN; count += skb_dest->len + ETH_HLEN; batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE, 2); batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, count); } else if (BATADV_SKB_CB(skb_src)->decoded && !BATADV_SKB_CB(skb_dest)->decoded) { /* skb_src recoded and skb_dest is newly coded */ batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE); batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, skb_src->len + ETH_HLEN); batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE); batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, skb_dest->len + ETH_HLEN); } else if (!BATADV_SKB_CB(skb_src)->decoded && BATADV_SKB_CB(skb_dest)->decoded) { /* skb_src is newly coded and skb_dest is recoded */ batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE); batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, skb_src->len + ETH_HLEN); batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE); batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, skb_dest->len + ETH_HLEN); } /* skb_src is now coded into skb_dest, so free it */ consume_skb(skb_src); /* avoid duplicate free of skb from nc_packet */ nc_packet->skb = NULL; batadv_nc_packet_free(nc_packet, false); /* Send the coded packet and return true */ batadv_send_unicast_skb(skb_dest, first_dest); res = true; out: batadv_neigh_node_put(router_neigh); batadv_neigh_node_put(router_coding); batadv_neigh_ifinfo_put(router_neigh_ifinfo); batadv_neigh_ifinfo_put(router_coding_ifinfo); return res; } /** * batadv_nc_skb_coding_possible() - true if a decoded skb is available at dst. * @skb: data skb to forward * @dst: destination mac address of the other skb to code with * @src: source mac address of skb * * Whenever we network code a packet we have to check whether we received it in * a network coded form. If so, we may not be able to use it for coding because * some neighbors may also have received (overheard) the packet in the network * coded form without being able to decode it. It is hard to know which of the * neighboring nodes was able to decode the packet, therefore we can only * re-code the packet if the source of the previous encoded packet is involved. * Since the source encoded the packet we can be certain it has all necessary * decode information. * * Return: true if coding of a decoded packet is allowed. */ static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src) { if (BATADV_SKB_CB(skb)->decoded && !batadv_compare_eth(dst, src)) return false; return true; } /** * batadv_nc_path_search() - Find the coding path matching in_nc_node and * out_nc_node to retrieve a buffered packet that can be used for coding. * @bat_priv: the bat priv with all the soft interface information * @in_nc_node: pointer to skb next hop's neighbor nc node * @out_nc_node: pointer to skb source's neighbor nc node * @skb: data skb to forward * @eth_dst: next hop mac address of skb * * Return: true if coding of a decoded skb is allowed. */ static struct batadv_nc_packet * batadv_nc_path_search(struct batadv_priv *bat_priv, struct batadv_nc_node *in_nc_node, struct batadv_nc_node *out_nc_node, struct sk_buff *skb, u8 *eth_dst) { struct batadv_nc_path *nc_path, nc_path_key; struct batadv_nc_packet *nc_packet_out = NULL; struct batadv_nc_packet *nc_packet, *nc_packet_tmp; struct batadv_hashtable *hash = bat_priv->nc.coding_hash; int idx; if (!hash) return NULL; /* Create almost path key */ batadv_nc_hash_key_gen(&nc_path_key, in_nc_node->addr, out_nc_node->addr); idx = batadv_nc_hash_choose(&nc_path_key, hash->size); /* Check for coding opportunities in this nc_path */ rcu_read_lock(); hlist_for_each_entry_rcu(nc_path, &hash->table[idx], hash_entry) { if (!batadv_compare_eth(nc_path->prev_hop, in_nc_node->addr)) continue; if (!batadv_compare_eth(nc_path->next_hop, out_nc_node->addr)) continue; spin_lock_bh(&nc_path->packet_list_lock); if (list_empty(&nc_path->packet_list)) { spin_unlock_bh(&nc_path->packet_list_lock); continue; } list_for_each_entry_safe(nc_packet, nc_packet_tmp, &nc_path->packet_list, list) { if (!batadv_nc_skb_coding_possible(nc_packet->skb, eth_dst, in_nc_node->addr)) continue; /* Coding opportunity is found! */ list_del(&nc_packet->list); nc_packet_out = nc_packet; break; } spin_unlock_bh(&nc_path->packet_list_lock); break; } rcu_read_unlock(); return nc_packet_out; } /** * batadv_nc_skb_src_search() - Loops through the list of neighboring nodes of * the skb's sender (may be equal to the originator). * @bat_priv: the bat priv with all the soft interface information * @skb: data skb to forward * @eth_dst: next hop mac address of skb * @eth_src: source mac address of skb * @in_nc_node: pointer to skb next hop's neighbor nc node * * Return: an nc packet if a suitable coding packet was found, NULL otherwise. */ static struct batadv_nc_packet * batadv_nc_skb_src_search(struct batadv_priv *bat_priv, struct sk_buff *skb, u8 *eth_dst, u8 *eth_src, struct batadv_nc_node *in_nc_node) { struct batadv_orig_node *orig_node; struct batadv_nc_node *out_nc_node; struct batadv_nc_packet *nc_packet = NULL; orig_node = batadv_orig_hash_find(bat_priv, eth_src); if (!orig_node) return NULL; rcu_read_lock(); list_for_each_entry_rcu(out_nc_node, &orig_node->out_coding_list, list) { /* Check if the skb is decoded and if recoding is possible */ if (!batadv_nc_skb_coding_possible(skb, out_nc_node->addr, eth_src)) continue; /* Search for an opportunity in this nc_path */ nc_packet = batadv_nc_path_search(bat_priv, in_nc_node, out_nc_node, skb, eth_dst); if (nc_packet) break; } rcu_read_unlock(); batadv_orig_node_put(orig_node); return nc_packet; } /** * batadv_nc_skb_store_before_coding() - set the ethernet src and dst of the * unicast skb before it is stored for use in later decoding * @bat_priv: the bat priv with all the soft interface information * @skb: data skb to store * @eth_dst_new: new destination mac address of skb */ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, struct sk_buff *skb, u8 *eth_dst_new) { struct ethhdr *ethhdr; /* Copy skb header to change the mac header */ skb = pskb_copy_for_clone(skb, GFP_ATOMIC); if (!skb) return; /* Set the mac header as if we actually sent the packet uncoded */ ethhdr = eth_hdr(skb); ether_addr_copy(ethhdr->h_source, ethhdr->h_dest); ether_addr_copy(ethhdr->h_dest, eth_dst_new); /* Set data pointer to MAC header to mimic packets from our tx path */ skb_push(skb, ETH_HLEN); /* Add the packet to the decoding packet pool */ batadv_nc_skb_store_for_decoding(bat_priv, skb); /* batadv_nc_skb_store_for_decoding() clones the skb, so we must free * our ref */ consume_skb(skb); } /** * batadv_nc_skb_dst_search() - Loops through list of neighboring nodes to dst. * @skb: data skb to forward * @neigh_node: next hop to forward packet to * @ethhdr: pointer to the ethernet header inside the skb * * Loops through the list of neighboring nodes the next hop has a good * connection to (receives OGMs with a sufficient quality). We need to find a * neighbor of our next hop that potentially sent a packet which our next hop * also received (overheard) and has stored for later decoding. * * Return: true if the skb was consumed (encoded packet sent) or false otherwise */ static bool batadv_nc_skb_dst_search(struct sk_buff *skb, struct batadv_neigh_node *neigh_node, struct ethhdr *ethhdr) { struct net_device *netdev = neigh_node->if_incoming->soft_iface; struct batadv_priv *bat_priv = netdev_priv(netdev); struct batadv_orig_node *orig_node = neigh_node->orig_node; struct batadv_nc_node *nc_node; struct batadv_nc_packet *nc_packet = NULL; rcu_read_lock(); list_for_each_entry_rcu(nc_node, &orig_node->in_coding_list, list) { /* Search for coding opportunity with this in_nc_node */ nc_packet = batadv_nc_skb_src_search(bat_priv, skb, neigh_node->addr, ethhdr->h_source, nc_node); /* Opportunity was found, so stop searching */ if (nc_packet) break; } rcu_read_unlock(); if (!nc_packet) return false; /* Save packets for later decoding */ batadv_nc_skb_store_before_coding(bat_priv, skb, neigh_node->addr); batadv_nc_skb_store_before_coding(bat_priv, nc_packet->skb, nc_packet->neigh_node->addr); /* Code and send packets */ if (batadv_nc_code_packets(bat_priv, skb, ethhdr, nc_packet, neigh_node)) return true; /* out of mem ? Coding failed - we have to free the buffered packet * to avoid memleaks. The skb passed as argument will be dealt with * by the calling function. */ batadv_nc_send_packet(nc_packet); return false; } /** * batadv_nc_skb_add_to_path() - buffer skb for later encoding / decoding * @skb: skb to add to path * @nc_path: path to add skb to * @neigh_node: next hop to forward packet to * @packet_id: checksum to identify packet * * Return: true if the packet was buffered or false in case of an error. */ static bool batadv_nc_skb_add_to_path(struct sk_buff *skb, struct batadv_nc_path *nc_path, struct batadv_neigh_node *neigh_node, __be32 packet_id) { struct batadv_nc_packet *nc_packet; nc_packet = kzalloc(sizeof(*nc_packet), GFP_ATOMIC); if (!nc_packet) return false; /* Initialize nc_packet */ nc_packet->timestamp = jiffies; nc_packet->packet_id = packet_id; nc_packet->skb = skb; nc_packet->neigh_node = neigh_node; nc_packet->nc_path = nc_path; /* Add coding packet to list */ spin_lock_bh(&nc_path->packet_list_lock); list_add_tail(&nc_packet->list, &nc_path->packet_list); spin_unlock_bh(&nc_path->packet_list_lock); return true; } /** * batadv_nc_skb_forward() - try to code a packet or add it to the coding packet * buffer * @skb: data skb to forward * @neigh_node: next hop to forward packet to * * Return: true if the skb was consumed (encoded packet sent) or false otherwise */ bool batadv_nc_skb_forward(struct sk_buff *skb, struct batadv_neigh_node *neigh_node) { const struct net_device *netdev = neigh_node->if_incoming->soft_iface; struct batadv_priv *bat_priv = netdev_priv(netdev); struct batadv_unicast_packet *packet; struct batadv_nc_path *nc_path; struct ethhdr *ethhdr = eth_hdr(skb); __be32 packet_id; u8 *payload; /* Check if network coding is enabled */ if (!atomic_read(&bat_priv->network_coding)) goto out; /* We only handle unicast packets */ payload = skb_network_header(skb); packet = (struct batadv_unicast_packet *)payload; if (packet->packet_type != BATADV_UNICAST) goto out; /* Try to find a coding opportunity and send the skb if one is found */ if (batadv_nc_skb_dst_search(skb, neigh_node, ethhdr)) return true; /* Find or create a nc_path for this src-dst pair */ nc_path = batadv_nc_get_path(bat_priv, bat_priv->nc.coding_hash, ethhdr->h_source, neigh_node->addr); if (!nc_path) goto out; /* Add skb to nc_path */ packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet)); if (!batadv_nc_skb_add_to_path(skb, nc_path, neigh_node, packet_id)) goto free_nc_path; /* Packet is consumed */ return true; free_nc_path: batadv_nc_path_put(nc_path); out: /* Packet is not consumed */ return false; } /** * batadv_nc_skb_store_for_decoding() - save a clone of the skb which can be * used when decoding coded packets * @bat_priv: the bat priv with all the soft interface information * @skb: data skb to store */ void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_unicast_packet *packet; struct batadv_nc_path *nc_path; struct ethhdr *ethhdr = eth_hdr(skb); __be32 packet_id; u8 *payload; /* Check if network coding is enabled */ if (!atomic_read(&bat_priv->network_coding)) goto out; /* Check for supported packet type */ payload = skb_network_header(skb); packet = (struct batadv_unicast_packet *)payload; if (packet->packet_type != BATADV_UNICAST) goto out; /* Find existing nc_path or create a new */ nc_path = batadv_nc_get_path(bat_priv, bat_priv->nc.decoding_hash, ethhdr->h_source, ethhdr->h_dest); if (!nc_path) goto out; /* Clone skb and adjust skb->data to point at batman header */ skb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!skb)) goto free_nc_path; if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) goto free_skb; if (unlikely(!skb_pull_rcsum(skb, ETH_HLEN))) goto free_skb; /* Add skb to nc_path */ packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet)); if (!batadv_nc_skb_add_to_path(skb, nc_path, NULL, packet_id)) goto free_skb; batadv_inc_counter(bat_priv, BATADV_CNT_NC_BUFFER); return; free_skb: kfree_skb(skb); free_nc_path: batadv_nc_path_put(nc_path); out: return; } /** * batadv_nc_skb_store_sniffed_unicast() - check if a received unicast packet * should be saved in the decoding buffer and, if so, store it there * @bat_priv: the bat priv with all the soft interface information * @skb: unicast skb to store */ void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct ethhdr *ethhdr = eth_hdr(skb); if (batadv_is_my_mac(bat_priv, ethhdr->h_dest)) return; /* Set data pointer to MAC header to mimic packets from our tx path */ skb_push(skb, ETH_HLEN); batadv_nc_skb_store_for_decoding(bat_priv, skb); } /** * batadv_nc_skb_decode_packet() - decode given skb using the decode data stored * in nc_packet * @bat_priv: the bat priv with all the soft interface information * @skb: unicast skb to decode * @nc_packet: decode data needed to decode the skb * * Return: pointer to decoded unicast packet if the packet was decoded or NULL * in case of an error. */ static struct batadv_unicast_packet * batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_nc_packet *nc_packet) { const int h_size = sizeof(struct batadv_unicast_packet); const int h_diff = sizeof(struct batadv_coded_packet) - h_size; struct batadv_unicast_packet *unicast_packet; struct batadv_coded_packet coded_packet_tmp; struct ethhdr *ethhdr, ethhdr_tmp; u8 *orig_dest, ttl, ttvn; unsigned int coding_len; int err; /* Save headers temporarily */ memcpy(&coded_packet_tmp, skb->data, sizeof(coded_packet_tmp)); memcpy(&ethhdr_tmp, skb_mac_header(skb), sizeof(ethhdr_tmp)); if (skb_cow(skb, 0) < 0) return NULL; if (unlikely(!skb_pull_rcsum(skb, h_diff))) return NULL; /* Data points to batman header, so set mac header 14 bytes before * and network to data */ skb_set_mac_header(skb, -ETH_HLEN); skb_reset_network_header(skb); /* Reconstruct original mac header */ ethhdr = eth_hdr(skb); *ethhdr = ethhdr_tmp; /* Select the correct unicast header information based on the location * of our mac address in the coded_packet header */ if (batadv_is_my_mac(bat_priv, coded_packet_tmp.second_dest)) { /* If we are the second destination the packet was overheard, * so the Ethernet address must be copied to h_dest and * pkt_type changed from PACKET_OTHERHOST to PACKET_HOST */ ether_addr_copy(ethhdr->h_dest, coded_packet_tmp.second_dest); skb->pkt_type = PACKET_HOST; orig_dest = coded_packet_tmp.second_orig_dest; ttl = coded_packet_tmp.second_ttl; ttvn = coded_packet_tmp.second_ttvn; } else { orig_dest = coded_packet_tmp.first_orig_dest; ttl = coded_packet_tmp.ttl; ttvn = coded_packet_tmp.first_ttvn; } coding_len = ntohs(coded_packet_tmp.coded_len); if (coding_len > skb->len) return NULL; /* Here the magic is reversed: * extract the missing packet from the received coded packet */ batadv_nc_memxor(skb->data + h_size, nc_packet->skb->data + h_size, coding_len); /* Resize decoded skb if decoded with larger packet */ if (nc_packet->skb->len > coding_len + h_size) { err = pskb_trim_rcsum(skb, coding_len + h_size); if (err) return NULL; } /* Create decoded unicast packet */ unicast_packet = (struct batadv_unicast_packet *)skb->data; unicast_packet->packet_type = BATADV_UNICAST; unicast_packet->version = BATADV_COMPAT_VERSION; unicast_packet->ttl = ttl; ether_addr_copy(unicast_packet->dest, orig_dest); unicast_packet->ttvn = ttvn; batadv_nc_packet_free(nc_packet, false); return unicast_packet; } /** * batadv_nc_find_decoding_packet() - search through buffered decoding data to * find the data needed to decode the coded packet * @bat_priv: the bat priv with all the soft interface information * @ethhdr: pointer to the ethernet header inside the coded packet * @coded: coded packet we try to find decode data for * * Return: pointer to nc packet if the needed data was found or NULL otherwise. */ static struct batadv_nc_packet * batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv, struct ethhdr *ethhdr, struct batadv_coded_packet *coded) { struct batadv_hashtable *hash = bat_priv->nc.decoding_hash; struct batadv_nc_packet *tmp_nc_packet, *nc_packet = NULL; struct batadv_nc_path *nc_path, nc_path_key; u8 *dest, *source; __be32 packet_id; int index; if (!hash) return NULL; /* Select the correct packet id based on the location of our mac-addr */ dest = ethhdr->h_source; if (!batadv_is_my_mac(bat_priv, coded->second_dest)) { source = coded->second_source; packet_id = coded->second_crc; } else { source = coded->first_source; packet_id = coded->first_crc; } batadv_nc_hash_key_gen(&nc_path_key, source, dest); index = batadv_nc_hash_choose(&nc_path_key, hash->size); /* Search for matching coding path */ rcu_read_lock(); hlist_for_each_entry_rcu(nc_path, &hash->table[index], hash_entry) { /* Find matching nc_packet */ spin_lock_bh(&nc_path->packet_list_lock); list_for_each_entry(tmp_nc_packet, &nc_path->packet_list, list) { if (packet_id == tmp_nc_packet->packet_id) { list_del(&tmp_nc_packet->list); nc_packet = tmp_nc_packet; break; } } spin_unlock_bh(&nc_path->packet_list_lock); if (nc_packet) break; } rcu_read_unlock(); if (!nc_packet) batadv_dbg(BATADV_DBG_NC, bat_priv, "No decoding packet found for %u\n", packet_id); return nc_packet; } /** * batadv_nc_recv_coded_packet() - try to decode coded packet and enqueue the * resulting unicast packet * @skb: incoming coded packet * @recv_if: pointer to interface this packet was received on * * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP * otherwise. */ static int batadv_nc_recv_coded_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); struct batadv_unicast_packet *unicast_packet; struct batadv_coded_packet *coded_packet; struct batadv_nc_packet *nc_packet; struct ethhdr *ethhdr; int hdr_size = sizeof(*coded_packet); /* Check if network coding is enabled */ if (!atomic_read(&bat_priv->network_coding)) goto free_skb; /* Make sure we can access (and remove) header */ if (unlikely(!pskb_may_pull(skb, hdr_size))) goto free_skb; coded_packet = (struct batadv_coded_packet *)skb->data; ethhdr = eth_hdr(skb); /* Verify frame is destined for us */ if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest) && !batadv_is_my_mac(bat_priv, coded_packet->second_dest)) goto free_skb; /* Update stat counter */ if (batadv_is_my_mac(bat_priv, coded_packet->second_dest)) batadv_inc_counter(bat_priv, BATADV_CNT_NC_SNIFFED); nc_packet = batadv_nc_find_decoding_packet(bat_priv, ethhdr, coded_packet); if (!nc_packet) { batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED); goto free_skb; } /* Make skb's linear, because decoding accesses the entire buffer */ if (skb_linearize(skb) < 0) goto free_nc_packet; if (skb_linearize(nc_packet->skb) < 0) goto free_nc_packet; /* Decode the packet */ unicast_packet = batadv_nc_skb_decode_packet(bat_priv, skb, nc_packet); if (!unicast_packet) { batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED); goto free_nc_packet; } /* Mark packet as decoded to do correct recoding when forwarding */ BATADV_SKB_CB(skb)->decoded = true; batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE); batadv_add_counter(bat_priv, BATADV_CNT_NC_DECODE_BYTES, skb->len + ETH_HLEN); return batadv_recv_unicast_packet(skb, recv_if); free_nc_packet: batadv_nc_packet_free(nc_packet, true); free_skb: kfree_skb(skb); return NET_RX_DROP; } /** * batadv_nc_mesh_free() - clean up network coding memory * @bat_priv: the bat priv with all the soft interface information */ void batadv_nc_mesh_free(struct batadv_priv *bat_priv) { batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_NC, 1); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_NC, 1); cancel_delayed_work_sync(&bat_priv->nc.work); batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, NULL); batadv_hash_destroy(bat_priv->nc.coding_hash); batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, NULL); batadv_hash_destroy(bat_priv->nc.decoding_hash); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/ufs/util.h * * Copyright (C) 1998 * Daniel Pirkl <daniel.pirkl@email.cz> * Charles University, Faculty of Mathematics and Physics */ #include <linux/buffer_head.h> #include <linux/fs.h> #include "swab.h" /* * functions used for retyping */ static inline struct ufs_buffer_head *UCPI_UBH(struct ufs_cg_private_info *cpi) { return &cpi->c_ubh; } static inline struct ufs_buffer_head *USPI_UBH(struct ufs_sb_private_info *spi) { return &spi->s_ubh; } /* * macros used for accessing structures */ static inline s32 ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1, struct ufs_super_block_third *usb3) { switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { case UFS_ST_SUNOS: if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT) return fs32_to_cpu(sb, usb1->fs_u0.fs_sun.fs_state); fallthrough; /* to UFS_ST_SUN */ case UFS_ST_SUN: return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state); case UFS_ST_SUNx86: return fs32_to_cpu(sb, usb1->fs_u1.fs_sunx86.fs_state); case UFS_ST_44BSD: default: return fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_state); } } static inline void ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1, struct ufs_super_block_third *usb3, s32 value) { switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { case UFS_ST_SUNOS: if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT) { usb1->fs_u0.fs_sun.fs_state = cpu_to_fs32(sb, value); break; } fallthrough; /* to UFS_ST_SUN */ case UFS_ST_SUN: usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value); break; case UFS_ST_SUNx86: usb1->fs_u1.fs_sunx86.fs_state = cpu_to_fs32(sb, value); break; case UFS_ST_44BSD: usb3->fs_un2.fs_44.fs_state = cpu_to_fs32(sb, value); break; } } static inline u32 ufs_get_fs_npsect(struct super_block *sb, struct ufs_super_block_first *usb1, struct ufs_super_block_third *usb3) { if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86) return fs32_to_cpu(sb, usb3->fs_un2.fs_sunx86.fs_npsect); else return fs32_to_cpu(sb, usb1->fs_u1.fs_sun.fs_npsect); } static inline u64 ufs_get_fs_qbmask(struct super_block *sb, struct ufs_super_block_third *usb3) { __fs64 tmp; switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { case UFS_ST_SUNOS: case UFS_ST_SUN: ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qbmask[0]; ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qbmask[1]; break; case UFS_ST_SUNx86: ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qbmask[0]; ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qbmask[1]; break; case UFS_ST_44BSD: ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qbmask[0]; ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qbmask[1]; break; } return fs64_to_cpu(sb, tmp); } static inline u64 ufs_get_fs_qfmask(struct super_block *sb, struct ufs_super_block_third *usb3) { __fs64 tmp; switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { case UFS_ST_SUNOS: case UFS_ST_SUN: ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qfmask[0]; ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qfmask[1]; break; case UFS_ST_SUNx86: ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qfmask[0]; ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qfmask[1]; break; case UFS_ST_44BSD: ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qfmask[0]; ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qfmask[1]; break; } return fs64_to_cpu(sb, tmp); } static inline u16 ufs_get_de_namlen(struct super_block *sb, struct ufs_dir_entry *de) { if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) == UFS_DE_OLD) return fs16_to_cpu(sb, de->d_u.d_namlen); else return de->d_u.d_44.d_namlen; /* XXX this seems wrong */ } static inline void ufs_set_de_namlen(struct super_block *sb, struct ufs_dir_entry *de, u16 value) { if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) == UFS_DE_OLD) de->d_u.d_namlen = cpu_to_fs16(sb, value); else de->d_u.d_44.d_namlen = value; /* XXX this seems wrong */ } static inline void ufs_set_de_type(struct super_block *sb, struct ufs_dir_entry *de, int mode) { if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) != UFS_DE_44BSD) return; /* * TODO turn this into a table lookup */ switch (mode & S_IFMT) { case S_IFSOCK: de->d_u.d_44.d_type = DT_SOCK; break; case S_IFLNK: de->d_u.d_44.d_type = DT_LNK; break; case S_IFREG: de->d_u.d_44.d_type = DT_REG; break; case S_IFBLK: de->d_u.d_44.d_type = DT_BLK; break; case S_IFDIR: de->d_u.d_44.d_type = DT_DIR; break; case S_IFCHR: de->d_u.d_44.d_type = DT_CHR; break; case S_IFIFO: de->d_u.d_44.d_type = DT_FIFO; break; default: de->d_u.d_44.d_type = DT_UNKNOWN; } } static inline u32 ufs_get_inode_uid(struct super_block *sb, struct ufs_inode *inode) { switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) { case UFS_UID_44BSD: return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_uid); case UFS_UID_EFT: if (inode->ui_u1.oldids.ui_suid == 0xFFFF) return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_uid); fallthrough; default: return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_suid); } } static inline void ufs_set_inode_uid(struct super_block *sb, struct ufs_inode *inode, u32 value) { switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) { case UFS_UID_44BSD: inode->ui_u3.ui_44.ui_uid = cpu_to_fs32(sb, value); inode->ui_u1.oldids.ui_suid = cpu_to_fs16(sb, value); break; case UFS_UID_EFT: inode->ui_u3.ui_sun.ui_uid = cpu_to_fs32(sb, value); if (value > 0xFFFF) value = 0xFFFF; fallthrough; default: inode->ui_u1.oldids.ui_suid = cpu_to_fs16(sb, value); break; } } static inline u32 ufs_get_inode_gid(struct super_block *sb, struct ufs_inode *inode) { switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) { case UFS_UID_44BSD: return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_gid); case UFS_UID_EFT: if (inode->ui_u1.oldids.ui_sgid == 0xFFFF) return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid); fallthrough; default: return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_sgid); } } static inline void ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value) { switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) { case UFS_UID_44BSD: inode->ui_u3.ui_44.ui_gid = cpu_to_fs32(sb, value); inode->ui_u1.oldids.ui_sgid = cpu_to_fs16(sb, value); break; case UFS_UID_EFT: inode->ui_u3.ui_sun.ui_gid = cpu_to_fs32(sb, value); if (value > 0xFFFF) value = 0xFFFF; fallthrough; default: inode->ui_u1.oldids.ui_sgid = cpu_to_fs16(sb, value); break; } } dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *); void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t); int ufs_prepare_chunk(struct folio *folio, loff_t pos, unsigned len); /* * These functions manipulate ufs buffers */ #define ubh_bread(sb,fragment,size) _ubh_bread_(uspi,sb,fragment,size) extern struct ufs_buffer_head * _ubh_bread_(struct ufs_sb_private_info *, struct super_block *, u64 , u64); extern struct ufs_buffer_head * ubh_bread_uspi(struct ufs_sb_private_info *, struct super_block *, u64, u64); extern void ubh_brelse (struct ufs_buffer_head *); extern void ubh_brelse_uspi (struct ufs_sb_private_info *); extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *); extern void ubh_sync_block(struct ufs_buffer_head *); extern void ubh_bforget (struct ufs_buffer_head *); extern int ubh_buffer_dirty (struct ufs_buffer_head *); /* This functions works with cache pages*/ struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index); static inline void ufs_put_locked_folio(struct folio *folio) { folio_unlock(folio); folio_put(folio); } /* * macros and inline function to get important structures from ufs_sb_private_info */ static inline void *get_usb_offset(struct ufs_sb_private_info *uspi, unsigned int offset) { unsigned int index; index = offset >> uspi->s_fshift; offset &= ~uspi->s_fmask; return uspi->s_ubh.bh[index]->b_data + offset; } #define ubh_get_usb_first(uspi) \ ((struct ufs_super_block_first *)get_usb_offset((uspi), 0)) #define ubh_get_usb_second(uspi) \ ((struct ufs_super_block_second *)get_usb_offset((uspi), UFS_SECTOR_SIZE)) #define ubh_get_usb_third(uspi) \ ((struct ufs_super_block_third *)get_usb_offset((uspi), 2*UFS_SECTOR_SIZE)) #define ubh_get_ucg(ubh) \ ((struct ufs_cylinder_group *)((ubh)->bh[0]->b_data)) /* * Extract byte from ufs_buffer_head * Extract the bits for a block from a map inside ufs_buffer_head */ #define ubh_get_addr8(ubh,begin) \ ((u8*)(ubh)->bh[(begin) >> uspi->s_fshift]->b_data + \ ((begin) & ~uspi->s_fmask)) #define ubh_get_addr16(ubh,begin) \ (((__fs16*)((ubh)->bh[(begin) >> (uspi->s_fshift-1)]->b_data)) + \ ((begin) & ((uspi->fsize>>1) - 1))) #define ubh_get_addr32(ubh,begin) \ (((__fs32*)((ubh)->bh[(begin) >> (uspi->s_fshift-2)]->b_data)) + \ ((begin) & ((uspi->s_fsize>>2) - 1))) #define ubh_get_addr64(ubh,begin) \ (((__fs64*)((ubh)->bh[(begin) >> (uspi->s_fshift-3)]->b_data)) + \ ((begin) & ((uspi->s_fsize>>3) - 1))) #define ubh_get_addr ubh_get_addr8 static inline void *ubh_get_data_ptr(struct ufs_sb_private_info *uspi, struct ufs_buffer_head *ubh, u64 blk) { if (uspi->fs_magic == UFS2_MAGIC) return ubh_get_addr64(ubh, blk); else return ubh_get_addr32(ubh, blk); } #define ubh_blkmap(ubh,begin,bit) \ ((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb))) static inline u64 ufs_freefrags(struct ufs_sb_private_info *uspi) { return ufs_blkstofrags(uspi->cs_total.cs_nbfree) + uspi->cs_total.cs_nffree; } /* * Macros to access cylinder group array structures */ #define ubh_cg_blktot(ucpi,cylno) \ (*((__fs32*)ubh_get_addr(UCPI_UBH(ucpi), (ucpi)->c_btotoff + ((cylno) << 2)))) #define ubh_cg_blks(ucpi,cylno,rpos) \ (*((__fs16*)ubh_get_addr(UCPI_UBH(ucpi), \ (ucpi)->c_boff + (((cylno) * uspi->s_nrpos + (rpos)) << 1 )))) /* * Bitmap operations * These functions work like classical bitmap operations. * The difference is that we don't have the whole bitmap * in one contiguous chunk of memory, but in several buffers. * The parameters of each function are super_block, ufs_buffer_head and * position of the beginning of the bitmap. */ #define ubh_setbit(ubh,begin,bit) \ (*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) |= (1 << ((bit) & 7))) #define ubh_clrbit(ubh,begin,bit) \ (*ubh_get_addr (ubh, (begin) + ((bit) >> 3)) &= ~(1 << ((bit) & 7))) #define ubh_isset(ubh,begin,bit) \ (*ubh_get_addr (ubh, (begin) + ((bit) >> 3)) & (1 << ((bit) & 7))) #define ubh_isclr(ubh,begin,bit) (!ubh_isset(ubh,begin,bit)) #define ubh_find_first_zero_bit(ubh,begin,size) _ubh_find_next_zero_bit_(uspi,ubh,begin,size,0) #define ubh_find_next_zero_bit(ubh,begin,size,offset) _ubh_find_next_zero_bit_(uspi,ubh,begin,size,offset) static inline unsigned _ubh_find_next_zero_bit_( struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, unsigned begin, unsigned size, unsigned offset) { unsigned base, count, pos; size -= offset; begin <<= 3; offset += begin; base = offset >> uspi->s_bpfshift; offset &= uspi->s_bpfmask; for (;;) { count = min_t(unsigned int, size + offset, uspi->s_bpf); size -= count - offset; pos = find_next_zero_bit_le(ubh->bh[base]->b_data, count, offset); if (pos < count || !size) break; base++; offset = 0; } return (base << uspi->s_bpfshift) + pos - begin; } static inline unsigned find_last_zero_bit (unsigned char * bitmap, unsigned size, unsigned offset) { unsigned bit, i; unsigned char * mapp; unsigned char map; mapp = bitmap + (size >> 3); map = *mapp--; bit = 1 << (size & 7); for (i = size; i > offset; i--) { if ((map & bit) == 0) break; if ((i & 7) != 0) { bit >>= 1; } else { map = *mapp--; bit = 1 << 7; } } return i; } #define ubh_find_last_zero_bit(ubh,begin,size,offset) _ubh_find_last_zero_bit_(uspi,ubh,begin,size,offset) static inline unsigned _ubh_find_last_zero_bit_( struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, unsigned begin, unsigned start, unsigned end) { unsigned base, count, pos, size; size = start - end; begin <<= 3; start += begin; base = start >> uspi->s_bpfshift; start &= uspi->s_bpfmask; for (;;) { count = min_t(unsigned int, size + (uspi->s_bpf - start), uspi->s_bpf) - (uspi->s_bpf - start); size -= count; pos = find_last_zero_bit (ubh->bh[base]->b_data, start, start - count); if (pos > start - count || !size) break; base--; start = uspi->s_bpf; } return (base << uspi->s_bpfshift) + pos - begin; } static inline int ubh_isblockset(struct ufs_sb_private_info *uspi, struct ufs_cg_private_info *ucpi, unsigned int frag) { struct ufs_buffer_head *ubh = UCPI_UBH(ucpi); u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3)); u8 mask; switch (uspi->s_fpb) { case 8: return *p == 0xff; case 4: mask = 0x0f << (frag & 4); return (*p & mask) == mask; case 2: mask = 0x03 << (frag & 6); return (*p & mask) == mask; case 1: mask = 0x01 << (frag & 7); return (*p & mask) == mask; } return 0; } static inline void ubh_clrblock(struct ufs_sb_private_info *uspi, struct ufs_cg_private_info *ucpi, unsigned int frag) { struct ufs_buffer_head *ubh = UCPI_UBH(ucpi); u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3)); switch (uspi->s_fpb) { case 8: *p = 0x00; return; case 4: *p &= ~(0x0f << (frag & 4)); return; case 2: *p &= ~(0x03 << (frag & 6)); return; case 1: *p &= ~(0x01 << (frag & 7)); return; } } static inline void ubh_setblock(struct ufs_sb_private_info * uspi, struct ufs_cg_private_info *ucpi, unsigned int frag) { struct ufs_buffer_head *ubh = UCPI_UBH(ucpi); u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3)); switch (uspi->s_fpb) { case 8: *p = 0xff; return; case 4: *p |= 0x0f << (frag & 4); return; case 2: *p |= 0x03 << (frag & 6); return; case 1: *p |= 0x01 << (frag & 7); return; } } static inline void ufs_fragacct (struct super_block * sb, unsigned blockmap, __fs32 * fraglist, int cnt) { struct ufs_sb_private_info * uspi; unsigned fragsize, pos; uspi = UFS_SB(sb)->s_uspi; fragsize = 0; for (pos = 0; pos < uspi->s_fpb; pos++) { if (blockmap & (1 << pos)) { fragsize++; } else if (fragsize > 0) { fs32_add(sb, &fraglist[fragsize], cnt); fragsize = 0; } } if (fragsize > 0 && fragsize < uspi->s_fpb) fs32_add(sb, &fraglist[fragsize], cnt); } static inline void *ufs_get_direct_data_ptr(struct ufs_sb_private_info *uspi, struct ufs_inode_info *ufsi, unsigned blk) { BUG_ON(blk > UFS_TIND_BLOCK); return uspi->fs_magic == UFS2_MAGIC ? (void *)&ufsi->i_u1.u2_i_data[blk] : (void *)&ufsi->i_u1.i_data[blk]; } static inline u64 ufs_data_ptr_to_cpu(struct super_block *sb, void *p) { return UFS_SB(sb)->s_uspi->fs_magic == UFS2_MAGIC ? fs64_to_cpu(sb, *(__fs64 *)p) : fs32_to_cpu(sb, *(__fs32 *)p); } static inline void ufs_cpu_to_data_ptr(struct super_block *sb, void *p, u64 val) { if (UFS_SB(sb)->s_uspi->fs_magic == UFS2_MAGIC) *(__fs64 *)p = cpu_to_fs64(sb, val); else *(__fs32 *)p = cpu_to_fs32(sb, val); } static inline void ufs_data_ptr_clear(struct ufs_sb_private_info *uspi, void *p) { if (uspi->fs_magic == UFS2_MAGIC) *(__fs64 *)p = 0; else *(__fs32 *)p = 0; } static inline int ufs_is_data_ptr_zero(struct ufs_sb_private_info *uspi, void *p) { if (uspi->fs_magic == UFS2_MAGIC) return *(__fs64 *)p == 0; else return *(__fs32 *)p == 0; } static inline __fs32 ufs_get_seconds(struct super_block *sbp) { time64_t now = ktime_get_real_seconds(); /* Signed 32-bit interpretation wraps around in 2038, which * happens in ufs1 inode stamps but not ufs2 using 64-bits * stamps. For superblock and blockgroup, let's assume * unsigned 32-bit stamps, which are good until y2106. * Wrap around rather than clamp here to make the dirty * file system detection work in the superblock stamp. */ return cpu_to_fs32(sbp, lower_32_bits(now)); }
10 120 263 184 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TTY_H #define _LINUX_TTY_H #include <linux/fs.h> #include <linux/major.h> #include <linux/termios.h> #include <linux/workqueue.h> #include <linux/tty_driver.h> #include <linux/tty_ldisc.h> #include <linux/tty_port.h> #include <linux/mutex.h> #include <linux/tty_flags.h> #include <uapi/linux/tty.h> #include <linux/rwsem.h> #include <linux/llist.h> /* * (Note: the *_driver.minor_start values 1, 64, 128, 192 are * hardcoded at present.) */ #define NR_UNIX98_PTY_DEFAULT 4096 /* Default maximum for Unix98 ptys */ #define NR_UNIX98_PTY_RESERVE 1024 /* Default reserve for main devpts */ #define NR_UNIX98_PTY_MAX (1 << MINORBITS) /* Absolute limit */ /* * This character is the same as _POSIX_VDISABLE: it cannot be used as * a c_cc[] character, but indicates that a particular special character * isn't in use (eg VINTR has no character etc) */ #define __DISABLED_CHAR '\0' #define INTR_CHAR(tty) ((tty)->termios.c_cc[VINTR]) #define QUIT_CHAR(tty) ((tty)->termios.c_cc[VQUIT]) #define ERASE_CHAR(tty) ((tty)->termios.c_cc[VERASE]) #define KILL_CHAR(tty) ((tty)->termios.c_cc[VKILL]) #define EOF_CHAR(tty) ((tty)->termios.c_cc[VEOF]) #define TIME_CHAR(tty) ((tty)->termios.c_cc[VTIME]) #define MIN_CHAR(tty) ((tty)->termios.c_cc[VMIN]) #define SWTC_CHAR(tty) ((tty)->termios.c_cc[VSWTC]) #define START_CHAR(tty) ((tty)->termios.c_cc[VSTART]) #define STOP_CHAR(tty) ((tty)->termios.c_cc[VSTOP]) #define SUSP_CHAR(tty) ((tty)->termios.c_cc[VSUSP]) #define EOL_CHAR(tty) ((tty)->termios.c_cc[VEOL]) #define REPRINT_CHAR(tty) ((tty)->termios.c_cc[VREPRINT]) #define DISCARD_CHAR(tty) ((tty)->termios.c_cc[VDISCARD]) #define WERASE_CHAR(tty) ((tty)->termios.c_cc[VWERASE]) #define LNEXT_CHAR(tty) ((tty)->termios.c_cc[VLNEXT]) #define EOL2_CHAR(tty) ((tty)->termios.c_cc[VEOL2]) #define _I_FLAG(tty, f) ((tty)->termios.c_iflag & (f)) #define _O_FLAG(tty, f) ((tty)->termios.c_oflag & (f)) #define _C_FLAG(tty, f) ((tty)->termios.c_cflag & (f)) #define _L_FLAG(tty, f) ((tty)->termios.c_lflag & (f)) #define I_IGNBRK(tty) _I_FLAG((tty), IGNBRK) #define I_BRKINT(tty) _I_FLAG((tty), BRKINT) #define I_IGNPAR(tty) _I_FLAG((tty), IGNPAR) #define I_PARMRK(tty) _I_FLAG((tty), PARMRK) #define I_INPCK(tty) _I_FLAG((tty), INPCK) #define I_ISTRIP(tty) _I_FLAG((tty), ISTRIP) #define I_INLCR(tty) _I_FLAG((tty), INLCR) #define I_IGNCR(tty) _I_FLAG((tty), IGNCR) #define I_ICRNL(tty) _I_FLAG((tty), ICRNL) #define I_IUCLC(tty) _I_FLAG((tty), IUCLC) #define I_IXON(tty) _I_FLAG((tty), IXON) #define I_IXANY(tty) _I_FLAG((tty), IXANY) #define I_IXOFF(tty) _I_FLAG((tty), IXOFF) #define I_IMAXBEL(tty) _I_FLAG((tty), IMAXBEL) #define I_IUTF8(tty) _I_FLAG((tty), IUTF8) #define O_OPOST(tty) _O_FLAG((tty), OPOST) #define O_OLCUC(tty) _O_FLAG((tty), OLCUC) #define O_ONLCR(tty) _O_FLAG((tty), ONLCR) #define O_OCRNL(tty) _O_FLAG((tty), OCRNL) #define O_ONOCR(tty) _O_FLAG((tty), ONOCR) #define O_ONLRET(tty) _O_FLAG((tty), ONLRET) #define O_OFILL(tty) _O_FLAG((tty), OFILL) #define O_OFDEL(tty) _O_FLAG((tty), OFDEL) #define O_NLDLY(tty) _O_FLAG((tty), NLDLY) #define O_CRDLY(tty) _O_FLAG((tty), CRDLY) #define O_TABDLY(tty) _O_FLAG((tty), TABDLY) #define O_BSDLY(tty) _O_FLAG((tty), BSDLY) #define O_VTDLY(tty) _O_FLAG((tty), VTDLY) #define O_FFDLY(tty) _O_FLAG((tty), FFDLY) #define C_BAUD(tty) _C_FLAG((tty), CBAUD) #define C_CSIZE(tty) _C_FLAG((tty), CSIZE) #define C_CSTOPB(tty) _C_FLAG((tty), CSTOPB) #define C_CREAD(tty) _C_FLAG((tty), CREAD) #define C_PARENB(tty) _C_FLAG((tty), PARENB) #define C_PARODD(tty) _C_FLAG((tty), PARODD) #define C_HUPCL(tty) _C_FLAG((tty), HUPCL) #define C_CLOCAL(tty) _C_FLAG((tty), CLOCAL) #define C_CIBAUD(tty) _C_FLAG((tty), CIBAUD) #define C_CRTSCTS(tty) _C_FLAG((tty), CRTSCTS) #define C_CMSPAR(tty) _C_FLAG((tty), CMSPAR) #define L_ISIG(tty) _L_FLAG((tty), ISIG) #define L_ICANON(tty) _L_FLAG((tty), ICANON) #define L_XCASE(tty) _L_FLAG((tty), XCASE) #define L_ECHO(tty) _L_FLAG((tty), ECHO) #define L_ECHOE(tty) _L_FLAG((tty), ECHOE) #define L_ECHOK(tty) _L_FLAG((tty), ECHOK) #define L_ECHONL(tty) _L_FLAG((tty), ECHONL) #define L_NOFLSH(tty) _L_FLAG((tty), NOFLSH) #define L_TOSTOP(tty) _L_FLAG((tty), TOSTOP) #define L_ECHOCTL(tty) _L_FLAG((tty), ECHOCTL) #define L_ECHOPRT(tty) _L_FLAG((tty), ECHOPRT) #define L_ECHOKE(tty) _L_FLAG((tty), ECHOKE) #define L_FLUSHO(tty) _L_FLAG((tty), FLUSHO) #define L_PENDIN(tty) _L_FLAG((tty), PENDIN) #define L_IEXTEN(tty) _L_FLAG((tty), IEXTEN) #define L_EXTPROC(tty) _L_FLAG((tty), EXTPROC) struct device; struct signal_struct; struct tty_operations; /** * struct tty_struct - state associated with a tty while open * * @kref: reference counting by tty_kref_get() and tty_kref_put(), reaching zero * frees the structure * @dev: class device or %NULL (e.g. ptys, serdev) * @driver: &struct tty_driver operating this tty * @ops: &struct tty_operations of @driver for this tty (open, close, etc.) * @index: index of this tty (e.g. to construct @name like tty12) * @ldisc_sem: protects line discipline changes (@ldisc) -- lock tty not pty * @ldisc: the current line discipline for this tty (n_tty by default) * @atomic_write_lock: protects against concurrent writers, i.e. locks * @write_cnt, @write_buf and similar * @legacy_mutex: leftover from history (BKL -> BTM -> @legacy_mutex), * protecting several operations on this tty * @throttle_mutex: protects against concurrent tty_throttle_safe() and * tty_unthrottle_safe() (but not tty_unthrottle()) * @termios_rwsem: protects @termios and @termios_locked * @winsize_mutex: protects @winsize * @termios: termios for the current tty, copied from/to @driver.termios * @termios_locked: locked termios (by %TIOCGLCKTRMIOS and %TIOCSLCKTRMIOS * ioctls) * @name: name of the tty constructed by tty_line_name() (e.g. ttyS3) * @flags: bitwise OR of %TTY_THROTTLED, %TTY_IO_ERROR, ... * @count: count of open processes, reaching zero cancels all the work for * this tty and drops a @kref too (but does not free this tty) * @winsize: size of the terminal "window" (cf. @winsize_mutex) * @flow: flow settings grouped together * @flow.lock: lock for @flow members * @flow.stopped: tty stopped/started by stop_tty()/start_tty() * @flow.tco_stopped: tty stopped/started by %TCOOFF/%TCOON ioctls (it has * precedence over @flow.stopped) * @ctrl: control settings grouped together * @ctrl.lock: lock for @ctrl members * @ctrl.pgrp: process group of this tty (setpgrp(2)) * @ctrl.session: session of this tty (setsid(2)). Writes are protected by both * @ctrl.lock and @legacy_mutex, readers must use at least one of * them. * @ctrl.pktstatus: packet mode status (bitwise OR of %TIOCPKT_ constants) * @ctrl.packet: packet mode enabled * @hw_stopped: not controlled by the tty layer, under @driver's control for CTS * handling * @receive_room: bytes permitted to feed to @ldisc without any being lost * @flow_change: controls behavior of throttling, see tty_throttle_safe() and * tty_unthrottle_safe() * @link: link to another pty (master -> slave and vice versa) * @fasync: state for %O_ASYNC (for %SIGIO); managed by fasync_helper() * @write_wait: concurrent writers are waiting in this queue until they are * allowed to write * @read_wait: readers wait for data in this queue * @hangup_work: normally a work to perform a hangup (do_tty_hangup()); while * freeing the tty, (re)used to release_one_tty() * @disc_data: pointer to @ldisc's private data (e.g. to &struct n_tty_data) * @driver_data: pointer to @driver's private data (e.g. &struct uart_state) * @files_lock: protects @tty_files list * @tty_files: list of (re)openers of this tty (i.e. linked &struct * tty_file_private) * @closing: when set during close, n_tty processes only START & STOP chars * @write_buf: temporary buffer used during tty_write() to copy user data to * @write_cnt: count of bytes written in tty_write() to @write_buf * @SAK_work: if the tty has a pending do_SAK, it is queued here * @port: persistent storage for this device (i.e. &struct tty_port) * * All of the state associated with a tty while the tty is open. Persistent * storage for tty devices is referenced here as @port and is documented in * &struct tty_port. */ struct tty_struct { struct kref kref; int index; struct device *dev; struct tty_driver *driver; struct tty_port *port; const struct tty_operations *ops; struct tty_ldisc *ldisc; struct ld_semaphore ldisc_sem; struct mutex atomic_write_lock; struct mutex legacy_mutex; struct mutex throttle_mutex; struct rw_semaphore termios_rwsem; struct mutex winsize_mutex; struct ktermios termios, termios_locked; char name[64]; unsigned long flags; int count; unsigned int receive_room; struct winsize winsize; struct { spinlock_t lock; bool stopped; bool tco_stopped; } flow; struct { struct pid *pgrp; struct pid *session; spinlock_t lock; unsigned char pktstatus; bool packet; } ctrl; bool hw_stopped; bool closing; int flow_change; struct tty_struct *link; struct fasync_struct *fasync; wait_queue_head_t write_wait; wait_queue_head_t read_wait; struct work_struct hangup_work; void *disc_data; void *driver_data; spinlock_t files_lock; int write_cnt; u8 *write_buf; struct list_head tty_files; #define N_TTY_BUF_SIZE 4096 struct work_struct SAK_work; } __randomize_layout; /* Each of a tty's open files has private_data pointing to tty_file_private */ struct tty_file_private { struct tty_struct *tty; struct file *file; struct list_head list; }; /** * DOC: TTY Struct Flags * * These bits are used in the :c:member:`tty_struct.flags` field. * * So that interrupts won't be able to mess up the queues, * copy_to_cooked must be atomic with respect to itself, as must * tty->write. Thus, you must use the inline functions set_bit() and * clear_bit() to make things atomic. * * TTY_THROTTLED * Driver input is throttled. The ldisc should call * :c:member:`tty_driver.unthrottle()` in order to resume reception when * it is ready to process more data (at threshold min). * * TTY_IO_ERROR * If set, causes all subsequent userspace read/write calls on the tty to * fail, returning -%EIO. (May be no ldisc too.) * * TTY_OTHER_CLOSED * Device is a pty and the other side has closed. * * TTY_EXCLUSIVE * Exclusive open mode (a single opener). * * TTY_DO_WRITE_WAKEUP * If set, causes the driver to call the * :c:member:`tty_ldisc_ops.write_wakeup()` method in order to resume * transmission when it can accept more data to transmit. * * TTY_LDISC_OPEN * Indicates that a line discipline is open. For debugging purposes only. * * TTY_PTY_LOCK * A flag private to pty code to implement %TIOCSPTLCK/%TIOCGPTLCK logic. * * TTY_NO_WRITE_SPLIT * Prevent driver from splitting up writes into smaller chunks (preserve * write boundaries to driver). * * TTY_HUPPED * The TTY was hung up. This is set post :c:member:`tty_driver.hangup()`. * * TTY_HUPPING * The TTY is in the process of hanging up to abort potential readers. * * TTY_LDISC_CHANGING * Line discipline for this TTY is being changed. I/O should not block * when this is set. Use tty_io_nonblock() to check. * * TTY_LDISC_HALTED * Line discipline for this TTY was stopped. No work should be queued to * this ldisc. */ #define TTY_THROTTLED 0 #define TTY_IO_ERROR 1 #define TTY_OTHER_CLOSED 2 #define TTY_EXCLUSIVE 3 #define TTY_DO_WRITE_WAKEUP 5 #define TTY_LDISC_OPEN 11 #define TTY_PTY_LOCK 16 #define TTY_NO_WRITE_SPLIT 17 #define TTY_HUPPED 18 #define TTY_HUPPING 19 #define TTY_LDISC_CHANGING 20 #define TTY_LDISC_HALTED 22 static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file) { return file->f_flags & O_NONBLOCK || test_bit(TTY_LDISC_CHANGING, &tty->flags); } static inline bool tty_io_error(struct tty_struct *tty) { return test_bit(TTY_IO_ERROR, &tty->flags); } static inline bool tty_throttled(struct tty_struct *tty) { return test_bit(TTY_THROTTLED, &tty->flags); } #ifdef CONFIG_TTY void tty_kref_put(struct tty_struct *tty); struct pid *tty_get_pgrp(struct tty_struct *tty); void tty_vhangup_self(void); void disassociate_ctty(int priv); dev_t tty_devnum(struct tty_struct *tty); void proc_clear_tty(struct task_struct *p); struct tty_struct *get_current_tty(void); /* tty_io.c */ int __init tty_init(void); const char *tty_name(const struct tty_struct *tty); struct tty_struct *tty_kopen_exclusive(dev_t device); struct tty_struct *tty_kopen_shared(dev_t device); void tty_kclose(struct tty_struct *tty); int tty_dev_name_to_number(const char *name, dev_t *number); #else static inline void tty_kref_put(struct tty_struct *tty) { } static inline struct pid *tty_get_pgrp(struct tty_struct *tty) { return NULL; } static inline void tty_vhangup_self(void) { } static inline void disassociate_ctty(int priv) { } static inline dev_t tty_devnum(struct tty_struct *tty) { return 0; } static inline void proc_clear_tty(struct task_struct *p) { } static inline struct tty_struct *get_current_tty(void) { return NULL; } /* tty_io.c */ static inline int __init tty_init(void) { return 0; } static inline const char *tty_name(const struct tty_struct *tty) { return "(none)"; } static inline struct tty_struct *tty_kopen_exclusive(dev_t device) { return ERR_PTR(-ENODEV); } static inline void tty_kclose(struct tty_struct *tty) { } static inline int tty_dev_name_to_number(const char *name, dev_t *number) { return -ENOTSUPP; } #endif extern struct ktermios tty_std_termios; int vcs_init(void); extern const struct class tty_class; /** * tty_kref_get - get a tty reference * @tty: tty device * * Returns: a new reference to a tty object * * Locking: The caller must hold sufficient locks/counts to ensure that their * existing reference cannot go away. */ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) { if (tty) kref_get(&tty->kref); return tty; } const char *tty_driver_name(const struct tty_struct *tty); void tty_wait_until_sent(struct tty_struct *tty, long timeout); void stop_tty(struct tty_struct *tty); void start_tty(struct tty_struct *tty); void tty_write_message(struct tty_struct *tty, char *msg); int tty_send_xchar(struct tty_struct *tty, u8 ch); int tty_put_char(struct tty_struct *tty, u8 c); unsigned int tty_chars_in_buffer(struct tty_struct *tty); unsigned int tty_write_room(struct tty_struct *tty); void tty_driver_flush_buffer(struct tty_struct *tty); void tty_unthrottle(struct tty_struct *tty); bool tty_throttle_safe(struct tty_struct *tty); bool tty_unthrottle_safe(struct tty_struct *tty); int tty_do_resize(struct tty_struct *tty, struct winsize *ws); int tty_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount); int tty_get_tiocm(struct tty_struct *tty); int is_current_pgrp_orphaned(void); void tty_hangup(struct tty_struct *tty); void tty_vhangup(struct tty_struct *tty); int tty_hung_up_p(struct file *filp); void do_SAK(struct tty_struct *tty); void __do_SAK(struct tty_struct *tty); void no_tty(void); speed_t tty_termios_baud_rate(const struct ktermios *termios); void tty_termios_encode_baud_rate(struct ktermios *termios, speed_t ibaud, speed_t obaud); void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, speed_t obaud); /** * tty_get_baud_rate - get tty bit rates * @tty: tty to query * * Returns: the baud rate as an integer for this terminal * * Locking: The termios lock must be held by the caller. */ static inline speed_t tty_get_baud_rate(const struct tty_struct *tty) { return tty_termios_baud_rate(&tty->termios); } unsigned char tty_get_char_size(unsigned int cflag); unsigned char tty_get_frame_size(unsigned int cflag); void tty_termios_copy_hw(struct ktermios *new, const struct ktermios *old); bool tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b); int tty_set_termios(struct tty_struct *tty, struct ktermios *kt); void tty_wakeup(struct tty_struct *tty); int tty_mode_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); int tty_perform_flush(struct tty_struct *tty, unsigned long arg); struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx); void tty_release_struct(struct tty_struct *tty, int idx); void tty_init_termios(struct tty_struct *tty); void tty_save_termios(struct tty_struct *tty); int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty); extern struct mutex tty_mutex; /* n_tty.c */ void n_tty_inherit_ops(struct tty_ldisc_ops *ops); #ifdef CONFIG_TTY void __init n_tty_init(void); #else static inline void n_tty_init(void) { } #endif /* tty_audit.c */ #ifdef CONFIG_AUDIT void tty_audit_exit(void); void tty_audit_fork(struct signal_struct *sig); int tty_audit_push(void); #else static inline void tty_audit_exit(void) { } static inline void tty_audit_fork(struct signal_struct *sig) { } static inline int tty_audit_push(void) { return 0; } #endif /* tty_ioctl.c */ int n_tty_ioctl_helper(struct tty_struct *tty, unsigned int cmd, unsigned long arg); /* vt.c */ int vt_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); long vt_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); /* tty_mutex.c */ /* functions for preparation of BKL removal */ void tty_lock(struct tty_struct *tty); int tty_lock_interruptible(struct tty_struct *tty); void tty_unlock(struct tty_struct *tty); void tty_lock_slave(struct tty_struct *tty); void tty_unlock_slave(struct tty_struct *tty); void tty_set_lock_subclass(struct tty_struct *tty); #endif
3 2 1 1 1 7 10 3 6 1 5 6 6 3 3 1 3 3 3 3 3 3 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2022 * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp stream queue/scheduling. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> /* Fair Capacity and Weighted Fair Queueing handling * RFC 8260 section 3.5 and 3.6 */ static void sctp_sched_fc_unsched_all(struct sctp_stream *stream); static int sctp_sched_wfq_set(struct sctp_stream *stream, __u16 sid, __u16 weight, gfp_t gfp) { struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext; if (!weight) return -EINVAL; soute->fc_weight = weight; return 0; } static int sctp_sched_wfq_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext; *value = soute->fc_weight; return 0; } static int sctp_sched_fc_set(struct sctp_stream *stream, __u16 sid, __u16 weight, gfp_t gfp) { return 0; } static int sctp_sched_fc_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { return 0; } static int sctp_sched_fc_init(struct sctp_stream *stream) { INIT_LIST_HEAD(&stream->fc_list); return 0; } static int sctp_sched_fc_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext; INIT_LIST_HEAD(&soute->fc_list); soute->fc_length = 0; soute->fc_weight = 1; return 0; } static void sctp_sched_fc_free_sid(struct sctp_stream *stream, __u16 sid) { } static void sctp_sched_fc_sched(struct sctp_stream *stream, struct sctp_stream_out_ext *soute) { struct sctp_stream_out_ext *pos; if (!list_empty(&soute->fc_list)) return; list_for_each_entry(pos, &stream->fc_list, fc_list) if ((__u64)pos->fc_length * soute->fc_weight >= (__u64)soute->fc_length * pos->fc_weight) break; list_add_tail(&soute->fc_list, &pos->fc_list); } static void sctp_sched_fc_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { struct sctp_stream *stream; struct sctp_chunk *ch; __u16 sid; ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); sid = sctp_chunk_stream_no(ch); stream = &q->asoc->stream; sctp_sched_fc_sched(stream, SCTP_SO(stream, sid)->ext); } static struct sctp_chunk *sctp_sched_fc_dequeue(struct sctp_outq *q) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_stream_out_ext *soute; struct sctp_chunk *ch; /* Bail out quickly if queue is empty */ if (list_empty(&q->out_chunk_list)) return NULL; /* Find which chunk is next */ if (stream->out_curr) soute = stream->out_curr->ext; else soute = list_entry(stream->fc_list.next, struct sctp_stream_out_ext, fc_list); ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list); sctp_sched_dequeue_common(q, ch); return ch; } static void sctp_sched_fc_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_stream_out_ext *soute, *pos; __u16 sid, i; sid = sctp_chunk_stream_no(ch); soute = SCTP_SO(stream, sid)->ext; /* reduce all fc_lengths by U32_MAX / 4 if the current fc_length overflows. */ if (soute->fc_length > U32_MAX - ch->skb->len) { for (i = 0; i < stream->outcnt; i++) { pos = SCTP_SO(stream, i)->ext; if (!pos) continue; if (pos->fc_length <= (U32_MAX >> 2)) { pos->fc_length = 0; continue; } pos->fc_length -= (U32_MAX >> 2); } } soute->fc_length += ch->skb->len; if (list_empty(&soute->outq)) { list_del_init(&soute->fc_list); return; } pos = soute; list_for_each_entry_continue(pos, &stream->fc_list, fc_list) if ((__u64)pos->fc_length * soute->fc_weight >= (__u64)soute->fc_length * pos->fc_weight) break; list_move_tail(&soute->fc_list, &pos->fc_list); } static void sctp_sched_fc_sched_all(struct sctp_stream *stream) { struct sctp_association *asoc; struct sctp_chunk *ch; asoc = container_of(stream, struct sctp_association, stream); list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { __u16 sid = sctp_chunk_stream_no(ch); if (SCTP_SO(stream, sid)->ext) sctp_sched_fc_sched(stream, SCTP_SO(stream, sid)->ext); } } static void sctp_sched_fc_unsched_all(struct sctp_stream *stream) { struct sctp_stream_out_ext *soute, *tmp; list_for_each_entry_safe(soute, tmp, &stream->fc_list, fc_list) list_del_init(&soute->fc_list); } static struct sctp_sched_ops sctp_sched_fc = { .set = sctp_sched_fc_set, .get = sctp_sched_fc_get, .init = sctp_sched_fc_init, .init_sid = sctp_sched_fc_init_sid, .free_sid = sctp_sched_fc_free_sid, .enqueue = sctp_sched_fc_enqueue, .dequeue = sctp_sched_fc_dequeue, .dequeue_done = sctp_sched_fc_dequeue_done, .sched_all = sctp_sched_fc_sched_all, .unsched_all = sctp_sched_fc_unsched_all, }; void sctp_sched_ops_fc_init(void) { sctp_sched_ops_register(SCTP_SS_FC, &sctp_sched_fc); } static struct sctp_sched_ops sctp_sched_wfq = { .set = sctp_sched_wfq_set, .get = sctp_sched_wfq_get, .init = sctp_sched_fc_init, .init_sid = sctp_sched_fc_init_sid, .free_sid = sctp_sched_fc_free_sid, .enqueue = sctp_sched_fc_enqueue, .dequeue = sctp_sched_fc_dequeue, .dequeue_done = sctp_sched_fc_dequeue_done, .sched_all = sctp_sched_fc_sched_all, .unsched_all = sctp_sched_fc_unsched_all, }; void sctp_sched_ops_wfq_init(void) { sctp_sched_ops_register(SCTP_SS_WFQ, &sctp_sched_wfq); }
16 13 7 16 5 11 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 // SPDX-License-Identifier: GPL-2.0 /* * Provide a default dump_stack() function for architectures * which don't implement their own. */ #include <linux/kernel.h> #include <linux/buildid.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/smp.h> #include <linux/atomic.h> #include <linux/kexec.h> #include <linux/utsname.h> #include <linux/stop_machine.h> static char dump_stack_arch_desc_str[128]; /** * dump_stack_set_arch_desc - set arch-specific str to show with task dumps * @fmt: printf-style format string * @...: arguments for the format string * * The configured string will be printed right after utsname during task * dumps. Usually used to add arch-specific system identifiers. If an * arch wants to make use of such an ID string, it should initialize this * as soon as possible during boot. */ void __init dump_stack_set_arch_desc(const char *fmt, ...) { va_list args; va_start(args, fmt); vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), fmt, args); va_end(args); } #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) #define BUILD_ID_FMT " %20phN" #define BUILD_ID_VAL vmlinux_build_id #else #define BUILD_ID_FMT "%s" #define BUILD_ID_VAL "" #endif /** * dump_stack_print_info - print generic debug info for dump_stack() * @log_lvl: log level * * Arch-specific dump_stack() implementations can use this function to * print out the same debug information as the generic dump_stack(). */ void dump_stack_print_info(const char *log_lvl) { printk("%sCPU: %d UID: %u PID: %d Comm: %.20s %s%s %s %.*s" BUILD_ID_FMT "\n", log_lvl, raw_smp_processor_id(), __kuid_val(current_real_cred()->euid), current->pid, current->comm, kexec_crash_loaded() ? "Kdump: loaded " : "", print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version, BUILD_ID_VAL); if (get_taint()) printk("%s%s\n", log_lvl, print_tainted_verbose()); if (dump_stack_arch_desc_str[0] != '\0') printk("%sHardware name: %s\n", log_lvl, dump_stack_arch_desc_str); print_worker_info(log_lvl, current); print_stop_info(log_lvl, current); print_scx_info(log_lvl, current); } /** * show_regs_print_info - print generic debug info for show_regs() * @log_lvl: log level * * show_regs() implementations can use this function to print out generic * debug information. */ void show_regs_print_info(const char *log_lvl) { dump_stack_print_info(log_lvl); } static void __dump_stack(const char *log_lvl) { dump_stack_print_info(log_lvl); show_stack(NULL, NULL, log_lvl); } /** * dump_stack_lvl - dump the current task information and its stack trace * @log_lvl: log level * * Architectures can override this implementation by implementing its own. */ asmlinkage __visible void dump_stack_lvl(const char *log_lvl) { bool in_panic = this_cpu_in_panic(); unsigned long flags; /* * Permit this cpu to perform nested stack dumps while serialising * against other CPUs, unless this CPU is in panic. * * When in panic, non-panic CPUs are not permitted to store new * printk messages so there is no need to synchronize the output. * This avoids potential deadlock in panic() if another CPU is * holding and unable to release the printk_cpu_sync. */ if (!in_panic) printk_cpu_sync_get_irqsave(flags); __dump_stack(log_lvl); if (!in_panic) printk_cpu_sync_put_irqrestore(flags); } EXPORT_SYMBOL(dump_stack_lvl); asmlinkage __visible void dump_stack(void) { dump_stack_lvl(KERN_DEFAULT); } EXPORT_SYMBOL(dump_stack);
1 1 3 7 3 7 6 1 12 12 3 2 6 6 1 5 1 4 5 1 1 2 2 5 6 1 1 1 1 8 1 1 6 6 1 2 3 4 4 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fsnotify.h> #include <linux/namei.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "../fs/internal.h" #include "io_uring.h" #include "rsrc.h" #include "openclose.h" struct io_open { struct file *file; int dfd; u32 file_slot; struct filename *filename; struct open_how how; unsigned long nofile; }; struct io_close { struct file *file; int fd; u32 file_slot; }; struct io_fixed_install { struct file *file; unsigned int o_flags; }; static bool io_openat_force_async(struct io_open *open) { /* * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, * it'll always -EAGAIN. Note that we test for __O_TMPFILE because * O_TMPFILE includes O_DIRECTORY, which isn't a flag we need to force * async for. */ return open->how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE); } static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); const char __user *fname; int ret; if (unlikely(sqe->buf_index)) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; /* open.how should be already initialised */ if (!(open->how.flags & O_PATH) && force_o_largefile()) open->how.flags |= O_LARGEFILE; open->dfd = READ_ONCE(sqe->fd); fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); open->filename = getname(fname); if (IS_ERR(open->filename)) { ret = PTR_ERR(open->filename); open->filename = NULL; return ret; } open->file_slot = READ_ONCE(sqe->file_index); if (open->file_slot && (open->how.flags & O_CLOEXEC)) return -EINVAL; open->nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; if (io_openat_force_async(open)) req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); u64 mode = READ_ONCE(sqe->len); u64 flags = READ_ONCE(sqe->open_flags); open->how = build_open_how(flags, mode); return __io_openat_prep(req, sqe); } int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); struct open_how __user *how; size_t len; int ret; how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); len = READ_ONCE(sqe->len); if (len < OPEN_HOW_SIZE_VER0) return -EINVAL; ret = copy_struct_from_user(&open->how, sizeof(open->how), how, len); if (ret) return ret; return __io_openat_prep(req, sqe); } int io_openat2(struct io_kiocb *req, unsigned int issue_flags) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); struct open_flags op; struct file *file; bool resolve_nonblock, nonblock_set; bool fixed = !!open->file_slot; int ret; ret = build_open_flags(&open->how, &op); if (ret) goto err; nonblock_set = op.open_flag & O_NONBLOCK; resolve_nonblock = open->how.resolve & RESOLVE_CACHED; if (issue_flags & IO_URING_F_NONBLOCK) { WARN_ON_ONCE(io_openat_force_async(open)); op.lookup_flags |= LOOKUP_CACHED; op.open_flag |= O_NONBLOCK; } if (!fixed) { ret = __get_unused_fd_flags(open->how.flags, open->nofile); if (ret < 0) goto err; } file = do_filp_open(open->dfd, open->filename, &op); if (IS_ERR(file)) { /* * We could hang on to this 'fd' on retrying, but seems like * marginal gain for something that is now known to be a slower * path. So just put it, and we'll get a new one when we retry. */ if (!fixed) put_unused_fd(ret); ret = PTR_ERR(file); /* only retry if RESOLVE_CACHED wasn't already set by application */ if (ret == -EAGAIN && (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) return -EAGAIN; goto err; } if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) file->f_flags &= ~O_NONBLOCK; if (!fixed) fd_install(ret, file); else ret = io_fixed_fd_install(req, issue_flags, file, open->file_slot); err: putname(open->filename); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; } int io_openat(struct io_kiocb *req, unsigned int issue_flags) { return io_openat2(req, issue_flags); } void io_open_cleanup(struct io_kiocb *req) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); if (open->filename) putname(open->filename); } int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags, unsigned int offset) { int ret; io_ring_submit_lock(ctx, issue_flags); ret = io_fixed_fd_remove(ctx, offset); io_ring_submit_unlock(ctx, issue_flags); return ret; } static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) { struct io_close *close = io_kiocb_to_cmd(req, struct io_close); return __io_close_fixed(req->ctx, issue_flags, close->file_slot - 1); } int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_close *close = io_kiocb_to_cmd(req, struct io_close); if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; close->fd = READ_ONCE(sqe->fd); close->file_slot = READ_ONCE(sqe->file_index); if (close->file_slot && close->fd) return -EINVAL; return 0; } int io_close(struct io_kiocb *req, unsigned int issue_flags) { struct files_struct *files = current->files; struct io_close *close = io_kiocb_to_cmd(req, struct io_close); struct file *file; int ret = -EBADF; if (close->file_slot) { ret = io_close_fixed(req, issue_flags); goto err; } spin_lock(&files->file_lock); file = files_lookup_fd_locked(files, close->fd); if (!file || io_is_uring_fops(file)) { spin_unlock(&files->file_lock); goto err; } /* if the file has a flush method, be safe and punt to async */ if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) { spin_unlock(&files->file_lock); return -EAGAIN; } file = file_close_fd_locked(files, close->fd); spin_unlock(&files->file_lock); if (!file) goto err; /* No ->flush() or already async, safely close from here */ ret = filp_close(file, current->files); err: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; } int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_fixed_install *ifi; unsigned int flags; if (sqe->off || sqe->addr || sqe->len || sqe->buf_index || sqe->splice_fd_in || sqe->addr3) return -EINVAL; /* must be a fixed file */ if (!(req->flags & REQ_F_FIXED_FILE)) return -EBADF; flags = READ_ONCE(sqe->install_fd_flags); if (flags & ~IORING_FIXED_FD_NO_CLOEXEC) return -EINVAL; /* ensure the task's creds are used when installing/receiving fds */ if (req->flags & REQ_F_CREDS) return -EPERM; /* default to O_CLOEXEC, disable if IORING_FIXED_FD_NO_CLOEXEC is set */ ifi = io_kiocb_to_cmd(req, struct io_fixed_install); ifi->o_flags = O_CLOEXEC; if (flags & IORING_FIXED_FD_NO_CLOEXEC) ifi->o_flags = 0; return 0; } int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags) { struct io_fixed_install *ifi; int ret; ifi = io_kiocb_to_cmd(req, struct io_fixed_install); ret = receive_fd(req->file, NULL, ifi->o_flags); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; }
19 10 19 19 10 9 19 9 4 5 6 12 12 12 12 3 1 11 11 11 11 2 2 4 7 6 3 6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 // SPDX-License-Identifier: GPL-2.0-only /* * dma-fence-util: misc functions for dma_fence objects * * Copyright (C) 2022 Advanced Micro Devices, Inc. * Authors: * Christian König <christian.koenig@amd.com> */ #include <linux/dma-fence.h> #include <linux/dma-fence-array.h> #include <linux/dma-fence-chain.h> #include <linux/dma-fence-unwrap.h> #include <linux/slab.h> #include <linux/sort.h> /* Internal helper to start new array iteration, don't use directly */ static struct dma_fence * __dma_fence_unwrap_array(struct dma_fence_unwrap *cursor) { cursor->array = dma_fence_chain_contained(cursor->chain); cursor->index = 0; return dma_fence_array_first(cursor->array); } /** * dma_fence_unwrap_first - return the first fence from fence containers * @head: the entrypoint into the containers * @cursor: current position inside the containers * * Unwraps potential dma_fence_chain/dma_fence_array containers and return the * first fence. */ struct dma_fence *dma_fence_unwrap_first(struct dma_fence *head, struct dma_fence_unwrap *cursor) { cursor->chain = dma_fence_get(head); return __dma_fence_unwrap_array(cursor); } EXPORT_SYMBOL_GPL(dma_fence_unwrap_first); /** * dma_fence_unwrap_next - return the next fence from a fence containers * @cursor: current position inside the containers * * Continue unwrapping the dma_fence_chain/dma_fence_array containers and return * the next fence from them. */ struct dma_fence *dma_fence_unwrap_next(struct dma_fence_unwrap *cursor) { struct dma_fence *tmp; ++cursor->index; tmp = dma_fence_array_next(cursor->array, cursor->index); if (tmp) return tmp; cursor->chain = dma_fence_chain_walk(cursor->chain); return __dma_fence_unwrap_array(cursor); } EXPORT_SYMBOL_GPL(dma_fence_unwrap_next); static int fence_cmp(const void *_a, const void *_b) { struct dma_fence *a = *(struct dma_fence **)_a; struct dma_fence *b = *(struct dma_fence **)_b; if (a->context < b->context) return -1; else if (a->context > b->context) return 1; if (dma_fence_is_later(b, a)) return 1; else if (dma_fence_is_later(a, b)) return -1; return 0; } /* Implementation for the dma_fence_merge() marco, don't use directly */ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences, struct dma_fence **fences, struct dma_fence_unwrap *iter) { struct dma_fence_array *result; struct dma_fence *tmp, **array; ktime_t timestamp; int i, j, count; count = 0; timestamp = ns_to_ktime(0); for (i = 0; i < num_fences; ++i) { dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) { if (!dma_fence_is_signaled(tmp)) { ++count; } else { ktime_t t = dma_fence_timestamp(tmp); if (ktime_after(t, timestamp)) timestamp = t; } } } /* * If we couldn't find a pending fence just return a private signaled * fence with the timestamp of the last signaled one. */ if (count == 0) return dma_fence_allocate_private_stub(timestamp); array = kmalloc_array(count, sizeof(*array), GFP_KERNEL); if (!array) return NULL; count = 0; for (i = 0; i < num_fences; ++i) { dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) { if (!dma_fence_is_signaled(tmp)) { array[count++] = dma_fence_get(tmp); } else { ktime_t t = dma_fence_timestamp(tmp); if (ktime_after(t, timestamp)) timestamp = t; } } } if (count == 0 || count == 1) goto return_fastpath; sort(array, count, sizeof(*array), fence_cmp, NULL); /* * Only keep the most recent fence for each context. */ j = 0; for (i = 1; i < count; i++) { if (array[i]->context == array[j]->context) dma_fence_put(array[i]); else array[++j] = array[i]; } count = ++j; if (count > 1) { result = dma_fence_array_create(count, array, dma_fence_context_alloc(1), 1, false); if (!result) { for (i = 0; i < count; i++) dma_fence_put(array[i]); tmp = NULL; goto return_tmp; } return &result->base; } return_fastpath: if (count == 0) tmp = dma_fence_allocate_private_stub(timestamp); else tmp = array[0]; return_tmp: kfree(array); return tmp; } EXPORT_SYMBOL_GPL(__dma_fence_unwrap_merge);
12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 /* * Copyright (c) 2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/completion.h> #include <linux/dma-mapping.h> #include <linux/err.h> #include <linux/interrupt.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/random.h> #include <rdma/ib_cache.h> #include "sa.h" static int mcast_add_one(struct ib_device *device); static void mcast_remove_one(struct ib_device *device, void *client_data); static struct ib_client mcast_client = { .name = "ib_multicast", .add = mcast_add_one, .remove = mcast_remove_one }; static struct ib_sa_client sa_client; static struct workqueue_struct *mcast_wq; static union ib_gid mgid0; struct mcast_device; struct mcast_port { struct mcast_device *dev; spinlock_t lock; struct rb_root table; refcount_t refcount; struct completion comp; u32 port_num; }; struct mcast_device { struct ib_device *device; struct ib_event_handler event_handler; int start_port; int end_port; struct mcast_port port[]; }; enum mcast_state { MCAST_JOINING, MCAST_MEMBER, MCAST_ERROR, }; enum mcast_group_state { MCAST_IDLE, MCAST_BUSY, MCAST_GROUP_ERROR, MCAST_PKEY_EVENT }; enum { MCAST_INVALID_PKEY_INDEX = 0xFFFF }; struct mcast_member; struct mcast_group { struct ib_sa_mcmember_rec rec; struct rb_node node; struct mcast_port *port; spinlock_t lock; struct work_struct work; struct list_head pending_list; struct list_head active_list; struct mcast_member *last_join; int members[NUM_JOIN_MEMBERSHIP_TYPES]; atomic_t refcount; enum mcast_group_state state; struct ib_sa_query *query; u16 pkey_index; u8 leave_state; int retries; }; struct mcast_member { struct ib_sa_multicast multicast; struct ib_sa_client *client; struct mcast_group *group; struct list_head list; enum mcast_state state; refcount_t refcount; struct completion comp; }; static void join_handler(int status, struct ib_sa_mcmember_rec *rec, void *context); static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, void *context); static struct mcast_group *mcast_find(struct mcast_port *port, union ib_gid *mgid) { struct rb_node *node = port->table.rb_node; struct mcast_group *group; int ret; while (node) { group = rb_entry(node, struct mcast_group, node); ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid); if (!ret) return group; if (ret < 0) node = node->rb_left; else node = node->rb_right; } return NULL; } static struct mcast_group *mcast_insert(struct mcast_port *port, struct mcast_group *group, int allow_duplicates) { struct rb_node **link = &port->table.rb_node; struct rb_node *parent = NULL; struct mcast_group *cur_group; int ret; while (*link) { parent = *link; cur_group = rb_entry(parent, struct mcast_group, node); ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw, sizeof group->rec.mgid); if (ret < 0) link = &(*link)->rb_left; else if (ret > 0) link = &(*link)->rb_right; else if (allow_duplicates) link = &(*link)->rb_left; else return cur_group; } rb_link_node(&group->node, parent, link); rb_insert_color(&group->node, &port->table); return NULL; } static void deref_port(struct mcast_port *port) { if (refcount_dec_and_test(&port->refcount)) complete(&port->comp); } static void release_group(struct mcast_group *group) { struct mcast_port *port = group->port; unsigned long flags; spin_lock_irqsave(&port->lock, flags); if (atomic_dec_and_test(&group->refcount)) { rb_erase(&group->node, &port->table); spin_unlock_irqrestore(&port->lock, flags); kfree(group); deref_port(port); } else spin_unlock_irqrestore(&port->lock, flags); } static void deref_member(struct mcast_member *member) { if (refcount_dec_and_test(&member->refcount)) complete(&member->comp); } static void queue_join(struct mcast_member *member) { struct mcast_group *group = member->group; unsigned long flags; spin_lock_irqsave(&group->lock, flags); list_add_tail(&member->list, &group->pending_list); if (group->state == MCAST_IDLE) { group->state = MCAST_BUSY; atomic_inc(&group->refcount); queue_work(mcast_wq, &group->work); } spin_unlock_irqrestore(&group->lock, flags); } /* * A multicast group has four types of members: full member, non member, * sendonly non member and sendonly full member. * We need to keep track of the number of members of each * type based on their join state. Adjust the number of members the belong to * the specified join states. */ static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) { int i; for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++, join_state >>= 1) if (join_state & 0x1) group->members[i] += inc; } /* * If a multicast group has zero members left for a particular join state, but * the group is still a member with the SA, we need to leave that join state. * Determine which join states we still belong to, but that do not have any * active members. */ static u8 get_leave_state(struct mcast_group *group) { u8 leave_state = 0; int i; for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++) if (!group->members[i]) leave_state |= (0x1 << i); return leave_state & group->rec.join_state; } static int check_selector(ib_sa_comp_mask comp_mask, ib_sa_comp_mask selector_mask, ib_sa_comp_mask value_mask, u8 selector, u8 src_value, u8 dst_value) { int err; if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) return 0; switch (selector) { case IB_SA_GT: err = (src_value <= dst_value); break; case IB_SA_LT: err = (src_value >= dst_value); break; case IB_SA_EQ: err = (src_value != dst_value); break; default: err = 0; break; } return err; } static int cmp_rec(struct ib_sa_mcmember_rec *src, struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask) { /* MGID must already match */ if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID && memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector, src->mtu, dst->mtu)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && src->traffic_class != dst->traffic_class) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, IB_SA_MCMEMBER_REC_RATE, dst->rate_selector, src->rate, dst->rate)) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, dst->packet_life_time_selector, src->packet_life_time, dst->packet_life_time)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL && src->flow_label != dst->flow_label) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT && src->hop_limit != dst->hop_limit) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope) return -EINVAL; /* join_state checked separately, proxy_join ignored */ return 0; } static int send_join(struct mcast_group *group, struct mcast_member *member) { struct mcast_port *port = group->port; int ret; group->last_join = member; ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, port->port_num, IB_MGMT_METHOD_SET, &member->multicast.rec, member->multicast.comp_mask, 3000, GFP_KERNEL, join_handler, group, &group->query); return (ret > 0) ? 0 : ret; } static int send_leave(struct mcast_group *group, u8 leave_state) { struct mcast_port *port = group->port; struct ib_sa_mcmember_rec rec; int ret; rec = group->rec; rec.join_state = leave_state; group->leave_state = leave_state; ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, port->port_num, IB_SA_METHOD_DELETE, &rec, IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE, 3000, GFP_KERNEL, leave_handler, group, &group->query); return (ret > 0) ? 0 : ret; } static void join_group(struct mcast_group *group, struct mcast_member *member, u8 join_state) { member->state = MCAST_MEMBER; adjust_membership(group, join_state, 1); group->rec.join_state |= join_state; member->multicast.rec = group->rec; member->multicast.rec.join_state = join_state; list_move(&member->list, &group->active_list); } static int fail_join(struct mcast_group *group, struct mcast_member *member, int status) { spin_lock_irq(&group->lock); list_del_init(&member->list); spin_unlock_irq(&group->lock); return member->multicast.callback(status, &member->multicast); } static void process_group_error(struct mcast_group *group) { struct mcast_member *member; int ret = 0; u16 pkey_index; if (group->state == MCAST_PKEY_EVENT) ret = ib_find_pkey(group->port->dev->device, group->port->port_num, be16_to_cpu(group->rec.pkey), &pkey_index); spin_lock_irq(&group->lock); if (group->state == MCAST_PKEY_EVENT && !ret && group->pkey_index == pkey_index) goto out; while (!list_empty(&group->active_list)) { member = list_entry(group->active_list.next, struct mcast_member, list); refcount_inc(&member->refcount); list_del_init(&member->list); adjust_membership(group, member->multicast.rec.join_state, -1); member->state = MCAST_ERROR; spin_unlock_irq(&group->lock); ret = member->multicast.callback(-ENETRESET, &member->multicast); deref_member(member); if (ret) ib_sa_free_multicast(&member->multicast); spin_lock_irq(&group->lock); } group->rec.join_state = 0; out: group->state = MCAST_BUSY; spin_unlock_irq(&group->lock); } static void mcast_work_handler(struct work_struct *work) { struct mcast_group *group; struct mcast_member *member; struct ib_sa_multicast *multicast; int status, ret; u8 join_state; group = container_of(work, typeof(*group), work); retest: spin_lock_irq(&group->lock); while (!list_empty(&group->pending_list) || (group->state != MCAST_BUSY)) { if (group->state != MCAST_BUSY) { spin_unlock_irq(&group->lock); process_group_error(group); goto retest; } member = list_entry(group->pending_list.next, struct mcast_member, list); multicast = &member->multicast; join_state = multicast->rec.join_state; refcount_inc(&member->refcount); if (join_state == (group->rec.join_state & join_state)) { status = cmp_rec(&group->rec, &multicast->rec, multicast->comp_mask); if (!status) join_group(group, member, join_state); else list_del_init(&member->list); spin_unlock_irq(&group->lock); ret = multicast->callback(status, multicast); } else { spin_unlock_irq(&group->lock); status = send_join(group, member); if (!status) { deref_member(member); return; } ret = fail_join(group, member, status); } deref_member(member); if (ret) ib_sa_free_multicast(&member->multicast); spin_lock_irq(&group->lock); } join_state = get_leave_state(group); if (join_state) { group->rec.join_state &= ~join_state; spin_unlock_irq(&group->lock); if (send_leave(group, join_state)) goto retest; } else { group->state = MCAST_IDLE; spin_unlock_irq(&group->lock); release_group(group); } } /* * Fail a join request if it is still active - at the head of the pending queue. */ static void process_join_error(struct mcast_group *group, int status) { struct mcast_member *member; int ret; spin_lock_irq(&group->lock); member = list_entry(group->pending_list.next, struct mcast_member, list); if (group->last_join == member) { refcount_inc(&member->refcount); list_del_init(&member->list); spin_unlock_irq(&group->lock); ret = member->multicast.callback(status, &member->multicast); deref_member(member); if (ret) ib_sa_free_multicast(&member->multicast); } else spin_unlock_irq(&group->lock); } static void join_handler(int status, struct ib_sa_mcmember_rec *rec, void *context) { struct mcast_group *group = context; u16 pkey_index = MCAST_INVALID_PKEY_INDEX; if (status) process_join_error(group, status); else { int mgids_changed, is_mgid0; if (ib_find_pkey(group->port->dev->device, group->port->port_num, be16_to_cpu(rec->pkey), &pkey_index)) pkey_index = MCAST_INVALID_PKEY_INDEX; spin_lock_irq(&group->port->lock); if (group->state == MCAST_BUSY && group->pkey_index == MCAST_INVALID_PKEY_INDEX) group->pkey_index = pkey_index; mgids_changed = memcmp(&rec->mgid, &group->rec.mgid, sizeof(group->rec.mgid)); group->rec = *rec; if (mgids_changed) { rb_erase(&group->node, &group->port->table); is_mgid0 = !memcmp(&mgid0, &group->rec.mgid, sizeof(mgid0)); mcast_insert(group->port, group, is_mgid0); } spin_unlock_irq(&group->port->lock); } mcast_work_handler(&group->work); } static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, void *context) { struct mcast_group *group = context; if (status && group->retries > 0 && !send_leave(group, group->leave_state)) group->retries--; else mcast_work_handler(&group->work); } static struct mcast_group *acquire_group(struct mcast_port *port, union ib_gid *mgid, gfp_t gfp_mask) { struct mcast_group *group, *cur_group; unsigned long flags; int is_mgid0; is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0); if (!is_mgid0) { spin_lock_irqsave(&port->lock, flags); group = mcast_find(port, mgid); if (group) goto found; spin_unlock_irqrestore(&port->lock, flags); } group = kzalloc(sizeof *group, gfp_mask); if (!group) return NULL; group->retries = 3; group->port = port; group->rec.mgid = *mgid; group->pkey_index = MCAST_INVALID_PKEY_INDEX; INIT_LIST_HEAD(&group->pending_list); INIT_LIST_HEAD(&group->active_list); INIT_WORK(&group->work, mcast_work_handler); spin_lock_init(&group->lock); spin_lock_irqsave(&port->lock, flags); cur_group = mcast_insert(port, group, is_mgid0); if (cur_group) { kfree(group); group = cur_group; } else refcount_inc(&port->refcount); found: atomic_inc(&group->refcount); spin_unlock_irqrestore(&port->lock, flags); return group; } /* * We serialize all join requests to a single group to make our lives much * easier. Otherwise, two users could try to join the same group * simultaneously, with different configurations, one could leave while the * join is in progress, etc., which makes locking around error recovery * difficult. */ struct ib_sa_multicast * ib_sa_join_multicast(struct ib_sa_client *client, struct ib_device *device, u32 port_num, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, gfp_t gfp_mask, int (*callback)(int status, struct ib_sa_multicast *multicast), void *context) { struct mcast_device *dev; struct mcast_member *member; struct ib_sa_multicast *multicast; int ret; dev = ib_get_client_data(device, &mcast_client); if (!dev) return ERR_PTR(-ENODEV); member = kmalloc(sizeof *member, gfp_mask); if (!member) return ERR_PTR(-ENOMEM); ib_sa_client_get(client); member->client = client; member->multicast.rec = *rec; member->multicast.comp_mask = comp_mask; member->multicast.callback = callback; member->multicast.context = context; init_completion(&member->comp); refcount_set(&member->refcount, 1); member->state = MCAST_JOINING; member->group = acquire_group(&dev->port[port_num - dev->start_port], &rec->mgid, gfp_mask); if (!member->group) { ret = -ENOMEM; goto err; } /* * The user will get the multicast structure in their callback. They * could then free the multicast structure before we can return from * this routine. So we save the pointer to return before queuing * any callback. */ multicast = &member->multicast; queue_join(member); return multicast; err: ib_sa_client_put(client); kfree(member); return ERR_PTR(ret); } EXPORT_SYMBOL(ib_sa_join_multicast); void ib_sa_free_multicast(struct ib_sa_multicast *multicast) { struct mcast_member *member; struct mcast_group *group; member = container_of(multicast, struct mcast_member, multicast); group = member->group; spin_lock_irq(&group->lock); if (member->state == MCAST_MEMBER) adjust_membership(group, multicast->rec.join_state, -1); list_del_init(&member->list); if (group->state == MCAST_IDLE) { group->state = MCAST_BUSY; spin_unlock_irq(&group->lock); /* Continue to hold reference on group until callback */ queue_work(mcast_wq, &group->work); } else { spin_unlock_irq(&group->lock); release_group(group); } deref_member(member); wait_for_completion(&member->comp); ib_sa_client_put(member->client); kfree(member); } EXPORT_SYMBOL(ib_sa_free_multicast); int ib_sa_get_mcmember_rec(struct ib_device *device, u32 port_num, union ib_gid *mgid, struct ib_sa_mcmember_rec *rec) { struct mcast_device *dev; struct mcast_port *port; struct mcast_group *group; unsigned long flags; int ret = 0; dev = ib_get_client_data(device, &mcast_client); if (!dev) return -ENODEV; port = &dev->port[port_num - dev->start_port]; spin_lock_irqsave(&port->lock, flags); group = mcast_find(port, mgid); if (group) *rec = group->rec; else ret = -EADDRNOTAVAIL; spin_unlock_irqrestore(&port->lock, flags); return ret; } EXPORT_SYMBOL(ib_sa_get_mcmember_rec); /** * ib_init_ah_from_mcmember - Initialize AH attribute from multicast * member record and gid of the device. * @device: RDMA device * @port_num: Port of the rdma device to consider * @rec: Multicast member record to use * @ndev: Optional netdevice, applicable only for RoCE * @gid_type: GID type to consider * @ah_attr: AH attribute to fillup on successful completion * * ib_init_ah_from_mcmember() initializes AH attribute based on multicast * member record and other device properties. On success the caller is * responsible to call rdma_destroy_ah_attr on the ah_attr. Returns 0 on * success or appropriate error code. * */ int ib_init_ah_from_mcmember(struct ib_device *device, u32 port_num, struct ib_sa_mcmember_rec *rec, struct net_device *ndev, enum ib_gid_type gid_type, struct rdma_ah_attr *ah_attr) { const struct ib_gid_attr *sgid_attr; /* GID table is not based on the netdevice for IB link layer, * so ignore ndev during search. */ if (rdma_protocol_ib(device, port_num)) ndev = NULL; else if (!rdma_protocol_roce(device, port_num)) return -EINVAL; sgid_attr = rdma_find_gid_by_port(device, &rec->port_gid, gid_type, port_num, ndev); if (IS_ERR(sgid_attr)) return PTR_ERR(sgid_attr); memset(ah_attr, 0, sizeof(*ah_attr)); ah_attr->type = rdma_ah_find_type(device, port_num); rdma_ah_set_dlid(ah_attr, be16_to_cpu(rec->mlid)); rdma_ah_set_sl(ah_attr, rec->sl); rdma_ah_set_port_num(ah_attr, port_num); rdma_ah_set_static_rate(ah_attr, rec->rate); rdma_move_grh_sgid_attr(ah_attr, &rec->mgid, be32_to_cpu(rec->flow_label), rec->hop_limit, rec->traffic_class, sgid_attr); return 0; } EXPORT_SYMBOL(ib_init_ah_from_mcmember); static void mcast_groups_event(struct mcast_port *port, enum mcast_group_state state) { struct mcast_group *group; struct rb_node *node; unsigned long flags; spin_lock_irqsave(&port->lock, flags); for (node = rb_first(&port->table); node; node = rb_next(node)) { group = rb_entry(node, struct mcast_group, node); spin_lock(&group->lock); if (group->state == MCAST_IDLE) { atomic_inc(&group->refcount); queue_work(mcast_wq, &group->work); } if (group->state != MCAST_GROUP_ERROR) group->state = state; spin_unlock(&group->lock); } spin_unlock_irqrestore(&port->lock, flags); } static void mcast_event_handler(struct ib_event_handler *handler, struct ib_event *event) { struct mcast_device *dev; int index; dev = container_of(handler, struct mcast_device, event_handler); if (!rdma_cap_ib_mcast(dev->device, event->element.port_num)) return; index = event->element.port_num - dev->start_port; switch (event->event) { case IB_EVENT_PORT_ERR: case IB_EVENT_LID_CHANGE: case IB_EVENT_CLIENT_REREGISTER: mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR); break; case IB_EVENT_PKEY_CHANGE: mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT); break; default: break; } } static int mcast_add_one(struct ib_device *device) { struct mcast_device *dev; struct mcast_port *port; int i; int count = 0; dev = kmalloc(struct_size(dev, port, device->phys_port_cnt), GFP_KERNEL); if (!dev) return -ENOMEM; dev->start_port = rdma_start_port(device); dev->end_port = rdma_end_port(device); for (i = 0; i <= dev->end_port - dev->start_port; i++) { if (!rdma_cap_ib_mcast(device, dev->start_port + i)) continue; port = &dev->port[i]; port->dev = dev; port->port_num = dev->start_port + i; spin_lock_init(&port->lock); port->table = RB_ROOT; init_completion(&port->comp); refcount_set(&port->refcount, 1); ++count; } if (!count) { kfree(dev); return -EOPNOTSUPP; } dev->device = device; ib_set_client_data(device, &mcast_client, dev); INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler); ib_register_event_handler(&dev->event_handler); return 0; } static void mcast_remove_one(struct ib_device *device, void *client_data) { struct mcast_device *dev = client_data; struct mcast_port *port; int i; ib_unregister_event_handler(&dev->event_handler); flush_workqueue(mcast_wq); for (i = 0; i <= dev->end_port - dev->start_port; i++) { if (rdma_cap_ib_mcast(device, dev->start_port + i)) { port = &dev->port[i]; deref_port(port); wait_for_completion(&port->comp); } } kfree(dev); } int mcast_init(void) { int ret; mcast_wq = alloc_ordered_workqueue("ib_mcast", WQ_MEM_RECLAIM); if (!mcast_wq) return -ENOMEM; ib_sa_register_client(&sa_client); ret = ib_register_client(&mcast_client); if (ret) goto err; return 0; err: ib_sa_unregister_client(&sa_client); destroy_workqueue(mcast_wq); return ret; } void mcast_cleanup(void) { ib_unregister_client(&mcast_client); ib_sa_unregister_client(&sa_client); destroy_workqueue(mcast_wq); }
4 2 3 17 13 4 16 1 1 1 1 4 1 7 1 8 136 158 9 9 5 5 186 1 187 1 1 125 37 139 1 126 23 2 25 25 5 3 18 18 9 5 5 23 4 1 10 2 5 3 2 3 3 2 2 13 10 17 1 6 2 3 2 16 30 27 3 3 2 20 2 3 15 2 11 4 2 63 12 4 27 8 12 1 9 31 5 9 1 1 1 4 10 9 3 2 6 31 31 35 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool_netlink.h> #include <linux/bitmap.h> #include "netlink.h" #include "bitset.h" /* Some bitmaps are internally represented as an array of unsigned long, some * as an array of u32 (some even as single u32 for now). To avoid the need of * wrappers on caller side, we provide two set of functions: those with "32" * suffix in their names expect u32 based bitmaps, those without it expect * unsigned long bitmaps. */ static u32 ethnl_lower_bits(unsigned int n) { return ~(u32)0 >> (32 - n % 32); } static u32 ethnl_upper_bits(unsigned int n) { return ~(u32)0 << (n % 32); } /** * ethnl_bitmap32_clear() - Clear u32 based bitmap * @dst: bitmap to clear * @start: beginning of the interval * @end: end of the interval * @mod: set if bitmap was modified * * Clear @nbits bits of a bitmap with indices @start <= i < @end */ static void ethnl_bitmap32_clear(u32 *dst, unsigned int start, unsigned int end, bool *mod) { unsigned int start_word = start / 32; unsigned int end_word = end / 32; unsigned int i; u32 mask; if (end <= start) return; if (start % 32) { mask = ethnl_upper_bits(start); if (end_word == start_word) { mask &= ethnl_lower_bits(end); if (dst[start_word] & mask) { dst[start_word] &= ~mask; *mod = true; } return; } if (dst[start_word] & mask) { dst[start_word] &= ~mask; *mod = true; } start_word++; } for (i = start_word; i < end_word; i++) { if (dst[i]) { dst[i] = 0; *mod = true; } } if (end % 32) { mask = ethnl_lower_bits(end); if (dst[end_word] & mask) { dst[end_word] &= ~mask; *mod = true; } } } /** * ethnl_bitmap32_not_zero() - Check if any bit is set in an interval * @map: bitmap to test * @start: beginning of the interval * @end: end of the interval * * Return: true if there is non-zero bit with index @start <= i < @end, * false if the whole interval is zero */ static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, unsigned int end) { unsigned int start_word = start / 32; unsigned int end_word = end / 32; u32 mask; if (end <= start) return true; if (start % 32) { mask = ethnl_upper_bits(start); if (end_word == start_word) { mask &= ethnl_lower_bits(end); return map[start_word] & mask; } if (map[start_word] & mask) return true; start_word++; } if (!memchr_inv(map + start_word, '\0', (end_word - start_word) * sizeof(u32))) return true; if (end % 32 == 0) return true; return map[end_word] & ethnl_lower_bits(end); } /** * ethnl_bitmap32_update() - Modify u32 based bitmap according to value/mask * pair * @dst: bitmap to update * @nbits: bit size of the bitmap * @value: values to set * @mask: mask of bits to set * @mod: set to true if bitmap is modified, preserve if not * * Set bits in @dst bitmap which are set in @mask to values from @value, leave * the rest untouched. If destination bitmap was modified, set @mod to true, * leave as it is if not. */ static void ethnl_bitmap32_update(u32 *dst, unsigned int nbits, const u32 *value, const u32 *mask, bool *mod) { while (nbits > 0) { u32 real_mask = mask ? *mask : ~(u32)0; u32 new_value; if (nbits < 32) real_mask &= ethnl_lower_bits(nbits); new_value = (*dst & ~real_mask) | (*value & real_mask); if (new_value != *dst) { *dst = new_value; *mod = true; } if (nbits <= 32) break; dst++; nbits -= 32; value++; if (mask) mask++; } } static bool ethnl_bitmap32_test_bit(const u32 *map, unsigned int index) { return map[index / 32] & (1U << (index % 32)); } /** * ethnl_bitset32_size() - Calculate size of bitset nested attribute * @val: value bitmap (u32 based) * @mask: mask bitmap (u32 based, optional) * @nbits: bit length of the bitset * @names: array of bit names (optional) * @compact: assume compact format for output * * Estimate length of netlink attribute composed by a later call to * ethnl_put_bitset32() call with the same arguments. * * Return: negative error code or attribute length estimate */ int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { unsigned int len = 0; /* list flag */ if (!mask) len += nla_total_size(sizeof(u32)); /* size */ len += nla_total_size(sizeof(u32)); if (compact) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); /* value, mask */ len += (mask ? 2 : 1) * nla_total_size(nwords * sizeof(u32)); } else { unsigned int bits_len = 0; unsigned int bit_len, i; for (i = 0; i < nbits; i++) { const char *name = names ? names[i] : NULL; if (!ethnl_bitmap32_test_bit(mask ?: val, i)) continue; /* index */ bit_len = nla_total_size(sizeof(u32)); /* name */ if (name) bit_len += ethnl_strz_size(name); /* value */ if (mask && ethnl_bitmap32_test_bit(val, i)) bit_len += nla_total_size(0); /* bit nest */ bits_len += nla_total_size(bit_len); } /* bits nest */ len += nla_total_size(bits_len); } /* outermost nest */ return nla_total_size(len); } /** * ethnl_put_bitset32() - Put a bitset nest into a message * @skb: skb with the message * @attrtype: attribute type for the bitset nest * @val: value bitmap (u32 based) * @mask: mask bitmap (u32 based, optional) * @nbits: bit length of the bitset * @names: array of bit names (optional) * @compact: use compact format for the output * * Compose a nested attribute representing a bitset. If @mask is null, simple * bitmap (bit list) is created, if @mask is provided, represent a value/mask * pair. Bit names are only used in verbose mode and when provided by calller. * * Return: 0 on success, negative error value on error */ int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val, const u32 *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { struct nlattr *nest; struct nlattr *attr; nest = nla_nest_start(skb, attrtype); if (!nest) return -EMSGSIZE; if (!mask && nla_put_flag(skb, ETHTOOL_A_BITSET_NOMASK)) goto nla_put_failure; if (nla_put_u32(skb, ETHTOOL_A_BITSET_SIZE, nbits)) goto nla_put_failure; if (compact) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); unsigned int nbytes = nwords * sizeof(u32); u32 *dst; attr = nla_reserve(skb, ETHTOOL_A_BITSET_VALUE, nbytes); if (!attr) goto nla_put_failure; dst = nla_data(attr); memcpy(dst, val, nbytes); if (nbits % 32) dst[nwords - 1] &= ethnl_lower_bits(nbits); if (mask) { attr = nla_reserve(skb, ETHTOOL_A_BITSET_MASK, nbytes); if (!attr) goto nla_put_failure; dst = nla_data(attr); memcpy(dst, mask, nbytes); if (nbits % 32) dst[nwords - 1] &= ethnl_lower_bits(nbits); } } else { struct nlattr *bits; unsigned int i; bits = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS); if (!bits) goto nla_put_failure; for (i = 0; i < nbits; i++) { const char *name = names ? names[i] : NULL; if (!ethnl_bitmap32_test_bit(mask ?: val, i)) continue; attr = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS_BIT); if (!attr) goto nla_put_failure; if (nla_put_u32(skb, ETHTOOL_A_BITSET_BIT_INDEX, i)) goto nla_put_failure; if (name && ethnl_put_strz(skb, ETHTOOL_A_BITSET_BIT_NAME, name)) goto nla_put_failure; if (mask && ethnl_bitmap32_test_bit(val, i) && nla_put_flag(skb, ETHTOOL_A_BITSET_BIT_VALUE)) goto nla_put_failure; nla_nest_end(skb, attr); } nla_nest_end(skb, bits); } nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static const struct nla_policy bitset_policy[] = { [ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG }, [ETHTOOL_A_BITSET_SIZE] = NLA_POLICY_MAX(NLA_U32, ETHNL_MAX_BITSET_SIZE), [ETHTOOL_A_BITSET_BITS] = { .type = NLA_NESTED }, [ETHTOOL_A_BITSET_VALUE] = { .type = NLA_BINARY }, [ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY }, }; static const struct nla_policy bit_policy[] = { [ETHTOOL_A_BITSET_BIT_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_BITSET_BIT_NAME] = { .type = NLA_NUL_STRING }, [ETHTOOL_A_BITSET_BIT_VALUE] = { .type = NLA_FLAG }, }; /** * ethnl_bitset_is_compact() - check if bitset attribute represents a compact * bitset * @bitset: nested attribute representing a bitset * @compact: pointer for return value * * Return: 0 on success, negative error code on failure */ int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; int ret; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, bitset, bitset_policy, NULL); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BITS]) { if (tb[ETHTOOL_A_BITSET_VALUE] || tb[ETHTOOL_A_BITSET_MASK]) return -EINVAL; *compact = false; return 0; } if (!tb[ETHTOOL_A_BITSET_SIZE] || !tb[ETHTOOL_A_BITSET_VALUE]) return -EINVAL; *compact = true; return 0; } /** * ethnl_name_to_idx() - look up string index for a name * @names: array of ETH_GSTRING_LEN sized strings * @n_names: number of strings in the array * @name: name to look up * * Return: index of the string if found, -ENOENT if not found */ static int ethnl_name_to_idx(ethnl_string_array_t names, unsigned int n_names, const char *name) { unsigned int i; if (!names) return -ENOENT; for (i = 0; i < n_names; i++) { /* names[i] may not be null terminated */ if (!strncmp(names[i], name, ETH_GSTRING_LEN) && strlen(name) <= ETH_GSTRING_LEN) return i; } return -ENOENT; } static int ethnl_parse_bit(unsigned int *index, bool *val, unsigned int nbits, const struct nlattr *bit_attr, bool no_mask, ethnl_string_array_t names, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(bit_policy)]; int ret, idx; ret = nla_parse_nested(tb, ARRAY_SIZE(bit_policy) - 1, bit_attr, bit_policy, extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BIT_INDEX]) { const char *name; idx = nla_get_u32(tb[ETHTOOL_A_BITSET_BIT_INDEX]); if (idx >= nbits) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_BIT_INDEX], "bit index too high"); return -EOPNOTSUPP; } name = names ? names[idx] : NULL; if (tb[ETHTOOL_A_BITSET_BIT_NAME] && name && strncmp(nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME]), name, nla_len(tb[ETHTOOL_A_BITSET_BIT_NAME]))) { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "bit index and name mismatch"); return -EINVAL; } } else if (tb[ETHTOOL_A_BITSET_BIT_NAME]) { idx = ethnl_name_to_idx(names, nbits, nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME])); if (idx < 0) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_BIT_NAME], "bit name not found"); return -EOPNOTSUPP; } } else { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "neither bit index nor name specified"); return -EINVAL; } *index = idx; *val = no_mask || tb[ETHTOOL_A_BITSET_BIT_VALUE]; return 0; } /** * ethnl_bitmap32_equal() - Compare two bitmaps * @map1: first bitmap * @map2: second bitmap * @nbits: bit size to compare * * Return: true if first @nbits are equal, false if not */ static bool ethnl_bitmap32_equal(const u32 *map1, const u32 *map2, unsigned int nbits) { if (memcmp(map1, map2, nbits / 32 * sizeof(u32))) return false; if (nbits % 32 == 0) return true; return !((map1[nbits / 32] ^ map2[nbits / 32]) & ethnl_lower_bits(nbits % 32)); } static int ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, const struct nlattr *attr, struct nlattr **tb, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { u32 *saved_bitmap = NULL; struct nlattr *bit_attr; bool no_mask; int rem; int ret; if (tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "value only allowed in compact bitset"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask only allowed in compact bitset"); return -EINVAL; } no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; if (no_mask) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); unsigned int nbytes = nwords * sizeof(u32); bool dummy; /* The bitmap size is only the size of the map part without * its mask part. */ saved_bitmap = kcalloc(nwords, sizeof(u32), GFP_KERNEL); if (!saved_bitmap) return -ENOMEM; memcpy(saved_bitmap, bitmap, nbytes); ethnl_bitmap32_clear(bitmap, 0, nbits, &dummy); } nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { bool old_val, new_val; unsigned int idx; if (nla_type(bit_attr) != ETHTOOL_A_BITSET_BITS_BIT) { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "only ETHTOOL_A_BITSET_BITS_BIT allowed in ETHTOOL_A_BITSET_BITS"); kfree(saved_bitmap); return -EINVAL; } ret = ethnl_parse_bit(&idx, &new_val, nbits, bit_attr, no_mask, names, extack); if (ret < 0) { kfree(saved_bitmap); return ret; } old_val = bitmap[idx / 32] & ((u32)1 << (idx % 32)); if (new_val != old_val) { if (new_val) bitmap[idx / 32] |= ((u32)1 << (idx % 32)); else bitmap[idx / 32] &= ~((u32)1 << (idx % 32)); if (!no_mask) *mod = true; } } if (no_mask && !ethnl_bitmap32_equal(saved_bitmap, bitmap, nbits)) *mod = true; kfree(saved_bitmap); return 0; } static int ethnl_compact_sanity_checks(unsigned int nbits, const struct nlattr *nest, struct nlattr **tb, struct netlink_ext_ack *extack) { bool no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; unsigned int attr_nbits, attr_nwords; const struct nlattr *test_attr; if (no_mask && tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask not allowed in list bitset"); return -EINVAL; } if (!tb[ETHTOOL_A_BITSET_SIZE]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing size in compact bitset"); return -EINVAL; } if (!tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing value in compact bitset"); return -EINVAL; } if (!no_mask && !tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing mask in compact nonlist bitset"); return -EINVAL; } attr_nbits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); attr_nwords = DIV_ROUND_UP(attr_nbits, 32); if (nla_len(tb[ETHTOOL_A_BITSET_VALUE]) != attr_nwords * sizeof(u32)) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "bitset value length does not match size"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK] && nla_len(tb[ETHTOOL_A_BITSET_MASK]) != attr_nwords * sizeof(u32)) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "bitset mask length does not match size"); return -EINVAL; } if (attr_nbits <= nbits) return 0; test_attr = no_mask ? tb[ETHTOOL_A_BITSET_VALUE] : tb[ETHTOOL_A_BITSET_MASK]; if (ethnl_bitmap32_not_zero(nla_data(test_attr), nbits, attr_nbits)) { NL_SET_ERR_MSG_ATTR(extack, test_attr, "cannot modify bits past kernel bitset size"); return -EINVAL; } return 0; } /** * ethnl_update_bitset32() - Apply a bitset nest to a u32 based bitmap * @bitmap: bitmap to update * @nbits: size of the updated bitmap in bits * @attr: nest attribute to parse and apply * @names: array of bit names; may be null for compact format * @extack: extack for error reporting * @mod: set this to true if bitmap is modified, leave as it is if not * * Apply bitset netsted attribute to a bitmap. If the attribute represents * a bit list, @bitmap is set to its contents; otherwise, bits in mask are * set to values from value. Bitmaps in the attribute may be longer than * @nbits but the message must not request modifying any bits past @nbits. * * Return: negative error code on failure, 0 on success */ int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; unsigned int change_bits; bool no_mask; int ret; if (!attr) return 0; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr, bitset_policy, extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BITS]) return ethnl_update_bitset32_verbose(bitmap, nbits, attr, tb, names, extack, mod); ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack); if (ret < 0) return ret; no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; change_bits = min_t(unsigned int, nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]), nbits); ethnl_bitmap32_update(bitmap, change_bits, nla_data(tb[ETHTOOL_A_BITSET_VALUE]), no_mask ? NULL : nla_data(tb[ETHTOOL_A_BITSET_MASK]), mod); if (no_mask && change_bits < nbits) ethnl_bitmap32_clear(bitmap, change_bits, nbits, mod); return 0; } /** * ethnl_parse_bitset() - Compute effective value and mask from bitset nest * @val: unsigned long based bitmap to put value into * @mask: unsigned long based bitmap to put mask into * @nbits: size of @val and @mask bitmaps * @attr: nest attribute to parse and apply * @names: array of bit names; may be null for compact format * @extack: extack for error reporting * * Provide @nbits size long bitmaps for value and mask so that * x = (val & mask) | (x & ~mask) would modify any @nbits sized bitmap x * the same way ethnl_update_bitset() with the same bitset attribute would. * * Return: negative error code on failure, 0 on success */ int ethnl_parse_bitset(unsigned long *val, unsigned long *mask, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; const struct nlattr *bit_attr; bool no_mask; int rem; int ret; if (!attr) return 0; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr, bitset_policy, extack); if (ret < 0) return ret; no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; if (!tb[ETHTOOL_A_BITSET_BITS]) { unsigned int change_bits; ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack); if (ret < 0) return ret; change_bits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); if (change_bits > nbits) change_bits = nbits; bitmap_from_arr32(val, nla_data(tb[ETHTOOL_A_BITSET_VALUE]), change_bits); if (change_bits < nbits) bitmap_clear(val, change_bits, nbits - change_bits); if (no_mask) { bitmap_fill(mask, nbits); } else { bitmap_from_arr32(mask, nla_data(tb[ETHTOOL_A_BITSET_MASK]), change_bits); if (change_bits < nbits) bitmap_clear(mask, change_bits, nbits - change_bits); } return 0; } if (tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "value only allowed in compact bitset"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask only allowed in compact bitset"); return -EINVAL; } bitmap_zero(val, nbits); if (no_mask) bitmap_fill(mask, nbits); else bitmap_zero(mask, nbits); nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { unsigned int idx; bool bit_val; ret = ethnl_parse_bit(&idx, &bit_val, nbits, bit_attr, no_mask, names, extack); if (ret < 0) return ret; if (bit_val) __set_bit(idx, val); if (!no_mask) __set_bit(idx, mask); } return 0; } #if BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) /* 64-bit big endian architectures are the only case when u32 based bitmaps * and unsigned long based bitmaps have different memory layout so that we * cannot simply cast the latter to the former and need actual wrappers * converting the latter to the former. * * To reduce the number of slab allocations, the wrappers use fixed size local * variables for bitmaps up to ETHNL_SMALL_BITMAP_BITS bits which is the * majority of bitmaps used by ethtool. */ #define ETHNL_SMALL_BITMAP_BITS 128 #define ETHNL_SMALL_BITMAP_WORDS DIV_ROUND_UP(ETHNL_SMALL_BITMAP_BITS, 32) int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; u32 *mask32; u32 *val32; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); if (!val32) return -ENOMEM; mask32 = val32 + nwords; } else { val32 = small_val32; mask32 = small_mask32; } bitmap_to_arr32(val32, val, nbits); if (mask) bitmap_to_arr32(mask32, mask, nbits); else mask32 = NULL; ret = ethnl_bitset32_size(val32, mask32, nbits, names, compact); if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(val32); return ret; } int ethnl_put_bitset(struct sk_buff *skb, int attrtype, const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; u32 *mask32; u32 *val32; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); if (!val32) return -ENOMEM; mask32 = val32 + nwords; } else { val32 = small_val32; mask32 = small_mask32; } bitmap_to_arr32(val32, val, nbits); if (mask) bitmap_to_arr32(mask32, mask, nbits); else mask32 = NULL; ret = ethnl_put_bitset32(skb, attrtype, val32, mask32, nbits, names, compact); if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(val32); return ret; } int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { u32 small_bitmap32[ETHNL_SMALL_BITMAP_WORDS]; u32 *bitmap32 = small_bitmap32; bool u32_mod = false; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int dst_words = DIV_ROUND_UP(nbits, 32); bitmap32 = kmalloc_array(dst_words, sizeof(u32), GFP_KERNEL); if (!bitmap32) return -ENOMEM; } bitmap_to_arr32(bitmap32, bitmap, nbits); ret = ethnl_update_bitset32(bitmap32, nbits, attr, names, extack, &u32_mod); if (u32_mod) { bitmap_from_arr32(bitmap, bitmap32, nbits); *mod = true; } if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(bitmap32); return ret; } #else /* On little endian 64-bit and all 32-bit architectures, an unsigned long * based bitmap can be interpreted as u32 based one using a simple cast. */ int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { return ethnl_bitset32_size((const u32 *)val, (const u32 *)mask, nbits, names, compact); } int ethnl_put_bitset(struct sk_buff *skb, int attrtype, const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { return ethnl_put_bitset32(skb, attrtype, (const u32 *)val, (const u32 *)mask, nbits, names, compact); } int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { return ethnl_update_bitset32((u32 *)bitmap, nbits, attr, names, extack, mod); } #endif /* BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) */
6 56 10 29 4 463 198 472 472 473 4 467 4 465 466 12 25 93 87 1 92 47 42 5 37 34 3 25 36 7 5 32 31 1 1 4 31 31 11 26 1 4 23 11 10 7 20 4 3 2 21 31 52 34 46 45 52 8 8 7 3 5 63 47 45 44 29 33 33 33 1 26 31 33 34 25 16 15 47 47 5 5 5 1 1 42 47 46 10 4 33 34 19 1 29 29 10 10 1 328 327 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/dir.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/dir.c * * Copyright (C) 1991, 1992 Linus Torvalds * * ext4 directory handling functions * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * * Hash Tree Directory indexing (c) 2001 Daniel Phillips * */ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/slab.h> #include <linux/iversion.h> #include <linux/unicode.h> #include "ext4.h" #include "xattr.h" static int ext4_dx_readdir(struct file *, struct dir_context *); /** * is_dx_dir() - check if a directory is using htree indexing * @inode: directory inode * * Check if the given dir-inode refers to an htree-indexed directory * (or a directory which could potentially get converted to use htree * indexing). * * Return 1 if it is a dx dir, 0 if not */ static int is_dx_dir(struct inode *inode) { struct super_block *sb = inode->i_sb; if (ext4_has_feature_dir_index(inode->i_sb) && ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || ((inode->i_size >> sb->s_blocksize_bits) == 1) || ext4_has_inline_data(inode))) return 1; return 0; } static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de) { /* Check if . or .. , or skip if namelen is 0 */ if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') && (de->name[1] == '.' || de->name[1] == '\0')) return true; /* Check if this is a csum entry */ if (de->file_type == EXT4_FT_DIR_CSUM) return true; return false; } /* * Return 0 if the directory entry is OK, and 1 if there is a problem * * Note: this is the opposite of what ext2 and ext3 historically returned... * * bh passed here can be an inode block or a dir data block, depending * on the inode inline data flag. */ int __ext4_check_dir_entry(const char *function, unsigned int line, struct inode *dir, struct file *filp, struct ext4_dir_entry_2 *de, struct buffer_head *bh, char *buf, int size, unsigned int offset) { const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); const int next_offset = ((char *) de - buf) + rlen; bool fake = is_fake_dir_entry(de); bool has_csum = ext4_has_metadata_csum(dir->i_sb); if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) error_msg = "rec_len is smaller than minimal"; else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < ext4_dir_rec_len(de->name_len, fake ? NULL : dir))) error_msg = "rec_len is too small for name_len"; else if (unlikely(next_offset > size)) error_msg = "directory entry overrun"; else if (unlikely(next_offset > size - ext4_dir_rec_len(1, has_csum ? NULL : dir) && next_offset != size)) error_msg = "directory entry too close to block end"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; else return 0; if (filp) ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), rlen, size, fake); else ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), rlen, size, fake); return 1; } static int ext4_readdir(struct file *file, struct dir_context *ctx) { unsigned int offset; int i; struct ext4_dir_entry_2 *de; int err; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); struct dir_private_info *info = file->private_data; err = fscrypt_prepare_readdir(inode); if (err) return err; if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); if (err != ERR_BAD_DX_DIR) return err; /* Can we just clear INDEX flag to ignore htree information? */ if (!ext4_has_metadata_csum(sb)) { /* * We don't set the inode dirty flag since it's not * critical that it gets flushed back to the disk. */ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } } if (ext4_has_inline_data(inode)) { int has_inline_data = 1; err = ext4_read_inline_dir(file, ctx, &has_inline_data); if (has_inline_data) return err; } if (IS_ENCRYPTED(inode)) { err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr); if (err < 0) return err; } while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto errout; } cond_resched(); offset = ctx->pos & (sb->s_blocksize - 1); map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; err = ext4_map_blocks(NULL, inode, &map, 0); if (err == 0) { /* m_len should never be zero but let's avoid * an infinite loop if it somehow is */ if (map.m_len == 0) map.m_len = 1; ctx->pos += map.m_len * sb->s_blocksize; continue; } if (err > 0) { pgoff_t index = map.m_pblk >> (PAGE_SHIFT - inode->i_blkbits); if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_mapping, &file->f_ra, file, index, 1); file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); if (IS_ERR(bh)) { err = PTR_ERR(bh); bh = NULL; goto errout; } } if (!bh) { /* corrupt size? Maybe no more blocks to read */ if (ctx->pos > inode->i_blocks << 9) break; ctx->pos += sb->s_blocksize - offset; continue; } /* Check the checksum */ if (!buffer_verified(bh) && !ext4_dirblock_csum_verify(inode, bh)) { EXT4_ERROR_FILE(file, 0, "directory fails checksum " "at offset %llu", (unsigned long long)ctx->pos); ctx->pos += sb->s_blocksize - offset; brelse(bh); bh = NULL; continue; } set_buffer_verified(bh); /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ if (!inode_eq_iversion(inode, info->cookie)) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext4_dir_entry_2 *) (bh->b_data + i); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at * least that it is non-zero. A * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) < ext4_dir_rec_len(1, inode)) break; i += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } offset = i; ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; info->cookie = inode_query_iversion(inode); } while (ctx->pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); if (ext4_check_dir_entry(inode, file, de, bh, bh->b_data, bh->b_size, offset)) { /* * On error, skip to the next block */ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; break; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { if (!IS_ENCRYPTED(inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; } else { int save_len = fstr.len; struct fscrypt_str de_name = FSTR_INIT(de->name, de->name_len); u32 hash; u32 minor_hash; if (IS_CASEFOLDED(inode)) { hash = EXT4_DIRENT_HASH(de); minor_hash = EXT4_DIRENT_MINOR_HASH(de); } else { hash = 0; minor_hash = 0; } /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(inode, hash, minor_hash, &de_name, &fstr); de_name = fstr; fstr.len = save_len; if (err) goto errout; if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; } } ctx->pos += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } if ((ctx->pos < inode->i_size) && !dir_relax_shared(inode)) goto done; brelse(bh); bh = NULL; } done: err = 0; errout: fscrypt_fname_free_buffer(&fstr); brelse(bh); return err; } static inline int is_32bit_api(void) { #ifdef CONFIG_COMPAT return in_compat_syscall(); #else return (BITS_PER_LONG == 32); #endif } /* * These functions convert from the major/minor hash to an f_pos * value for dx directories * * Upper layer (for example NFS) should specify FMODE_32BITHASH or * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted * directly on both 32-bit and 64-bit nodes, under such case, neither * FMODE_32BITHASH nor FMODE_64BITHASH is specified. */ static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return major >> 1; else return ((__u64)(major >> 1) << 32) | (__u64)minor; } static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return (pos << 1) & 0xffffffff; else return ((pos >> 32) << 1) & 0xffffffff; } static inline __u32 pos2min_hash(struct file *filp, loff_t pos) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return 0; else return pos & 0xffffffff; } /* * Return 32- or 64-bit end-of-file for dx directories */ static inline loff_t ext4_get_htree_eof(struct file *filp) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return EXT4_HTREE_EOF_32BIT; else return EXT4_HTREE_EOF_64BIT; } /* * ext4_dir_llseek() calls generic_file_llseek_size to handle htree * directories, where the "offset" is in terms of the filename hash * value instead of the byte offset. * * Because we may return a 64-bit hash that is well beyond offset limits, * we need to pass the max hash as the maximum allowable offset in * the htree directory case. * * For non-htree, ext4_llseek already chooses the proper max offset. */ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; struct dir_private_info *info = file->private_data; int dx_dir = is_dx_dir(inode); loff_t ret, htree_max = ext4_get_htree_eof(file); if (likely(dx_dir)) ret = generic_file_llseek_size(file, offset, whence, htree_max, htree_max); else ret = ext4_llseek(file, offset, whence); info->cookie = inode_peek_iversion(inode) - 1; return ret; } /* * This structure holds the nodes of the red-black tree used to store * the directory entry in hash order. */ struct fname { __u32 hash; __u32 minor_hash; struct rb_node rb_hash; struct fname *next; __u32 inode; __u8 name_len; __u8 file_type; char name[] __counted_by(name_len); }; /* * This function implements a non-recursive way of freeing all of the * nodes in the red-black tree. */ static void free_rb_tree_fname(struct rb_root *root) { struct fname *fname, *next; rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) while (fname) { struct fname *old = fname; fname = fname->next; kfree(old); } *root = RB_ROOT; } static void ext4_htree_init_dir_info(struct file *filp, loff_t pos) { struct dir_private_info *p = filp->private_data; if (is_dx_dir(file_inode(filp)) && !p->initialized) { p->curr_hash = pos2maj_hash(filp, pos); p->curr_minor_hash = pos2min_hash(filp, pos); p->initialized = true; } } void ext4_htree_free_dir_info(struct dir_private_info *p) { free_rb_tree_fname(&p->root); kfree(p); } /* * Given a directory entry, enter it into the fname rb tree. * * When filename encryption is enabled, the dirent will hold the * encrypted filename, while the htree will hold decrypted filename. * The decrypted filename is passed in via ent_name. parameter. */ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, struct fscrypt_str *ent_name) { struct rb_node **p, *parent = NULL; struct fname *fname, *new_fn; struct dir_private_info *info; info = dir_file->private_data; p = &info->root.rb_node; /* Create and allocate the fname structure */ new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1), GFP_KERNEL); if (!new_fn) return -ENOMEM; new_fn->hash = hash; new_fn->minor_hash = minor_hash; new_fn->inode = le32_to_cpu(dirent->inode); new_fn->name_len = ent_name->len; new_fn->file_type = dirent->file_type; memcpy(new_fn->name, ent_name->name, ent_name->len); while (*p) { parent = *p; fname = rb_entry(parent, struct fname, rb_hash); /* * If the hash and minor hash match up, then we put * them on a linked list. This rarely happens... */ if ((new_fn->hash == fname->hash) && (new_fn->minor_hash == fname->minor_hash)) { new_fn->next = fname->next; fname->next = new_fn; return 0; } if (new_fn->hash < fname->hash) p = &(*p)->rb_left; else if (new_fn->hash > fname->hash) p = &(*p)->rb_right; else if (new_fn->minor_hash < fname->minor_hash) p = &(*p)->rb_left; else /* if (new_fn->minor_hash > fname->minor_hash) */ p = &(*p)->rb_right; } rb_link_node(&new_fn->rb_hash, parent, p); rb_insert_color(&new_fn->rb_hash, &info->root); return 0; } /* * This is a helper function for ext4_dx_readdir. It calls filldir * for all entries on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ static int call_filldir(struct file *file, struct dir_context *ctx, struct fname *fname) { struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; if (!fname) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " "called with null fname?!?", __func__, __LINE__, inode->i_ino, current->comm); return 0; } ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); while (fname) { if (!dir_emit(ctx, fname->name, fname->name_len, fname->inode, get_dtype(sb, fname->file_type))) { info->extra_fname = fname; return 1; } fname = fname->next; } return 0; } static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) { struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct fname *fname; int ret = 0; ext4_htree_init_dir_info(file, ctx->pos); if (ctx->pos == ext4_get_htree_eof(file)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ if (info->last_pos != ctx->pos) { free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; info->curr_hash = pos2maj_hash(file, ctx->pos); info->curr_minor_hash = pos2min_hash(file, ctx->pos); } /* * If there are any leftover names on the hash collision * chain, return them first. */ if (info->extra_fname) { if (call_filldir(file, ctx, info->extra_fname)) goto finished; info->extra_fname = NULL; goto next_node; } else if (!info->curr_node) info->curr_node = rb_first(&info->root); while (1) { /* * Fill the rbtree if we have no more entries, * or the inode has changed since we last read in the * cached entries. */ if ((!info->curr_node) || !inode_eq_iversion(inode, info->cookie)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); info->cookie = inode_query_iversion(inode); ret = ext4_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); if (ret < 0) goto finished; if (ret == 0) { ctx->pos = ext4_get_htree_eof(file); break; } info->curr_node = rb_first(&info->root); } fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; if (call_filldir(file, ctx, fname)) break; next_node: info->curr_node = rb_next(info->curr_node); if (info->curr_node) { fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { ctx->pos = ext4_get_htree_eof(file); break; } info->curr_hash = info->next_hash; info->curr_minor_hash = 0; } } finished: info->last_pos = ctx->pos; return ret < 0 ? ret : 0; } static int ext4_release_dir(struct inode *inode, struct file *filp) { if (filp->private_data) ext4_htree_free_dir_info(filp->private_data); return 0; } int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size) { struct ext4_dir_entry_2 *de; int rlen; unsigned int offset = 0; char *top; de = buf; top = buf + buf_size; while ((char *) de < top) { if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) return -EFSCORRUPTED; rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } if ((char *) de > top) return -EFSCORRUPTED; return 0; } static int ext4_dir_open(struct inode *inode, struct file *file) { struct dir_private_info *info; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; file->private_data = info; return 0; } const struct file_operations ext4_dir_operations = { .open = ext4_dir_open, .llseek = ext4_dir_llseek, .read = generic_read_dir, .iterate_shared = ext4_readdir, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif .fsync = ext4_sync_file, .release = ext4_release_dir, };
7 129 129 123 6 131 112 6 16 1 121 122 1 120 130 131 124 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 // SPDX-License-Identifier: GPL-2.0-only /* * Pluggable TCP upper layer protocol support. * * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. * */ #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/list.h> #include <linux/gfp.h> #include <net/tcp.h> static DEFINE_SPINLOCK(tcp_ulp_list_lock); static LIST_HEAD(tcp_ulp_list); /* Simple linear search, don't expect many entries! */ static struct tcp_ulp_ops *tcp_ulp_find(const char *name) { struct tcp_ulp_ops *e; list_for_each_entry_rcu(e, &tcp_ulp_list, list, lockdep_is_held(&tcp_ulp_list_lock)) { if (strcmp(e->name, name) == 0) return e; } return NULL; } static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) { const struct tcp_ulp_ops *ulp = NULL; rcu_read_lock(); ulp = tcp_ulp_find(name); #ifdef CONFIG_MODULES if (!ulp && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); request_module("tcp-ulp-%s", name); rcu_read_lock(); ulp = tcp_ulp_find(name); } #endif if (!ulp || !try_module_get(ulp->owner)) ulp = NULL; rcu_read_unlock(); return ulp; } /* Attach new upper layer protocol to the list * of available protocols. */ int tcp_register_ulp(struct tcp_ulp_ops *ulp) { int ret = 0; spin_lock(&tcp_ulp_list_lock); if (tcp_ulp_find(ulp->name)) ret = -EEXIST; else list_add_tail_rcu(&ulp->list, &tcp_ulp_list); spin_unlock(&tcp_ulp_list_lock); return ret; } EXPORT_SYMBOL_GPL(tcp_register_ulp); void tcp_unregister_ulp(struct tcp_ulp_ops *ulp) { spin_lock(&tcp_ulp_list_lock); list_del_rcu(&ulp->list); spin_unlock(&tcp_ulp_list_lock); synchronize_rcu(); } EXPORT_SYMBOL_GPL(tcp_unregister_ulp); /* Build string with list of available upper layer protocl values */ void tcp_get_available_ulp(char *buf, size_t maxlen) { struct tcp_ulp_ops *ulp_ops; size_t offs = 0; *buf = '\0'; rcu_read_lock(); list_for_each_entry_rcu(ulp_ops, &tcp_ulp_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ulp_ops->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } void tcp_update_ulp(struct sock *sk, struct proto *proto, void (*write_space)(struct sock *sk)) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ulp_ops->update) icsk->icsk_ulp_ops->update(sk, proto, write_space); } void tcp_cleanup_ulp(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); /* No sock_owned_by_me() check here as at the time the * stack calls this function, the socket is dead and * about to be destroyed. */ if (!icsk->icsk_ulp_ops) return; if (icsk->icsk_ulp_ops->release) icsk->icsk_ulp_ops->release(sk); module_put(icsk->icsk_ulp_ops->owner); icsk->icsk_ulp_ops = NULL; } static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops) { struct inet_connection_sock *icsk = inet_csk(sk); int err; err = -EEXIST; if (icsk->icsk_ulp_ops) goto out_err; if (sk->sk_socket) clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); err = -ENOTCONN; if (!ulp_ops->clone && sk->sk_state == TCP_LISTEN) goto out_err; err = ulp_ops->init(sk); if (err) goto out_err; icsk->icsk_ulp_ops = ulp_ops; return 0; out_err: module_put(ulp_ops->owner); return err; } int tcp_set_ulp(struct sock *sk, const char *name) { const struct tcp_ulp_ops *ulp_ops; sock_owned_by_me(sk); ulp_ops = __tcp_ulp_find_autoload(name); if (!ulp_ops) return -ENOENT; return __tcp_set_ulp(sk, ulp_ops); }
2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011 STRATO. All rights reserved. */ #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/rbtree.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/btrfs.h> #include <linux/sched/mm.h> #include "ctree.h" #include "transaction.h" #include "disk-io.h" #include "locking.h" #include "ulist.h" #include "backref.h" #include "extent_io.h" #include "qgroup.h" #include "block-group.h" #include "sysfs.h" #include "tree-mod-log.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" #include "tree-checker.h" enum btrfs_qgroup_mode btrfs_qgroup_mode(const struct btrfs_fs_info *fs_info) { if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return BTRFS_QGROUP_MODE_DISABLED; if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) return BTRFS_QGROUP_MODE_SIMPLE; return BTRFS_QGROUP_MODE_FULL; } bool btrfs_qgroup_enabled(const struct btrfs_fs_info *fs_info) { return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED; } bool btrfs_qgroup_full_accounting(const struct btrfs_fs_info *fs_info) { return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL; } /* * Helpers to access qgroup reservation * * Callers should ensure the lock context and type are valid */ static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup) { u64 ret = 0; int i; for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) ret += qgroup->rsv.values[i]; return ret; } #ifdef CONFIG_BTRFS_DEBUG static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type) { if (type == BTRFS_QGROUP_RSV_DATA) return "data"; if (type == BTRFS_QGROUP_RSV_META_PERTRANS) return "meta_pertrans"; if (type == BTRFS_QGROUP_RSV_META_PREALLOC) return "meta_prealloc"; return NULL; } #endif static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); qgroup->rsv.values[type] += num_bytes; } static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); if (qgroup->rsv.values[type] >= num_bytes) { qgroup->rsv.values[type] -= num_bytes; return; } #ifdef CONFIG_BTRFS_DEBUG WARN_RATELIMIT(1, "qgroup %llu %s reserved space underflow, have %llu to free %llu", qgroup->qgroupid, qgroup_rsv_type_str(type), qgroup->rsv.values[type], num_bytes); #endif qgroup->rsv.values[type] = 0; } static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *dest, const struct btrfs_qgroup *src) { int i; for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i); } static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *dest, const struct btrfs_qgroup *src) { int i; for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i); } static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, int mod) { if (qg->old_refcnt < seq) qg->old_refcnt = seq; qg->old_refcnt += mod; } static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, int mod) { if (qg->new_refcnt < seq) qg->new_refcnt = seq; qg->new_refcnt += mod; } static inline u64 btrfs_qgroup_get_old_refcnt(const struct btrfs_qgroup *qg, u64 seq) { if (qg->old_refcnt < seq) return 0; return qg->old_refcnt - seq; } static inline u64 btrfs_qgroup_get_new_refcnt(const struct btrfs_qgroup *qg, u64 seq) { if (qg->new_refcnt < seq) return 0; return qg->new_refcnt - seq; } static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags); static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); /* must be called with qgroup_ioctl_lock held */ static struct btrfs_qgroup *find_qgroup_rb(const struct btrfs_fs_info *fs_info, u64 qgroupid) { struct rb_node *n = fs_info->qgroup_tree.rb_node; struct btrfs_qgroup *qgroup; while (n) { qgroup = rb_entry(n, struct btrfs_qgroup, node); if (qgroup->qgroupid < qgroupid) n = n->rb_left; else if (qgroup->qgroupid > qgroupid) n = n->rb_right; else return qgroup; } return NULL; } /* * Add qgroup to the filesystem's qgroup tree. * * Must be called with qgroup_lock held and @prealloc preallocated. * * The control on the lifespan of @prealloc would be transferred to this * function, thus caller should no longer touch @prealloc. */ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *prealloc, u64 qgroupid) { struct rb_node **p = &fs_info->qgroup_tree.rb_node; struct rb_node *parent = NULL; struct btrfs_qgroup *qgroup; /* Caller must have pre-allocated @prealloc. */ ASSERT(prealloc); while (*p) { parent = *p; qgroup = rb_entry(parent, struct btrfs_qgroup, node); if (qgroup->qgroupid < qgroupid) { p = &(*p)->rb_left; } else if (qgroup->qgroupid > qgroupid) { p = &(*p)->rb_right; } else { kfree(prealloc); return qgroup; } } qgroup = prealloc; qgroup->qgroupid = qgroupid; INIT_LIST_HEAD(&qgroup->groups); INIT_LIST_HEAD(&qgroup->members); INIT_LIST_HEAD(&qgroup->dirty); INIT_LIST_HEAD(&qgroup->iterator); INIT_LIST_HEAD(&qgroup->nested_iterator); rb_link_node(&qgroup->node, parent, p); rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); return qgroup; } static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) { struct btrfs_qgroup_list *list; list_del(&qgroup->dirty); while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, struct btrfs_qgroup_list, next_group); list_del(&list->next_group); list_del(&list->next_member); kfree(list); } while (!list_empty(&qgroup->members)) { list = list_first_entry(&qgroup->members, struct btrfs_qgroup_list, next_member); list_del(&list->next_group); list_del(&list->next_member); kfree(list); } } /* must be called with qgroup_lock held */ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) { struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); if (!qgroup) return -ENOENT; rb_erase(&qgroup->node, &fs_info->qgroup_tree); __del_qgroup_rb(qgroup); return 0; } /* * Add relation specified by two qgroups. * * Must be called with qgroup_lock held, the ownership of @prealloc is * transferred to this function and caller should not touch it anymore. * * Return: 0 on success * -ENOENT if one of the qgroups is NULL * <0 other errors */ static int __add_relation_rb(struct btrfs_qgroup_list *prealloc, struct btrfs_qgroup *member, struct btrfs_qgroup *parent) { if (!member || !parent) { kfree(prealloc); return -ENOENT; } prealloc->group = parent; prealloc->member = member; list_add_tail(&prealloc->next_group, &member->groups); list_add_tail(&prealloc->next_member, &parent->members); return 0; } /* * Add relation specified by two qgroup ids. * * Must be called with qgroup_lock held. * * Return: 0 on success * -ENOENT if one of the ids does not exist * <0 other errors */ static int add_relation_rb(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_list *prealloc, u64 memberid, u64 parentid) { struct btrfs_qgroup *member; struct btrfs_qgroup *parent; member = find_qgroup_rb(fs_info, memberid); parent = find_qgroup_rb(fs_info, parentid); return __add_relation_rb(prealloc, member, parent); } /* Must be called with qgroup_lock held */ static int del_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) { struct btrfs_qgroup *member; struct btrfs_qgroup *parent; struct btrfs_qgroup_list *list; member = find_qgroup_rb(fs_info, memberid); parent = find_qgroup_rb(fs_info, parentid); if (!member || !parent) return -ENOENT; list_for_each_entry(list, &member->groups, next_group) { if (list->group == parent) { list_del(&list->next_group); list_del(&list->next_member); kfree(list); return 0; } } return -ENOENT; } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid, u64 rfer, u64 excl) { struct btrfs_qgroup *qgroup; qgroup = find_qgroup_rb(fs_info, qgroupid); if (!qgroup) return -EINVAL; if (qgroup->rfer != rfer || qgroup->excl != excl) return -EINVAL; return 0; } #endif static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) { if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) return; fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); } static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, int slot, struct btrfs_qgroup_status_item *ptr) { ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr)); fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr); } /* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded */ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) { struct btrfs_key key; struct btrfs_key found_key; struct btrfs_root *quota_root = fs_info->quota_root; struct btrfs_path *path = NULL; struct extent_buffer *l; int slot; int ret = 0; u64 flags = 0; u64 rescan_progress = 0; if (!fs_info->quota_root) return 0; fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); if (!fs_info->qgroup_ulist) { ret = -ENOMEM; goto out; } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } ret = btrfs_sysfs_add_qgroups(fs_info); if (ret < 0) goto out; /* default this to quota off, in case no status key is found */ fs_info->qgroup_flags = 0; /* * pass 1: read status, all qgroup infos and limits */ key.objectid = 0; key.type = 0; key.offset = 0; ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); if (ret) goto out; while (1) { struct btrfs_qgroup *qgroup; slot = path->slots[0]; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { struct btrfs_qgroup_status_item *ptr; ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); if (btrfs_qgroup_status_version(l, ptr) != BTRFS_QGROUP_STATUS_VERSION) { btrfs_err(fs_info, "old qgroup version, quota disabled"); goto out; } fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) { qgroup_read_enable_gen(fs_info, l, slot, ptr); } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { qgroup_mark_inconsistent(fs_info); btrfs_err(fs_info, "qgroup generation mismatch, marked as inconsistent"); } rescan_progress = btrfs_qgroup_status_rescan(l, ptr); goto next1; } if (found_key.type != BTRFS_QGROUP_INFO_KEY && found_key.type != BTRFS_QGROUP_LIMIT_KEY) goto next1; qgroup = find_qgroup_rb(fs_info, found_key.offset); if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { btrfs_err(fs_info, "inconsistent qgroup config"); qgroup_mark_inconsistent(fs_info); } if (!qgroup) { struct btrfs_qgroup *prealloc; struct btrfs_root *tree_root = fs_info->tree_root; prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); if (!prealloc) { ret = -ENOMEM; goto out; } qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); /* * If a qgroup exists for a subvolume ID, it is possible * that subvolume has been deleted, in which case * reusing that ID would lead to incorrect accounting. * * Ensure that we skip any such subvol ids. * * We don't need to lock because this is only called * during mount before we start doing things like creating * subvolumes. */ if (is_fstree(qgroup->qgroupid) && qgroup->qgroupid > tree_root->free_objectid) /* * Don't need to check against BTRFS_LAST_FREE_OBJECTID, * as it will get checked on the next call to * btrfs_get_free_objectid. */ tree_root->free_objectid = qgroup->qgroupid + 1; } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) goto out; switch (found_key.type) { case BTRFS_QGROUP_INFO_KEY: { struct btrfs_qgroup_info_item *ptr; ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr); qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr); qgroup->excl = btrfs_qgroup_info_excl(l, ptr); qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr); /* generation currently unused */ break; } case BTRFS_QGROUP_LIMIT_KEY: { struct btrfs_qgroup_limit_item *ptr; ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr); qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr); qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr); qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr); qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr); break; } } next1: ret = btrfs_next_item(quota_root, path); if (ret < 0) goto out; if (ret) break; } btrfs_release_path(path); /* * pass 2: read all qgroup relations */ key.objectid = 0; key.type = BTRFS_QGROUP_RELATION_KEY; key.offset = 0; ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); if (ret) goto out; while (1) { struct btrfs_qgroup_list *list = NULL; slot = path->slots[0]; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); if (found_key.type != BTRFS_QGROUP_RELATION_KEY) goto next2; if (found_key.objectid > found_key.offset) { /* parent <- member, not needed to build config */ /* FIXME should we omit the key completely? */ goto next2; } list = kzalloc(sizeof(*list), GFP_KERNEL); if (!list) { ret = -ENOMEM; goto out; } ret = add_relation_rb(fs_info, list, found_key.objectid, found_key.offset); list = NULL; if (ret == -ENOENT) { btrfs_warn(fs_info, "orphan qgroup relation 0x%llx->0x%llx", found_key.objectid, found_key.offset); ret = 0; /* ignore the error */ } if (ret) goto out; next2: ret = btrfs_next_item(quota_root, path); if (ret < 0) goto out; if (ret) break; } out: btrfs_free_path(path); fs_info->qgroup_flags |= flags; if (ret >= 0) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON) set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ret = qgroup_rescan_init(fs_info, rescan_progress, 0); } else { ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; btrfs_sysfs_del_qgroups(fs_info); } return ret < 0 ? ret : 0; } /* * Called in close_ctree() when quota is still enabled. This verifies we don't * leak some reserved space. * * Return false if no reserved space is left. * Return true if some reserved space is leaked. */ bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info) { struct rb_node *node; bool ret = false; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) return ret; /* * Since we're unmounting, there is no race and no need to grab qgroup * lock. And here we don't go post-order to provide a more user * friendly sorted result. */ for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { struct btrfs_qgroup *qgroup; int i; qgroup = rb_entry(node, struct btrfs_qgroup, node); for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { if (qgroup->rsv.values[i]) { ret = true; btrfs_warn(fs_info, "qgroup %hu/%llu has unreleased space, type %d rsv %llu", btrfs_qgroup_level(qgroup->qgroupid), btrfs_qgroup_subvolid(qgroup->qgroupid), i, qgroup->rsv.values[i]); } } } return ret; } /* * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), * first two are in single-threaded paths.And for the third one, we have set * quota_root to be null with qgroup_lock held before, so it is safe to clean * up the in-memory structures without qgroup_lock held. */ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) { struct rb_node *n; struct btrfs_qgroup *qgroup; while ((n = rb_first(&fs_info->qgroup_tree))) { qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); __del_qgroup_rb(qgroup); btrfs_sysfs_del_one_qgroup(fs_info, qgroup); kfree(qgroup); } /* * We call btrfs_free_qgroup_config() when unmounting * filesystem and disabling quota, so we set qgroup_ulist * to be null here to avoid double free. */ ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; btrfs_sysfs_del_qgroups(fs_info); } static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, u64 dst) { int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = src; key.type = BTRFS_QGROUP_RELATION_KEY; key.offset = dst; ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); btrfs_free_path(path); return ret; } static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, u64 dst) { int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = src; key.type = BTRFS_QGROUP_RELATION_KEY; key.offset = dst; ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); if (ret < 0) goto out; if (ret > 0) { ret = -ENOENT; goto out; } ret = btrfs_del_item(trans, quota_root, path); out: btrfs_free_path(path); return ret; } static int add_qgroup_item(struct btrfs_trans_handle *trans, struct btrfs_root *quota_root, u64 qgroupid) { int ret; struct btrfs_path *path; struct btrfs_qgroup_info_item *qgroup_info; struct btrfs_qgroup_limit_item *qgroup_limit; struct extent_buffer *leaf; struct btrfs_key key; if (btrfs_is_testing(quota_root->fs_info)) return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = 0; key.type = BTRFS_QGROUP_INFO_KEY; key.offset = qgroupid; /* * Avoid a transaction abort by catching -EEXIST here. In that * case, we proceed by re-initializing the existing structure * on disk. */ ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*qgroup_info)); if (ret && ret != -EEXIST) goto out; leaf = path->nodes[0]; qgroup_info = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_qgroup_info_item); btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid); btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0); btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0); btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); btrfs_release_path(path); key.type = BTRFS_QGROUP_LIMIT_KEY; ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*qgroup_limit)); if (ret && ret != -EEXIST) goto out; leaf = path->nodes[0]; qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_qgroup_limit_item); btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); ret = 0; out: btrfs_free_path(path); return ret; } static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) { int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = 0; key.type = BTRFS_QGROUP_INFO_KEY; key.offset = qgroupid; ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); if (ret < 0) goto out; if (ret > 0) { ret = -ENOENT; goto out; } ret = btrfs_del_item(trans, quota_root, path); if (ret) goto out; btrfs_release_path(path); key.type = BTRFS_QGROUP_LIMIT_KEY; ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); if (ret < 0) goto out; if (ret > 0) { ret = -ENOENT; goto out; } ret = btrfs_del_item(trans, quota_root, path); out: btrfs_free_path(path); return ret; } static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, struct btrfs_qgroup *qgroup) { struct btrfs_root *quota_root = trans->fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *l; struct btrfs_qgroup_limit_item *qgroup_limit; int ret; int slot; key.objectid = 0; key.type = BTRFS_QGROUP_LIMIT_KEY; key.offset = qgroup->qgroupid; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; if (ret) goto out; l = path->nodes[0]; slot = path->slots[0]; qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags); btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer); btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); out: btrfs_free_path(path); return ret; } static int update_qgroup_info_item(struct btrfs_trans_handle *trans, struct btrfs_qgroup *qgroup) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root = fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *l; struct btrfs_qgroup_info_item *qgroup_info; int ret; int slot; if (btrfs_is_testing(fs_info)) return 0; key.objectid = 0; key.type = BTRFS_QGROUP_INFO_KEY; key.offset = qgroup->qgroupid; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; if (ret) goto out; l = path->nodes[0]; slot = path->slots[0]; qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); out: btrfs_free_path(path); return ret; } static int update_qgroup_status_item(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root = fs_info->quota_root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *l; struct btrfs_qgroup_status_item *ptr; int ret; int slot; key.objectid = 0; key.type = BTRFS_QGROUP_STATUS_KEY; key.offset = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; if (ret) goto out; l = path->nodes[0]; slot = path->slots[0]; ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_generation(l, ptr, trans->transid); btrfs_set_qgroup_status_rescan(l, ptr, fs_info->qgroup_rescan_progress.objectid); out: btrfs_free_path(path); return ret; } /* * called with qgroup_lock held */ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf = NULL; int ret; int nr = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = 0; key.offset = 0; key.type = 0; while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; leaf = path->nodes[0]; nr = btrfs_header_nritems(leaf); if (!nr) break; /* * delete the leaf one by one * since the whole tree is going * to be deleted. */ path->slots[0] = 0; ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) goto out; btrfs_release_path(path); } ret = 0; out: btrfs_free_path(path); return ret; } int btrfs_quota_enable(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_quota_ctl_args *quota_ctl_args) { struct btrfs_root *quota_root; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_path *path = NULL; struct btrfs_qgroup_status_item *ptr; struct extent_buffer *leaf; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_qgroup *qgroup = NULL; struct btrfs_qgroup *prealloc = NULL; struct btrfs_trans_handle *trans = NULL; struct ulist *ulist = NULL; const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA); int ret = 0; int slot; /* * We need to have subvol_sem write locked, to prevent races between * concurrent tasks trying to enable quotas, because we will unlock * and relock qgroup_ioctl_lock before setting fs_info->quota_root * and before setting BTRFS_FS_QUOTA_ENABLED. */ lockdep_assert_held_write(&fs_info->subvol_sem); if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { btrfs_err(fs_info, "qgroups are currently unsupported in extent tree v2"); return -EINVAL; } mutex_lock(&fs_info->qgroup_ioctl_lock); if (fs_info->quota_root) goto out; ulist = ulist_alloc(GFP_KERNEL); if (!ulist) { ret = -ENOMEM; goto out; } ret = btrfs_sysfs_add_qgroups(fs_info); if (ret < 0) goto out; /* * Unlock qgroup_ioctl_lock before starting the transaction. This is to * avoid lock acquisition inversion problems (reported by lockdep) between * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we * start a transaction. * After we started the transaction lock qgroup_ioctl_lock again and * check if someone else created the quota root in the meanwhile. If so, * just return success and release the transaction handle. * * Also we don't need to worry about someone else calling * btrfs_sysfs_add_qgroups() after we unlock and getting an error because * that function returns 0 (success) when the sysfs entries already exist. */ mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * 1 for quota root item * 1 for BTRFS_QGROUP_STATUS item * * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items * per subvolume. However those are not currently reserved since it * would be a lot of overkill. */ trans = btrfs_start_transaction(tree_root, 2); mutex_lock(&fs_info->qgroup_ioctl_lock); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } if (fs_info->quota_root) goto out; fs_info->qgroup_ulist = ulist; ulist = NULL; /* * initially create the quota tree */ quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID); if (IS_ERR(quota_root)) { ret = PTR_ERR(quota_root); btrfs_abort_transaction(trans, ret); goto out; } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out_free_root; } key.objectid = 0; key.type = BTRFS_QGROUP_STATUS_KEY; key.offset = 0; ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*ptr)); if (ret) { btrfs_abort_transaction(trans, ret); goto out_free_path; } leaf = path->nodes[0]; ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_qgroup_status_item); btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; if (simple) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); } else { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_rescan(leaf, ptr, 0); key.objectid = 0; key.type = BTRFS_ROOT_REF_KEY; key.offset = 0; btrfs_release_path(path); ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); if (ret > 0) goto out_add_root; if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out_free_path; } while (1) { slot = path->slots[0]; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.type == BTRFS_ROOT_REF_KEY) { /* Release locks on tree_root before we access quota_root */ btrfs_release_path(path); /* We should not have a stray @prealloc pointer. */ ASSERT(prealloc == NULL); prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out_free_path; } ret = add_qgroup_item(trans, quota_root, found_key.offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out_free_path; } qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); prealloc = NULL; if (IS_ERR(qgroup)) { ret = PTR_ERR(qgroup); btrfs_abort_transaction(trans, ret); goto out_free_path; } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out_free_path; } ret = btrfs_search_slot_for_read(tree_root, &found_key, path, 1, 0); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out_free_path; } if (ret > 0) { /* * Shouldn't happen, but in case it does we * don't need to do the btrfs_next_item, just * continue. */ continue; } } ret = btrfs_next_item(tree_root, path); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out_free_path; } if (ret) break; } out_add_root: btrfs_release_path(path); ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); if (ret) { btrfs_abort_transaction(trans, ret); goto out_free_path; } ASSERT(prealloc == NULL); prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) { ret = -ENOMEM; goto out_free_path; } qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID); prealloc = NULL; ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out_free_path; } fs_info->qgroup_enable_gen = trans->transid; mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * Commit the transaction while not holding qgroup_ioctl_lock, to avoid * a deadlock with tasks concurrently doing other qgroup operations, such * adding/removing qgroups or adding/deleting qgroup relations for example, * because all qgroup operations first start or join a transaction and then * lock the qgroup_ioctl_lock mutex. * We are safe from a concurrent task trying to enable quotas, by calling * this function, since we are serialized by fs_info->subvol_sem. */ ret = btrfs_commit_transaction(trans); trans = NULL; mutex_lock(&fs_info->qgroup_ioctl_lock); if (ret) goto out_free_path; /* * Set quota enabled flag after committing the transaction, to avoid * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot * creation. */ spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); spin_unlock(&fs_info->qgroup_lock); /* Skip rescan for simple qgroups. */ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) goto out_free_path; ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); } else { /* * We have set both BTRFS_FS_QUOTA_ENABLED and * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with * -EINPROGRESS. That can happen because someone started the * rescan worker by calling quota rescan ioctl before we * attempted to initialize the rescan worker. Failure due to * quotas disabled in the meanwhile is not possible, because * we are holding a write lock on fs_info->subvol_sem, which * is also acquired when disabling quotas. * Ignore such error, and any other error would need to undo * everything we did in the transaction we just committed. */ ASSERT(ret == -EINPROGRESS); ret = 0; } out_free_path: btrfs_free_path(path); out_free_root: if (ret) btrfs_put_root(quota_root); out: if (ret) { ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; btrfs_sysfs_del_qgroups(fs_info); } mutex_unlock(&fs_info->qgroup_ioctl_lock); if (ret && trans) btrfs_end_transaction(trans); else if (trans) ret = btrfs_end_transaction(trans); ulist_free(ulist); kfree(prealloc); return ret; } /* * It is possible to have outstanding ordered extents which reserved bytes * before we disabled. We need to fully flush delalloc, ordered extents, and a * commit to ensure that we don't leak such reservations, only to have them * come back if we re-enable. * * - enable simple quotas * - reserve space * - release it, store rsv_bytes in OE * - disable quotas * - enable simple quotas (qgroup rsv are all 0) * - OE finishes * - run delayed refs * - free rsv_bytes, resulting in miscounting or even underflow */ static int flush_reservations(struct btrfs_fs_info *fs_info) { int ret; ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) return ret; btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); return btrfs_commit_current_transaction(fs_info->tree_root); } int btrfs_quota_disable(struct btrfs_fs_info *fs_info) { struct btrfs_root *quota_root = NULL; struct btrfs_trans_handle *trans = NULL; int ret = 0; /* * We need to have subvol_sem write locked to prevent races with * snapshot creation. */ lockdep_assert_held_write(&fs_info->subvol_sem); /* * Relocation will mess with backrefs, so make sure we have the * cleaner_mutex held to protect us from relocate. */ lockdep_assert_held(&fs_info->cleaner_mutex); mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out; /* * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs * to lock that mutex while holding a transaction handle and the rescan * worker needs to commit a transaction. */ mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * Request qgroup rescan worker to complete and wait for it. This wait * must be done before transaction start for quota disable since it may * deadlock with transaction by the qgroup rescan worker. */ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); /* * We have nothing held here and no trans handle, just return the error * if there is one. */ ret = flush_reservations(fs_info); if (ret) return ret; /* * 1 For the root item * * We should also reserve enough items for the quota tree deletion in * btrfs_clean_quota_tree but this is not done. * * Also, we must always start a transaction without holding the mutex * qgroup_ioctl_lock, see btrfs_quota_enable(). */ trans = btrfs_start_transaction(fs_info->tree_root, 1); mutex_lock(&fs_info->qgroup_ioctl_lock); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); goto out; } if (!fs_info->quota_root) goto out; spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; fs_info->quota_root = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; spin_unlock(&fs_info->qgroup_lock); btrfs_free_qgroup_config(fs_info); ret = btrfs_clean_quota_tree(trans, quota_root); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_del_root(trans, &quota_root->root_key); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } spin_lock(&fs_info->trans_lock); list_del(&quota_root->dirty_list); spin_unlock(&fs_info->trans_lock); btrfs_tree_lock(quota_root->node); btrfs_clear_buffer_dirty(trans, quota_root->node); btrfs_tree_unlock(quota_root->node); ret = btrfs_free_tree_block(trans, btrfs_root_id(quota_root), quota_root->node, 0, 1); if (ret < 0) btrfs_abort_transaction(trans, ret); out: btrfs_put_root(quota_root); mutex_unlock(&fs_info->qgroup_ioctl_lock); if (ret && trans) btrfs_end_transaction(trans); else if (trans) ret = btrfs_commit_transaction(trans); return ret; } static void qgroup_dirty(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { if (list_empty(&qgroup->dirty)) list_add(&qgroup->dirty, &fs_info->dirty_qgroups); } static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup) { if (!list_empty(&qgroup->iterator)) return; list_add_tail(&qgroup->iterator, head); } static void qgroup_iterator_clean(struct list_head *head) { while (!list_empty(head)) { struct btrfs_qgroup *qgroup; qgroup = list_first_entry(head, struct btrfs_qgroup, iterator); list_del_init(&qgroup->iterator); } } /* * The easy accounting, we're updating qgroup relationship whose child qgroup * only has exclusive extents. * * In this case, all exclusive extents will also be exclusive for parent, so * excl/rfer just get added/removed. * * So is qgroup reservation space, which should also be added/removed to * parent. * Or when child tries to release reservation space, parent will underflow its * reservation (for relationship adding case). * * Caller should hold fs_info->qgroup_lock. */ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup *src, int sign) { struct btrfs_qgroup *qgroup; struct btrfs_qgroup *cur; LIST_HEAD(qgroup_list); u64 num_bytes = src->excl; int ret = 0; qgroup = find_qgroup_rb(fs_info, ref_root); if (!qgroup) goto out; qgroup_iterator_add(&qgroup_list, qgroup); list_for_each_entry(cur, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; qgroup->rfer += sign * num_bytes; qgroup->rfer_cmpr += sign * num_bytes; WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; qgroup->excl_cmpr += sign * num_bytes; if (sign > 0) qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); else qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); qgroup_dirty(fs_info, qgroup); /* Append parent qgroups to @qgroup_list. */ list_for_each_entry(glist, &qgroup->groups, next_group) qgroup_iterator_add(&qgroup_list, glist->group); } ret = 0; out: qgroup_iterator_clean(&qgroup_list); return ret; } /* * Quick path for updating qgroup with only excl refs. * * In that case, just update all parent will be enough. * Or we needs to do a full rescan. * Caller should also hold fs_info->qgroup_lock. * * Return 0 for quick update, return >0 for need to full rescan * and mark INCONSISTENT flag. * Return < 0 for other error. */ static int quick_update_accounting(struct btrfs_fs_info *fs_info, u64 src, u64 dst, int sign) { struct btrfs_qgroup *qgroup; int ret = 1; qgroup = find_qgroup_rb(fs_info, src); if (!qgroup) goto out; if (qgroup->excl == qgroup->rfer) { ret = __qgroup_excl_accounting(fs_info, dst, qgroup, sign); if (ret < 0) goto out; ret = 0; } out: if (ret) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; return ret; } /* * Add relation between @src and @dst qgroup. The @prealloc is allocated by the * callers and transferred here (either used or freed on error). */ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst, struct btrfs_qgroup_list *prealloc) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; int ret = 0; ASSERT(prealloc); /* Check the level of src and dst first */ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) return -EINVAL; mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; goto out; } member = find_qgroup_rb(fs_info, src); parent = find_qgroup_rb(fs_info, dst); if (!member || !parent) { ret = -EINVAL; goto out; } /* check if such qgroup relation exist firstly */ list_for_each_entry(list, &member->groups, next_group) { if (list->group == parent) { ret = -EEXIST; goto out; } } ret = add_qgroup_relation_item(trans, src, dst); if (ret) goto out; ret = add_qgroup_relation_item(trans, dst, src); if (ret) { del_qgroup_relation_item(trans, src, dst); goto out; } spin_lock(&fs_info->qgroup_lock); ret = __add_relation_rb(prealloc, member, parent); prealloc = NULL; if (ret < 0) { spin_unlock(&fs_info->qgroup_lock); goto out; } ret = quick_update_accounting(fs_info, src, dst, 1); spin_unlock(&fs_info->qgroup_lock); out: kfree(prealloc); mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; bool found = false; int ret = 0; int ret2; if (!fs_info->quota_root) { ret = -ENOTCONN; goto out; } member = find_qgroup_rb(fs_info, src); parent = find_qgroup_rb(fs_info, dst); /* * The parent/member pair doesn't exist, then try to delete the dead * relation items only. */ if (!member || !parent) goto delete_item; /* check if such qgroup relation exist firstly */ list_for_each_entry(list, &member->groups, next_group) { if (list->group == parent) { found = true; break; } } delete_item: ret = del_qgroup_relation_item(trans, src, dst); if (ret < 0 && ret != -ENOENT) goto out; ret2 = del_qgroup_relation_item(trans, dst, src); if (ret2 < 0 && ret2 != -ENOENT) goto out; /* At least one deletion succeeded, return 0 */ if (!ret || !ret2) ret = 0; if (found) { spin_lock(&fs_info->qgroup_lock); del_relation_rb(fs_info, src, dst); ret = quick_update_accounting(fs_info, src, dst, -1); spin_unlock(&fs_info->qgroup_lock); } out: return ret; } int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); ret = __del_qgroup_relation(trans, src, dst); mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_qgroup *prealloc = NULL; int ret = 0; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) return 0; mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; goto out; } quota_root = fs_info->quota_root; qgroup = find_qgroup_rb(fs_info, qgroupid); if (qgroup) { ret = -EEXIST; goto out; } prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) { ret = -ENOMEM; goto out; } ret = add_qgroup_item(trans, quota_root, qgroupid); if (ret) goto out; spin_lock(&fs_info->qgroup_lock); qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid); spin_unlock(&fs_info->qgroup_lock); prealloc = NULL; ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); kfree(prealloc); return ret; } /* * Return 0 if we can not delete the qgroup (not empty or has children etc). * Return >0 if we can delete the qgroup. * Return <0 for other errors during tree search. */ static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { struct btrfs_key key; struct btrfs_path *path; int ret; /* * Squota would never be inconsistent, but there can still be case * where a dropped subvolume still has qgroup numbers, and squota * relies on such qgroup for future accounting. * * So for squota, do not allow dropping any non-zero qgroup. */ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && (qgroup->rfer || qgroup->excl || qgroup->excl_cmpr || qgroup->rfer_cmpr)) return 0; /* For higher level qgroup, we can only delete it if it has no child. */ if (btrfs_qgroup_level(qgroup->qgroupid)) { if (!list_empty(&qgroup->members)) return 0; return 1; } /* * For level-0 qgroups, we can only delete it if it has no subvolume * for it. * This means even a subvolume is unlinked but not yet fully dropped, * we can not delete the qgroup. */ key.objectid = qgroup->qgroupid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = -1ULL; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL); btrfs_free_path(path); /* * The @ret from btrfs_find_root() exactly matches our definition for * the return value, thus can be returned directly. */ return ret; } int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *qgroup; struct btrfs_qgroup_list *list; int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; goto out; } qgroup = find_qgroup_rb(fs_info, qgroupid); if (!qgroup) { ret = -ENOENT; goto out; } ret = can_delete_qgroup(fs_info, qgroup); if (ret < 0) goto out; if (ret == 0) { ret = -EBUSY; goto out; } /* Check if there are no children of this qgroup */ if (!list_empty(&qgroup->members)) { ret = -EBUSY; goto out; } ret = del_qgroup_item(trans, qgroupid); if (ret && ret != -ENOENT) goto out; while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, struct btrfs_qgroup_list, next_group); ret = __del_qgroup_relation(trans, qgroupid, list->group->qgroupid); if (ret) goto out; } spin_lock(&fs_info->qgroup_lock); /* * Warn on reserved space. The subvolume should has no child nor * corresponding subvolume. * Thus its reserved space should all be zero, no matter if qgroup * is consistent or the mode. */ if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] || qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] || qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) { WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); btrfs_warn_rl(fs_info, "to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu", btrfs_qgroup_level(qgroup->qgroupid), btrfs_qgroup_subvolid(qgroup->qgroupid), qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA], qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC], qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]); } /* * The same for rfer/excl numbers, but that's only if our qgroup is * consistent and if it's in regular qgroup mode. * For simple mode it's not as accurate thus we can hit non-zero values * very frequently. */ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL && !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) { if (qgroup->rfer || qgroup->excl || qgroup->rfer_cmpr || qgroup->excl_cmpr) { WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); btrfs_warn_rl(fs_info, "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu", btrfs_qgroup_level(qgroup->qgroupid), btrfs_qgroup_subvolid(qgroup->qgroupid), qgroup->rfer, qgroup->rfer_cmpr, qgroup->excl, qgroup->excl_cmpr); qgroup_mark_inconsistent(fs_info); } } del_qgroup_rb(fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); /* * Remove the qgroup from sysfs now without holding the qgroup_lock * spinlock, since the sysfs_remove_group() function needs to take * the mutex kernfs_mutex through kernfs_remove_by_name_ns(). */ btrfs_sysfs_del_one_qgroup(fs_info, qgroup); kfree(qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid) { struct btrfs_trans_handle *trans; int ret; if (!is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || !fs_info->quota_root) return 0; /* * Commit current transaction to make sure all the rfer/excl numbers * get updated. */ ret = btrfs_commit_current_transaction(fs_info->quota_root); if (ret < 0) return ret; /* Start new trans to delete the qgroup info and limit items. */ trans = btrfs_start_transaction(fs_info->quota_root, 2); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_remove_qgroup(trans, subvolid); btrfs_end_transaction(trans); /* * It's squota and the subvolume still has numbers needed for future * accounting, in this case we can not delete it. Just skip it. * * Or the qgroup is already removed by a qgroup rescan. For both cases we're * safe to ignore them. */ if (ret == -EBUSY || ret == -ENOENT) ret = 0; return ret; } int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *qgroup; int ret = 0; /* Sometimes we would want to clear the limit on this qgroup. * To meet this requirement, we treat the -1 as a special value * which tell kernel to clear the limit on this qgroup. */ const u64 CLEAR_VALUE = -1; mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; goto out; } qgroup = find_qgroup_rb(fs_info, qgroupid); if (!qgroup) { ret = -ENOENT; goto out; } spin_lock(&fs_info->qgroup_lock); if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) { if (limit->max_rfer == CLEAR_VALUE) { qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; qgroup->max_rfer = 0; } else { qgroup->max_rfer = limit->max_rfer; } } if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { if (limit->max_excl == CLEAR_VALUE) { qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; qgroup->max_excl = 0; } else { qgroup->max_excl = limit->max_excl; } } if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) { if (limit->rsv_rfer == CLEAR_VALUE) { qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; qgroup->rsv_rfer = 0; } else { qgroup->rsv_rfer = limit->rsv_rfer; } } if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) { if (limit->rsv_excl == CLEAR_VALUE) { qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; qgroup->rsv_excl = 0; } else { qgroup->rsv_excl = limit->rsv_excl; } } qgroup->lim_flags |= limit->flags; spin_unlock(&fs_info->qgroup_lock); ret = update_qgroup_limit_item(trans, qgroup); if (ret) { qgroup_mark_inconsistent(fs_info); btrfs_info(fs_info, "unable to update quota limit for %llu", qgroupid); } out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } /* * Inform qgroup to trace one dirty extent, its info is recorded in @record. * So qgroup can account it at transaction committing time. * * No lock version, caller must acquire delayed ref lock and allocated memory, * then call btrfs_qgroup_trace_extent_post() after exiting lock context. * * Return 0 for success insert * Return >0 for existing record, caller can free @record safely. * Return <0 for insertion failure, caller can free @record safely. */ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record, u64 bytenr) { struct btrfs_qgroup_extent_record *existing, *ret; const unsigned long index = (bytenr >> fs_info->sectorsize_bits); if (!btrfs_qgroup_full_accounting(fs_info)) return 1; #if BITS_PER_LONG == 32 if (bytenr >= MAX_LFS_FILESIZE) { btrfs_err_rl(fs_info, "qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit", bytenr); btrfs_err_32bit_limit(fs_info); return -EOVERFLOW; } #endif trace_btrfs_qgroup_trace_extent(fs_info, record, bytenr); xa_lock(&delayed_refs->dirty_extents); existing = xa_load(&delayed_refs->dirty_extents, index); if (existing) { if (record->data_rsv && !existing->data_rsv) { existing->data_rsv = record->data_rsv; existing->data_rsv_refroot = record->data_rsv_refroot; } xa_unlock(&delayed_refs->dirty_extents); return 1; } ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC); xa_unlock(&delayed_refs->dirty_extents); if (xa_is_err(ret)) { qgroup_mark_inconsistent(fs_info); return xa_err(ret); } return 0; } /* * Post handler after qgroup_trace_extent_nolock(). * * NOTE: Current qgroup does the expensive backref walk at transaction * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming * new transaction. * This is designed to allow btrfs_find_all_roots() to get correct new_roots * result. * * However for old_roots there is no need to do backref walk at that time, * since we search commit roots to walk backref and result will always be * correct. * * Due to the nature of no lock version, we can't do backref there. * So we must call btrfs_qgroup_trace_extent_post() after exiting * spinlock context. * * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result * using current root, then we can move all expensive backref walk out of * transaction committing, but not now as qgroup accounting will be wrong again. */ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_backref_walk_ctx ctx = { .bytenr = bytenr, .fs_info = fs_info, }; int ret; if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* * We are always called in a context where we are already holding a * transaction handle. Often we are called when adding a data delayed * reference from btrfs_truncate_inode_items() (truncating or unlinking), * in which case we will be holding a write lock on extent buffer from a * subvolume tree. In this case we can't allow btrfs_find_all_roots() to * acquire fs_info->commit_root_sem, because that is a higher level lock * that must be acquired before locking any extent buffers. * * So we want btrfs_find_all_roots() to not acquire the commit_root_sem * but we can't pass it a non-NULL transaction handle, because otherwise * it would not use commit roots and would lock extent buffers, causing * a deadlock if it ends up trying to read lock the same extent buffer * that was previously write locked at btrfs_truncate_inode_items(). * * So pass a NULL transaction handle to btrfs_find_all_roots() and * explicitly tell it to not acquire the commit_root_sem - if we are * holding a transaction handle we don't need its protection. */ ASSERT(trans != NULL); if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) return 0; ret = btrfs_find_all_roots(&ctx, true); if (ret < 0) { qgroup_mark_inconsistent(fs_info); btrfs_warn(fs_info, "error accounting new delayed refs extent (err code: %d), quota inconsistent", ret); return 0; } /* * Here we don't need to get the lock of * trans->transaction->delayed_refs, since inserted qrecord won't * be deleted, only qrecord->node may be modified (new qrecord insert) * * So modifying qrecord->old_roots is safe here */ qrecord->old_roots = ctx.roots; return 0; } /* * Inform qgroup to trace one dirty extent, specified by @bytenr and * @num_bytes. * So qgroup can account it at commit trans time. * * Better encapsulated version, with memory allocation and backref walk for * commit roots. * So this can sleep. * * Return 0 if the operation is done. * Return <0 for error, like memory allocation failure or invalid parameter * (NULL trans) */ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; const unsigned long index = (bytenr >> fs_info->sectorsize_bits); int ret; if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0) return 0; record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) return -ENOMEM; if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) { kfree(record); return -ENOMEM; } record->num_bytes = num_bytes; ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr); if (ret) { /* Clean up if insertion fails or item exists. */ xa_release(&delayed_refs->dirty_extents, index); kfree(record); return 0; } return btrfs_qgroup_trace_extent_post(trans, record, bytenr); } /* * Inform qgroup to trace all leaf items of data * * Return 0 for success * Return <0 for error(ENOMEM) */ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = trans->fs_info; int nr = btrfs_header_nritems(eb); int i, extent_type, ret; struct btrfs_key key; struct btrfs_file_extent_item *fi; u64 bytenr, num_bytes; /* We can be called directly from walk_up_proc() */ if (!btrfs_qgroup_full_accounting(fs_info)) return 0; for (i = 0; i < nr; i++) { btrfs_item_key_to_cpu(eb, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); /* filter out non qgroup-accountable extents */ extent_type = btrfs_file_extent_type(eb, fi); if (extent_type == BTRFS_FILE_EXTENT_INLINE) continue; bytenr = btrfs_file_extent_disk_bytenr(eb, fi); if (!bytenr) continue; num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes); if (ret) return ret; } cond_resched(); return 0; } /* * Walk up the tree from the bottom, freeing leaves and any interior * nodes which have had all slots visited. If a node (leaf or * interior) is freed, the node above it will have it's slot * incremented. The root node will never be freed. * * At the end of this function, we should have a path which has all * slots incremented to the next position for a search. If we need to * read a new node it will be NULL and the node above it will have the * correct slot selected for a later read. * * If we increment the root nodes slot counter past the number of * elements, 1 is returned to signal completion of the search. */ static int adjust_slots_upwards(struct btrfs_path *path, int root_level) { int level = 0; int nr, slot; struct extent_buffer *eb; if (root_level == 0) return 1; while (level <= root_level) { eb = path->nodes[level]; nr = btrfs_header_nritems(eb); path->slots[level]++; slot = path->slots[level]; if (slot >= nr || level == 0) { /* * Don't free the root - we will detect this * condition after our loop and return a * positive value for caller to stop walking the tree. */ if (level != root_level) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; free_extent_buffer(eb); path->nodes[level] = NULL; path->slots[level] = 0; } } else { /* * We have a valid slot to walk back down * from. Stop here so caller can process these * new nodes. */ break; } level++; } eb = path->nodes[root_level]; if (path->slots[root_level] >= btrfs_header_nritems(eb)) return 1; return 0; } /* * Helper function to trace a subtree tree block swap. * * The swap will happen in highest tree block, but there may be a lot of * tree blocks involved. * * For example: * OO = Old tree blocks * NN = New tree blocks allocated during balance * * File tree (257) Reloc tree for 257 * L2 OO NN * / \ / \ * L1 OO OO (a) OO NN (a) * / \ / \ / \ / \ * L0 OO OO OO OO OO OO NN NN * (b) (c) (b) (c) * * When calling qgroup_trace_extent_swap(), we will pass: * @src_eb = OO(a) * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] * @dst_level = 0 * @root_level = 1 * * In that case, qgroup_trace_extent_swap() will search from OO(a) to * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. * * The main work of qgroup_trace_extent_swap() can be split into 3 parts: * * 1) Tree search from @src_eb * It should acts as a simplified btrfs_search_slot(). * The key for search can be extracted from @dst_path->nodes[dst_level] * (first key). * * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. * They should be marked during previous (@dst_level = 1) iteration. * * 3) Mark file extents in leaves dirty * We don't have good way to pick out new file extents only. * So we still follow the old method by scanning all file extents in * the leave. * * This function can free us from keeping two paths, thus later we only need * to care about how to iterate all new tree blocks in reloc tree. */ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, struct extent_buffer *src_eb, struct btrfs_path *dst_path, int dst_level, int root_level, bool trace_leaf) { struct btrfs_key key; struct btrfs_path *src_path; struct btrfs_fs_info *fs_info = trans->fs_info; u32 nodesize = fs_info->nodesize; int cur_level = root_level; int ret; BUG_ON(dst_level > root_level); /* Level mismatch */ if (btrfs_header_level(src_eb) != root_level) return -EINVAL; src_path = btrfs_alloc_path(); if (!src_path) { ret = -ENOMEM; goto out; } if (dst_level) btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); else btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); /* For src_path */ atomic_inc(&src_eb->refs); src_path->nodes[root_level] = src_eb; src_path->slots[root_level] = dst_path->slots[root_level]; src_path->locks[root_level] = 0; /* A simplified version of btrfs_search_slot() */ while (cur_level >= dst_level) { struct btrfs_key src_key; struct btrfs_key dst_key; if (src_path->nodes[cur_level] == NULL) { struct extent_buffer *eb; int parent_slot; eb = src_path->nodes[cur_level + 1]; parent_slot = src_path->slots[cur_level + 1]; eb = btrfs_read_node_slot(eb, parent_slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; } src_path->nodes[cur_level] = eb; btrfs_tree_read_lock(eb); src_path->locks[cur_level] = BTRFS_READ_LOCK; } src_path->slots[cur_level] = dst_path->slots[cur_level]; if (cur_level) { btrfs_node_key_to_cpu(dst_path->nodes[cur_level], &dst_key, dst_path->slots[cur_level]); btrfs_node_key_to_cpu(src_path->nodes[cur_level], &src_key, src_path->slots[cur_level]); } else { btrfs_item_key_to_cpu(dst_path->nodes[cur_level], &dst_key, dst_path->slots[cur_level]); btrfs_item_key_to_cpu(src_path->nodes[cur_level], &src_key, src_path->slots[cur_level]); } /* Content mismatch, something went wrong */ if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { ret = -ENOENT; goto out; } cur_level--; } /* * Now both @dst_path and @src_path have been populated, record the tree * blocks for qgroup accounting. */ ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, nodesize); if (ret < 0) goto out; ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start, nodesize); if (ret < 0) goto out; /* Record leaf file extents */ if (dst_level == 0 && trace_leaf) { ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); if (ret < 0) goto out; ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); } out: btrfs_free_path(src_path); return ret; } /* * Helper function to do recursive generation-aware depth-first search, to * locate all new tree blocks in a subtree of reloc tree. * * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot) * reloc tree * L2 NN (a) * / \ * L1 OO NN (b) * / \ / \ * L0 OO OO OO NN * (c) (d) * If we pass: * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ], * @cur_level = 1 * @root_level = 1 * * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace * above tree blocks along with their counter parts in file tree. * While during search, old tree blocks OO(c) will be skipped as tree block swap * won't affect OO(c). */ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, struct extent_buffer *src_eb, struct btrfs_path *dst_path, int cur_level, int root_level, u64 last_snapshot, bool trace_leaf) { struct btrfs_fs_info *fs_info = trans->fs_info; struct extent_buffer *eb; bool need_cleanup = false; int ret = 0; int i; /* Level sanity check */ if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || root_level < cur_level) { btrfs_err_rl(fs_info, "%s: bad levels, cur_level=%d root_level=%d", __func__, cur_level, root_level); return -EUCLEAN; } /* Read the tree block if needed */ if (dst_path->nodes[cur_level] == NULL) { int parent_slot; u64 child_gen; /* * dst_path->nodes[root_level] must be initialized before * calling this function. */ if (cur_level == root_level) { btrfs_err_rl(fs_info, "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", __func__, root_level, root_level, cur_level); return -EUCLEAN; } /* * We need to get child blockptr/gen from parent before we can * read it. */ eb = dst_path->nodes[cur_level + 1]; parent_slot = dst_path->slots[cur_level + 1]; child_gen = btrfs_node_ptr_generation(eb, parent_slot); /* This node is old, no need to trace */ if (child_gen < last_snapshot) goto out; eb = btrfs_read_node_slot(eb, parent_slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; } dst_path->nodes[cur_level] = eb; dst_path->slots[cur_level] = 0; btrfs_tree_read_lock(eb); dst_path->locks[cur_level] = BTRFS_READ_LOCK; need_cleanup = true; } /* Now record this tree block and its counter part for qgroups */ ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level, root_level, trace_leaf); if (ret < 0) goto cleanup; eb = dst_path->nodes[cur_level]; if (cur_level > 0) { /* Iterate all child tree blocks */ for (i = 0; i < btrfs_header_nritems(eb); i++) { /* Skip old tree blocks as they won't be swapped */ if (btrfs_node_ptr_generation(eb, i) < last_snapshot) continue; dst_path->slots[cur_level] = i; /* Recursive call (at most 7 times) */ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, cur_level - 1, root_level, last_snapshot, trace_leaf); if (ret < 0) goto cleanup; } } cleanup: if (need_cleanup) { /* Clean up */ btrfs_tree_unlock_rw(dst_path->nodes[cur_level], dst_path->locks[cur_level]); free_extent_buffer(dst_path->nodes[cur_level]); dst_path->nodes[cur_level] = NULL; dst_path->slots[cur_level] = 0; dst_path->locks[cur_level] = 0; } out: return ret; } static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, struct extent_buffer *src_eb, struct extent_buffer *dst_eb, u64 last_snapshot, bool trace_leaf) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *dst_path = NULL; int level; int ret; if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* Wrong parameter order */ if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { btrfs_err_rl(fs_info, "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, btrfs_header_generation(src_eb), btrfs_header_generation(dst_eb)); return -EUCLEAN; } if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { ret = -EIO; goto out; } level = btrfs_header_level(dst_eb); dst_path = btrfs_alloc_path(); if (!dst_path) { ret = -ENOMEM; goto out; } /* For dst_path */ atomic_inc(&dst_eb->refs); dst_path->nodes[level] = dst_eb; dst_path->slots[level] = 0; dst_path->locks[level] = 0; /* Do the generation aware breadth-first search */ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, level, last_snapshot, trace_leaf); if (ret < 0) goto out; ret = 0; out: btrfs_free_path(dst_path); if (ret < 0) qgroup_mark_inconsistent(fs_info); return ret; } /* * Inform qgroup to trace a whole subtree, including all its child tree * blocks and data. * The root tree block is specified by @root_eb. * * Normally used by relocation(tree block swap) and subvolume deletion. * * Return 0 for success * Return <0 for error(ENOMEM or tree search error) */ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0; int level; u8 drop_subptree_thres; struct extent_buffer *eb = root_eb; struct btrfs_path *path = NULL; ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL); ASSERT(root_eb != NULL); if (!btrfs_qgroup_full_accounting(fs_info)) return 0; spin_lock(&fs_info->qgroup_lock); drop_subptree_thres = fs_info->qgroup_drop_subtree_thres; spin_unlock(&fs_info->qgroup_lock); /* * This function only gets called for snapshot drop, if we hit a high * node here, it means we are going to change ownership for quite a lot * of extents, which will greatly slow down btrfs_commit_transaction(). * * So here if we find a high tree here, we just skip the accounting and * mark qgroup inconsistent. */ if (root_level >= drop_subptree_thres) { qgroup_mark_inconsistent(fs_info); return 0; } if (!extent_buffer_uptodate(root_eb)) { struct btrfs_tree_parent_check check = { .transid = root_gen, .level = root_level }; ret = btrfs_read_extent_buffer(root_eb, &check); if (ret) goto out; } if (root_level == 0) { ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); goto out; } path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* * Walk down the tree. Missing extent blocks are filled in as * we go. Metadata is accounted every time we read a new * extent block. * * When we reach a leaf, we account for file extent items in it, * walk back up the tree (adjusting slot pointers as we go) * and restart the search process. */ atomic_inc(&root_eb->refs); /* For path */ path->nodes[root_level] = root_eb; path->slots[root_level] = 0; path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ walk_down: level = root_level; while (level >= 0) { if (path->nodes[level] == NULL) { int parent_slot; u64 child_bytenr; /* * We need to get child blockptr from parent before we * can read it. */ eb = path->nodes[level + 1]; parent_slot = path->slots[level + 1]; child_bytenr = btrfs_node_blockptr(eb, parent_slot); eb = btrfs_read_node_slot(eb, parent_slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; } path->nodes[level] = eb; path->slots[level] = 0; btrfs_tree_read_lock(eb); path->locks[level] = BTRFS_READ_LOCK; ret = btrfs_qgroup_trace_extent(trans, child_bytenr, fs_info->nodesize); if (ret) goto out; } if (level == 0) { ret = btrfs_qgroup_trace_leaf_items(trans, path->nodes[level]); if (ret) goto out; /* Nonzero return here means we completed our search */ ret = adjust_slots_upwards(path, root_level); if (ret) break; /* Restart search with new slots */ goto walk_down; } level--; } ret = 0; out: btrfs_free_path(path); return ret; } static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup) { if (!list_empty(&qgroup->nested_iterator)) return; list_add_tail(&qgroup->nested_iterator, head); } static void qgroup_iterator_nested_clean(struct list_head *head) { while (!list_empty(head)) { struct btrfs_qgroup *qgroup; qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator); list_del_init(&qgroup->nested_iterator); } } #define UPDATE_NEW 0 #define UPDATE_OLD 1 /* * Walk all of the roots that points to the bytenr and adjust their refcnts. */ static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info, struct ulist *roots, struct list_head *qgroups, u64 seq, int update_old) { struct ulist_node *unode; struct ulist_iterator uiter; struct btrfs_qgroup *qg; if (!roots) return; ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { LIST_HEAD(tmp); qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; qgroup_iterator_nested_add(qgroups, qg); qgroup_iterator_add(&tmp, qg); list_for_each_entry(qg, &tmp, iterator) { struct btrfs_qgroup_list *glist; if (update_old) btrfs_qgroup_update_old_refcnt(qg, seq, 1); else btrfs_qgroup_update_new_refcnt(qg, seq, 1); list_for_each_entry(glist, &qg->groups, next_group) { qgroup_iterator_nested_add(qgroups, glist->group); qgroup_iterator_add(&tmp, glist->group); } } qgroup_iterator_clean(&tmp); } } /* * Update qgroup rfer/excl counters. * Rfer update is easy, codes can explain themselves. * * Excl update is tricky, the update is split into 2 parts. * Part 1: Possible exclusive <-> sharing detect: * | A | !A | * ------------------------------------- * B | * | - | * ------------------------------------- * !B | + | ** | * ------------------------------------- * * Conditions: * A: cur_old_roots < nr_old_roots (not exclusive before) * !A: cur_old_roots == nr_old_roots (possible exclusive before) * B: cur_new_roots < nr_new_roots (not exclusive now) * !B: cur_new_roots == nr_new_roots (possible exclusive now) * * Results: * +: Possible sharing -> exclusive -: Possible exclusive -> sharing * *: Definitely not changed. **: Possible unchanged. * * For !A and !B condition, the exception is cur_old/new_roots == 0 case. * * To make the logic clear, we first use condition A and B to split * combination into 4 results. * * Then, for result "+" and "-", check old/new_roots == 0 case, as in them * only on variant maybe 0. * * Lastly, check result **, since there are 2 variants maybe 0, split them * again(2x2). * But this time we don't need to consider other things, the codes and logic * is easy to understand now. */ static void qgroup_update_counters(struct btrfs_fs_info *fs_info, struct list_head *qgroups, u64 nr_old_roots, u64 nr_new_roots, u64 num_bytes, u64 seq) { struct btrfs_qgroup *qg; list_for_each_entry(qg, qgroups, nested_iterator) { u64 cur_new_count, cur_old_count; bool dirty = false; cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); trace_qgroup_update_counters(fs_info, qg, cur_old_count, cur_new_count); /* Rfer update part */ if (cur_old_count == 0 && cur_new_count > 0) { qg->rfer += num_bytes; qg->rfer_cmpr += num_bytes; dirty = true; } if (cur_old_count > 0 && cur_new_count == 0) { qg->rfer -= num_bytes; qg->rfer_cmpr -= num_bytes; dirty = true; } /* Excl update part */ /* Exclusive/none -> shared case */ if (cur_old_count == nr_old_roots && cur_new_count < nr_new_roots) { /* Exclusive -> shared */ if (cur_old_count != 0) { qg->excl -= num_bytes; qg->excl_cmpr -= num_bytes; dirty = true; } } /* Shared -> exclusive/none case */ if (cur_old_count < nr_old_roots && cur_new_count == nr_new_roots) { /* Shared->exclusive */ if (cur_new_count != 0) { qg->excl += num_bytes; qg->excl_cmpr += num_bytes; dirty = true; } } /* Exclusive/none -> exclusive/none case */ if (cur_old_count == nr_old_roots && cur_new_count == nr_new_roots) { if (cur_old_count == 0) { /* None -> exclusive/none */ if (cur_new_count != 0) { /* None -> exclusive */ qg->excl += num_bytes; qg->excl_cmpr += num_bytes; dirty = true; } /* None -> none, nothing changed */ } else { /* Exclusive -> exclusive/none */ if (cur_new_count == 0) { /* Exclusive -> none */ qg->excl -= num_bytes; qg->excl_cmpr -= num_bytes; dirty = true; } /* Exclusive -> exclusive, nothing changed */ } } if (dirty) qgroup_dirty(fs_info, qg); } } /* * Check if the @roots potentially is a list of fs tree roots * * Return 0 for definitely not a fs/subvol tree roots ulist * Return 1 for possible fs/subvol tree roots in the list (considering an empty * one as well) */ static int maybe_fs_roots(struct ulist *roots) { struct ulist_node *unode; struct ulist_iterator uiter; /* Empty one, still possible for fs roots */ if (!roots || roots->nnodes == 0) return 1; ULIST_ITER_INIT(&uiter); unode = ulist_next(roots, &uiter); if (!unode) return 1; /* * If it contains fs tree roots, then it must belong to fs/subvol * trees. * If it contains a non-fs tree, it won't be shared with fs/subvol trees. */ return is_fstree(unode->val); } int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct ulist *old_roots, struct ulist *new_roots) { struct btrfs_fs_info *fs_info = trans->fs_info; LIST_HEAD(qgroups); u64 seq; u64 nr_new_roots = 0; u64 nr_old_roots = 0; int ret = 0; /* * If quotas get disabled meanwhile, the resources need to be freed and * we can't just exit here. */ if (!btrfs_qgroup_full_accounting(fs_info) || fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) goto out_free; if (new_roots) { if (!maybe_fs_roots(new_roots)) goto out_free; nr_new_roots = new_roots->nnodes; } if (old_roots) { if (!maybe_fs_roots(old_roots)) goto out_free; nr_old_roots = old_roots->nnodes; } /* Quick exit, either not fs tree roots, or won't affect any qgroup */ if (nr_old_roots == 0 && nr_new_roots == 0) goto out_free; trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, num_bytes, nr_old_roots, nr_new_roots); mutex_lock(&fs_info->qgroup_rescan_lock); if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { mutex_unlock(&fs_info->qgroup_rescan_lock); ret = 0; goto out_free; } } mutex_unlock(&fs_info->qgroup_rescan_lock); spin_lock(&fs_info->qgroup_lock); seq = fs_info->qgroup_seq; /* Update old refcnts using old_roots */ qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD); /* Update new refcnts using new_roots */ qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW); qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots, num_bytes, seq); /* * We're done using the iterator, release all its qgroups while holding * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup() * and trigger use-after-free accesses to qgroups. */ qgroup_iterator_nested_clean(&qgroups); /* * Bump qgroup_seq to avoid seq overlap */ fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; spin_unlock(&fs_info->qgroup_lock); out_free: ulist_free(old_roots); ulist_free(new_roots); return ret; } int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs; struct ulist *new_roots = NULL; unsigned long index; u64 num_dirty_extents = 0; u64 qgroup_to_skip; int ret = 0; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) return 0; delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; xa_for_each(&delayed_refs->dirty_extents, index, record) { const u64 bytenr = (((u64)index) << fs_info->sectorsize_bits); num_dirty_extents++; trace_btrfs_qgroup_account_extents(fs_info, record, bytenr); if (!ret && !(fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { struct btrfs_backref_walk_ctx ctx = { 0 }; ctx.bytenr = bytenr; ctx.fs_info = fs_info; /* * Old roots should be searched when inserting qgroup * extent record. * * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case, * we may have some record inserted during * NO_ACCOUNTING (thus no old_roots populated), but * later we start rescan, which clears NO_ACCOUNTING, * leaving some inserted records without old_roots * populated. * * Those cases are rare and should not cause too much * time spent during commit_transaction(). */ if (!record->old_roots) { /* Search commit root to find old_roots */ ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto cleanup; record->old_roots = ctx.roots; ctx.roots = NULL; } /* * Use BTRFS_SEQ_LAST as time_seq to do special search, * which doesn't lock tree or delayed_refs and search * current root. It's safe inside commit_transaction(). */ ctx.trans = trans; ctx.time_seq = BTRFS_SEQ_LAST; ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto cleanup; new_roots = ctx.roots; if (qgroup_to_skip) { ulist_del(new_roots, qgroup_to_skip, 0); ulist_del(record->old_roots, qgroup_to_skip, 0); } ret = btrfs_qgroup_account_extent(trans, bytenr, record->num_bytes, record->old_roots, new_roots); record->old_roots = NULL; new_roots = NULL; } /* Free the reserved data space */ btrfs_qgroup_free_refroot(fs_info, record->data_rsv_refroot, record->data_rsv, BTRFS_QGROUP_RSV_DATA); cleanup: ulist_free(record->old_roots); ulist_free(new_roots); new_roots = NULL; xa_erase(&delayed_refs->dirty_extents, index); kfree(record); } trace_qgroup_num_dirty_extents(fs_info, trans->transid, num_dirty_extents); return ret; } /* * Writes all changed qgroups to disk. * Called by the transaction commit path and the qgroup assign ioctl. */ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0; /* * In case we are called from the qgroup assign ioctl, assert that we * are holding the qgroup_ioctl_lock, otherwise we can race with a quota * disable operation (ioctl) and access a freed quota root. */ if (trans->transaction->state != TRANS_STATE_COMMIT_DOING) lockdep_assert_held(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) return ret; spin_lock(&fs_info->qgroup_lock); while (!list_empty(&fs_info->dirty_qgroups)) { struct btrfs_qgroup *qgroup; qgroup = list_first_entry(&fs_info->dirty_qgroups, struct btrfs_qgroup, dirty); list_del_init(&qgroup->dirty); spin_unlock(&fs_info->qgroup_lock); ret = update_qgroup_info_item(trans, qgroup); if (ret) qgroup_mark_inconsistent(fs_info); ret = update_qgroup_limit_item(trans, qgroup); if (ret) qgroup_mark_inconsistent(fs_info); spin_lock(&fs_info->qgroup_lock); } if (btrfs_qgroup_enabled(fs_info)) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; else fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; spin_unlock(&fs_info->qgroup_lock); ret = update_qgroup_status_item(trans); if (ret) qgroup_mark_inconsistent(fs_info); return ret; } int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_inherit *inherit, size_t size) { if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP) return -EOPNOTSUPP; if (size < sizeof(*inherit) || size > PAGE_SIZE) return -EINVAL; /* * In the past we allowed btrfs_qgroup_inherit to specify to copy * rfer/excl numbers directly from other qgroups. This behavior has * been disabled in userspace for a very long time, but here we should * also disable it in kernel, as this behavior is known to mark qgroup * inconsistent, and a rescan would wipe out the changes anyway. * * Reject any btrfs_qgroup_inherit with num_ref_copies or num_excl_copies. */ if (inherit->num_ref_copies > 0 || inherit->num_excl_copies > 0) return -EINVAL; if (size != struct_size(inherit, qgroups, inherit->num_qgroups)) return -EINVAL; /* * Skip the inherit source qgroups check if qgroup is not enabled. * Qgroup can still be later enabled causing problems, but in that case * btrfs_qgroup_inherit() would just ignore those invalid ones. */ if (!btrfs_qgroup_enabled(fs_info)) return 0; /* * Now check all the remaining qgroups, they should all: * * - Exist * - Be higher level qgroups. */ for (int i = 0; i < inherit->num_qgroups; i++) { struct btrfs_qgroup *qgroup; u64 qgroupid = inherit->qgroups[i]; if (btrfs_qgroup_level(qgroupid) == 0) return -EINVAL; spin_lock(&fs_info->qgroup_lock); qgroup = find_qgroup_rb(fs_info, qgroupid); if (!qgroup) { spin_unlock(&fs_info->qgroup_lock); return -ENOENT; } spin_unlock(&fs_info->qgroup_lock); } return 0; } static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info, u64 inode_rootid, struct btrfs_qgroup_inherit **inherit) { int i = 0; u64 num_qgroups = 0; struct btrfs_qgroup *inode_qg; struct btrfs_qgroup_list *qg_list; struct btrfs_qgroup_inherit *res; size_t struct_sz; u64 *qgids; if (*inherit) return -EEXIST; inode_qg = find_qgroup_rb(fs_info, inode_rootid); if (!inode_qg) return -ENOENT; num_qgroups = list_count_nodes(&inode_qg->groups); if (!num_qgroups) return 0; struct_sz = struct_size(res, qgroups, num_qgroups); if (struct_sz == SIZE_MAX) return -ERANGE; res = kzalloc(struct_sz, GFP_NOFS); if (!res) return -ENOMEM; res->num_qgroups = num_qgroups; qgids = res->qgroups; list_for_each_entry(qg_list, &inode_qg->groups, next_group) qgids[i++] = qg_list->group->qgroupid; *inherit = res; return 0; } /* * Check if we can skip rescan when inheriting qgroups. If @src has a single * @parent, and that @parent is owning all its bytes exclusively, we can skip * the full rescan, by just adding nodesize to the @parent's excl/rfer. * * Return <0 for fatal errors (like srcid/parentid has no qgroup). * Return 0 if a quick inherit is done. * Return >0 if a quick inherit is not possible, and a full rescan is needed. */ static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info, u64 srcid, u64 parentid) { struct btrfs_qgroup *src; struct btrfs_qgroup *parent; struct btrfs_qgroup_list *list; int nr_parents = 0; src = find_qgroup_rb(fs_info, srcid); if (!src) return -ENOENT; parent = find_qgroup_rb(fs_info, parentid); if (!parent) return -ENOENT; /* * Source has no parent qgroup, but our new qgroup would have one. * Qgroup numbers would become inconsistent. */ if (list_empty(&src->groups)) return 1; list_for_each_entry(list, &src->groups, next_group) { /* The parent is not the same, quick update is not possible. */ if (list->group->qgroupid != parentid) return 1; nr_parents++; /* * More than one parent qgroup, we can't be sure about accounting * consistency. */ if (nr_parents > 1) return 1; } /* * The parent is not exclusively owning all its bytes. We're not sure * if the source has any bytes not fully owned by the parent. */ if (parent->excl != parent->rfer) return 1; parent->excl += fs_info->nodesize; parent->rfer += fs_info->nodesize; return 0; } /* * Copy the accounting information between qgroups. This is necessary * when a snapshot or a subvolume is created. Throwing an error will * cause a transaction abort so we take extra care here to only error * when a readonly fs is a reasonable outcome. */ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, u64 objectid, u64 inode_rootid, struct btrfs_qgroup_inherit *inherit) { int ret = 0; u64 *i_qgroups; bool committing = false; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; struct btrfs_qgroup *prealloc; struct btrfs_qgroup_list **qlist_prealloc = NULL; bool free_inherit = false; bool need_rescan = false; u32 level_size = 0; u64 nums; prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) return -ENOMEM; /* * There are only two callers of this function. * * One in create_subvol() in the ioctl context, which needs to hold * the qgroup_ioctl_lock. * * The other one in create_pending_snapshot() where no other qgroup * code can modify the fs as they all need to either start a new trans * or hold a trans handler, thus we don't need to hold * qgroup_ioctl_lock. * This would avoid long and complex lock chain and make lockdep happy. */ spin_lock(&fs_info->trans_lock); if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) committing = true; spin_unlock(&fs_info->trans_lock); if (!committing) mutex_lock(&fs_info->qgroup_ioctl_lock); if (!btrfs_qgroup_enabled(fs_info)) goto out; quota_root = fs_info->quota_root; if (!quota_root) { ret = -EINVAL; goto out; } if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) { ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit); if (ret) goto out; free_inherit = true; } if (inherit) { i_qgroups = (u64 *)(inherit + 1); nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + 2 * inherit->num_excl_copies; for (int i = 0; i < nums; i++) { srcgroup = find_qgroup_rb(fs_info, *i_qgroups); /* * Zero out invalid groups so we can ignore * them later. */ if (!srcgroup || ((srcgroup->qgroupid >> 48) <= (objectid >> 48))) *i_qgroups = 0ULL; ++i_qgroups; } } /* * create a tracking group for the subvol itself */ ret = add_qgroup_item(trans, quota_root, objectid); if (ret) goto out; /* * add qgroup to all inherited groups */ if (inherit) { i_qgroups = (u64 *)(inherit + 1); for (int i = 0; i < inherit->num_qgroups; i++, i_qgroups++) { if (*i_qgroups == 0) continue; ret = add_qgroup_relation_item(trans, objectid, *i_qgroups); if (ret && ret != -EEXIST) goto out; ret = add_qgroup_relation_item(trans, *i_qgroups, objectid); if (ret && ret != -EEXIST) goto out; } ret = 0; qlist_prealloc = kcalloc(inherit->num_qgroups, sizeof(struct btrfs_qgroup_list *), GFP_NOFS); if (!qlist_prealloc) { ret = -ENOMEM; goto out; } for (int i = 0; i < inherit->num_qgroups; i++) { qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list), GFP_NOFS); if (!qlist_prealloc[i]) { ret = -ENOMEM; goto out; } } } spin_lock(&fs_info->qgroup_lock); dstgroup = add_qgroup_rb(fs_info, prealloc, objectid); prealloc = NULL; if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { dstgroup->lim_flags = inherit->lim.flags; dstgroup->max_rfer = inherit->lim.max_rfer; dstgroup->max_excl = inherit->lim.max_excl; dstgroup->rsv_rfer = inherit->lim.rsv_rfer; dstgroup->rsv_excl = inherit->lim.rsv_excl; qgroup_dirty(fs_info, dstgroup); } if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) { srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) goto unlock; /* * We call inherit after we clone the root in order to make sure * our counts don't go crazy, so at this point the only * difference between the two roots should be the root node. */ level_size = fs_info->nodesize; dstgroup->rfer = srcgroup->rfer; dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; dstgroup->excl = level_size; dstgroup->excl_cmpr = level_size; srcgroup->excl = level_size; srcgroup->excl_cmpr = level_size; /* inherit the limit info */ dstgroup->lim_flags = srcgroup->lim_flags; dstgroup->max_rfer = srcgroup->max_rfer; dstgroup->max_excl = srcgroup->max_excl; dstgroup->rsv_rfer = srcgroup->rsv_rfer; dstgroup->rsv_excl = srcgroup->rsv_excl; qgroup_dirty(fs_info, dstgroup); qgroup_dirty(fs_info, srcgroup); /* * If the source qgroup has parent but the new one doesn't, * we need a full rescan. */ if (!inherit && !list_empty(&srcgroup->groups)) need_rescan = true; } if (!inherit) goto unlock; i_qgroups = (u64 *)(inherit + 1); for (int i = 0; i < inherit->num_qgroups; i++) { if (*i_qgroups) { ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid, *i_qgroups); qlist_prealloc[i] = NULL; if (ret) goto unlock; } if (srcid) { /* Check if we can do a quick inherit. */ ret = qgroup_snapshot_quick_inherit(fs_info, srcid, *i_qgroups); if (ret < 0) goto unlock; if (ret > 0) need_rescan = true; ret = 0; } ++i_qgroups; } for (int i = 0; i < inherit->num_ref_copies; i++, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst; if (!i_qgroups[0] || !i_qgroups[1]) continue; src = find_qgroup_rb(fs_info, i_qgroups[0]); dst = find_qgroup_rb(fs_info, i_qgroups[1]); if (!src || !dst) { ret = -EINVAL; goto unlock; } dst->rfer = src->rfer - level_size; dst->rfer_cmpr = src->rfer_cmpr - level_size; /* Manually tweaking numbers certainly needs a rescan */ need_rescan = true; } for (int i = 0; i < inherit->num_excl_copies; i++, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst; if (!i_qgroups[0] || !i_qgroups[1]) continue; src = find_qgroup_rb(fs_info, i_qgroups[0]); dst = find_qgroup_rb(fs_info, i_qgroups[1]); if (!src || !dst) { ret = -EINVAL; goto unlock; } dst->excl = src->excl + level_size; dst->excl_cmpr = src->excl_cmpr + level_size; need_rescan = true; } unlock: spin_unlock(&fs_info->qgroup_lock); if (!ret) ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup); out: if (!committing) mutex_unlock(&fs_info->qgroup_ioctl_lock); if (need_rescan) qgroup_mark_inconsistent(fs_info); if (qlist_prealloc) { for (int i = 0; i < inherit->num_qgroups; i++) kfree(qlist_prealloc[i]); kfree(qlist_prealloc); } if (free_inherit) kfree(inherit); kfree(prealloc); return ret; } static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) { if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) return false; if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) return false; return true; } static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, enum btrfs_qgroup_rsv_type type) { struct btrfs_qgroup *qgroup; struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = btrfs_root_id(root); int ret = 0; LIST_HEAD(qgroup_list); if (!is_fstree(ref_root)) return 0; if (num_bytes == 0) return 0; if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) && capable(CAP_SYS_RESOURCE)) enforce = false; spin_lock(&fs_info->qgroup_lock); if (!fs_info->quota_root) goto out; qgroup = find_qgroup_rb(fs_info, ref_root); if (!qgroup) goto out; qgroup_iterator_add(&qgroup_list, qgroup); list_for_each_entry(qgroup, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; if (enforce && !qgroup_check_limits(qgroup, num_bytes)) { ret = -EDQUOT; goto out; } list_for_each_entry(glist, &qgroup->groups, next_group) qgroup_iterator_add(&qgroup_list, glist->group); } ret = 0; /* * no limits exceeded, now record the reservation into all qgroups */ list_for_each_entry(qgroup, &qgroup_list, iterator) qgroup_rsv_add(fs_info, qgroup, num_bytes, type); out: qgroup_iterator_clean(&qgroup_list); spin_unlock(&fs_info->qgroup_lock); return ret; } /* * Free @num_bytes of reserved space with @type for qgroup. (Normally level 0 * qgroup). * * Will handle all higher level qgroup too. * * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup. * This special case is only used for META_PERTRANS type. */ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { struct btrfs_qgroup *qgroup; LIST_HEAD(qgroup_list); if (!is_fstree(ref_root)) return; if (num_bytes == 0) return; if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) { WARN(1, "%s: Invalid type to free", __func__); return; } spin_lock(&fs_info->qgroup_lock); if (!fs_info->quota_root) goto out; qgroup = find_qgroup_rb(fs_info, ref_root); if (!qgroup) goto out; if (num_bytes == (u64)-1) /* * We're freeing all pertrans rsv, get reserved value from * level 0 qgroup as real num_bytes to free. */ num_bytes = qgroup->rsv.values[type]; qgroup_iterator_add(&qgroup_list, qgroup); list_for_each_entry(qgroup, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; qgroup_rsv_release(fs_info, qgroup, num_bytes, type); list_for_each_entry(glist, &qgroup->groups, next_group) { qgroup_iterator_add(&qgroup_list, glist->group); } } out: qgroup_iterator_clean(&qgroup_list); spin_unlock(&fs_info->qgroup_lock); } /* * Check if the leaf is the last leaf. Which means all node pointers * are at their last position. */ static bool is_last_leaf(struct btrfs_path *path) { int i; for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1) return false; } return true; } /* * returns < 0 on error, 0 when more leafs are to be scanned. * returns 1 when done. */ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *extent_root; struct btrfs_key found; struct extent_buffer *scratch_leaf = NULL; u64 num_bytes; bool done; int slot; int ret; if (!btrfs_qgroup_full_accounting(fs_info)) return 1; mutex_lock(&fs_info->qgroup_rescan_lock); extent_root = btrfs_extent_root(fs_info, fs_info->qgroup_rescan_progress.objectid); ret = btrfs_search_slot_for_read(extent_root, &fs_info->qgroup_rescan_progress, path, 1, 0); btrfs_debug(fs_info, "current progress key (%llu %u %llu), search_slot ret %d", fs_info->qgroup_rescan_progress.objectid, fs_info->qgroup_rescan_progress.type, fs_info->qgroup_rescan_progress.offset, ret); if (ret) { /* * The rescan is about to end, we will not be scanning any * further blocks. We cannot unset the RESCAN flag here, because * we want to commit the transaction if everything went well. * To make the live accounting work in this phase, we set our * scan progress pointer such that every real extent objectid * will be smaller. */ fs_info->qgroup_rescan_progress.objectid = (u64)-1; btrfs_release_path(path); mutex_unlock(&fs_info->qgroup_rescan_lock); return ret; } done = is_last_leaf(path); btrfs_item_key_to_cpu(path->nodes[0], &found, btrfs_header_nritems(path->nodes[0]) - 1); fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); if (!scratch_leaf) { ret = -ENOMEM; mutex_unlock(&fs_info->qgroup_rescan_lock); goto out; } slot = path->slots[0]; btrfs_release_path(path); mutex_unlock(&fs_info->qgroup_rescan_lock); for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { struct btrfs_backref_walk_ctx ctx = { 0 }; btrfs_item_key_to_cpu(scratch_leaf, &found, slot); if (found.type != BTRFS_EXTENT_ITEM_KEY && found.type != BTRFS_METADATA_ITEM_KEY) continue; if (found.type == BTRFS_METADATA_ITEM_KEY) num_bytes = fs_info->nodesize; else num_bytes = found.offset; ctx.bytenr = found.objectid; ctx.fs_info = fs_info; ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto out; /* For rescan, just pass old_roots as NULL */ ret = btrfs_qgroup_account_extent(trans, found.objectid, num_bytes, NULL, ctx.roots); if (ret < 0) goto out; } out: if (scratch_leaf) free_extent_buffer(scratch_leaf); if (done && !ret) { ret = 1; fs_info->qgroup_rescan_progress.objectid = (u64)-1; } return ret; } static bool rescan_should_stop(struct btrfs_fs_info *fs_info) { if (btrfs_fs_closing(fs_info)) return true; if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) return true; if (!btrfs_qgroup_enabled(fs_info)) return true; if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) return true; return false; } static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) { struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; int ret = 0; bool stopped = false; bool did_leaf_rescans = false; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) return; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } /* * Rescan should only search for commit root, and any later difference * should be recorded by qgroup */ path->search_commit_root = 1; path->skip_locking = 1; while (!ret && !(stopped = rescan_should_stop(fs_info))) { trans = btrfs_start_transaction(fs_info->fs_root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); break; } ret = qgroup_rescan_leaf(trans, path); did_leaf_rescans = true; if (ret > 0) btrfs_commit_transaction(trans); else btrfs_end_transaction(trans); } out: btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); if (ret > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } else if (ret < 0 || stopped) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } mutex_unlock(&fs_info->qgroup_rescan_lock); /* * Only update status, since the previous part has already updated the * qgroup info, and only if we did any actual work. This also prevents * race with a concurrent quota disable, which has already set * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at * btrfs_quota_disable(). */ if (did_leaf_rescans) { trans = btrfs_start_transaction(fs_info->quota_root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; btrfs_err(fs_info, "fail to start transaction for status update: %d", ret); } } else { trans = NULL; } mutex_lock(&fs_info->qgroup_rescan_lock); if (!stopped || fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; if (trans) { int ret2 = update_qgroup_status_item(trans); if (ret2 < 0) { ret = ret2; btrfs_err(fs_info, "fail to update qgroup status: %d", ret); } } fs_info->qgroup_rescan_running = false; fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; complete_all(&fs_info->qgroup_rescan_completion); mutex_unlock(&fs_info->qgroup_rescan_lock); if (!trans) return; btrfs_end_transaction(trans); if (stopped) { btrfs_info(fs_info, "qgroup scan paused"); } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) { btrfs_info(fs_info, "qgroup scan cancelled"); } else if (ret >= 0) { btrfs_info(fs_info, "qgroup scan completed%s", ret > 0 ? " (inconsistency flag cleared)" : ""); } else { btrfs_err(fs_info, "qgroup scan failed with %d", ret); } } /* * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all * memory required for the rescan context. */ static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags) { int ret = 0; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode"); return -EINVAL; } if (!init_flags) { /* we're resuming qgroup rescan at mount time */ if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { btrfs_debug(fs_info, "qgroup rescan init failed, qgroup rescan is not queued"); ret = -EINVAL; } else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { btrfs_debug(fs_info, "qgroup rescan init failed, qgroup is not enabled"); ret = -ENOTCONN; } if (ret) return ret; } mutex_lock(&fs_info->qgroup_rescan_lock); if (init_flags) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { ret = -EINPROGRESS; } else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { btrfs_debug(fs_info, "qgroup rescan init failed, qgroup is not enabled"); ret = -ENOTCONN; } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) { /* Quota disable is in progress */ ret = -EBUSY; } if (ret) { mutex_unlock(&fs_info->qgroup_rescan_lock); return ret; } fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; } memset(&fs_info->qgroup_rescan_progress, 0, sizeof(fs_info->qgroup_rescan_progress)); fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); fs_info->qgroup_rescan_progress.objectid = progress_objectid; init_completion(&fs_info->qgroup_rescan_completion); mutex_unlock(&fs_info->qgroup_rescan_lock); btrfs_init_work(&fs_info->qgroup_rescan_work, btrfs_qgroup_rescan_worker, NULL); return 0; } static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) { struct rb_node *n; struct btrfs_qgroup *qgroup; spin_lock(&fs_info->qgroup_lock); /* clear all current qgroup tracking information */ for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { qgroup = rb_entry(n, struct btrfs_qgroup, node); qgroup->rfer = 0; qgroup->rfer_cmpr = 0; qgroup->excl = 0; qgroup->excl_cmpr = 0; qgroup_dirty(fs_info, qgroup); } spin_unlock(&fs_info->qgroup_lock); } int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) { int ret = 0; ret = qgroup_rescan_init(fs_info, 0, 1); if (ret) return ret; /* * We have set the rescan_progress to 0, which means no more * delayed refs will be accounted by btrfs_qgroup_account_ref. * However, btrfs_qgroup_account_ref may be right after its call * to btrfs_find_all_roots, in which case it would still do the * accounting. * To solve this, we're committing the transaction, which will * ensure we run all delayed refs and only after that, we are * going to clear all tracking information for a clean start. */ ret = btrfs_commit_current_transaction(fs_info->fs_root); if (ret) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; return ret; } qgroup_rescan_zero_tracking(fs_info); mutex_lock(&fs_info->qgroup_rescan_lock); fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); mutex_unlock(&fs_info->qgroup_rescan_lock); return 0; } int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, bool interruptible) { int running; int ret = 0; mutex_lock(&fs_info->qgroup_rescan_lock); running = fs_info->qgroup_rescan_running; mutex_unlock(&fs_info->qgroup_rescan_lock); if (!running) return 0; if (interruptible) ret = wait_for_completion_interruptible( &fs_info->qgroup_rescan_completion); else wait_for_completion(&fs_info->qgroup_rescan_completion); return ret; } /* * this is only called from open_ctree where we're still single threaded, thus * locking is omitted here. */ void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { mutex_lock(&fs_info->qgroup_rescan_lock); fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); mutex_unlock(&fs_info->qgroup_rescan_lock); } } #define rbtree_iterate_from_safe(node, next, start) \ for (node = start; node && ({ next = rb_next(node); 1;}); node = next) static int qgroup_unreserve_range(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) { struct rb_node *node; struct rb_node *next; struct ulist_node *entry; int ret = 0; node = reserved->range_changed.root.rb_node; if (!node) return 0; while (node) { entry = rb_entry(node, struct ulist_node, rb_node); if (entry->val < start) node = node->rb_right; else node = node->rb_left; } if (entry->val > start && rb_prev(&entry->rb_node)) entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, rb_node); rbtree_iterate_from_safe(node, next, &entry->rb_node) { u64 entry_start; u64 entry_end; u64 entry_len; int clear_ret; entry = rb_entry(node, struct ulist_node, rb_node); entry_start = entry->val; entry_end = entry->aux; entry_len = entry_end - entry_start + 1; if (entry_start >= start + len) break; if (entry_start + entry_len <= start) continue; /* * Now the entry is in [start, start + len), revert the * EXTENT_QGROUP_RESERVED bit. */ clear_ret = clear_extent_bits(&inode->io_tree, entry_start, entry_end, EXTENT_QGROUP_RESERVED); if (!ret && clear_ret < 0) ret = clear_ret; ulist_del(&reserved->range_changed, entry->val, entry->aux); if (likely(reserved->bytes_changed >= entry_len)) { reserved->bytes_changed -= entry_len; } else { WARN_ON(1); reserved->bytes_changed = 0; } } return ret; } /* * Try to free some space for qgroup. * * For qgroup, there are only 3 ways to free qgroup space: * - Flush nodatacow write * Any nodatacow write will free its reserved data space at run_delalloc_range(). * In theory, we should only flush nodatacow inodes, but it's not yet * possible, so we need to flush the whole root. * * - Wait for ordered extents * When ordered extents are finished, their reserved metadata is finally * converted to per_trans status, which can be freed by later commit * transaction. * * - Commit transaction * This would free the meta_per_trans space. * In theory this shouldn't provide much space, but any more qgroup space * is needed. */ static int try_flush_qgroup(struct btrfs_root *root) { int ret; /* Can't hold an open transaction or we run the risk of deadlocking. */ ASSERT(current->journal_info == NULL); if (WARN_ON(current->journal_info)) return 0; /* * We don't want to run flush again and again, so if there is a running * one, we won't try to start a new flush, but exit directly. */ if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { wait_event(root->qgroup_flush_wait, !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); return 0; } ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out; btrfs_wait_ordered_extents(root, U64_MAX, NULL); /* * After waiting for ordered extents run delayed iputs in order to free * space from unlinked files before committing the current transaction, * as ordered extents may have been holding the last reference of an * inode and they add a delayed iput when they complete. */ btrfs_run_delayed_iputs(root->fs_info); btrfs_wait_on_delayed_iputs(root->fs_info); ret = btrfs_commit_current_transaction(root); out: clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); wake_up(&root->qgroup_flush_wait); return ret; } static int qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { struct btrfs_root *root = inode->root; struct extent_changeset *reserved; bool new_reserved = false; u64 orig_reserved; u64 to_reserve; int ret; if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(btrfs_root_id(root)) || len == 0) return 0; /* @reserved parameter is mandatory for qgroup */ if (WARN_ON(!reserved_ret)) return -EINVAL; if (!*reserved_ret) { new_reserved = true; *reserved_ret = extent_changeset_alloc(); if (!*reserved_ret) return -ENOMEM; } reserved = *reserved_ret; /* Record already reserved space */ orig_reserved = reserved->bytes_changed; ret = set_record_extent_bits(&inode->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, reserved); /* Newly reserved space */ to_reserve = reserved->bytes_changed - orig_reserved; trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, to_reserve, QGROUP_RESERVE); if (ret < 0) goto out; ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); if (ret < 0) goto cleanup; return ret; cleanup: qgroup_unreserve_range(inode, reserved, start, len); out: if (new_reserved) { extent_changeset_free(reserved); *reserved_ret = NULL; } return ret; } /* * Reserve qgroup space for range [start, start + len). * * This function will either reserve space from related qgroups or do nothing * if the range is already reserved. * * Return 0 for successful reservation * Return <0 for error (including -EQUOT) * * NOTE: This function may sleep for memory allocation, dirty page flushing and * commit transaction. So caller should not hold any dirty page locked. */ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { int ret; ret = qgroup_reserve_data(inode, reserved_ret, start, len); if (ret <= 0 && ret != -EDQUOT) return ret; ret = try_flush_qgroup(inode->root); if (ret < 0) return ret; return qgroup_reserve_data(inode, reserved_ret, start, len); } /* Free ranges specified by @reserved, normally in error path */ static int qgroup_free_reserved_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, u64 *freed_ret) { struct btrfs_root *root = inode->root; struct ulist_node *unode; struct ulist_iterator uiter; struct extent_changeset changeset; u64 freed = 0; int ret; extent_changeset_init(&changeset); len = round_up(start + len, root->fs_info->sectorsize); start = round_down(start, root->fs_info->sectorsize); ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(&reserved->range_changed, &uiter))) { u64 range_start = unode->val; /* unode->aux is the inclusive end */ u64 range_len = unode->aux - range_start + 1; u64 free_start; u64 free_len; extent_changeset_release(&changeset); /* Only free range in range [start, start + len) */ if (range_start >= start + len || range_start + range_len <= start) continue; free_start = max(range_start, start); free_len = min(start + len, range_start + range_len) - free_start; /* * TODO: To also modify reserved->ranges_reserved to reflect * the modification. * * However as long as we free qgroup reserved according to * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush. */ ret = clear_record_extent_bits(&inode->io_tree, free_start, free_start + free_len - 1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; freed += changeset.bytes_changed; } btrfs_qgroup_free_refroot(root->fs_info, btrfs_root_id(root), freed, BTRFS_QGROUP_RSV_DATA); if (freed_ret) *freed_ret = freed; ret = 0; out: extent_changeset_release(&changeset); return ret; } static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, u64 *released, int free) { struct extent_changeset changeset; int trace_op = QGROUP_RELEASE; int ret; if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { return clear_record_extent_bits(&inode->io_tree, start, start + len - 1, EXTENT_QGROUP_RESERVED, NULL); } /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); if (free && reserved) return qgroup_free_reserved_data(inode, reserved, start, len, released); extent_changeset_init(&changeset); ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; if (free) trace_op = QGROUP_FREE; trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len, changeset.bytes_changed, trace_op); if (free) btrfs_qgroup_free_refroot(inode->root->fs_info, btrfs_root_id(inode->root), changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); if (released) *released = changeset.bytes_changed; out: extent_changeset_release(&changeset); return ret; } /* * Free a reserved space range from io_tree and related qgroups * * Should be called when a range of pages get invalidated before reaching disk. * Or for error cleanup case. * if @reserved is given, only reserved range in [@start, @start + @len) will * be freed. * * For data written to disk, use btrfs_qgroup_release_data(). * * NOTE: This function may sleep for memory allocation. */ int btrfs_qgroup_free_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, u64 *freed) { return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1); } /* * Release a reserved space range from io_tree only. * * Should be called when a range of pages get written to disk and corresponding * FILE_EXTENT is inserted into corresponding root. * * Since new qgroup accounting framework will only update qgroup numbers at * commit_transaction() time, its reserved space shouldn't be freed from * related qgroups. * * But we should release the range from io_tree, to allow further write to be * COWed. * * NOTE: This function may sleep for memory allocation. */ int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released) { return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0); } static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type) { if (type != BTRFS_QGROUP_RSV_META_PREALLOC && type != BTRFS_QGROUP_RSV_META_PERTRANS) return; if (num_bytes == 0) return; spin_lock(&root->qgroup_meta_rsv_lock); if (type == BTRFS_QGROUP_RSV_META_PREALLOC) root->qgroup_meta_rsv_prealloc += num_bytes; else root->qgroup_meta_rsv_pertrans += num_bytes; spin_unlock(&root->qgroup_meta_rsv_lock); } static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type) { if (type != BTRFS_QGROUP_RSV_META_PREALLOC && type != BTRFS_QGROUP_RSV_META_PERTRANS) return 0; if (num_bytes == 0) return 0; spin_lock(&root->qgroup_meta_rsv_lock); if (type == BTRFS_QGROUP_RSV_META_PREALLOC) { num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc, num_bytes); root->qgroup_meta_rsv_prealloc -= num_bytes; } else { num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans, num_bytes); root->qgroup_meta_rsv_pertrans -= num_bytes; } spin_unlock(&root->qgroup_meta_rsv_lock); return num_bytes; } int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(btrfs_root_id(root)) || num_bytes == 0) return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); trace_qgroup_meta_reserve(root, (s64)num_bytes, type); ret = qgroup_reserve(root, num_bytes, enforce, type); if (ret < 0) return ret; /* * Record what we have reserved into root. * * To avoid quota disabled->enabled underflow. * In that case, we may try to free space we haven't reserved * (since quota was disabled), so record what we reserved into root. * And ensure later release won't underflow this number. */ add_root_meta_rsv(root, num_bytes, type); return ret; } int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce, bool noflush) { int ret; ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); if ((ret <= 0 && ret != -EDQUOT) || noflush) return ret; ret = try_flush_qgroup(root); if (ret < 0) return ret; return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); } /* * Per-transaction meta reservation should be all freed at transaction commit * time */ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(btrfs_root_id(root))) return; /* TODO: Update trace point to handle such free */ trace_qgroup_meta_free_all_pertrans(root); /* Special value -1 means to free all reserved space */ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1, BTRFS_QGROUP_RSV_META_PERTRANS); } void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type) { struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(btrfs_root_id(root))) return; /* * reservation for META_PREALLOC can happen before quota is enabled, * which can lead to underflow. * Here ensure we will only free what we really have reserved. */ num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type); } static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, int num_bytes) { struct btrfs_qgroup *qgroup; LIST_HEAD(qgroup_list); if (num_bytes == 0) return; if (!fs_info->quota_root) return; spin_lock(&fs_info->qgroup_lock); qgroup = find_qgroup_rb(fs_info, ref_root); if (!qgroup) goto out; qgroup_iterator_add(&qgroup_list, qgroup); list_for_each_entry(qgroup, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; qgroup_rsv_release(fs_info, qgroup, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); if (!sb_rdonly(fs_info->sb)) qgroup_rsv_add(fs_info, qgroup, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); list_for_each_entry(glist, &qgroup->groups, next_group) qgroup_iterator_add(&qgroup_list, glist->group); } out: qgroup_iterator_clean(&qgroup_list); spin_unlock(&fs_info->qgroup_lock); } /* * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. * * This is called when preallocated meta reservation needs to be used. * Normally after btrfs_join_transaction() call. */ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) { struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(btrfs_root_id(root))) return; /* Same as btrfs_qgroup_free_meta_prealloc() */ num_bytes = sub_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); trace_qgroup_meta_convert(root, num_bytes); qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes); if (!sb_rdonly(fs_info->sb)) add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); } /* * Check qgroup reserved space leaking, normally at destroy inode * time */ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) { struct extent_changeset changeset; struct ulist_node *unode; struct ulist_iterator iter; int ret; extent_changeset_init(&changeset); ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, EXTENT_QGROUP_RESERVED, &changeset); WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) { ULIST_ITER_INIT(&iter); while ((unode = ulist_next(&changeset.range_changed, &iter))) { btrfs_warn(inode->root->fs_info, "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu", btrfs_ino(inode), unode->val, unode->aux); } btrfs_qgroup_free_refroot(inode->root->fs_info, btrfs_root_id(inode->root), changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); } extent_changeset_release(&changeset); } void btrfs_qgroup_init_swapped_blocks( struct btrfs_qgroup_swapped_blocks *swapped_blocks) { int i; spin_lock_init(&swapped_blocks->lock); for (i = 0; i < BTRFS_MAX_LEVEL; i++) swapped_blocks->blocks[i] = RB_ROOT; swapped_blocks->swapped = false; } /* * Delete all swapped blocks record of @root. * Every record here means we skipped a full subtree scan for qgroup. * * Gets called when committing one transaction. */ void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root) { struct btrfs_qgroup_swapped_blocks *swapped_blocks; int i; swapped_blocks = &root->swapped_blocks; spin_lock(&swapped_blocks->lock); if (!swapped_blocks->swapped) goto out; for (i = 0; i < BTRFS_MAX_LEVEL; i++) { struct rb_root *cur_root = &swapped_blocks->blocks[i]; struct btrfs_qgroup_swapped_block *entry; struct btrfs_qgroup_swapped_block *next; rbtree_postorder_for_each_entry_safe(entry, next, cur_root, node) kfree(entry); swapped_blocks->blocks[i] = RB_ROOT; } swapped_blocks->swapped = false; out: spin_unlock(&swapped_blocks->lock); } /* * Add subtree roots record into @subvol_root. * * @subvol_root: tree root of the subvolume tree get swapped * @bg: block group under balance * @subvol_parent/slot: pointer to the subtree root in subvolume tree * @reloc_parent/slot: pointer to the subtree root in reloc tree * BOTH POINTERS ARE BEFORE TREE SWAP * @last_snapshot: last snapshot generation of the subvolume tree */ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, u64 last_snapshot) { struct btrfs_fs_info *fs_info = subvol_root->fs_info; struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; struct btrfs_qgroup_swapped_block *block; struct rb_node **cur; struct rb_node *parent = NULL; int level = btrfs_header_level(subvol_parent) - 1; int ret = 0; if (!btrfs_qgroup_full_accounting(fs_info)) return 0; if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { btrfs_err_rl(fs_info, "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", __func__, btrfs_node_ptr_generation(subvol_parent, subvol_slot), btrfs_node_ptr_generation(reloc_parent, reloc_slot)); return -EUCLEAN; } block = kmalloc(sizeof(*block), GFP_NOFS); if (!block) { ret = -ENOMEM; goto out; } /* * @reloc_parent/slot is still before swap, while @block is going to * record the bytenr after swap, so we do the swap here. */ block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot); block->subvol_generation = btrfs_node_ptr_generation(reloc_parent, reloc_slot); block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot); block->reloc_generation = btrfs_node_ptr_generation(subvol_parent, subvol_slot); block->last_snapshot = last_snapshot; block->level = level; /* * If we have bg == NULL, we're called from btrfs_recover_relocation(), * no one else can modify tree blocks thus we qgroup will not change * no matter the value of trace_leaf. */ if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA) block->trace_leaf = true; else block->trace_leaf = false; btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot); /* Insert @block into @blocks */ spin_lock(&blocks->lock); cur = &blocks->blocks[level].rb_node; while (*cur) { struct btrfs_qgroup_swapped_block *entry; parent = *cur; entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, node); if (entry->subvol_bytenr < block->subvol_bytenr) { cur = &(*cur)->rb_left; } else if (entry->subvol_bytenr > block->subvol_bytenr) { cur = &(*cur)->rb_right; } else { if (entry->subvol_generation != block->subvol_generation || entry->reloc_bytenr != block->reloc_bytenr || entry->reloc_generation != block->reloc_generation) { /* * Duplicated but mismatch entry found. * Shouldn't happen. * * Marking qgroup inconsistent should be enough * for end users. */ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); ret = -EEXIST; } kfree(block); goto out_unlock; } } rb_link_node(&block->node, parent, cur); rb_insert_color(&block->node, &blocks->blocks[level]); blocks->swapped = true; out_unlock: spin_unlock(&blocks->lock); out: if (ret < 0) qgroup_mark_inconsistent(fs_info); return ret; } /* * Check if the tree block is a subtree root, and if so do the needed * delayed subtree trace for qgroup. * * This is called during btrfs_cow_block(). */ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *subvol_eb) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_tree_parent_check check = { 0 }; struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; struct btrfs_qgroup_swapped_block *block; struct extent_buffer *reloc_eb = NULL; struct rb_node *node; bool found = false; bool swapped = false; int level = btrfs_header_level(subvol_eb); int ret = 0; int i; if (!btrfs_qgroup_full_accounting(fs_info)) return 0; if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root) return 0; spin_lock(&blocks->lock); if (!blocks->swapped) { spin_unlock(&blocks->lock); return 0; } node = blocks->blocks[level].rb_node; while (node) { block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); if (block->subvol_bytenr < subvol_eb->start) { node = node->rb_left; } else if (block->subvol_bytenr > subvol_eb->start) { node = node->rb_right; } else { found = true; break; } } if (!found) { spin_unlock(&blocks->lock); goto out; } /* Found one, remove it from @blocks first and update blocks->swapped */ rb_erase(&block->node, &blocks->blocks[level]); for (i = 0; i < BTRFS_MAX_LEVEL; i++) { if (RB_EMPTY_ROOT(&blocks->blocks[i])) { swapped = true; break; } } blocks->swapped = swapped; spin_unlock(&blocks->lock); check.level = block->level; check.transid = block->reloc_generation; check.has_first_key = true; memcpy(&check.first_key, &block->first_key, sizeof(check.first_key)); /* Read out reloc subtree root */ reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check); if (IS_ERR(reloc_eb)) { ret = PTR_ERR(reloc_eb); reloc_eb = NULL; goto free_out; } if (!extent_buffer_uptodate(reloc_eb)) { ret = -EIO; goto free_out; } ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, block->last_snapshot, block->trace_leaf); free_out: kfree(block); free_extent_buffer(reloc_eb); out: if (ret < 0) { btrfs_err_rl(fs_info, "failed to account subtree at bytenr %llu: %d", subvol_eb->start, ret); qgroup_mark_inconsistent(fs_info); } return ret; } void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) { struct btrfs_qgroup_extent_record *entry; unsigned long index; xa_for_each(&trans->delayed_refs.dirty_extents, index, entry) { ulist_free(entry->old_roots); kfree(entry); } xa_destroy(&trans->delayed_refs.dirty_extents); } int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, const struct btrfs_squota_delta *delta) { int ret; struct btrfs_qgroup *qgroup; struct btrfs_qgroup *qg; LIST_HEAD(qgroup_list); u64 root = delta->root; u64 num_bytes = delta->num_bytes; const int sign = (delta->is_inc ? 1 : -1); if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) return 0; if (!is_fstree(root)) return 0; /* If the extent predates enabling quotas, don't count it. */ if (delta->generation < fs_info->qgroup_enable_gen) return 0; spin_lock(&fs_info->qgroup_lock); qgroup = find_qgroup_rb(fs_info, root); if (!qgroup) { ret = -ENOENT; goto out; } ret = 0; qgroup_iterator_add(&qgroup_list, qgroup); list_for_each_entry(qg, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; qg->excl += num_bytes * sign; qg->rfer += num_bytes * sign; qgroup_dirty(fs_info, qg); list_for_each_entry(glist, &qg->groups, next_group) qgroup_iterator_add(&qgroup_list, glist->group); } qgroup_iterator_clean(&qgroup_list); out: spin_unlock(&fs_info->qgroup_lock); return ret; }
8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 // SPDX-License-Identifier: GPL-2.0 /* * This file contains functions which emulate a local clock-event * device via a broadcast event source. * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> #include <linux/smp.h> #include <linux/module.h> #include "tick-internal.h" /* * Broadcast support for broken x86 hardware, where the local apic * timer stops in C3 state. */ static struct tick_device tick_broadcast_device; static cpumask_var_t tick_broadcast_mask __cpumask_var_read_mostly; static cpumask_var_t tick_broadcast_on __cpumask_var_read_mostly; static cpumask_var_t tmpmask __cpumask_var_read_mostly; static int tick_broadcast_forced; static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); #ifdef CONFIG_TICK_ONESHOT static DEFINE_PER_CPU(struct clock_event_device *, tick_oneshot_wakeup_device); static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic); static void tick_broadcast_clear_oneshot(int cpu); static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); # ifdef CONFIG_HOTPLUG_CPU static void tick_broadcast_oneshot_offline(unsigned int cpu); # endif #else static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) { BUG(); } static inline void tick_broadcast_clear_oneshot(int cpu) { } static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } # ifdef CONFIG_HOTPLUG_CPU static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { } # endif #endif /* * Debugging: see timer_list.c */ struct tick_device *tick_get_broadcast_device(void) { return &tick_broadcast_device; } struct cpumask *tick_get_broadcast_mask(void) { return tick_broadcast_mask; } static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu); const struct clock_event_device *tick_get_wakeup_device(int cpu) { return tick_get_oneshot_wakeup_device(cpu); } /* * Start the device in periodic mode */ static void tick_broadcast_start_periodic(struct clock_event_device *bc) { if (bc) tick_setup_periodic(bc, 1); } /* * Check, if the device can be utilized as broadcast device: */ static bool tick_check_broadcast_device(struct clock_event_device *curdev, struct clock_event_device *newdev) { if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || (newdev->features & CLOCK_EVT_FEAT_PERCPU) || (newdev->features & CLOCK_EVT_FEAT_C3STOP)) return false; if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT && !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) return false; return !curdev || newdev->rating > curdev->rating; } #ifdef CONFIG_TICK_ONESHOT static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu) { return per_cpu(tick_oneshot_wakeup_device, cpu); } static void tick_oneshot_wakeup_handler(struct clock_event_device *wd) { /* * If we woke up early and the tick was reprogrammed in the * meantime then this may be spurious but harmless. */ tick_receive_broadcast(); } static bool tick_set_oneshot_wakeup_device(struct clock_event_device *newdev, int cpu) { struct clock_event_device *curdev = tick_get_oneshot_wakeup_device(cpu); if (!newdev) goto set_device; if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || (newdev->features & CLOCK_EVT_FEAT_C3STOP)) return false; if (!(newdev->features & CLOCK_EVT_FEAT_PERCPU) || !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) return false; if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) return false; if (curdev && newdev->rating <= curdev->rating) return false; if (!try_module_get(newdev->owner)) return false; newdev->event_handler = tick_oneshot_wakeup_handler; set_device: clockevents_exchange_device(curdev, newdev); per_cpu(tick_oneshot_wakeup_device, cpu) = newdev; return true; } #else static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu) { return NULL; } static bool tick_set_oneshot_wakeup_device(struct clock_event_device *newdev, int cpu) { return false; } #endif /* * Conditionally install/replace broadcast device */ void tick_install_broadcast_device(struct clock_event_device *dev, int cpu) { struct clock_event_device *cur = tick_broadcast_device.evtdev; if (tick_set_oneshot_wakeup_device(dev, cpu)) return; if (!tick_check_broadcast_device(cur, dev)) return; if (!try_module_get(dev->owner)) return; clockevents_exchange_device(cur, dev); if (cur) cur->event_handler = clockevents_handle_noop; tick_broadcast_device.evtdev = dev; if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(dev); if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return; /* * If the system already runs in oneshot mode, switch the newly * registered broadcast device to oneshot mode explicitly. */ if (tick_broadcast_oneshot_active()) { tick_broadcast_switch_to_oneshot(); return; } /* * Inform all cpus about this. We might be in a situation * where we did not switch to oneshot mode because the per cpu * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack * of a oneshot capable broadcast device. Without that * notification the systems stays stuck in periodic mode * forever. */ tick_clock_notify(); } /* * Check, if the device is the broadcast device */ int tick_is_broadcast_device(struct clock_event_device *dev) { return (dev && tick_broadcast_device.evtdev == dev); } int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { int ret = -ENODEV; if (tick_is_broadcast_device(dev)) { raw_spin_lock(&tick_broadcast_lock); ret = __clockevents_update_freq(dev, freq); raw_spin_unlock(&tick_broadcast_lock); } return ret; } static void err_broadcast(const struct cpumask *mask) { pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); } static void tick_device_setup_broadcast_func(struct clock_event_device *dev) { if (!dev->broadcast) dev->broadcast = tick_broadcast; if (!dev->broadcast) { pr_warn_once("%s depends on broadcast, but no broadcast function available\n", dev->name); dev->broadcast = err_broadcast; } } /* * Check, if the device is dysfunctional and a placeholder, which * needs to be handled by the broadcast device. */ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { struct clock_event_device *bc = tick_broadcast_device.evtdev; unsigned long flags; int ret = 0; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* * Devices might be registered with both periodic and oneshot * mode disabled. This signals, that the device needs to be * operated from the broadcast device and is a placeholder for * the cpu local device. */ if (!tick_device_is_functional(dev)) { dev->event_handler = tick_handle_periodic; tick_device_setup_broadcast_func(dev); cpumask_set_cpu(cpu, tick_broadcast_mask); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else tick_broadcast_setup_oneshot(bc, false); ret = 1; } else { /* * Clear the broadcast bit for this cpu if the * device is not power state affected. */ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) cpumask_clear_cpu(cpu, tick_broadcast_mask); else tick_device_setup_broadcast_func(dev); /* * Clear the broadcast bit if the CPU is not in * periodic broadcast on state. */ if (!cpumask_test_cpu(cpu, tick_broadcast_on)) cpumask_clear_cpu(cpu, tick_broadcast_mask); switch (tick_broadcast_device.mode) { case TICKDEV_MODE_ONESHOT: /* * If the system is in oneshot mode we can * unconditionally clear the oneshot mask bit, * because the CPU is running and therefore * not in an idle state which causes the power * state affected device to stop. Let the * caller initialize the device. */ tick_broadcast_clear_oneshot(cpu); ret = 0; break; case TICKDEV_MODE_PERIODIC: /* * If the system is in periodic mode, check * whether the broadcast device can be * switched off now. */ if (cpumask_empty(tick_broadcast_mask) && bc) clockevents_shutdown(bc); /* * If we kept the cpu in the broadcast mask, * tell the caller to leave the per cpu device * in shutdown state. The periodic interrupt * is delivered by the broadcast device, if * the broadcast device exists and is not * hrtimer based. */ if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER)) ret = cpumask_test_cpu(cpu, tick_broadcast_mask); break; default: break; } } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); return ret; } int tick_receive_broadcast(void) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *evt = td->evtdev; if (!evt) return -ENODEV; if (!evt->event_handler) return -EINVAL; evt->event_handler(evt); return 0; } /* * Broadcast the event to the cpus, which are set in the mask (mangled). */ static bool tick_do_broadcast(struct cpumask *mask) { int cpu = smp_processor_id(); struct tick_device *td; bool local = false; /* * Check, if the current cpu is in the mask */ if (cpumask_test_cpu(cpu, mask)) { struct clock_event_device *bc = tick_broadcast_device.evtdev; cpumask_clear_cpu(cpu, mask); /* * We only run the local handler, if the broadcast * device is not hrtimer based. Otherwise we run into * a hrtimer recursion. * * local timer_interrupt() * local_handler() * expire_hrtimers() * bc_handler() * local_handler() * expire_hrtimers() */ local = !(bc->features & CLOCK_EVT_FEAT_HRTIMER); } if (!cpumask_empty(mask)) { /* * It might be necessary to actually check whether the devices * have different broadcast functions. For now, just use the * one of the first device. This works as long as we have this * misfeature only on x86 (lapic) */ td = &per_cpu(tick_cpu_device, cpumask_first(mask)); td->evtdev->broadcast(mask); } return local; } /* * Periodic broadcast: * - invoke the broadcast handlers */ static bool tick_do_periodic_broadcast(void) { cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); return tick_do_broadcast(tmpmask); } /* * Event handler for periodic broadcast ticks */ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); bool bc_local; raw_spin_lock(&tick_broadcast_lock); /* Handle spurious interrupts gracefully */ if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) { raw_spin_unlock(&tick_broadcast_lock); return; } bc_local = tick_do_periodic_broadcast(); if (clockevent_state_oneshot(dev)) { ktime_t next = ktime_add_ns(dev->next_event, TICK_NSEC); clockevents_program_event(dev, next, true); } raw_spin_unlock(&tick_broadcast_lock); /* * We run the handler of the local cpu after dropping * tick_broadcast_lock because the handler might deadlock when * trying to switch to oneshot mode. */ if (bc_local) td->evtdev->event_handler(td->evtdev); } /** * tick_broadcast_control - Enable/disable or force broadcast mode * @mode: The selected broadcast mode * * Called when the system enters a state where affected tick devices * might stop. Note: TICK_BROADCAST_FORCE cannot be undone. */ void tick_broadcast_control(enum tick_broadcast_mode mode) { struct clock_event_device *bc, *dev; struct tick_device *td; int cpu, bc_stopped; unsigned long flags; /* Protects also the local clockevent device. */ raw_spin_lock_irqsave(&tick_broadcast_lock, flags); td = this_cpu_ptr(&tick_cpu_device); dev = td->evtdev; /* * Is the device not affected by the powerstate ? */ if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) goto out; if (!tick_device_is_functional(dev)) goto out; cpu = smp_processor_id(); bc = tick_broadcast_device.evtdev; bc_stopped = cpumask_empty(tick_broadcast_mask); switch (mode) { case TICK_BROADCAST_FORCE: tick_broadcast_forced = 1; fallthrough; case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { /* * Only shutdown the cpu local device, if: * * - the broadcast device exists * - the broadcast device is not a hrtimer based one * - the broadcast device is in periodic mode to * avoid a hiccup during switch to oneshot mode */ if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) && tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } break; case TICK_BROADCAST_OFF: if (tick_broadcast_forced) break; cpumask_clear_cpu(cpu, tick_broadcast_on); if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); } break; } if (bc) { if (cpumask_empty(tick_broadcast_mask)) { if (!bc_stopped) clockevents_shutdown(bc); } else if (bc_stopped) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else tick_broadcast_setup_oneshot(bc, false); } } out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } EXPORT_SYMBOL_GPL(tick_broadcast_control); /* * Set the periodic handler depending on broadcast on/off */ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) { if (!broadcast) dev->event_handler = tick_handle_periodic; else dev->event_handler = tick_handle_periodic_broadcast; } #ifdef CONFIG_HOTPLUG_CPU static void tick_shutdown_broadcast(void) { struct clock_event_device *bc = tick_broadcast_device.evtdev; if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpumask_empty(tick_broadcast_mask)) clockevents_shutdown(bc); } } /* * Remove a CPU from broadcasting */ void tick_broadcast_offline(unsigned int cpu) { raw_spin_lock(&tick_broadcast_lock); cpumask_clear_cpu(cpu, tick_broadcast_mask); cpumask_clear_cpu(cpu, tick_broadcast_on); tick_broadcast_oneshot_offline(cpu); tick_shutdown_broadcast(); raw_spin_unlock(&tick_broadcast_lock); } #endif void tick_suspend_broadcast(void) { struct clock_event_device *bc; unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; if (bc) clockevents_shutdown(bc); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* * This is called from tick_resume_local() on a resuming CPU. That's * called from the core resume function, tick_unfreeze() and the magic XEN * resume hackery. * * In none of these cases the broadcast device mode can change and the * bit of the resuming CPU in the broadcast mask is safe as well. */ bool tick_resume_check_broadcast(void) { if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) return false; else return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask); } void tick_resume_broadcast(void) { struct clock_event_device *bc; unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; if (bc) { clockevents_tick_resume(bc); switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(bc); break; case TICKDEV_MODE_ONESHOT: if (!cpumask_empty(tick_broadcast_mask)) tick_resume_broadcast_oneshot(bc); break; } } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } #ifdef CONFIG_TICK_ONESHOT static cpumask_var_t tick_broadcast_oneshot_mask __cpumask_var_read_mostly; static cpumask_var_t tick_broadcast_pending_mask __cpumask_var_read_mostly; static cpumask_var_t tick_broadcast_force_mask __cpumask_var_read_mostly; /* * Exposed for debugging: see timer_list.c */ struct cpumask *tick_get_broadcast_oneshot_mask(void) { return tick_broadcast_oneshot_mask; } /* * Called before going idle with interrupts disabled. Checks whether a * broadcast event from the other core is about to happen. We detected * that in tick_broadcast_oneshot_control(). The callsite can use this * to avoid a deep idle transition as we are about to get the * broadcast IPI right away. */ noinstr int tick_check_broadcast_expired(void) { #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H return arch_test_bit(smp_processor_id(), cpumask_bits(tick_broadcast_force_mask)); #else return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); #endif } /* * Set broadcast interrupt affinity */ static void tick_broadcast_set_affinity(struct clock_event_device *bc, const struct cpumask *cpumask) { if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ)) return; if (cpumask_equal(bc->cpumask, cpumask)) return; bc->cpumask = cpumask; irq_set_affinity(bc->irq, bc->cpumask); } static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu, ktime_t expires) { if (!clockevent_state_oneshot(bc)) clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(bc, expires, 1); tick_broadcast_set_affinity(bc, cpumask_of(cpu)); } static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); } /* * Called from irq_enter() when idle was interrupted to reenable the * per cpu device. */ void tick_check_oneshot_broadcast_this_cpu(void) { if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); /* * We might be in the middle of switching over from * periodic to oneshot. If the CPU has not yet * switched over, leave the device alone. */ if (td->mode == TICKDEV_MODE_ONESHOT) { clockevents_switch_state(td->evtdev, CLOCK_EVT_STATE_ONESHOT); } } } /* * Handle oneshot mode broadcasting */ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) { struct tick_device *td; ktime_t now, next_event; int cpu, next_cpu = 0; bool bc_local; raw_spin_lock(&tick_broadcast_lock); dev->next_event = KTIME_MAX; next_event = KTIME_MAX; cpumask_clear(tmpmask); now = ktime_get(); /* Find all expired events */ for_each_cpu(cpu, tick_broadcast_oneshot_mask) { /* * Required for !SMP because for_each_cpu() reports * unconditionally CPU0 as set on UP kernels. */ if (!IS_ENABLED(CONFIG_SMP) && cpumask_empty(tick_broadcast_oneshot_mask)) break; td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev->next_event <= now) { cpumask_set_cpu(cpu, tmpmask); /* * Mark the remote cpu in the pending mask, so * it can avoid reprogramming the cpu local * timer in tick_broadcast_oneshot_control(). */ cpumask_set_cpu(cpu, tick_broadcast_pending_mask); } else if (td->evtdev->next_event < next_event) { next_event = td->evtdev->next_event; next_cpu = cpu; } } /* * Remove the current cpu from the pending mask. The event is * delivered immediately in tick_do_broadcast() ! */ cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask); /* Take care of enforced broadcast requests */ cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); cpumask_clear(tick_broadcast_force_mask); /* * Sanity check. Catch the case where we try to broadcast to * offline cpus. */ if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask))) cpumask_and(tmpmask, tmpmask, cpu_online_mask); /* * Wakeup the cpus which have an expired event. */ bc_local = tick_do_broadcast(tmpmask); /* * Two reasons for reprogram: * * - The global event did not expire any CPU local * events. This happens in dyntick mode, as the maximum PIT * delta is quite small. * * - There are pending events on sleeping CPUs which were not * in the event mask */ if (next_event != KTIME_MAX) tick_broadcast_set_event(dev, next_cpu, next_event); raw_spin_unlock(&tick_broadcast_lock); if (bc_local) { td = this_cpu_ptr(&tick_cpu_device); td->evtdev->event_handler(td->evtdev); } } static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) { if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER)) return 0; if (bc->next_event == KTIME_MAX) return 0; return bc->bound_on == cpu ? -EBUSY : 0; } static void broadcast_shutdown_local(struct clock_event_device *bc, struct clock_event_device *dev) { /* * For hrtimer based broadcasting we cannot shutdown the cpu * local device if our own event is the first one to expire or * if we own the broadcast timer. */ if (bc->features & CLOCK_EVT_FEAT_HRTIMER) { if (broadcast_needs_cpu(bc, smp_processor_id())) return; if (dev->next_event < bc->next_event) return; } clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); } static int ___tick_broadcast_oneshot_control(enum tick_broadcast_state state, struct tick_device *td, int cpu) { struct clock_event_device *bc, *dev = td->evtdev; int ret = 0; ktime_t now; raw_spin_lock(&tick_broadcast_lock); bc = tick_broadcast_device.evtdev; if (state == TICK_BROADCAST_ENTER) { /* * If the current CPU owns the hrtimer broadcast * mechanism, it cannot go deep idle and we do not add * the CPU to the broadcast mask. We don't have to go * through the EXIT path as the local timer is not * shutdown. */ ret = broadcast_needs_cpu(bc, cpu); if (ret) goto out; /* * If the broadcast device is in periodic mode, we * return. */ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { /* If it is a hrtimer based broadcast, return busy */ if (bc->features & CLOCK_EVT_FEAT_HRTIMER) ret = -EBUSY; goto out; } if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); /* Conditionally shut down the local timer. */ broadcast_shutdown_local(bc, dev); /* * We only reprogram the broadcast timer if we * did not mark ourself in the force mask and * if the cpu local event is earlier than the * broadcast event. If the current CPU is in * the force mask, then we are going to be * woken by the IPI right away; we return * busy, so the CPU does not try to go deep * idle. */ if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) { ret = -EBUSY; } else if (dev->next_event < bc->next_event) { tick_broadcast_set_event(bc, cpu, dev->next_event); /* * In case of hrtimer broadcasts the * programming might have moved the * timer to this cpu. If yes, remove * us from the broadcast mask and * return busy. */ ret = broadcast_needs_cpu(bc, cpu); if (ret) { cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } } } } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); /* * The cpu which was handling the broadcast * timer marked this cpu in the broadcast * pending mask and fired the broadcast * IPI. So we are going to handle the expired * event anyway via the broadcast IPI * handler. No need to reprogram the timer * with an already expired event. */ if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_pending_mask)) goto out; /* * Bail out if there is no next event. */ if (dev->next_event == KTIME_MAX) goto out; /* * If the pending bit is not set, then we are * either the CPU handling the broadcast * interrupt or we got woken by something else. * * We are no longer in the broadcast mask, so * if the cpu local expiry time is already * reached, we would reprogram the cpu local * timer with an already expired event. * * This can lead to a ping-pong when we return * to idle and therefore rearm the broadcast * timer before the cpu local timer was able * to fire. This happens because the forced * reprogramming makes sure that the event * will happen in the future and depending on * the min_delta setting this might be far * enough out that the ping-pong starts. * * If the cpu local next_event has expired * then we know that the broadcast timer * next_event has expired as well and * broadcast is about to be handled. So we * avoid reprogramming and enforce that the * broadcast handler, which did not run yet, * will invoke the cpu local handler. * * We cannot call the handler directly from * here, because we might be in a NOHZ phase * and we did not go through the irq_enter() * nohz fixups. */ now = ktime_get(); if (dev->next_event <= now) { cpumask_set_cpu(cpu, tick_broadcast_force_mask); goto out; } /* * We got woken by something else. Reprogram * the cpu local timer device. */ tick_program_event(dev->next_event, 1); } } out: raw_spin_unlock(&tick_broadcast_lock); return ret; } static int tick_oneshot_wakeup_control(enum tick_broadcast_state state, struct tick_device *td, int cpu) { struct clock_event_device *dev, *wd; dev = td->evtdev; if (td->mode != TICKDEV_MODE_ONESHOT) return -EINVAL; wd = tick_get_oneshot_wakeup_device(cpu); if (!wd) return -ENODEV; switch (state) { case TICK_BROADCAST_ENTER: clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); clockevents_switch_state(wd, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(wd, dev->next_event, 1); break; case TICK_BROADCAST_EXIT: /* We may have transitioned to oneshot mode while idle */ if (clockevent_get_state(wd) != CLOCK_EVT_STATE_ONESHOT) return -ENODEV; } return 0; } int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); int cpu = smp_processor_id(); if (!tick_oneshot_wakeup_control(state, td, cpu)) return 0; if (tick_broadcast_device.evtdev) return ___tick_broadcast_oneshot_control(state, td, cpu); /* * If there is no broadcast or wakeup device, tell the caller not * to go into deep idle. */ return -EBUSY; } /* * Reset the one shot broadcast for a cpu * * Called with tick_broadcast_lock held */ static void tick_broadcast_clear_oneshot(int cpu) { cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); } static void tick_broadcast_init_next_event(struct cpumask *mask, ktime_t expires) { struct tick_device *td; int cpu; for_each_cpu(cpu, mask) { td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev) td->evtdev->next_event = expires; } } static inline ktime_t tick_get_next_period(void) { ktime_t next; /* * Protect against concurrent updates (store /load tearing on * 32bit). It does not matter if the time is already in the * past. The broadcast device which is about to be programmed will * fire in any case. */ raw_spin_lock(&jiffies_lock); next = tick_next_period; raw_spin_unlock(&jiffies_lock); return next; } /** * tick_broadcast_setup_oneshot - setup the broadcast device * @bc: the broadcast device * @from_periodic: true if called from periodic mode */ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) { int cpu = smp_processor_id(); ktime_t nexttick = 0; if (!bc) return; /* * When the broadcast device was switched to oneshot by the first * CPU handling the NOHZ change, the other CPUs will reach this * code via hrtimer_run_queues() -> tick_check_oneshot_change() * too. Set up the broadcast device only once! */ if (bc->event_handler == tick_handle_oneshot_broadcast) { /* * The CPU which switched from periodic to oneshot mode * set the broadcast oneshot bit for all other CPUs which * are in the general (periodic) broadcast mask to ensure * that CPUs which wait for the periodic broadcast are * woken up. * * Clear the bit for the local CPU as the set bit would * prevent the first tick_broadcast_enter() after this CPU * switched to oneshot state to program the broadcast * device. * * This code can also be reached via tick_broadcast_control(), * but this cannot avoid the tick_broadcast_clear_oneshot() * as that would break the periodic to oneshot transition of * secondary CPUs. But that's harmless as the below only * clears already cleared bits. */ tick_broadcast_clear_oneshot(cpu); return; } bc->event_handler = tick_handle_oneshot_broadcast; bc->next_event = KTIME_MAX; /* * When the tick mode is switched from periodic to oneshot it must * be ensured that CPUs which are waiting for periodic broadcast * get their wake-up at the next tick. This is achieved by ORing * tick_broadcast_mask into tick_broadcast_oneshot_mask. * * For other callers, e.g. broadcast device replacement, * tick_broadcast_oneshot_mask must not be touched as this would * set bits for CPUs which are already NOHZ, but not idle. Their * next tick_broadcast_enter() would observe the bit set and fail * to update the expiry time and the broadcast event device. */ if (from_periodic) { cpumask_copy(tmpmask, tick_broadcast_mask); /* Remove the local CPU as it is obviously not idle */ cpumask_clear_cpu(cpu, tmpmask); cpumask_or(tick_broadcast_oneshot_mask, tick_broadcast_oneshot_mask, tmpmask); /* * Ensure that the oneshot broadcast handler will wake the * CPUs which are still waiting for periodic broadcast. */ nexttick = tick_get_next_period(); tick_broadcast_init_next_event(tmpmask, nexttick); /* * If the underlying broadcast clock event device is * already in oneshot state, then there is nothing to do. * The device was already armed for the next tick * in tick_handle_broadcast_periodic() */ if (clockevent_state_oneshot(bc)) return; } /* * When switching from periodic to oneshot mode arm the broadcast * device for the next tick. * * If the broadcast device has been replaced in oneshot mode and * the oneshot broadcast mask is not empty, then arm it to expire * immediately in order to reevaluate the next expiring timer. * @nexttick is 0 and therefore in the past which will cause the * clockevent code to force an event. * * For both cases the programming can be avoided when the oneshot * broadcast mask is empty. * * tick_broadcast_set_event() implicitly switches the broadcast * device to oneshot state. */ if (!cpumask_empty(tick_broadcast_oneshot_mask)) tick_broadcast_set_event(bc, cpu, nexttick); } /* * Select oneshot operating mode for the broadcast device */ void tick_broadcast_switch_to_oneshot(void) { struct clock_event_device *bc; enum tick_device_mode oldmode; unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); oldmode = tick_broadcast_device.mode; tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; bc = tick_broadcast_device.evtdev; if (bc) tick_broadcast_setup_oneshot(bc, oldmode == TICKDEV_MODE_PERIODIC); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } #ifdef CONFIG_HOTPLUG_CPU void hotplug_cpu__broadcast_tick_pull(int deadcpu) { struct clock_event_device *bc; unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; if (bc && broadcast_needs_cpu(bc, deadcpu)) { /* * If the broadcast force bit of the current CPU is set, * then the current CPU has not yet reprogrammed the local * timer device to avoid a ping-pong race. See * ___tick_broadcast_oneshot_control(). * * If the broadcast device is hrtimer based then * programming the broadcast event below does not have any * effect because the local clockevent device is not * running and not programmed because the broadcast event * is not earlier than the pending event of the local clock * event device. As a consequence all CPUs waiting for a * broadcast event are stuck forever. * * Detect this condition and reprogram the cpu local timer * device to avoid the starvation. */ if (tick_check_broadcast_expired()) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask); tick_program_event(td->evtdev->next_event, 1); } /* This moves the broadcast assignment to this CPU: */ clockevents_program_event(bc, bc->next_event, 1); } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* * Remove a dying CPU from broadcasting */ static void tick_broadcast_oneshot_offline(unsigned int cpu) { if (tick_get_oneshot_wakeup_device(cpu)) tick_set_oneshot_wakeup_device(NULL, cpu); /* * Clear the broadcast masks for the dead cpu, but do not stop * the broadcast device! */ cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); } #endif /* * Check, whether the broadcast device is in one shot mode */ int tick_broadcast_oneshot_active(void) { return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; } /* * Check whether the broadcast device supports oneshot. */ bool tick_broadcast_oneshot_available(void) { struct clock_event_device *bc = tick_broadcast_device.evtdev; return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; } #else int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct clock_event_device *bc = tick_broadcast_device.evtdev; if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER)) return -EBUSY; return 0; } #endif void __init tick_broadcast_init(void) { zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT); zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); #ifdef CONFIG_TICK_ONESHOT zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); #endif }
47 47 8 8 38 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 // SPDX-License-Identifier: GPL-2.0 #include <linux/proc_fs.h> #include <linux/ethtool.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/bonding.h> #include "bonding_priv.h" static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct bonding *bond = pde_data(file_inode(seq->file)); struct list_head *iter; struct slave *slave; loff_t off = 0; rcu_read_lock(); if (*pos == 0) return SEQ_START_TOKEN; bond_for_each_slave_rcu(bond, slave, iter) if (++off == *pos) return slave; return NULL; } static void *bond_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bonding *bond = pde_data(file_inode(seq->file)); struct list_head *iter; struct slave *slave; bool found = false; ++*pos; if (v == SEQ_START_TOKEN) return bond_first_slave_rcu(bond); bond_for_each_slave_rcu(bond, slave, iter) { if (found) return slave; if (slave == v) found = true; } return NULL; } static void bond_info_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static void bond_info_show_master(struct seq_file *seq) { struct bonding *bond = pde_data(file_inode(seq->file)); const struct bond_opt_value *optval; struct slave *curr, *primary; int i; curr = rcu_dereference(bond->curr_active_slave); seq_printf(seq, "Bonding Mode: %s", bond_mode_name(BOND_MODE(bond))); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && bond->params.fail_over_mac) { optval = bond_opt_get_val(BOND_OPT_FAIL_OVER_MAC, bond->params.fail_over_mac); seq_printf(seq, " (fail_over_mac %s)", optval->string); } seq_printf(seq, "\n"); if (bond_mode_uses_xmit_hash(bond)) { optval = bond_opt_get_val(BOND_OPT_XMIT_HASH, bond->params.xmit_policy); seq_printf(seq, "Transmit Hash Policy: %s (%d)\n", optval->string, bond->params.xmit_policy); } if (bond_uses_primary(bond)) { primary = rcu_dereference(bond->primary_slave); seq_printf(seq, "Primary Slave: %s", primary ? primary->dev->name : "None"); if (primary) { optval = bond_opt_get_val(BOND_OPT_PRIMARY_RESELECT, bond->params.primary_reselect); seq_printf(seq, " (primary_reselect %s)", optval->string); } seq_printf(seq, "\nCurrently Active Slave: %s\n", (curr) ? curr->dev->name : "None"); } seq_printf(seq, "MII Status: %s\n", netif_carrier_ok(bond->dev) ? "up" : "down"); seq_printf(seq, "MII Polling Interval (ms): %d\n", bond->params.miimon); seq_printf(seq, "Up Delay (ms): %d\n", bond->params.updelay * bond->params.miimon); seq_printf(seq, "Down Delay (ms): %d\n", bond->params.downdelay * bond->params.miimon); seq_printf(seq, "Peer Notification Delay (ms): %d\n", bond->params.peer_notif_delay * bond->params.miimon); /* ARP information */ if (bond->params.arp_interval > 0) { int printed = 0; seq_printf(seq, "ARP Polling Interval (ms): %d\n", bond->params.arp_interval); seq_printf(seq, "ARP Missed Max: %u\n", bond->params.missed_max); seq_printf(seq, "ARP IP target/s (n.n.n.n form):"); for (i = 0; (i < BOND_MAX_ARP_TARGETS); i++) { if (!bond->params.arp_targets[i]) break; if (printed) seq_printf(seq, ","); seq_printf(seq, " %pI4", &bond->params.arp_targets[i]); printed = 1; } seq_printf(seq, "\n"); #if IS_ENABLED(CONFIG_IPV6) printed = 0; seq_printf(seq, "NS IPv6 target/s (xx::xx form):"); for (i = 0; (i < BOND_MAX_NS_TARGETS); i++) { if (ipv6_addr_any(&bond->params.ns_targets[i])) break; if (printed) seq_printf(seq, ","); seq_printf(seq, " %pI6c", &bond->params.ns_targets[i]); printed = 1; } seq_printf(seq, "\n"); #endif } if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info ad_info; seq_puts(seq, "\n802.3ad info\n"); seq_printf(seq, "LACP active: %s\n", (bond->params.lacp_active) ? "on" : "off"); seq_printf(seq, "LACP rate: %s\n", (bond->params.lacp_fast) ? "fast" : "slow"); seq_printf(seq, "Min links: %d\n", bond->params.min_links); optval = bond_opt_get_val(BOND_OPT_AD_SELECT, bond->params.ad_select); seq_printf(seq, "Aggregator selection policy (ad_select): %s\n", optval->string); if (capable(CAP_NET_ADMIN)) { seq_printf(seq, "System priority: %d\n", BOND_AD_INFO(bond).system.sys_priority); seq_printf(seq, "System MAC address: %pM\n", &BOND_AD_INFO(bond).system.sys_mac_addr); if (__bond_3ad_get_active_agg_info(bond, &ad_info)) { seq_printf(seq, "bond %s has no active aggregator\n", bond->dev->name); } else { seq_printf(seq, "Active Aggregator Info:\n"); seq_printf(seq, "\tAggregator ID: %d\n", ad_info.aggregator_id); seq_printf(seq, "\tNumber of ports: %d\n", ad_info.ports); seq_printf(seq, "\tActor Key: %d\n", ad_info.actor_key); seq_printf(seq, "\tPartner Key: %d\n", ad_info.partner_key); seq_printf(seq, "\tPartner Mac Address: %pM\n", ad_info.partner_system); } } } } static void bond_info_show_slave(struct seq_file *seq, const struct slave *slave) { struct bonding *bond = pde_data(file_inode(seq->file)); seq_printf(seq, "\nSlave Interface: %s\n", slave->dev->name); seq_printf(seq, "MII Status: %s\n", bond_slave_link_status(slave->link)); if (slave->speed == SPEED_UNKNOWN) seq_printf(seq, "Speed: %s\n", "Unknown"); else seq_printf(seq, "Speed: %d Mbps\n", slave->speed); if (slave->duplex == DUPLEX_UNKNOWN) seq_printf(seq, "Duplex: %s\n", "Unknown"); else seq_printf(seq, "Duplex: %s\n", slave->duplex ? "full" : "half"); seq_printf(seq, "Link Failure Count: %u\n", slave->link_failure_count); seq_printf(seq, "Permanent HW addr: %*phC\n", slave->dev->addr_len, slave->perm_hwaddr); seq_printf(seq, "Slave queue ID: %d\n", READ_ONCE(slave->queue_id)); if (BOND_MODE(bond) == BOND_MODE_8023AD) { const struct port *port = &SLAVE_AD_INFO(slave)->port; const struct aggregator *agg = port->aggregator; if (agg) { seq_printf(seq, "Aggregator ID: %d\n", agg->aggregator_identifier); seq_printf(seq, "Actor Churn State: %s\n", bond_3ad_churn_desc(port->sm_churn_actor_state)); seq_printf(seq, "Partner Churn State: %s\n", bond_3ad_churn_desc(port->sm_churn_partner_state)); seq_printf(seq, "Actor Churned Count: %d\n", port->churn_actor_count); seq_printf(seq, "Partner Churned Count: %d\n", port->churn_partner_count); if (capable(CAP_NET_ADMIN)) { seq_puts(seq, "details actor lacp pdu:\n"); seq_printf(seq, " system priority: %d\n", port->actor_system_priority); seq_printf(seq, " system mac address: %pM\n", &port->actor_system); seq_printf(seq, " port key: %d\n", port->actor_oper_port_key); seq_printf(seq, " port priority: %d\n", port->actor_port_priority); seq_printf(seq, " port number: %d\n", port->actor_port_number); seq_printf(seq, " port state: %d\n", port->actor_oper_port_state); seq_puts(seq, "details partner lacp pdu:\n"); seq_printf(seq, " system priority: %d\n", port->partner_oper.system_priority); seq_printf(seq, " system mac address: %pM\n", &port->partner_oper.system); seq_printf(seq, " oper key: %d\n", port->partner_oper.key); seq_printf(seq, " port priority: %d\n", port->partner_oper.port_priority); seq_printf(seq, " port number: %d\n", port->partner_oper.port_number); seq_printf(seq, " port state: %d\n", port->partner_oper.port_state); } } else { seq_puts(seq, "Aggregator ID: N/A\n"); } } } static int bond_info_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_printf(seq, "%s\n", bond_version); bond_info_show_master(seq); } else bond_info_show_slave(seq, v); return 0; } static const struct seq_operations bond_info_seq_ops = { .start = bond_info_seq_start, .next = bond_info_seq_next, .stop = bond_info_seq_stop, .show = bond_info_seq_show, }; void bond_create_proc_entry(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); if (bn->proc_dir) { bond->proc_entry = proc_create_seq_data(bond_dev->name, 0444, bn->proc_dir, &bond_info_seq_ops, bond); if (bond->proc_entry == NULL) netdev_warn(bond_dev, "Cannot create /proc/net/%s/%s\n", DRV_NAME, bond_dev->name); else memcpy(bond->proc_file_name, bond_dev->name, IFNAMSIZ); } } void bond_remove_proc_entry(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); if (bn->proc_dir && bond->proc_entry) { remove_proc_entry(bond->proc_file_name, bn->proc_dir); memset(bond->proc_file_name, 0, IFNAMSIZ); bond->proc_entry = NULL; } } /* Create the bonding directory under /proc/net, if doesn't exist yet. * Caller must hold rtnl_lock. */ void __net_init bond_create_proc_dir(struct bond_net *bn) { if (!bn->proc_dir) { bn->proc_dir = proc_mkdir(DRV_NAME, bn->net->proc_net); if (!bn->proc_dir) pr_warn("Warning: Cannot create /proc/net/%s\n", DRV_NAME); } } /* Destroy the bonding directory under /proc/net, if empty. */ void __net_exit bond_destroy_proc_dir(struct bond_net *bn) { if (bn->proc_dir) { remove_proc_entry(DRV_NAME, bn->net->proc_net); bn->proc_dir = NULL; } }
14 18 10 12 11 18 74 1522 1515 31 12 12 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include <linux/if_macvlan.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/interrupt.h> #include <linux/nsproxy.h> #include <linux/compat.h> #include <linux/if_tun.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/sched/signal.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> #include <linux/uio.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <linux/virtio_net.h> #include <linux/skb_array.h> struct macvtap_dev { struct macvlan_dev vlan; struct tap_dev tap; }; /* * Variables for dealing with macvtaps device numbers. */ static dev_t macvtap_major; static const void *macvtap_net_namespace(const struct device *d) { const struct net_device *dev = to_net_dev(d->parent); return dev_net(dev); } static struct class macvtap_class = { .name = "macvtap", .ns_type = &net_ns_type_operations, .namespace = macvtap_net_namespace, }; static struct cdev macvtap_cdev; #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ NETIF_F_TSO6) static void macvtap_count_tx_dropped(struct tap_dev *tap) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; this_cpu_inc(vlan->pcpu_stats->tx_dropped); } static void macvtap_count_rx_dropped(struct tap_dev *tap) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; macvlan_count_rx(vlan, 0, 0, 0); } static void macvtap_update_features(struct tap_dev *tap, netdev_features_t features) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; vlan->set_features = features; netdev_update_features(vlan->dev); } static int macvtap_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct macvtap_dev *vlantap = netdev_priv(dev); int err; INIT_LIST_HEAD(&vlantap->tap.queue_list); /* Since macvlan supports all offloads by default, make * tap support all offloads also. */ vlantap->tap.tap_features = TUN_OFFLOADS; /* Register callbacks for rx/tx drops accounting and updating * net_device features */ vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped; vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped; vlantap->tap.update_features = macvtap_update_features; err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap); if (err) return err; /* Don't put anything that may fail after macvlan_common_newlink * because we can't undo what it does. */ err = macvlan_common_newlink(src_net, dev, tb, data, extack); if (err) { netdev_rx_handler_unregister(dev); return err; } vlantap->tap.dev = vlantap->vlan.dev; return 0; } static void macvtap_dellink(struct net_device *dev, struct list_head *head) { struct macvtap_dev *vlantap = netdev_priv(dev); netdev_rx_handler_unregister(dev); tap_del_queues(&vlantap->tap); macvlan_dellink(dev, head); } static void macvtap_setup(struct net_device *dev) { macvlan_common_setup(dev); dev->tx_queue_len = TUN_READQ_SIZE; } static struct net *macvtap_link_net(const struct net_device *dev) { return dev_net(macvlan_dev_real_dev(dev)); } static struct rtnl_link_ops macvtap_link_ops __read_mostly = { .kind = "macvtap", .setup = macvtap_setup, .newlink = macvtap_newlink, .dellink = macvtap_dellink, .get_link_net = macvtap_link_net, .priv_size = sizeof(struct macvtap_dev), }; static int macvtap_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct macvtap_dev *vlantap; struct device *classdev; dev_t devt; int err; char tap_name[IFNAMSIZ]; if (dev->rtnl_link_ops != &macvtap_link_ops) return NOTIFY_DONE; snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex); vlantap = netdev_priv(dev); switch (event) { case NETDEV_REGISTER: /* Create the device node here after the network device has * been registered but before register_netdevice has * finished running. */ err = tap_get_minor(macvtap_major, &vlantap->tap); if (err) return notifier_from_errno(err); devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor); classdev = device_create(&macvtap_class, &dev->dev, devt, dev, "%s", tap_name); if (IS_ERR(classdev)) { tap_free_minor(macvtap_major, &vlantap->tap); return notifier_from_errno(PTR_ERR(classdev)); } err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj, tap_name); if (err) return notifier_from_errno(err); break; case NETDEV_UNREGISTER: /* vlan->minor == 0 if NETDEV_REGISTER above failed */ if (vlantap->tap.minor == 0) break; sysfs_remove_link(&dev->dev.kobj, tap_name); devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor); device_destroy(&macvtap_class, devt); tap_free_minor(macvtap_major, &vlantap->tap); break; case NETDEV_CHANGE_TX_QUEUE_LEN: if (tap_queue_resize(&vlantap->tap)) return NOTIFY_BAD; break; } return NOTIFY_DONE; } static struct notifier_block macvtap_notifier_block __read_mostly = { .notifier_call = macvtap_device_event, }; static int __init macvtap_init(void) { int err; err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap", THIS_MODULE); if (err) goto out1; err = class_register(&macvtap_class); if (err) goto out2; err = register_netdevice_notifier(&macvtap_notifier_block); if (err) goto out3; err = macvlan_link_register(&macvtap_link_ops); if (err) goto out4; return 0; out4: unregister_netdevice_notifier(&macvtap_notifier_block); out3: class_unregister(&macvtap_class); out2: tap_destroy_cdev(macvtap_major, &macvtap_cdev); out1: return err; } module_init(macvtap_init); static void __exit macvtap_exit(void) { rtnl_link_unregister(&macvtap_link_ops); unregister_netdevice_notifier(&macvtap_notifier_block); class_unregister(&macvtap_class); tap_destroy_cdev(macvtap_major, &macvtap_cdev); } module_exit(macvtap_exit); MODULE_ALIAS_RTNL_LINK("macvtap"); MODULE_DESCRIPTION("MAC-VLAN based tap driver"); MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); MODULE_LICENSE("GPL");
66 66 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2020 Facebook * Copyright 2020 Google LLC. */ #include <linux/pid.h> #include <linux/sched.h> #include <linux/rculist.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/bpf.h> #include <linux/bpf_local_storage.h> #include <linux/filter.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> #include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(task_cache); static DEFINE_PER_CPU(int, bpf_task_storage_busy); static void bpf_task_storage_lock(void) { cant_migrate(); this_cpu_inc(bpf_task_storage_busy); } static void bpf_task_storage_unlock(void) { this_cpu_dec(bpf_task_storage_busy); } static bool bpf_task_storage_trylock(void) { cant_migrate(); if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) { this_cpu_dec(bpf_task_storage_busy); return false; } return true; } static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) { struct task_struct *task = owner; return &task->bpf_storage; } static struct bpf_local_storage_data * task_storage_lookup(struct task_struct *task, struct bpf_map *map, bool cacheit_lockit) { struct bpf_local_storage *task_storage; struct bpf_local_storage_map *smap; task_storage = rcu_dereference_check(task->bpf_storage, bpf_rcu_lock_held()); if (!task_storage) return NULL; smap = (struct bpf_local_storage_map *)map; return bpf_local_storage_lookup(task_storage, smap, cacheit_lockit); } void bpf_task_storage_free(struct task_struct *task) { struct bpf_local_storage *local_storage; migrate_disable(); rcu_read_lock(); local_storage = rcu_dereference(task->bpf_storage); if (!local_storage) goto out; bpf_task_storage_lock(); bpf_local_storage_destroy(local_storage); bpf_task_storage_unlock(); out: rcu_read_unlock(); migrate_enable(); } static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) { struct bpf_local_storage_data *sdata; struct task_struct *task; unsigned int f_flags; struct pid *pid; int fd, err; fd = *(int *)key; pid = pidfd_get_pid(fd, &f_flags); if (IS_ERR(pid)) return ERR_CAST(pid); /* We should be in an RCU read side critical section, it should be safe * to call pid_task. */ WARN_ON_ONCE(!rcu_read_lock_held()); task = pid_task(pid, PIDTYPE_PID); if (!task) { err = -ENOENT; goto out; } bpf_task_storage_lock(); sdata = task_storage_lookup(task, map, true); bpf_task_storage_unlock(); put_pid(pid); return sdata ? sdata->data : NULL; out: put_pid(pid); return ERR_PTR(err); } static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct task_struct *task; unsigned int f_flags; struct pid *pid; int fd, err; if ((map_flags & BPF_F_LOCK) && btf_record_has_field(map->record, BPF_UPTR)) return -EOPNOTSUPP; fd = *(int *)key; pid = pidfd_get_pid(fd, &f_flags); if (IS_ERR(pid)) return PTR_ERR(pid); /* We should be in an RCU read side critical section, it should be safe * to call pid_task. */ WARN_ON_ONCE(!rcu_read_lock_held()); task = pid_task(pid, PIDTYPE_PID); if (!task) { err = -ENOENT; goto out; } bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags, true, GFP_ATOMIC); bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); out: put_pid(pid); return err; } static int task_storage_delete(struct task_struct *task, struct bpf_map *map, bool nobusy) { struct bpf_local_storage_data *sdata; sdata = task_storage_lookup(task, map, false); if (!sdata) return -ENOENT; if (!nobusy) return -EBUSY; bpf_selem_unlink(SELEM(sdata), false); return 0; } static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) { struct task_struct *task; unsigned int f_flags; struct pid *pid; int fd, err; fd = *(int *)key; pid = pidfd_get_pid(fd, &f_flags); if (IS_ERR(pid)) return PTR_ERR(pid); /* We should be in an RCU read side critical section, it should be safe * to call pid_task. */ WARN_ON_ONCE(!rcu_read_lock_held()); task = pid_task(pid, PIDTYPE_PID); if (!task) { err = -ENOENT; goto out; } bpf_task_storage_lock(); err = task_storage_delete(task, map, true); bpf_task_storage_unlock(); out: put_pid(pid); return err; } /* Called by bpf_task_storage_get*() helpers */ static void *__bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags, gfp_t gfp_flags, bool nobusy) { struct bpf_local_storage_data *sdata; sdata = task_storage_lookup(task, map, nobusy); if (sdata) return sdata->data; /* only allocate new storage, when the task is refcounted */ if (refcount_read(&task->usage) && (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) { sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); return IS_ERR(sdata) ? NULL : sdata->data; } return NULL; } /* *gfp_flags* is a hidden argument provided by the verifier */ BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *, task, void *, value, u64, flags, gfp_t, gfp_flags) { bool nobusy; void *data; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) return (unsigned long)NULL; nobusy = bpf_task_storage_trylock(); data = __bpf_task_storage_get(map, task, value, flags, gfp_flags, nobusy); if (nobusy) bpf_task_storage_unlock(); return (unsigned long)data; } /* *gfp_flags* is a hidden argument provided by the verifier */ BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, task, void *, value, u64, flags, gfp_t, gfp_flags) { void *data; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) return (unsigned long)NULL; bpf_task_storage_lock(); data = __bpf_task_storage_get(map, task, value, flags, gfp_flags, true); bpf_task_storage_unlock(); return (unsigned long)data; } BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *, task) { bool nobusy; int ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!task) return -EINVAL; nobusy = bpf_task_storage_trylock(); /* This helper must only be called from places where the lifetime of the task * is guaranteed. Either by being refcounted or by being protected * by an RCU read-side critical section. */ ret = task_storage_delete(task, map, nobusy); if (nobusy) bpf_task_storage_unlock(); return ret; } BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, task) { int ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!task) return -EINVAL; bpf_task_storage_lock(); /* This helper must only be called from places where the lifetime of the task * is guaranteed. Either by being refcounted or by being protected * by an RCU read-side critical section. */ ret = task_storage_delete(task, map, true); bpf_task_storage_unlock(); return ret; } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -ENOTSUPP; } static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) { return bpf_local_storage_map_alloc(attr, &task_cache, true); } static void task_storage_map_free(struct bpf_map *map) { bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy); } BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map) const struct bpf_map_ops task_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, .map_alloc = task_storage_map_alloc, .map_free = task_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_pid_task_storage_lookup_elem, .map_update_elem = bpf_pid_task_storage_update_elem, .map_delete_elem = bpf_pid_task_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, .map_mem_usage = bpf_local_storage_map_mem_usage, .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = task_storage_ptr, }; const struct bpf_func_proto bpf_task_storage_get_recur_proto = { .func = bpf_task_storage_get_recur, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_task_storage_get_proto = { .func = bpf_task_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_task_storage_delete_recur_proto = { .func = bpf_task_storage_delete_recur, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], }; const struct bpf_func_proto bpf_task_storage_delete_proto = { .func = bpf_task_storage_delete, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], };
97 97 877 3 869 860 13 417 9 125 371 69 10 18 321 457 145 317 451 313 7 101 4 272 60 6 10 15 222 346 131 222 344 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/readdir.c * * Copyright (C) 1995 Linus Torvalds */ #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/errno.h> #include <linux/stat.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/fsnotify.h> #include <linux/dirent.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/compat.h> #include <linux/uaccess.h> /* * Some filesystems were never converted to '->iterate_shared()' * and their directory iterators want the inode lock held for * writing. This wrapper allows for converting from the shared * semantics to the exclusive inode use. */ int wrap_directory_iterator(struct file *file, struct dir_context *ctx, int (*iter)(struct file *, struct dir_context *)) { struct inode *inode = file_inode(file); int ret; /* * We'd love to have an 'inode_upgrade_trylock()' operation, * see the comment in mmap_upgrade_trylock() in mm/memory.c. * * But considering this is for "filesystems that never got * converted", it really doesn't matter. * * Also note that since we have to return with the lock held * for reading, we can't use the "killable()" locking here, * since we do need to get the lock even if we're dying. * * We could do the write part killably and then get the read * lock unconditionally if it mattered, but see above on why * this does the very simplistic conversion. */ up_read(&inode->i_rwsem); down_write(&inode->i_rwsem); /* * Since we dropped the inode lock, we should do the * DEADDIR test again. See 'iterate_dir()' below. * * Note that we don't need to re-do the f_pos games, * since the file must be locked wrt f_pos anyway. */ ret = -ENOENT; if (!IS_DEADDIR(inode)) ret = iter(file, ctx); downgrade_write(&inode->i_rwsem); return ret; } EXPORT_SYMBOL(wrap_directory_iterator); /* * Note the "unsafe_put_user()" semantics: we goto a * label for errors. */ #define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \ char __user *dst = (_dst); \ const char *src = (_src); \ size_t len = (_len); \ unsafe_put_user(0, dst+len, label); \ unsafe_copy_to_user(dst, src, len, label); \ } while (0) int iterate_dir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); int res = -ENOTDIR; if (!file->f_op->iterate_shared) goto out; res = security_file_permission(file, MAY_READ); if (res) goto out; res = fsnotify_file_perm(file, MAY_READ); if (res) goto out; res = down_read_killable(&inode->i_rwsem); if (res) goto out; res = -ENOENT; if (!IS_DEADDIR(inode)) { ctx->pos = file->f_pos; res = file->f_op->iterate_shared(file, ctx); file->f_pos = ctx->pos; fsnotify_access(file); file_accessed(file); } inode_unlock_shared(inode); out: return res; } EXPORT_SYMBOL(iterate_dir); /* * POSIX says that a dirent name cannot contain NULL or a '/'. * * It's not 100% clear what we should really do in this case. * The filesystem is clearly corrupted, but returning a hard * error means that you now don't see any of the other names * either, so that isn't a perfect alternative. * * And if you return an error, what error do you use? Several * filesystems seem to have decided on EUCLEAN being the error * code for EFSCORRUPTED, and that may be the error to use. Or * just EIO, which is perhaps more obvious to users. * * In order to see the other file names in the directory, the * caller might want to make this a "soft" error: skip the * entry, and return the error at the end instead. * * Note that this should likely do a "memchr(name, 0, len)" * check too, since that would be filesystem corruption as * well. However, that case can't actually confuse user space, * which has to do a strlen() on the name anyway to find the * filename length, and the above "soft error" worry means * that it's probably better left alone until we have that * issue clarified. * * Note the PATH_MAX check - it's arbitrary but the real * kernel limit on a possible path component, not NAME_MAX, * which is the technical standard limit. */ static int verify_dirent_name(const char *name, int len) { if (len <= 0 || len >= PATH_MAX) return -EIO; if (memchr(name, '/', len)) return -EIO; return 0; } /* * Traditional linux readdir() handling.. * * "count=1" is a special case, meaning that the buffer is one * dirent-structure in size and that the code can't handle more * anyway. Thus the special "fillonedir()" function for that * case (the low-level handlers don't need to care about this). */ #ifdef __ARCH_WANT_OLD_READDIR struct old_linux_dirent { unsigned long d_ino; unsigned long d_offset; unsigned short d_namlen; char d_name[]; }; struct readdir_callback { struct dir_context ctx; struct old_linux_dirent __user * dirent; int result; }; static bool fillonedir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct readdir_callback *buf = container_of(ctx, struct readdir_callback, ctx); struct old_linux_dirent __user * dirent; unsigned long d_ino; if (buf->result) return false; buf->result = verify_dirent_name(name, namlen); if (buf->result) return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; return false; } buf->result++; dirent = buf->dirent; if (!user_write_access_begin(dirent, (unsigned long)(dirent->d_name + namlen + 1) - (unsigned long)dirent)) goto efault; unsafe_put_user(d_ino, &dirent->d_ino, efault_end); unsafe_put_user(offset, &dirent->d_offset, efault_end); unsafe_put_user(namlen, &dirent->d_namlen, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_write_access_end(); return true; efault_end: user_write_access_end(); efault: buf->result = -EFAULT; return false; } SYSCALL_DEFINE3(old_readdir, unsigned int, fd, struct old_linux_dirent __user *, dirent, unsigned int, count) { int error; CLASS(fd_pos, f)(fd); struct readdir_callback buf = { .ctx.actor = fillonedir, .dirent = dirent }; if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); if (buf.result) error = buf.result; return error; } #endif /* __ARCH_WANT_OLD_READDIR */ /* * New, all-improved, singing, dancing, iBCS2-compliant getdents() * interface. */ struct linux_dirent { unsigned long d_ino; unsigned long d_off; unsigned short d_reclen; char d_name[]; }; struct getdents_callback { struct dir_context ctx; struct linux_dirent __user * current_dir; int prev_reclen; int count; int error; }; static bool filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct linux_dirent __user *dirent, *prev; struct getdents_callback *buf = container_of(ctx, struct getdents_callback, ctx); unsigned long d_ino; int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2, sizeof(long)); int prev_reclen; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->error = -EOVERFLOW; return false; } prev_reclen = buf->prev_reclen; if (prev_reclen && signal_pending(current)) return false; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; if (!user_write_access_begin(prev, reclen + prev_reclen)) goto efault; /* This might be 'dirent->d_off', but if so it will get overwritten */ unsafe_put_user(offset, &prev->d_off, efault_end); unsafe_put_user(d_ino, &dirent->d_ino, efault_end); unsafe_put_user(reclen, &dirent->d_reclen, efault_end); unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_write_access_end(); buf->current_dir = (void __user *)dirent + reclen; buf->prev_reclen = reclen; buf->count -= reclen; return true; efault_end: user_write_access_end(); efault: buf->error = -EFAULT; return false; } SYSCALL_DEFINE3(getdents, unsigned int, fd, struct linux_dirent __user *, dirent, unsigned int, count) { CLASS(fd_pos, f)(fd); struct getdents_callback buf = { .ctx.actor = filldir, .count = count, .current_dir = dirent }; int error; if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); if (error >= 0) error = buf.error; if (buf.prev_reclen) { struct linux_dirent __user * lastdirent; lastdirent = (void __user *)buf.current_dir - buf.prev_reclen; if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; } return error; } struct getdents_callback64 { struct dir_context ctx; struct linux_dirent64 __user * current_dir; int prev_reclen; int count; int error; }; static bool filldir64(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct linux_dirent64 __user *dirent, *prev; struct getdents_callback64 *buf = container_of(ctx, struct getdents_callback64, ctx); int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, sizeof(u64)); int prev_reclen; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return false; prev_reclen = buf->prev_reclen; if (prev_reclen && signal_pending(current)) return false; dirent = buf->current_dir; prev = (void __user *)dirent - prev_reclen; if (!user_write_access_begin(prev, reclen + prev_reclen)) goto efault; /* This might be 'dirent->d_off', but if so it will get overwritten */ unsafe_put_user(offset, &prev->d_off, efault_end); unsafe_put_user(ino, &dirent->d_ino, efault_end); unsafe_put_user(reclen, &dirent->d_reclen, efault_end); unsafe_put_user(d_type, &dirent->d_type, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_write_access_end(); buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; buf->count -= reclen; return true; efault_end: user_write_access_end(); efault: buf->error = -EFAULT; return false; } SYSCALL_DEFINE3(getdents64, unsigned int, fd, struct linux_dirent64 __user *, dirent, unsigned int, count) { CLASS(fd_pos, f)(fd); struct getdents_callback64 buf = { .ctx.actor = filldir64, .count = count, .current_dir = dirent }; int error; if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); if (error >= 0) error = buf.error; if (buf.prev_reclen) { struct linux_dirent64 __user * lastdirent; typeof(lastdirent->d_off) d_off = buf.ctx.pos; lastdirent = (void __user *) buf.current_dir - buf.prev_reclen; if (put_user(d_off, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; } return error; } #ifdef CONFIG_COMPAT struct compat_old_linux_dirent { compat_ulong_t d_ino; compat_ulong_t d_offset; unsigned short d_namlen; char d_name[]; }; struct compat_readdir_callback { struct dir_context ctx; struct compat_old_linux_dirent __user *dirent; int result; }; static bool compat_fillonedir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct compat_readdir_callback *buf = container_of(ctx, struct compat_readdir_callback, ctx); struct compat_old_linux_dirent __user *dirent; compat_ulong_t d_ino; if (buf->result) return false; buf->result = verify_dirent_name(name, namlen); if (buf->result) return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; return false; } buf->result++; dirent = buf->dirent; if (!user_write_access_begin(dirent, (unsigned long)(dirent->d_name + namlen + 1) - (unsigned long)dirent)) goto efault; unsafe_put_user(d_ino, &dirent->d_ino, efault_end); unsafe_put_user(offset, &dirent->d_offset, efault_end); unsafe_put_user(namlen, &dirent->d_namlen, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_write_access_end(); return true; efault_end: user_write_access_end(); efault: buf->result = -EFAULT; return false; } COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, struct compat_old_linux_dirent __user *, dirent, unsigned int, count) { int error; CLASS(fd_pos, f)(fd); struct compat_readdir_callback buf = { .ctx.actor = compat_fillonedir, .dirent = dirent }; if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); if (buf.result) error = buf.result; return error; } struct compat_linux_dirent { compat_ulong_t d_ino; compat_ulong_t d_off; unsigned short d_reclen; char d_name[]; }; struct compat_getdents_callback { struct dir_context ctx; struct compat_linux_dirent __user *current_dir; int prev_reclen; int count; int error; }; static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct compat_linux_dirent __user *dirent, *prev; struct compat_getdents_callback *buf = container_of(ctx, struct compat_getdents_callback, ctx); compat_ulong_t d_ino; int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + namlen + 2, sizeof(compat_long_t)); int prev_reclen; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->error = -EOVERFLOW; return false; } prev_reclen = buf->prev_reclen; if (prev_reclen && signal_pending(current)) return false; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; if (!user_write_access_begin(prev, reclen + prev_reclen)) goto efault; unsafe_put_user(offset, &prev->d_off, efault_end); unsafe_put_user(d_ino, &dirent->d_ino, efault_end); unsafe_put_user(reclen, &dirent->d_reclen, efault_end); unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_write_access_end(); buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; buf->count -= reclen; return true; efault_end: user_write_access_end(); efault: buf->error = -EFAULT; return false; } COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, struct compat_linux_dirent __user *, dirent, unsigned int, count) { CLASS(fd_pos, f)(fd); struct compat_getdents_callback buf = { .ctx.actor = compat_filldir, .current_dir = dirent, .count = count }; int error; if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); if (error >= 0) error = buf.error; if (buf.prev_reclen) { struct compat_linux_dirent __user * lastdirent; lastdirent = (void __user *)buf.current_dir - buf.prev_reclen; if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; } return error; } #endif
18 19 19 1 1 8 9 1 9 8 2 1 3 12 5 5 5 4 4 2 9 9 9 9 1 7 1 6 183 182 175 7 12 8 12 16 12 14 13 12 12 12 2 12 12 12 12 7 7 1 6 2 6 6 1 7 328 328 2 2 119 2 186 120 74 1 1 186 5 2 4 4 4 4 5 4 3 6 6 4 4 6 4 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 // SPDX-License-Identifier: GPL-2.0-or-later /* * Anycast support for IPv6 * Linux INET6 implementation * * Authors: * David L Stevens (dlstevens@us.ibm.com) * * based heavily on net/ipv6/mcast.c */ #include <linux/capability.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/random.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/route.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/if_inet6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/checksum.h> #define IN6_ADDR_HSIZE_SHIFT 8 #define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT) /* anycast address hash table */ static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE]; static DEFINE_SPINLOCK(acaddr_hash_lock); static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); static u32 inet6_acaddr_hash(const struct net *net, const struct in6_addr *addr) { u32 val = __ipv6_addr_jhash(addr, net_hash_mix(net)); return hash_32(val, IN6_ADDR_HSIZE_SHIFT); } /* * socket join an anycast group */ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); struct net_device *dev = NULL; struct inet6_dev *idev; struct ipv6_ac_socklist *pac; struct net *net = sock_net(sk); int ishost = !net->ipv6.devconf_all->forwarding; int err = 0; ASSERT_RTNL(); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (ipv6_addr_is_multicast(addr)) return -EINVAL; if (ifindex) dev = __dev_get_by_index(net, ifindex); if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE)) return -EINVAL; pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL); if (!pac) return -ENOMEM; pac->acl_next = NULL; pac->acl_addr = *addr; if (ifindex == 0) { struct rt6_info *rt; rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; ip6_rt_put(rt); } else if (ishost) { err = -EADDRNOTAVAIL; goto error; } else { /* router, no matching interface: just pick one */ dev = __dev_get_by_flags(net, IFF_UP, IFF_UP | IFF_LOOPBACK); } } if (!dev) { err = -ENODEV; goto error; } idev = __in6_dev_get(dev); if (!idev) { if (ifindex) err = -ENODEV; else err = -EADDRNOTAVAIL; goto error; } /* reset ishost, now that we have a specific device */ ishost = !idev->cnf.forwarding; pac->acl_ifindex = dev->ifindex; /* XXX * For hosts, allow link-local or matching prefix anycasts. * This obviates the need for propagating anycast routes while * still allowing some non-router anycast participation. */ if (!ipv6_chk_prefix(addr, dev)) { if (ishost) err = -EADDRNOTAVAIL; if (err) goto error; } err = __ipv6_dev_ac_inc(idev, addr); if (!err) { pac->acl_next = np->ipv6_ac_list; np->ipv6_ac_list = pac; pac = NULL; } error: if (pac) sock_kfree_s(sk, pac, sizeof(*pac)); return err; } /* * socket leave an anycast group */ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); struct net_device *dev; struct ipv6_ac_socklist *pac, *prev_pac; struct net *net = sock_net(sk); ASSERT_RTNL(); prev_pac = NULL; for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) { if ((ifindex == 0 || pac->acl_ifindex == ifindex) && ipv6_addr_equal(&pac->acl_addr, addr)) break; prev_pac = pac; } if (!pac) return -ENOENT; if (prev_pac) prev_pac->acl_next = pac->acl_next; else np->ipv6_ac_list = pac->acl_next; dev = __dev_get_by_index(net, pac->acl_ifindex); if (dev) ipv6_dev_ac_dec(dev, &pac->acl_addr); sock_kfree_s(sk, pac, sizeof(*pac)); return 0; } void __ipv6_sock_ac_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct net_device *dev = NULL; struct ipv6_ac_socklist *pac; struct net *net = sock_net(sk); int prev_index; ASSERT_RTNL(); pac = np->ipv6_ac_list; np->ipv6_ac_list = NULL; prev_index = 0; while (pac) { struct ipv6_ac_socklist *next = pac->acl_next; if (pac->acl_ifindex != prev_index) { dev = __dev_get_by_index(net, pac->acl_ifindex); prev_index = pac->acl_ifindex; } if (dev) ipv6_dev_ac_dec(dev, &pac->acl_addr); sock_kfree_s(sk, pac, sizeof(*pac)); pac = next; } } void ipv6_sock_ac_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); if (!np->ipv6_ac_list) return; rtnl_lock(); __ipv6_sock_ac_close(sk); rtnl_unlock(); } static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca) { unsigned int hash = inet6_acaddr_hash(net, &aca->aca_addr); spin_lock(&acaddr_hash_lock); hlist_add_head_rcu(&aca->aca_addr_lst, &inet6_acaddr_lst[hash]); spin_unlock(&acaddr_hash_lock); } static void ipv6_del_acaddr_hash(struct ifacaddr6 *aca) { spin_lock(&acaddr_hash_lock); hlist_del_init_rcu(&aca->aca_addr_lst); spin_unlock(&acaddr_hash_lock); } static void aca_get(struct ifacaddr6 *aca) { refcount_inc(&aca->aca_refcnt); } static void aca_free_rcu(struct rcu_head *h) { struct ifacaddr6 *aca = container_of(h, struct ifacaddr6, rcu); fib6_info_release(aca->aca_rt); kfree(aca); } static void aca_put(struct ifacaddr6 *ac) { if (refcount_dec_and_test(&ac->aca_refcnt)) call_rcu_hurry(&ac->rcu, aca_free_rcu); } static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, const struct in6_addr *addr) { struct ifacaddr6 *aca; aca = kzalloc(sizeof(*aca), GFP_ATOMIC); if (!aca) return NULL; aca->aca_addr = *addr; fib6_info_hold(f6i); aca->aca_rt = f6i; INIT_HLIST_NODE(&aca->aca_addr_lst); aca->aca_users = 1; /* aca_tstamp should be updated upon changes */ aca->aca_cstamp = aca->aca_tstamp = jiffies; refcount_set(&aca->aca_refcnt, 1); return aca; } static void inet6_ifacaddr_notify(struct net_device *dev, const struct ifacaddr6 *ifaca, int event) { struct inet6_fill_args fillargs = { .event = event, .netnsid = -1, }; struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOMEM; skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(sizeof(struct in6_addr)) + nla_total_size(sizeof(struct ifa_cacheinfo)), GFP_KERNEL); if (!skb) goto error; err = inet6_fill_ifacaddr(skb, ifaca, &fillargs); if (err < 0) { pr_err("Failed to fill in anycast addresses (err %d)\n", err); nlmsg_free(skb); goto error; } rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ACADDR, NULL, GFP_KERNEL); return; error: rtnl_set_sk_err(net, RTNLGRP_IPV6_ACADDR, err); } /* * device anycast group inc (add if not found) */ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca; struct fib6_info *f6i; struct net *net; int err; ASSERT_RTNL(); write_lock_bh(&idev->lock); if (idev->dead) { err = -ENODEV; goto out; } for (aca = rtnl_dereference(idev->ac_list); aca; aca = rtnl_dereference(aca->aca_next)) { if (ipv6_addr_equal(&aca->aca_addr, addr)) { aca->aca_users++; err = 0; goto out; } } net = dev_net(idev->dev); f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC, NULL); if (IS_ERR(f6i)) { err = PTR_ERR(f6i); goto out; } aca = aca_alloc(f6i, addr); if (!aca) { fib6_info_release(f6i); err = -ENOMEM; goto out; } /* Hold this for addrconf_join_solict() below before we unlock, * it is already exposed via idev->ac_list. */ aca_get(aca); aca->aca_next = idev->ac_list; rcu_assign_pointer(idev->ac_list, aca); write_unlock_bh(&idev->lock); ipv6_add_acaddr_hash(net, aca); ip6_ins_rt(net, f6i); addrconf_join_solict(idev->dev, &aca->aca_addr); inet6_ifacaddr_notify(idev->dev, aca, RTM_NEWANYCAST); aca_put(aca); return 0; out: write_unlock_bh(&idev->lock); return err; } /* * device anycast group decrement */ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca, *prev_aca; ASSERT_RTNL(); write_lock_bh(&idev->lock); prev_aca = NULL; for (aca = rtnl_dereference(idev->ac_list); aca; aca = rtnl_dereference(aca->aca_next)) { if (ipv6_addr_equal(&aca->aca_addr, addr)) break; prev_aca = aca; } if (!aca) { write_unlock_bh(&idev->lock); return -ENOENT; } if (--aca->aca_users > 0) { write_unlock_bh(&idev->lock); return 0; } if (prev_aca) rcu_assign_pointer(prev_aca->aca_next, aca->aca_next); else rcu_assign_pointer(idev->ac_list, aca->aca_next); write_unlock_bh(&idev->lock); ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); inet6_ifacaddr_notify(idev->dev, aca, RTM_DELANYCAST); aca_put(aca); return 0; } /* called with rtnl_lock() */ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) { struct inet6_dev *idev = __in6_dev_get(dev); if (!idev) return -ENODEV; return __ipv6_dev_ac_dec(idev, addr); } void ipv6_ac_destroy_dev(struct inet6_dev *idev) { struct ifacaddr6 *aca; write_lock_bh(&idev->lock); while ((aca = rtnl_dereference(idev->ac_list)) != NULL) { rcu_assign_pointer(idev->ac_list, aca->aca_next); write_unlock_bh(&idev->lock); ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); aca_put(aca); write_lock_bh(&idev->lock); } write_unlock_bh(&idev->lock); } /* * check if the interface has this anycast address * called with rcu_read_lock() */ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *addr) { struct inet6_dev *idev; struct ifacaddr6 *aca; idev = __in6_dev_get(dev); if (idev) { for (aca = rcu_dereference(idev->ac_list); aca; aca = rcu_dereference(aca->aca_next)) if (ipv6_addr_equal(&aca->aca_addr, addr)) break; return aca != NULL; } return false; } /* * check if given interface (or any, if dev==0) has this anycast address */ bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr) { struct net_device *nh_dev; struct ifacaddr6 *aca; bool found = false; rcu_read_lock(); if (dev) found = ipv6_chk_acast_dev(dev, addr); else { unsigned int hash = inet6_acaddr_hash(net, addr); hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash], aca_addr_lst) { nh_dev = fib6_info_nh_dev(aca->aca_rt); if (!nh_dev || !net_eq(dev_net(nh_dev), net)) continue; if (ipv6_addr_equal(&aca->aca_addr, addr)) { found = true; break; } } } rcu_read_unlock(); return found; } /* check if this anycast address is link-local on given interface or * is global */ bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev, const struct in6_addr *addr) { return ipv6_chk_acast_addr(net, (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL ? dev : NULL), addr); } #ifdef CONFIG_PROC_FS struct ac6_iter_state { struct seq_net_private p; struct net_device *dev; }; #define ac6_seq_private(seq) ((struct ac6_iter_state *)(seq)->private) static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq) { struct ac6_iter_state *state = ac6_seq_private(seq); struct net *net = seq_file_net(seq); struct ifacaddr6 *im = NULL; for_each_netdev_rcu(net, state->dev) { struct inet6_dev *idev; idev = __in6_dev_get(state->dev); if (!idev) continue; im = rcu_dereference(idev->ac_list); if (im) break; } return im; } static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im) { struct ac6_iter_state *state = ac6_seq_private(seq); struct inet6_dev *idev; im = rcu_dereference(im->aca_next); while (!im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) break; idev = __in6_dev_get(state->dev); if (!idev) continue; im = rcu_dereference(idev->ac_list); } return im; } static struct ifacaddr6 *ac6_get_idx(struct seq_file *seq, loff_t pos) { struct ifacaddr6 *im = ac6_get_first(seq); if (im) while (pos && (im = ac6_get_next(seq, im)) != NULL) --pos; return pos ? NULL : im; } static void *ac6_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return ac6_get_idx(seq, *pos); } static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ifacaddr6 *im = ac6_get_next(seq, v); ++*pos; return im; } static void ac6_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ac6_seq_show(struct seq_file *seq, void *v) { struct ifacaddr6 *im = (struct ifacaddr6 *)v; struct ac6_iter_state *state = ac6_seq_private(seq); seq_printf(seq, "%-4d %-15s %pi6 %5d\n", state->dev->ifindex, state->dev->name, &im->aca_addr, im->aca_users); return 0; } static const struct seq_operations ac6_seq_ops = { .start = ac6_seq_start, .next = ac6_seq_next, .stop = ac6_seq_stop, .show = ac6_seq_show, }; int __net_init ac6_proc_init(struct net *net) { if (!proc_create_net("anycast6", 0444, net->proc_net, &ac6_seq_ops, sizeof(struct ac6_iter_state))) return -ENOMEM; return 0; } void ac6_proc_exit(struct net *net) { remove_proc_entry("anycast6", net->proc_net); } #endif /* Init / cleanup code */ int __init ipv6_anycast_init(void) { int i; for (i = 0; i < IN6_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&inet6_acaddr_lst[i]); return 0; } void ipv6_anycast_cleanup(void) { int i; spin_lock(&acaddr_hash_lock); for (i = 0; i < IN6_ADDR_HSIZE; i++) WARN_ON(!hlist_empty(&inet6_acaddr_lst[i])); spin_unlock(&acaddr_hash_lock); }
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz> * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon */ #include <linux/module.h> #include "../../dccp.h" #include "tfrc.h" #define TFRC_CALC_X_ARRSIZE 500 #define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */ #define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE) /* TFRC TCP Reno Throughput Equation Lookup Table for f(p) The following two-column lookup table implements a part of the TCP throughput equation from [RFC 3448, sec. 3.1]: s X_calc = -------------------------------------------------------------- R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3)) Where: X is the transmit rate in bytes/second s is the packet size in bytes R is the round trip time in seconds p is the loss event rate, between 0 and 1.0, of the number of loss events as a fraction of the number of packets transmitted t_RTO is the TCP retransmission timeout value in seconds b is the number of packets acknowledged by a single TCP ACK We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes: s X_calc = ------------------------------------------------------- R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3)) which we can break down into: s X_calc = --------- R * f(p) where f(p) is given for 0 < p <= 1 by: f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3) Since this is kernel code, floating-point arithmetic is avoided in favour of integer arithmetic. This means that nearly all fractional parameters are scaled by 1000000: * the parameters p and R * the return result f(p) The lookup table therefore actually tabulates the following function g(q): g(q) = 1000000 * f(q/1000000) Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer granularity for the practically more relevant case of small values of p (up to 5%), the second column is used; the first one ranges up to 100%. This split corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also determines the smallest resolution possible with this lookup table: TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE The entire table is generated by: for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) { lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE); lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE); } With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1, lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%) lookup[M][0] = g(1000000) = 1000000 * f(100%) lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%) lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%) In summary, the two columns represent f(p) for the following ranges: * The first column is for 0.002 <= p <= 1.0 * The second column is for 0.0001 <= p <= 0.05 Where the columns overlap, the second (finer-grained) is given preference, i.e. the first column is used only for p >= 0.05. */ static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { { 37172, 8172 }, { 53499, 11567 }, { 66664, 14180 }, { 78298, 16388 }, { 89021, 18339 }, { 99147, 20108 }, { 108858, 21738 }, { 118273, 23260 }, { 127474, 24693 }, { 136520, 26052 }, { 145456, 27348 }, { 154316, 28589 }, { 163130, 29783 }, { 171919, 30935 }, { 180704, 32049 }, { 189502, 33130 }, { 198328, 34180 }, { 207194, 35202 }, { 216114, 36198 }, { 225097, 37172 }, { 234153, 38123 }, { 243294, 39055 }, { 252527, 39968 }, { 261861, 40864 }, { 271305, 41743 }, { 280866, 42607 }, { 290553, 43457 }, { 300372, 44293 }, { 310333, 45117 }, { 320441, 45929 }, { 330705, 46729 }, { 341131, 47518 }, { 351728, 48297 }, { 362501, 49066 }, { 373460, 49826 }, { 384609, 50577 }, { 395958, 51320 }, { 407513, 52054 }, { 419281, 52780 }, { 431270, 53499 }, { 443487, 54211 }, { 455940, 54916 }, { 468635, 55614 }, { 481581, 56306 }, { 494785, 56991 }, { 508254, 57671 }, { 521996, 58345 }, { 536019, 59014 }, { 550331, 59677 }, { 564939, 60335 }, { 579851, 60988 }, { 595075, 61636 }, { 610619, 62279 }, { 626491, 62918 }, { 642700, 63553 }, { 659253, 64183 }, { 676158, 64809 }, { 693424, 65431 }, { 711060, 66050 }, { 729073, 66664 }, { 747472, 67275 }, { 766266, 67882 }, { 785464, 68486 }, { 805073, 69087 }, { 825103, 69684 }, { 845562, 70278 }, { 866460, 70868 }, { 887805, 71456 }, { 909606, 72041 }, { 931873, 72623 }, { 954614, 73202 }, { 977839, 73778 }, { 1001557, 74352 }, { 1025777, 74923 }, { 1050508, 75492 }, { 1075761, 76058 }, { 1101544, 76621 }, { 1127867, 77183 }, { 1154739, 77741 }, { 1182172, 78298 }, { 1210173, 78852 }, { 1238753, 79405 }, { 1267922, 79955 }, { 1297689, 80503 }, { 1328066, 81049 }, { 1359060, 81593 }, { 1390684, 82135 }, { 1422947, 82675 }, { 1455859, 83213 }, { 1489430, 83750 }, { 1523671, 84284 }, { 1558593, 84817 }, { 1594205, 85348 }, { 1630518, 85878 }, { 1667543, 86406 }, { 1705290, 86932 }, { 1743770, 87457 }, { 1782994, 87980 }, { 1822973, 88501 }, { 1863717, 89021 }, { 1905237, 89540 }, { 1947545, 90057 }, { 1990650, 90573 }, { 2034566, 91087 }, { 2079301, 91600 }, { 2124869, 92111 }, { 2171279, 92622 }, { 2218543, 93131 }, { 2266673, 93639 }, { 2315680, 94145 }, { 2365575, 94650 }, { 2416371, 95154 }, { 2468077, 95657 }, { 2520707, 96159 }, { 2574271, 96660 }, { 2628782, 97159 }, { 2684250, 97658 }, { 2740689, 98155 }, { 2798110, 98651 }, { 2856524, 99147 }, { 2915944, 99641 }, { 2976382, 100134 }, { 3037850, 100626 }, { 3100360, 101117 }, { 3163924, 101608 }, { 3228554, 102097 }, { 3294263, 102586 }, { 3361063, 103073 }, { 3428966, 103560 }, { 3497984, 104045 }, { 3568131, 104530 }, { 3639419, 105014 }, { 3711860, 105498 }, { 3785467, 105980 }, { 3860253, 106462 }, { 3936229, 106942 }, { 4013410, 107422 }, { 4091808, 107902 }, { 4171435, 108380 }, { 4252306, 108858 }, { 4334431, 109335 }, { 4417825, 109811 }, { 4502501, 110287 }, { 4588472, 110762 }, { 4675750, 111236 }, { 4764349, 111709 }, { 4854283, 112182 }, { 4945564, 112654 }, { 5038206, 113126 }, { 5132223, 113597 }, { 5227627, 114067 }, { 5324432, 114537 }, { 5422652, 115006 }, { 5522299, 115474 }, { 5623389, 115942 }, { 5725934, 116409 }, { 5829948, 116876 }, { 5935446, 117342 }, { 6042439, 117808 }, { 6150943, 118273 }, { 6260972, 118738 }, { 6372538, 119202 }, { 6485657, 119665 }, { 6600342, 120128 }, { 6716607, 120591 }, { 6834467, 121053 }, { 6953935, 121514 }, { 7075025, 121976 }, { 7197752, 122436 }, { 7322131, 122896 }, { 7448175, 123356 }, { 7575898, 123815 }, { 7705316, 124274 }, { 7836442, 124733 }, { 7969291, 125191 }, { 8103877, 125648 }, { 8240216, 126105 }, { 8378321, 126562 }, { 8518208, 127018 }, { 8659890, 127474 }, { 8803384, 127930 }, { 8948702, 128385 }, { 9095861, 128840 }, { 9244875, 129294 }, { 9395760, 129748 }, { 9548529, 130202 }, { 9703198, 130655 }, { 9859782, 131108 }, { 10018296, 131561 }, { 10178755, 132014 }, { 10341174, 132466 }, { 10505569, 132917 }, { 10671954, 133369 }, { 10840345, 133820 }, { 11010757, 134271 }, { 11183206, 134721 }, { 11357706, 135171 }, { 11534274, 135621 }, { 11712924, 136071 }, { 11893673, 136520 }, { 12076536, 136969 }, { 12261527, 137418 }, { 12448664, 137867 }, { 12637961, 138315 }, { 12829435, 138763 }, { 13023101, 139211 }, { 13218974, 139658 }, { 13417071, 140106 }, { 13617407, 140553 }, { 13819999, 140999 }, { 14024862, 141446 }, { 14232012, 141892 }, { 14441465, 142339 }, { 14653238, 142785 }, { 14867346, 143230 }, { 15083805, 143676 }, { 15302632, 144121 }, { 15523842, 144566 }, { 15747453, 145011 }, { 15973479, 145456 }, { 16201939, 145900 }, { 16432847, 146345 }, { 16666221, 146789 }, { 16902076, 147233 }, { 17140429, 147677 }, { 17381297, 148121 }, { 17624696, 148564 }, { 17870643, 149007 }, { 18119154, 149451 }, { 18370247, 149894 }, { 18623936, 150336 }, { 18880241, 150779 }, { 19139176, 151222 }, { 19400759, 151664 }, { 19665007, 152107 }, { 19931936, 152549 }, { 20201564, 152991 }, { 20473907, 153433 }, { 20748982, 153875 }, { 21026807, 154316 }, { 21307399, 154758 }, { 21590773, 155199 }, { 21876949, 155641 }, { 22165941, 156082 }, { 22457769, 156523 }, { 22752449, 156964 }, { 23049999, 157405 }, { 23350435, 157846 }, { 23653774, 158287 }, { 23960036, 158727 }, { 24269236, 159168 }, { 24581392, 159608 }, { 24896521, 160049 }, { 25214642, 160489 }, { 25535772, 160929 }, { 25859927, 161370 }, { 26187127, 161810 }, { 26517388, 162250 }, { 26850728, 162690 }, { 27187165, 163130 }, { 27526716, 163569 }, { 27869400, 164009 }, { 28215234, 164449 }, { 28564236, 164889 }, { 28916423, 165328 }, { 29271815, 165768 }, { 29630428, 166208 }, { 29992281, 166647 }, { 30357392, 167087 }, { 30725779, 167526 }, { 31097459, 167965 }, { 31472452, 168405 }, { 31850774, 168844 }, { 32232445, 169283 }, { 32617482, 169723 }, { 33005904, 170162 }, { 33397730, 170601 }, { 33792976, 171041 }, { 34191663, 171480 }, { 34593807, 171919 }, { 34999428, 172358 }, { 35408544, 172797 }, { 35821174, 173237 }, { 36237335, 173676 }, { 36657047, 174115 }, { 37080329, 174554 }, { 37507197, 174993 }, { 37937673, 175433 }, { 38371773, 175872 }, { 38809517, 176311 }, { 39250924, 176750 }, { 39696012, 177190 }, { 40144800, 177629 }, { 40597308, 178068 }, { 41053553, 178507 }, { 41513554, 178947 }, { 41977332, 179386 }, { 42444904, 179825 }, { 42916290, 180265 }, { 43391509, 180704 }, { 43870579, 181144 }, { 44353520, 181583 }, { 44840352, 182023 }, { 45331092, 182462 }, { 45825761, 182902 }, { 46324378, 183342 }, { 46826961, 183781 }, { 47333531, 184221 }, { 47844106, 184661 }, { 48358706, 185101 }, { 48877350, 185541 }, { 49400058, 185981 }, { 49926849, 186421 }, { 50457743, 186861 }, { 50992759, 187301 }, { 51531916, 187741 }, { 52075235, 188181 }, { 52622735, 188622 }, { 53174435, 189062 }, { 53730355, 189502 }, { 54290515, 189943 }, { 54854935, 190383 }, { 55423634, 190824 }, { 55996633, 191265 }, { 56573950, 191706 }, { 57155606, 192146 }, { 57741621, 192587 }, { 58332014, 193028 }, { 58926806, 193470 }, { 59526017, 193911 }, { 60129666, 194352 }, { 60737774, 194793 }, { 61350361, 195235 }, { 61967446, 195677 }, { 62589050, 196118 }, { 63215194, 196560 }, { 63845897, 197002 }, { 64481179, 197444 }, { 65121061, 197886 }, { 65765563, 198328 }, { 66414705, 198770 }, { 67068508, 199213 }, { 67726992, 199655 }, { 68390177, 200098 }, { 69058085, 200540 }, { 69730735, 200983 }, { 70408147, 201426 }, { 71090343, 201869 }, { 71777343, 202312 }, { 72469168, 202755 }, { 73165837, 203199 }, { 73867373, 203642 }, { 74573795, 204086 }, { 75285124, 204529 }, { 76001380, 204973 }, { 76722586, 205417 }, { 77448761, 205861 }, { 78179926, 206306 }, { 78916102, 206750 }, { 79657310, 207194 }, { 80403571, 207639 }, { 81154906, 208084 }, { 81911335, 208529 }, { 82672880, 208974 }, { 83439562, 209419 }, { 84211402, 209864 }, { 84988421, 210309 }, { 85770640, 210755 }, { 86558080, 211201 }, { 87350762, 211647 }, { 88148708, 212093 }, { 88951938, 212539 }, { 89760475, 212985 }, { 90574339, 213432 }, { 91393551, 213878 }, { 92218133, 214325 }, { 93048107, 214772 }, { 93883493, 215219 }, { 94724314, 215666 }, { 95570590, 216114 }, { 96422343, 216561 }, { 97279594, 217009 }, { 98142366, 217457 }, { 99010679, 217905 }, { 99884556, 218353 }, { 100764018, 218801 }, { 101649086, 219250 }, { 102539782, 219698 }, { 103436128, 220147 }, { 104338146, 220596 }, { 105245857, 221046 }, { 106159284, 221495 }, { 107078448, 221945 }, { 108003370, 222394 }, { 108934074, 222844 }, { 109870580, 223294 }, { 110812910, 223745 }, { 111761087, 224195 }, { 112715133, 224646 }, { 113675069, 225097 }, { 114640918, 225548 }, { 115612702, 225999 }, { 116590442, 226450 }, { 117574162, 226902 }, { 118563882, 227353 }, { 119559626, 227805 }, { 120561415, 228258 }, { 121569272, 228710 }, { 122583219, 229162 }, { 123603278, 229615 }, { 124629471, 230068 }, { 125661822, 230521 }, { 126700352, 230974 }, { 127745083, 231428 }, { 128796039, 231882 }, { 129853241, 232336 }, { 130916713, 232790 }, { 131986475, 233244 }, { 133062553, 233699 }, { 134144966, 234153 }, { 135233739, 234608 }, { 136328894, 235064 }, { 137430453, 235519 }, { 138538440, 235975 }, { 139652876, 236430 }, { 140773786, 236886 }, { 141901190, 237343 }, { 143035113, 237799 }, { 144175576, 238256 }, { 145322604, 238713 }, { 146476218, 239170 }, { 147636442, 239627 }, { 148803298, 240085 }, { 149976809, 240542 }, { 151156999, 241000 }, { 152343890, 241459 }, { 153537506, 241917 }, { 154737869, 242376 }, { 155945002, 242835 }, { 157158929, 243294 }, { 158379673, 243753 }, { 159607257, 244213 }, { 160841704, 244673 }, { 162083037, 245133 }, { 163331279, 245593 }, { 164586455, 246054 }, { 165848586, 246514 }, { 167117696, 246975 }, { 168393810, 247437 }, { 169676949, 247898 }, { 170967138, 248360 }, { 172264399, 248822 }, { 173568757, 249284 }, { 174880235, 249747 }, { 176198856, 250209 }, { 177524643, 250672 }, { 178857621, 251136 }, { 180197813, 251599 }, { 181545242, 252063 }, { 182899933, 252527 }, { 184261908, 252991 }, { 185631191, 253456 }, { 187007807, 253920 }, { 188391778, 254385 }, { 189783129, 254851 }, { 191181884, 255316 }, { 192588065, 255782 }, { 194001698, 256248 }, { 195422805, 256714 }, { 196851411, 257181 }, { 198287540, 257648 }, { 199731215, 258115 }, { 201182461, 258582 }, { 202641302, 259050 }, { 204107760, 259518 }, { 205581862, 259986 }, { 207063630, 260454 }, { 208553088, 260923 }, { 210050262, 261392 }, { 211555174, 261861 }, { 213067849, 262331 }, { 214588312, 262800 }, { 216116586, 263270 }, { 217652696, 263741 }, { 219196666, 264211 }, { 220748520, 264682 }, { 222308282, 265153 }, { 223875978, 265625 }, { 225451630, 266097 }, { 227035265, 266569 }, { 228626905, 267041 }, { 230226576, 267514 }, { 231834302, 267986 }, { 233450107, 268460 }, { 235074016, 268933 }, { 236706054, 269407 }, { 238346244, 269881 }, { 239994613, 270355 }, { 241651183, 270830 }, { 243315981, 271305 } }; /* return largest index i such that fval <= lookup[i][small] */ static inline u32 tfrc_binsearch(u32 fval, u8 small) { u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1; while (low < high) { try = (low + high) / 2; if (fval <= tfrc_calc_x_lookup[try][small]) high = try; else low = try + 1; } return high; } /** * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448 * @s: packet size in bytes * @R: RTT scaled by 1000000 (i.e., microseconds) * @p: loss ratio estimate scaled by 1000000 * * Returns X_calc in bytes per second (not scaled). */ u32 tfrc_calc_x(u16 s, u32 R, u32 p) { u16 index; u32 f; u64 result; /* check against invalid parameters and divide-by-zero */ BUG_ON(p > 1000000); /* p must not exceed 100% */ BUG_ON(p == 0); /* f(0) = 0, divide by zero */ if (R == 0) { /* possible divide by zero */ DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc."); return ~0U; } if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ DCCP_WARN("Value of p (%d) below resolution. " "Substituting %d\n", p, TFRC_SMALLEST_P); index = 0; } else /* 0.0001 <= p <= 0.05 */ index = p/TFRC_SMALLEST_P - 1; f = tfrc_calc_x_lookup[index][1]; } else { /* 0.05 < p <= 1.00 */ index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1; f = tfrc_calc_x_lookup[index][0]; } /* * Compute X = s/(R*f(p)) in bytes per second. * Since f(p) and R are both scaled by 1000000, we need to multiply by * 1000000^2. To avoid overflow, the result is computed in two stages. * This works under almost all reasonable operational conditions, for a * wide range of parameters. Yet, should some strange combination of * parameters result in overflow, the use of scaled_div32 will catch * this and return UINT_MAX - which is a logically adequate consequence. */ result = scaled_div(s, R); return scaled_div32(result, f); } /** * tfrc_calc_x_reverse_lookup - try to find p given f(p) * @fvalue: function value to match, scaled by 1000000 * * Returns closest match for p, also scaled by 1000000 */ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) { int index; if (fvalue == 0) /* f(p) = 0 whenever p = 0 */ return 0; /* Error cases. */ if (fvalue < tfrc_calc_x_lookup[0][1]) { DCCP_WARN("fvalue %u smaller than resolution\n", fvalue); return TFRC_SMALLEST_P; } if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) { DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue); return 1000000; } if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) { index = tfrc_binsearch(fvalue, 1); return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE; } /* else ... it must be in the coarse-grained column */ index = tfrc_binsearch(fvalue, 0); return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; } /** * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% * @loss_event_rate: loss event rate to invert * When @loss_event_rate is large, there is a chance that p is truncated to 0. * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. */ u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) { if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ return 0; if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ return 1000000; return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); }
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 // SPDX-License-Identifier: GPL-2.0 /* dvb-usb-firmware.c is part of the DVB USB library. * * Copyright (C) 2004-6 Patrick Boettcher (patrick.boettcher@posteo.de) * see dvb-usb-init.c for copyright information. * * This file contains functions for downloading the firmware to Cypress FX 1 and 2 based devices. * * FIXME: This part does actually not belong to dvb-usb, but to the usb-subsystem. */ #include "dvb-usb-common.h" #include <linux/usb.h> struct usb_cypress_controller { int id; const char *name; /* name of the usb controller */ u16 cpu_cs_register; /* needs to be restarted, when the firmware has been downloaded. */ }; static struct usb_cypress_controller cypress[] = { { .id = DEVICE_SPECIFIC, .name = "Device specific", .cpu_cs_register = 0 }, { .id = CYPRESS_AN2135, .name = "Cypress AN2135", .cpu_cs_register = 0x7f92 }, { .id = CYPRESS_AN2235, .name = "Cypress AN2235", .cpu_cs_register = 0x7f92 }, { .id = CYPRESS_FX2, .name = "Cypress FX2", .cpu_cs_register = 0xe600 }, }; /* * load a firmware packet to the device */ static int usb_cypress_writemem(struct usb_device *udev,u16 addr,u8 *data, u8 len) { return usb_control_msg(udev, usb_sndctrlpipe(udev,0), 0xa0, USB_TYPE_VENDOR, addr, 0x00, data, len, 5000); } int usb_cypress_load_firmware(struct usb_device *udev, const struct firmware *fw, int type) { struct hexline *hx; u8 *buf; int ret, pos = 0; u16 cpu_cs_register = cypress[type].cpu_cs_register; buf = kmalloc(sizeof(*hx), GFP_KERNEL); if (!buf) return -ENOMEM; hx = (struct hexline *)buf; /* stop the CPU */ buf[0] = 1; if (usb_cypress_writemem(udev, cpu_cs_register, buf, 1) != 1) err("could not stop the USB controller CPU."); while ((ret = dvb_usb_get_hexline(fw, hx, &pos)) > 0) { deb_fw("writing to address 0x%04x (buffer: 0x%02x %02x)\n", hx->addr, hx->len, hx->chk); ret = usb_cypress_writemem(udev, hx->addr, hx->data, hx->len); if (ret != hx->len) { err("error while transferring firmware (transferred size: %d, block size: %d)", ret, hx->len); ret = -EINVAL; break; } } if (ret < 0) { err("firmware download failed at %d with %d",pos,ret); kfree(buf); return ret; } if (ret == 0) { /* restart the CPU */ buf[0] = 0; if (usb_cypress_writemem(udev, cpu_cs_register, buf, 1) != 1) { err("could not restart the USB controller CPU."); ret = -EINVAL; } } else ret = -EIO; kfree(buf); return ret; } EXPORT_SYMBOL(usb_cypress_load_firmware); int dvb_usb_download_firmware(struct usb_device *udev, const struct dvb_usb_device_properties *props) { int ret; const struct firmware *fw = NULL; if ((ret = request_firmware(&fw, props->firmware, &udev->dev)) != 0) { err("did not find the firmware file '%s' (status %d). You can use <kernel_dir>/scripts/get_dvb_firmware to get the firmware", props->firmware,ret); return ret; } info("downloading firmware from file '%s'",props->firmware); switch (props->usb_ctrl) { case CYPRESS_AN2135: case CYPRESS_AN2235: case CYPRESS_FX2: ret = usb_cypress_load_firmware(udev, fw, props->usb_ctrl); break; case DEVICE_SPECIFIC: if (props->download_firmware) ret = props->download_firmware(udev,fw); else { err("BUG: driver didn't specified a download_firmware-callback, although it claims to have a DEVICE_SPECIFIC one."); ret = -EINVAL; } break; default: ret = -EINVAL; break; } release_firmware(fw); return ret; } int dvb_usb_get_hexline(const struct firmware *fw, struct hexline *hx, int *pos) { u8 *b = (u8 *) &fw->data[*pos]; int data_offs = 4; if (*pos >= fw->size) return 0; memset(hx,0,sizeof(struct hexline)); hx->len = b[0]; if ((*pos + hx->len + 4) >= fw->size) return -EINVAL; hx->addr = b[1] | (b[2] << 8); hx->type = b[3]; if (hx->type == 0x04) { /* b[4] and b[5] are the Extended linear address record data field */ hx->addr |= (b[4] << 24) | (b[5] << 16); /* hx->len -= 2; data_offs += 2; */ } memcpy(hx->data,&b[data_offs],hx->len); hx->chk = b[hx->len + data_offs]; *pos += hx->len + 5; return *pos; } EXPORT_SYMBOL(dvb_usb_get_hexline);
2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 // SPDX-License-Identifier: GPL-2.0-only /* * Driver for the ov9650 sensor * * Copyright (C) 2008 Erik Andrén * Copyright (C) 2007 Ilyes Gouta. Based on the m5603x Linux Driver Project. * Copyright (C) 2005 m5603x Linux Driver Project <m5602@x3ng.com.br> * * Portions of code to USB interface and ALi driver software, * Copyright (c) 2006 Willem Duinker * v4l2 interface modeled after the V4L2 driver * for SN9C10x PC Camera Controllers */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "m5602_ov9650.h" static int ov9650_s_ctrl(struct v4l2_ctrl *ctrl); static void ov9650_dump_registers(struct sd *sd); static const unsigned char preinit_ov9650[][3] = { /* [INITCAM] */ {BRIDGE, M5602_XB_MCU_CLK_DIV, 0x02}, {BRIDGE, M5602_XB_MCU_CLK_CTRL, 0xb0}, {BRIDGE, M5602_XB_SEN_CLK_DIV, 0x00}, {BRIDGE, M5602_XB_SEN_CLK_CTRL, 0xb0}, {BRIDGE, M5602_XB_ADC_CTRL, 0xc0}, {BRIDGE, M5602_XB_SENSOR_CTRL, 0x00}, {BRIDGE, M5602_XB_SENSOR_TYPE, 0x08}, {BRIDGE, M5602_XB_GPIO_DIR, 0x05}, {BRIDGE, M5602_XB_GPIO_DAT, 0x04}, {BRIDGE, M5602_XB_GPIO_EN_H, 0x06}, {BRIDGE, M5602_XB_GPIO_DIR_H, 0x06}, {BRIDGE, M5602_XB_GPIO_DAT_H, 0x00}, {BRIDGE, M5602_XB_GPIO_DAT, 0x00}, {BRIDGE, M5602_XB_I2C_CLK_DIV, 0x0a}, /* Reset chip */ {SENSOR, OV9650_COM7, OV9650_REGISTER_RESET}, /* Enable double clock */ {SENSOR, OV9650_CLKRC, 0x80}, /* Do something out of spec with the power */ {SENSOR, OV9650_OFON, 0x40} }; static const unsigned char init_ov9650[][3] = { /* [INITCAM] */ {BRIDGE, M5602_XB_MCU_CLK_DIV, 0x02}, {BRIDGE, M5602_XB_MCU_CLK_CTRL, 0xb0}, {BRIDGE, M5602_XB_SEN_CLK_DIV, 0x00}, {BRIDGE, M5602_XB_SEN_CLK_CTRL, 0xb0}, {BRIDGE, M5602_XB_ADC_CTRL, 0xc0}, {BRIDGE, M5602_XB_SENSOR_CTRL, 0x00}, {BRIDGE, M5602_XB_SENSOR_TYPE, 0x08}, {BRIDGE, M5602_XB_GPIO_DIR, 0x05}, {BRIDGE, M5602_XB_GPIO_DAT, 0x04}, {BRIDGE, M5602_XB_GPIO_EN_H, 0x06}, {BRIDGE, M5602_XB_GPIO_DIR_H, 0x06}, {BRIDGE, M5602_XB_GPIO_DAT_H, 0x00}, {BRIDGE, M5602_XB_GPIO_DAT, 0x00}, {BRIDGE, M5602_XB_I2C_CLK_DIV, 0x0a}, /* Reset chip */ {SENSOR, OV9650_COM7, OV9650_REGISTER_RESET}, /* One extra reset is needed in order to make the sensor behave properly when resuming from ram, could be a timing issue */ {SENSOR, OV9650_COM7, OV9650_REGISTER_RESET}, /* Enable double clock */ {SENSOR, OV9650_CLKRC, 0x80}, /* Do something out of spec with the power */ {SENSOR, OV9650_OFON, 0x40}, /* Set fast AGC/AEC algorithm with unlimited step size */ {SENSOR, OV9650_COM8, OV9650_FAST_AGC_AEC | OV9650_AEC_UNLIM_STEP_SIZE}, {SENSOR, OV9650_CHLF, 0x10}, {SENSOR, OV9650_ARBLM, 0xbf}, {SENSOR, OV9650_ACOM38, 0x81}, /* Turn off color matrix coefficient double option */ {SENSOR, OV9650_COM16, 0x00}, /* Enable color matrix for RGB/YUV, Delay Y channel, set output Y/UV delay to 1 */ {SENSOR, OV9650_COM13, 0x19}, /* Enable digital BLC, Set output mode to U Y V Y */ {SENSOR, OV9650_TSLB, 0x0c}, /* Limit the AGC/AEC stable upper region */ {SENSOR, OV9650_COM24, 0x00}, /* Enable HREF and some out of spec things */ {SENSOR, OV9650_COM12, 0x73}, /* Set all DBLC offset signs to positive and do some out of spec stuff */ {SENSOR, OV9650_DBLC1, 0xdf}, {SENSOR, OV9650_COM21, 0x06}, {SENSOR, OV9650_RSVD35, 0x91}, /* Necessary, no camera stream without it */ {SENSOR, OV9650_RSVD16, 0x06}, {SENSOR, OV9650_RSVD94, 0x99}, {SENSOR, OV9650_RSVD95, 0x99}, {SENSOR, OV9650_RSVD96, 0x04}, /* Enable full range output */ {SENSOR, OV9650_COM15, 0x0}, /* Enable HREF at optical black, enable ADBLC bias, enable ADBLC, reset timings at format change */ {SENSOR, OV9650_COM6, 0x4b}, /* Subtract 32 from the B channel bias */ {SENSOR, OV9650_BBIAS, 0xa0}, /* Subtract 32 from the Gb channel bias */ {SENSOR, OV9650_GbBIAS, 0xa0}, /* Do not bypass the analog BLC and to some out of spec stuff */ {SENSOR, OV9650_Gr_COM, 0x00}, /* Subtract 32 from the R channel bias */ {SENSOR, OV9650_RBIAS, 0xa0}, /* Subtract 32 from the R channel bias */ {SENSOR, OV9650_RBIAS, 0x0}, {SENSOR, OV9650_COM26, 0x80}, {SENSOR, OV9650_ACOMA9, 0x98}, /* Set the AGC/AEC stable region upper limit */ {SENSOR, OV9650_AEW, 0x68}, /* Set the AGC/AEC stable region lower limit */ {SENSOR, OV9650_AEB, 0x5c}, /* Set the high and low limit nibbles to 3 */ {SENSOR, OV9650_VPT, 0xc3}, /* Set the Automatic Gain Ceiling (AGC) to 128x, drop VSYNC at frame drop, limit exposure timing, drop frame when the AEC step is larger than the exposure gap */ {SENSOR, OV9650_COM9, 0x6e}, /* Set VSYNC negative, Set RESET to SLHS (slave mode horizontal sync) and set PWDN to SLVS (slave mode vertical sync) */ {SENSOR, OV9650_COM10, 0x42}, /* Set horizontal column start high to default value */ {SENSOR, OV9650_HSTART, 0x1a}, /* 210 */ /* Set horizontal column end */ {SENSOR, OV9650_HSTOP, 0xbf}, /* 1534 */ /* Complementing register to the two writes above */ {SENSOR, OV9650_HREF, 0xb2}, /* Set vertical row start high bits */ {SENSOR, OV9650_VSTRT, 0x02}, /* Set vertical row end low bits */ {SENSOR, OV9650_VSTOP, 0x7e}, /* Set complementing vertical frame control */ {SENSOR, OV9650_VREF, 0x10}, {SENSOR, OV9650_ADC, 0x04}, {SENSOR, OV9650_HV, 0x40}, /* Enable denoise, and white-pixel erase */ {SENSOR, OV9650_COM22, OV9650_DENOISE_ENABLE | OV9650_WHITE_PIXEL_ENABLE | OV9650_WHITE_PIXEL_OPTION}, /* Enable VARIOPIXEL */ {SENSOR, OV9650_COM3, OV9650_VARIOPIXEL}, {SENSOR, OV9650_COM4, OV9650_QVGA_VARIOPIXEL}, /* Put the sensor in soft sleep mode */ {SENSOR, OV9650_COM2, OV9650_SOFT_SLEEP | OV9650_OUTPUT_DRIVE_2X}, }; static const unsigned char res_init_ov9650[][3] = { {SENSOR, OV9650_COM2, OV9650_OUTPUT_DRIVE_2X}, {BRIDGE, M5602_XB_LINE_OF_FRAME_H, 0x82}, {BRIDGE, M5602_XB_LINE_OF_FRAME_L, 0x00}, {BRIDGE, M5602_XB_PIX_OF_LINE_H, 0x82}, {BRIDGE, M5602_XB_PIX_OF_LINE_L, 0x00}, {BRIDGE, M5602_XB_SIG_INI, 0x01} }; /* Vertically and horizontally flips the image if matched, needed for machines where the sensor is mounted upside down */ static const struct dmi_system_id ov9650_flip_dmi_table[] = { { .ident = "ASUS A6Ja", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "A6J") } }, { .ident = "ASUS A6JC", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "A6JC") } }, { .ident = "ASUS A6K", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "A6K") } }, { .ident = "ASUS A6Kt", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "A6Kt") } }, { .ident = "ASUS A6VA", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "A6VA") } }, { .ident = "ASUS A6VC", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "A6VC") } }, { .ident = "ASUS A6VM", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "A6VM") } }, { .ident = "ASUS A7V", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "A7V") } }, { .ident = "Alienware Aurora m9700", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "alienware"), DMI_MATCH(DMI_PRODUCT_NAME, "Aurora m9700") } }, {} }; static struct v4l2_pix_format ov9650_modes[] = { { 176, 144, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .sizeimage = 176 * 144, .bytesperline = 176, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 9 }, { 320, 240, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .sizeimage = 320 * 240, .bytesperline = 320, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 8 }, { 352, 288, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .sizeimage = 352 * 288, .bytesperline = 352, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 9 }, { 640, 480, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .sizeimage = 640 * 480, .bytesperline = 640, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 9 } }; static const struct v4l2_ctrl_ops ov9650_ctrl_ops = { .s_ctrl = ov9650_s_ctrl, }; int ov9650_probe(struct sd *sd) { int err = 0; u8 prod_id = 0, ver_id = 0, i; struct gspca_dev *gspca_dev = (struct gspca_dev *)sd; if (force_sensor) { if (force_sensor == OV9650_SENSOR) { pr_info("Forcing an %s sensor\n", ov9650.name); goto sensor_found; } /* If we want to force another sensor, don't try to probe this one */ return -ENODEV; } gspca_dbg(gspca_dev, D_PROBE, "Probing for an ov9650 sensor\n"); /* Run the pre-init before probing the sensor */ for (i = 0; i < ARRAY_SIZE(preinit_ov9650) && !err; i++) { u8 data = preinit_ov9650[i][2]; if (preinit_ov9650[i][0] == SENSOR) err = m5602_write_sensor(sd, preinit_ov9650[i][1], &data, 1); else err = m5602_write_bridge(sd, preinit_ov9650[i][1], data); } if (err < 0) return err; if (m5602_read_sensor(sd, OV9650_PID, &prod_id, 1)) return -ENODEV; if (m5602_read_sensor(sd, OV9650_VER, &ver_id, 1)) return -ENODEV; if ((prod_id == 0x96) && (ver_id == 0x52)) { pr_info("Detected an ov9650 sensor\n"); goto sensor_found; } return -ENODEV; sensor_found: sd->gspca_dev.cam.cam_mode = ov9650_modes; sd->gspca_dev.cam.nmodes = ARRAY_SIZE(ov9650_modes); return 0; } int ov9650_init(struct sd *sd) { int i, err = 0; u8 data; if (dump_sensor) ov9650_dump_registers(sd); for (i = 0; i < ARRAY_SIZE(init_ov9650) && !err; i++) { data = init_ov9650[i][2]; if (init_ov9650[i][0] == SENSOR) err = m5602_write_sensor(sd, init_ov9650[i][1], &data, 1); else err = m5602_write_bridge(sd, init_ov9650[i][1], data); } return 0; } int ov9650_init_controls(struct sd *sd) { struct v4l2_ctrl_handler *hdl = &sd->gspca_dev.ctrl_handler; sd->gspca_dev.vdev.ctrl_handler = hdl; v4l2_ctrl_handler_init(hdl, 9); sd->auto_white_bal = v4l2_ctrl_new_std(hdl, &ov9650_ctrl_ops, V4L2_CID_AUTO_WHITE_BALANCE, 0, 1, 1, 1); sd->red_bal = v4l2_ctrl_new_std(hdl, &ov9650_ctrl_ops, V4L2_CID_RED_BALANCE, 0, 255, 1, RED_GAIN_DEFAULT); sd->blue_bal = v4l2_ctrl_new_std(hdl, &ov9650_ctrl_ops, V4L2_CID_BLUE_BALANCE, 0, 255, 1, BLUE_GAIN_DEFAULT); sd->autoexpo = v4l2_ctrl_new_std_menu(hdl, &ov9650_ctrl_ops, V4L2_CID_EXPOSURE_AUTO, 1, 0, V4L2_EXPOSURE_AUTO); sd->expo = v4l2_ctrl_new_std(hdl, &ov9650_ctrl_ops, V4L2_CID_EXPOSURE, 0, 0x1ff, 4, EXPOSURE_DEFAULT); sd->autogain = v4l2_ctrl_new_std(hdl, &ov9650_ctrl_ops, V4L2_CID_AUTOGAIN, 0, 1, 1, 1); sd->gain = v4l2_ctrl_new_std(hdl, &ov9650_ctrl_ops, V4L2_CID_GAIN, 0, 0x3ff, 1, GAIN_DEFAULT); sd->hflip = v4l2_ctrl_new_std(hdl, &ov9650_ctrl_ops, V4L2_CID_HFLIP, 0, 1, 1, 0); sd->vflip = v4l2_ctrl_new_std(hdl, &ov9650_ctrl_ops, V4L2_CID_VFLIP, 0, 1, 1, 0); if (hdl->error) { pr_err("Could not initialize controls\n"); return hdl->error; } v4l2_ctrl_auto_cluster(3, &sd->auto_white_bal, 0, false); v4l2_ctrl_auto_cluster(2, &sd->autoexpo, 0, false); v4l2_ctrl_auto_cluster(2, &sd->autogain, 0, false); v4l2_ctrl_cluster(2, &sd->hflip); return 0; } int ov9650_start(struct sd *sd) { u8 data; int i, err = 0; struct cam *cam = &sd->gspca_dev.cam; int width = cam->cam_mode[sd->gspca_dev.curr_mode].width; int height = cam->cam_mode[sd->gspca_dev.curr_mode].height; int ver_offs = cam->cam_mode[sd->gspca_dev.curr_mode].priv; int hor_offs = OV9650_LEFT_OFFSET; struct gspca_dev *gspca_dev = (struct gspca_dev *)sd; if ((!dmi_check_system(ov9650_flip_dmi_table) && sd->vflip->val) || (dmi_check_system(ov9650_flip_dmi_table) && !sd->vflip->val)) ver_offs--; if (width <= 320) hor_offs /= 2; /* Synthesize the vsync/hsync setup */ for (i = 0; i < ARRAY_SIZE(res_init_ov9650) && !err; i++) { if (res_init_ov9650[i][0] == BRIDGE) err = m5602_write_bridge(sd, res_init_ov9650[i][1], res_init_ov9650[i][2]); else if (res_init_ov9650[i][0] == SENSOR) { data = res_init_ov9650[i][2]; err = m5602_write_sensor(sd, res_init_ov9650[i][1], &data, 1); } } if (err < 0) return err; err = m5602_write_bridge(sd, M5602_XB_VSYNC_PARA, ((ver_offs >> 8) & 0xff)); if (err < 0) return err; err = m5602_write_bridge(sd, M5602_XB_VSYNC_PARA, (ver_offs & 0xff)); if (err < 0) return err; err = m5602_write_bridge(sd, M5602_XB_VSYNC_PARA, 0); if (err < 0) return err; err = m5602_write_bridge(sd, M5602_XB_VSYNC_PARA, (height >> 8) & 0xff); if (err < 0) return err; err = m5602_write_bridge(sd, M5602_XB_VSYNC_PARA, (height & 0xff)); if (err < 0) return err; for (i = 0; i < 2 && !err; i++) err = m5602_write_bridge(sd, M5602_XB_VSYNC_PARA, 0); if (err < 0) return err; err = m5602_write_bridge(sd, M5602_XB_SIG_INI, 0); if (err < 0) return err; err = m5602_write_bridge(sd, M5602_XB_SIG_INI, 2); if (err < 0) return err; err = m5602_write_bridge(sd, M5602_XB_HSYNC_PARA, (hor_offs >> 8) & 0xff); if (err < 0) return err; err = m5602_write_bridge(sd, M5602_XB_HSYNC_PARA, hor_offs & 0xff); if (err < 0) return err; err = m5602_write_bridge(sd, M5602_XB_HSYNC_PARA, ((width + hor_offs) >> 8) & 0xff); if (err < 0) return err; err = m5602_write_bridge(sd, M5602_XB_HSYNC_PARA, ((width + hor_offs) & 0xff)); if (err < 0) return err; err = m5602_write_bridge(sd, M5602_XB_SIG_INI, 0); if (err < 0) return err; switch (width) { case 640: gspca_dbg(gspca_dev, D_CONF, "Configuring camera for VGA mode\n"); data = OV9650_VGA_SELECT | OV9650_RGB_SELECT | OV9650_RAW_RGB_SELECT; err = m5602_write_sensor(sd, OV9650_COM7, &data, 1); break; case 352: gspca_dbg(gspca_dev, D_CONF, "Configuring camera for CIF mode\n"); data = OV9650_CIF_SELECT | OV9650_RGB_SELECT | OV9650_RAW_RGB_SELECT; err = m5602_write_sensor(sd, OV9650_COM7, &data, 1); break; case 320: gspca_dbg(gspca_dev, D_CONF, "Configuring camera for QVGA mode\n"); data = OV9650_QVGA_SELECT | OV9650_RGB_SELECT | OV9650_RAW_RGB_SELECT; err = m5602_write_sensor(sd, OV9650_COM7, &data, 1); break; case 176: gspca_dbg(gspca_dev, D_CONF, "Configuring camera for QCIF mode\n"); data = OV9650_QCIF_SELECT | OV9650_RGB_SELECT | OV9650_RAW_RGB_SELECT; err = m5602_write_sensor(sd, OV9650_COM7, &data, 1); break; } return err; } int ov9650_stop(struct sd *sd) { u8 data = OV9650_SOFT_SLEEP | OV9650_OUTPUT_DRIVE_2X; return m5602_write_sensor(sd, OV9650_COM2, &data, 1); } void ov9650_disconnect(struct sd *sd) { ov9650_stop(sd); sd->sensor = NULL; } static int ov9650_set_exposure(struct gspca_dev *gspca_dev, __s32 val) { struct sd *sd = (struct sd *) gspca_dev; u8 i2c_data; int err; gspca_dbg(gspca_dev, D_CONF, "Set exposure to %d\n", val); /* The 6 MSBs */ i2c_data = (val >> 10) & 0x3f; err = m5602_write_sensor(sd, OV9650_AECHM, &i2c_data, 1); if (err < 0) return err; /* The 8 middle bits */ i2c_data = (val >> 2) & 0xff; err = m5602_write_sensor(sd, OV9650_AECH, &i2c_data, 1); if (err < 0) return err; /* The 2 LSBs */ i2c_data = val & 0x03; err = m5602_write_sensor(sd, OV9650_COM1, &i2c_data, 1); return err; } static int ov9650_set_gain(struct gspca_dev *gspca_dev, __s32 val) { int err; u8 i2c_data; struct sd *sd = (struct sd *) gspca_dev; gspca_dbg(gspca_dev, D_CONF, "Setting gain to %d\n", val); /* The 2 MSB */ /* Read the OV9650_VREF register first to avoid corrupting the VREF high and low bits */ err = m5602_read_sensor(sd, OV9650_VREF, &i2c_data, 1); if (err < 0) return err; /* Mask away all uninteresting bits */ i2c_data = ((val & 0x0300) >> 2) | (i2c_data & 0x3f); err = m5602_write_sensor(sd, OV9650_VREF, &i2c_data, 1); if (err < 0) return err; /* The 8 LSBs */ i2c_data = val & 0xff; err = m5602_write_sensor(sd, OV9650_GAIN, &i2c_data, 1); return err; } static int ov9650_set_red_balance(struct gspca_dev *gspca_dev, __s32 val) { int err; u8 i2c_data; struct sd *sd = (struct sd *) gspca_dev; gspca_dbg(gspca_dev, D_CONF, "Set red gain to %d\n", val); i2c_data = val & 0xff; err = m5602_write_sensor(sd, OV9650_RED, &i2c_data, 1); return err; } static int ov9650_set_blue_balance(struct gspca_dev *gspca_dev, __s32 val) { int err; u8 i2c_data; struct sd *sd = (struct sd *) gspca_dev; gspca_dbg(gspca_dev, D_CONF, "Set blue gain to %d\n", val); i2c_data = val & 0xff; err = m5602_write_sensor(sd, OV9650_BLUE, &i2c_data, 1); return err; } static int ov9650_set_hvflip(struct gspca_dev *gspca_dev) { int err; u8 i2c_data; struct sd *sd = (struct sd *) gspca_dev; int hflip = sd->hflip->val; int vflip = sd->vflip->val; gspca_dbg(gspca_dev, D_CONF, "Set hvflip to %d %d\n", hflip, vflip); if (dmi_check_system(ov9650_flip_dmi_table)) vflip = !vflip; i2c_data = (hflip << 5) | (vflip << 4); err = m5602_write_sensor(sd, OV9650_MVFP, &i2c_data, 1); if (err < 0) return err; /* When vflip is toggled we need to readjust the bridge hsync/vsync */ if (gspca_dev->streaming) err = ov9650_start(sd); return err; } static int ov9650_set_auto_exposure(struct gspca_dev *gspca_dev, __s32 val) { int err; u8 i2c_data; struct sd *sd = (struct sd *) gspca_dev; gspca_dbg(gspca_dev, D_CONF, "Set auto exposure control to %d\n", val); err = m5602_read_sensor(sd, OV9650_COM8, &i2c_data, 1); if (err < 0) return err; val = (val == V4L2_EXPOSURE_AUTO); i2c_data = ((i2c_data & 0xfe) | ((val & 0x01) << 0)); return m5602_write_sensor(sd, OV9650_COM8, &i2c_data, 1); } static int ov9650_set_auto_white_balance(struct gspca_dev *gspca_dev, __s32 val) { int err; u8 i2c_data; struct sd *sd = (struct sd *) gspca_dev; gspca_dbg(gspca_dev, D_CONF, "Set auto white balance to %d\n", val); err = m5602_read_sensor(sd, OV9650_COM8, &i2c_data, 1); if (err < 0) return err; i2c_data = ((i2c_data & 0xfd) | ((val & 0x01) << 1)); err = m5602_write_sensor(sd, OV9650_COM8, &i2c_data, 1); return err; } static int ov9650_set_auto_gain(struct gspca_dev *gspca_dev, __s32 val) { int err; u8 i2c_data; struct sd *sd = (struct sd *) gspca_dev; gspca_dbg(gspca_dev, D_CONF, "Set auto gain control to %d\n", val); err = m5602_read_sensor(sd, OV9650_COM8, &i2c_data, 1); if (err < 0) return err; i2c_data = ((i2c_data & 0xfb) | ((val & 0x01) << 2)); return m5602_write_sensor(sd, OV9650_COM8, &i2c_data, 1); } static int ov9650_s_ctrl(struct v4l2_ctrl *ctrl) { struct gspca_dev *gspca_dev = container_of(ctrl->handler, struct gspca_dev, ctrl_handler); struct sd *sd = (struct sd *) gspca_dev; int err; if (!gspca_dev->streaming) return 0; switch (ctrl->id) { case V4L2_CID_AUTO_WHITE_BALANCE: err = ov9650_set_auto_white_balance(gspca_dev, ctrl->val); if (err || ctrl->val) return err; err = ov9650_set_red_balance(gspca_dev, sd->red_bal->val); if (err) return err; err = ov9650_set_blue_balance(gspca_dev, sd->blue_bal->val); break; case V4L2_CID_EXPOSURE_AUTO: err = ov9650_set_auto_exposure(gspca_dev, ctrl->val); if (err || ctrl->val == V4L2_EXPOSURE_AUTO) return err; err = ov9650_set_exposure(gspca_dev, sd->expo->val); break; case V4L2_CID_AUTOGAIN: err = ov9650_set_auto_gain(gspca_dev, ctrl->val); if (err || ctrl->val) return err; err = ov9650_set_gain(gspca_dev, sd->gain->val); break; case V4L2_CID_HFLIP: err = ov9650_set_hvflip(gspca_dev); break; default: return -EINVAL; } return err; } static void ov9650_dump_registers(struct sd *sd) { int address; pr_info("Dumping the ov9650 register state\n"); for (address = 0; address < 0xa9; address++) { u8 value; m5602_read_sensor(sd, address, &value, 1); pr_info("register 0x%x contains 0x%x\n", address, value); } pr_info("ov9650 register state dump complete\n"); pr_info("Probing for which registers that are read/write\n"); for (address = 0; address < 0xff; address++) { u8 old_value, ctrl_value; u8 test_value[2] = {0xff, 0xff}; m5602_read_sensor(sd, address, &old_value, 1); m5602_write_sensor(sd, address, test_value, 1); m5602_read_sensor(sd, address, &ctrl_value, 1); if (ctrl_value == test_value[0]) pr_info("register 0x%x is writeable\n", address); else pr_info("register 0x%x is read only\n", address); /* Restore original value */ m5602_write_sensor(sd, address, &old_value, 1); } }
11 11 5 1 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 // SPDX-License-Identifier: GPL-2.0-only /* * STP SAP demux * * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/mutex.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/llc.h> #include <linux/slab.h> #include <linux/module.h> #include <net/llc.h> #include <net/llc_pdu.h> #include <net/stp.h> /* 01:80:c2:00:00:20 - 01:80:c2:00:00:2F */ #define GARP_ADDR_MIN 0x20 #define GARP_ADDR_MAX 0x2F #define GARP_ADDR_RANGE (GARP_ADDR_MAX - GARP_ADDR_MIN) static const struct stp_proto __rcu *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly; static const struct stp_proto __rcu *stp_proto __read_mostly; static struct llc_sap *sap __read_mostly; static unsigned int sap_registered; static DEFINE_MUTEX(stp_proto_mutex); /* Called under rcu_read_lock from LLC */ static int stp_pdu_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct ethhdr *eh = eth_hdr(skb); const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); const struct stp_proto *proto; if (pdu->ssap != LLC_SAP_BSPAN || pdu->dsap != LLC_SAP_BSPAN || pdu->ctrl_1 != LLC_PDU_TYPE_U) goto err; if (eh->h_dest[5] >= GARP_ADDR_MIN && eh->h_dest[5] <= GARP_ADDR_MAX) { proto = rcu_dereference(garp_protos[eh->h_dest[5] - GARP_ADDR_MIN]); if (proto && !ether_addr_equal(eh->h_dest, proto->group_address)) goto err; } else proto = rcu_dereference(stp_proto); if (!proto) goto err; proto->rcv(proto, skb, dev); return 0; err: kfree_skb(skb); return 0; } int stp_proto_register(const struct stp_proto *proto) { int err = 0; mutex_lock(&stp_proto_mutex); if (sap_registered++ == 0) { sap = llc_sap_open(LLC_SAP_BSPAN, stp_pdu_rcv); if (!sap) { err = -ENOMEM; goto out; } } if (is_zero_ether_addr(proto->group_address)) rcu_assign_pointer(stp_proto, proto); else rcu_assign_pointer(garp_protos[proto->group_address[5] - GARP_ADDR_MIN], proto); out: mutex_unlock(&stp_proto_mutex); return err; } EXPORT_SYMBOL_GPL(stp_proto_register); void stp_proto_unregister(const struct stp_proto *proto) { mutex_lock(&stp_proto_mutex); if (is_zero_ether_addr(proto->group_address)) RCU_INIT_POINTER(stp_proto, NULL); else RCU_INIT_POINTER(garp_protos[proto->group_address[5] - GARP_ADDR_MIN], NULL); synchronize_rcu(); if (--sap_registered == 0) llc_sap_put(sap); mutex_unlock(&stp_proto_mutex); } EXPORT_SYMBOL_GPL(stp_proto_unregister); MODULE_DESCRIPTION("SAP demux for IEEE 802.1D Spanning Tree Protocol (STP)"); MODULE_LICENSE("GPL");
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 // SPDX-License-Identifier: GPL-2.0-only /* * IPV4 GSO/GRO offload support * Linux INET implementation * * Copyright (C) 2016 secunet Security Networks AG * Author: Steffen Klassert <steffen.klassert@secunet.com> * * ESP GRO support */ #include <linux/skbuff.h> #include <linux/init.h> #include <net/protocol.h> #include <crypto/aead.h> #include <crypto/authenc.h> #include <linux/err.h> #include <linux/module.h> #include <net/gro.h> #include <net/gso.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/esp.h> #include <linux/scatterlist.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/udp.h> static struct sk_buff *esp4_gro_receive(struct list_head *head, struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; struct xfrm_state *x; int encap_type = 0; __be32 seq; __be32 spi; if (!pskb_pull(skb, offset)) return NULL; if (xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq) != 0) goto out; xo = xfrm_offload(skb); if (!xo || !(xo->flags & CRYPTO_DONE)) { struct sec_path *sp = secpath_set(skb); if (!sp) goto out; if (sp->len == XFRM_MAX_DEPTH) goto out_reset; x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark, (xfrm_address_t *)&ip_hdr(skb)->daddr, spi, IPPROTO_ESP, AF_INET); if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { /* non-offload path will record the error and audit log */ xfrm_state_put(x); x = NULL; } if (!x) goto out_reset; skb->mark = xfrm_smark_get(skb->mark, x); sp->xvec[sp->len++] = x; sp->olen++; xo = xfrm_offload(skb); if (!xo) goto out_reset; } xo->flags |= XFRM_GRO; if (NAPI_GRO_CB(skb)->proto == IPPROTO_UDP) encap_type = UDP_ENCAP_ESPINUDP; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); XFRM_SPI_SKB_CB(skb)->seq = seq; /* We don't need to handle errors from xfrm_input, it does all * the error handling and frees the resources on error. */ xfrm_input(skb, IPPROTO_ESP, spi, encap_type); return ERR_PTR(-EINPROGRESS); out_reset: secpath_reset(skb); out: skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 1; return NULL; } static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb) { struct ip_esp_hdr *esph; struct iphdr *iph = ip_hdr(skb); struct xfrm_offload *xo = xfrm_offload(skb); int proto = iph->protocol; skb_push(skb, -skb_network_offset(skb)); esph = ip_esp_hdr(skb); *skb_mac_header(skb) = IPPROTO_ESP; esph->spi = x->id.spi; esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); xo->proto = proto; } static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { __be16 type = x->inner_mode.family == AF_INET6 ? htons(ETH_P_IPV6) : htons(ETH_P_IP); return skb_eth_gso_segment(skb, features, type); } static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { const struct net_offload *ops; struct sk_buff *segs = ERR_PTR(-EINVAL); struct xfrm_offload *xo = xfrm_offload(skb); skb->transport_header += x->props.header_len; ops = rcu_dereference(inet_offloads[xo->proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); return segs; } static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { struct xfrm_offload *xo = xfrm_offload(skb); struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; u8 proto = xo->proto; skb->transport_header += x->props.header_len; if (x->sel.family != AF_INET6) { if (proto == IPPROTO_BEETPH) { struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data; skb->transport_header += ph->hdrlen * 8; proto = ph->nexthdr; } else { skb->transport_header -= IPV4_BEET_PHMAXLEN; } } else { __be16 frag; skb->transport_header += ipv6_skip_exthdr(skb, 0, &proto, &frag); if (proto == IPPROTO_TCP) skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; } if (proto == IPPROTO_IPV6) skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; __skb_pull(skb, skb_transport_offset(skb)); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); return segs; } static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { switch (x->outer_mode.encap) { case XFRM_MODE_TUNNEL: return xfrm4_tunnel_gso_segment(x, skb, features); case XFRM_MODE_TRANSPORT: return xfrm4_transport_gso_segment(x, skb, features); case XFRM_MODE_BEET: return xfrm4_beet_gso_segment(x, skb, features); } return ERR_PTR(-EOPNOTSUPP); } static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct xfrm_state *x; struct ip_esp_hdr *esph; struct crypto_aead *aead; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); struct sec_path *sp; if (!xo) return ERR_PTR(-EINVAL); if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) return ERR_PTR(-EINVAL); sp = skb_sec_path(skb); x = sp->xvec[sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); if (esph->spi != x->id.spi) return ERR_PTR(-EINVAL); if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) return ERR_PTR(-EINVAL); __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); skb->encap_hdr_csum = 1; if ((!(skb->dev->gso_partial_features & NETIF_F_HW_ESP) && !(features & NETIF_F_HW_ESP)) || x->xso.dev != skb->dev) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC); else if (!(features & NETIF_F_HW_ESP_TX_CSUM) && !(skb->dev->gso_partial_features & NETIF_F_HW_ESP_TX_CSUM)) esp_features = features & ~(NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC); xo->flags |= XFRM_GSO_SEGMENT; return xfrm4_outer_mode_gso_segment(x, skb, esp_features); } static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb) { struct crypto_aead *aead = x->data; struct xfrm_offload *xo = xfrm_offload(skb); if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead))) return -EINVAL; if (!(xo->flags & CRYPTO_DONE)) skb->ip_summed = CHECKSUM_NONE; return esp_input_done2(skb, 0); } static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { int err; int alen; int blksize; struct xfrm_offload *xo; struct ip_esp_hdr *esph; struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; __u32 seq; int encap_type = 0; esp.inplace = true; xo = xfrm_offload(skb); if (!xo) return -EINVAL; if ((!(features & NETIF_F_HW_ESP) && !(skb->dev->gso_partial_features & NETIF_F_HW_ESP)) || x->xso.dev != skb->dev) { xo->flags |= CRYPTO_FALLBACK; hw_offload = false; } esp.proto = xo->proto; /* skb is pure payload to encrypt */ aead = x->data; alen = crypto_aead_authsize(aead); esp.tfclen = 0; /* XXX: Add support for tfc padding here. */ blksize = ALIGN(crypto_aead_blocksize(aead), 4); esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize); esp.plen = esp.clen - skb->len - esp.tfclen; esp.tailen = esp.tfclen + esp.plen + alen; esp.esph = ip_esp_hdr(skb); if (x->encap) encap_type = x->encap->encap_type; if (!hw_offload || !skb_is_gso(skb) || (hw_offload && encap_type == UDP_ENCAP_ESPINUDP)) { esp.nfrags = esp_output_head(x, skb, &esp); if (esp.nfrags < 0) return esp.nfrags; } seq = xo->seq.low; esph = esp.esph; esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { esph->seq_no = htonl(seq); if (!skb_is_gso(skb)) xo->seq.low++; else xo->seq.low += skb_shinfo(skb)->gso_segs; } if (xo->seq.low < seq) xo->seq.hi++; esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32)); if (hw_offload && encap_type == UDP_ENCAP_ESPINUDP) { /* In the XFRM stack, the encapsulation protocol is set to iphdr->protocol by * setting *skb_mac_header(skb) (see esp_output_udp_encap()) where skb->mac_header * points to iphdr->protocol (see xfrm4_tunnel_encap_add()). * However, in esp_xmit(), skb->mac_header doesn't point to iphdr->protocol. * Therefore, the protocol field needs to be corrected. */ ip_hdr(skb)->protocol = IPPROTO_UDP; esph->seq_no = htonl(seq); } ip_hdr(skb)->tot_len = htons(skb->len); ip_send_check(ip_hdr(skb)); if (hw_offload) { if (!skb_ext_add(skb, SKB_EXT_SEC_PATH)) return -ENOMEM; xo = xfrm_offload(skb); if (!xo) return -EINVAL; xo->flags |= XFRM_XMIT; return 0; } err = esp_output_tail(x, skb, &esp); if (err) return err; secpath_reset(skb); if (skb_needs_linearize(skb, skb->dev->features) && __skb_linearize(skb)) return -ENOMEM; return 0; } static const struct net_offload esp4_offload = { .callbacks = { .gro_receive = esp4_gro_receive, .gso_segment = esp4_gso_segment, }, }; static const struct xfrm_type_offload esp_type_offload = { .owner = THIS_MODULE, .proto = IPPROTO_ESP, .input_tail = esp_input_tail, .xmit = esp_xmit, .encap = esp4_gso_encap, }; static int __init esp4_offload_init(void) { if (xfrm_register_type_offload(&esp_type_offload, AF_INET) < 0) { pr_info("%s: can't add xfrm type offload\n", __func__); return -EAGAIN; } return inet_add_offload(&esp4_offload, IPPROTO_ESP); } static void __exit esp4_offload_exit(void) { xfrm_unregister_type_offload(&esp_type_offload, AF_INET); inet_del_offload(&esp4_offload, IPPROTO_ESP); } module_init(esp4_offload_init); module_exit(esp4_offload_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET, XFRM_PROTO_ESP); MODULE_DESCRIPTION("IPV4 GSO/GRO offload support");
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * configfs_internal.h - Internal stuff for configfs * * Based on sysfs: * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel * * configfs Copyright (C) 2005 Oracle. All rights reserved. */ #ifdef pr_fmt #undef pr_fmt #endif #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> struct configfs_fragment { atomic_t frag_count; struct rw_semaphore frag_sem; bool frag_dead; }; void put_fragment(struct configfs_fragment *); struct configfs_fragment *get_fragment(struct configfs_fragment *); struct configfs_dirent { atomic_t s_count; int s_dependent_count; struct list_head s_sibling; struct list_head s_children; int s_links; void * s_element; int s_type; umode_t s_mode; struct dentry * s_dentry; struct iattr * s_iattr; #ifdef CONFIG_LOCKDEP int s_depth; #endif struct configfs_fragment *s_frag; }; #define CONFIGFS_ROOT 0x0001 #define CONFIGFS_DIR 0x0002 #define CONFIGFS_ITEM_ATTR 0x0004 #define CONFIGFS_ITEM_BIN_ATTR 0x0008 #define CONFIGFS_ITEM_LINK 0x0020 #define CONFIGFS_USET_DIR 0x0040 #define CONFIGFS_USET_DEFAULT 0x0080 #define CONFIGFS_USET_DROPPING 0x0100 #define CONFIGFS_USET_IN_MKDIR 0x0200 #define CONFIGFS_USET_CREATING 0x0400 #define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR) #define CONFIGFS_PINNED \ (CONFIGFS_ROOT | CONFIGFS_DIR | CONFIGFS_ITEM_LINK) extern struct mutex configfs_symlink_mutex; extern spinlock_t configfs_dirent_lock; extern struct kmem_cache *configfs_dir_cachep; extern int configfs_is_root(struct config_item *item); extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *); extern struct inode *configfs_create(struct dentry *, umode_t mode); extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); extern int configfs_create_bin_file(struct config_item *, const struct configfs_bin_attribute *); extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *, void *, umode_t, int, struct configfs_fragment *); extern int configfs_dirent_is_ready(struct configfs_dirent *); extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); extern int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); extern struct dentry *configfs_pin_fs(void); extern void configfs_release_fs(void); extern const struct file_operations configfs_dir_operations; extern const struct file_operations configfs_file_operations; extern const struct file_operations configfs_bin_file_operations; extern const struct inode_operations configfs_dir_inode_operations; extern const struct inode_operations configfs_root_inode_operations; extern const struct inode_operations configfs_symlink_inode_operations; extern const struct dentry_operations configfs_dentry_ops; extern int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname); extern int configfs_unlink(struct inode *dir, struct dentry *dentry); int configfs_create_link(struct configfs_dirent *target, struct dentry *parent, struct dentry *dentry, char *body); static inline struct config_item * to_item(struct dentry * dentry) { struct configfs_dirent * sd = dentry->d_fsdata; return ((struct config_item *) sd->s_element); } static inline struct configfs_attribute * to_attr(struct dentry * dentry) { struct configfs_dirent * sd = dentry->d_fsdata; return ((struct configfs_attribute *) sd->s_element); } static inline struct configfs_bin_attribute *to_bin_attr(struct dentry *dentry) { struct configfs_attribute *attr = to_attr(dentry); return container_of(attr, struct configfs_bin_attribute, cb_attr); } static inline struct config_item *configfs_get_config_item(struct dentry *dentry) { struct config_item * item = NULL; spin_lock(&dentry->d_lock); if (!d_unhashed(dentry)) { struct configfs_dirent * sd = dentry->d_fsdata; item = config_item_get(sd->s_element); } spin_unlock(&dentry->d_lock); return item; } static inline void release_configfs_dirent(struct configfs_dirent * sd) { if (!(sd->s_type & CONFIGFS_ROOT)) { kfree(sd->s_iattr); put_fragment(sd->s_frag); kmem_cache_free(configfs_dir_cachep, sd); } } static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd) { if (sd) { WARN_ON(!atomic_read(&sd->s_count)); atomic_inc(&sd->s_count); } return sd; } static inline void configfs_put(struct configfs_dirent * sd) { WARN_ON(!atomic_read(&sd->s_count)); if (atomic_dec_and_test(&sd->s_count)) release_configfs_dirent(sd); }
2 2 2 3 1 1 1 1 28 8 1 6 2 3 2 7 4 6 21 11 2 10 14 14 14 14 14 14 14 4 16 2 3 1 6 6 2 3 6 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 // SPDX-License-Identifier: GPL-2.0-or-later /* Asymmetric public-key cryptography key type * * See Documentation/crypto/asymmetric-keys.rst * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <keys/asymmetric-subtype.h> #include <keys/asymmetric-parser.h> #include <crypto/public_key.h> #include <linux/seq_file.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/ctype.h> #include <keys/system_keyring.h> #include <keys/user-type.h> #include "asymmetric_keys.h" static LIST_HEAD(asymmetric_key_parsers); static DECLARE_RWSEM(asymmetric_key_parsers_sem); /** * find_asymmetric_key - Find a key by ID. * @keyring: The keys to search. * @id_0: The first ID to look for or NULL. * @id_1: The second ID to look for or NULL, matched together with @id_0 * against @keyring keys' id[0] and id[1]. * @id_2: The fallback ID to match against @keyring keys' id[2] if both of the * other IDs are NULL. * @partial: Use partial match for @id_0 and @id_1 if true, exact if false. * * Find a key in the given keyring by identifier. The preferred identifier is * the id_0 and the fallback identifier is the id_1. If both are given, the * former is matched (exactly or partially) against either of the sought key's * identifiers and the latter must match the found key's second identifier * exactly. If both are missing, id_2 must match the sought key's third * identifier exactly. */ struct key *find_asymmetric_key(struct key *keyring, const struct asymmetric_key_id *id_0, const struct asymmetric_key_id *id_1, const struct asymmetric_key_id *id_2, bool partial) { struct key *key; key_ref_t ref; const char *lookup; char *req, *p; int len; if (id_0) { lookup = id_0->data; len = id_0->len; } else if (id_1) { lookup = id_1->data; len = id_1->len; } else if (id_2) { lookup = id_2->data; len = id_2->len; } else { WARN_ON(1); return ERR_PTR(-EINVAL); } /* Construct an identifier "id:<keyid>". */ p = req = kmalloc(2 + 1 + len * 2 + 1, GFP_KERNEL); if (!req) return ERR_PTR(-ENOMEM); if (!id_0 && !id_1) { *p++ = 'd'; *p++ = 'n'; } else if (partial) { *p++ = 'i'; *p++ = 'd'; } else { *p++ = 'e'; *p++ = 'x'; } *p++ = ':'; p = bin2hex(p, lookup, len); *p = 0; pr_debug("Look up: \"%s\"\n", req); ref = keyring_search(make_key_ref(keyring, 1), &key_type_asymmetric, req, true); if (IS_ERR(ref)) pr_debug("Request for key '%s' err %ld\n", req, PTR_ERR(ref)); kfree(req); if (IS_ERR(ref)) { switch (PTR_ERR(ref)) { /* Hide some search errors */ case -EACCES: case -ENOTDIR: case -EAGAIN: return ERR_PTR(-ENOKEY); default: return ERR_CAST(ref); } } key = key_ref_to_ptr(ref); if (id_0 && id_1) { const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); if (!kids->id[1]) { pr_debug("First ID matches, but second is missing\n"); goto reject; } if (!asymmetric_key_id_same(id_1, kids->id[1])) { pr_debug("First ID matches, but second does not\n"); goto reject; } } pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key)); return key; reject: key_put(key); return ERR_PTR(-EKEYREJECTED); } EXPORT_SYMBOL_GPL(find_asymmetric_key); /** * asymmetric_key_generate_id: Construct an asymmetric key ID * @val_1: First binary blob * @len_1: Length of first binary blob * @val_2: Second binary blob * @len_2: Length of second binary blob * * Construct an asymmetric key ID from a pair of binary blobs. */ struct asymmetric_key_id *asymmetric_key_generate_id(const void *val_1, size_t len_1, const void *val_2, size_t len_2) { struct asymmetric_key_id *kid; kid = kmalloc(sizeof(struct asymmetric_key_id) + len_1 + len_2, GFP_KERNEL); if (!kid) return ERR_PTR(-ENOMEM); kid->len = len_1 + len_2; memcpy(kid->data, val_1, len_1); memcpy(kid->data + len_1, val_2, len_2); return kid; } EXPORT_SYMBOL_GPL(asymmetric_key_generate_id); /** * asymmetric_key_id_same - Return true if two asymmetric keys IDs are the same. * @kid1: The key ID to compare * @kid2: The key ID to compare */ bool asymmetric_key_id_same(const struct asymmetric_key_id *kid1, const struct asymmetric_key_id *kid2) { if (!kid1 || !kid2) return false; if (kid1->len != kid2->len) return false; return memcmp(kid1->data, kid2->data, kid1->len) == 0; } EXPORT_SYMBOL_GPL(asymmetric_key_id_same); /** * asymmetric_key_id_partial - Return true if two asymmetric keys IDs * partially match * @kid1: The key ID to compare * @kid2: The key ID to compare */ bool asymmetric_key_id_partial(const struct asymmetric_key_id *kid1, const struct asymmetric_key_id *kid2) { if (!kid1 || !kid2) return false; if (kid1->len < kid2->len) return false; return memcmp(kid1->data + (kid1->len - kid2->len), kid2->data, kid2->len) == 0; } EXPORT_SYMBOL_GPL(asymmetric_key_id_partial); /** * asymmetric_match_key_ids - Search asymmetric key IDs 1 & 2 * @kids: The pair of key IDs to check * @match_id: The key ID we're looking for * @match: The match function to use */ static bool asymmetric_match_key_ids( const struct asymmetric_key_ids *kids, const struct asymmetric_key_id *match_id, bool (*match)(const struct asymmetric_key_id *kid1, const struct asymmetric_key_id *kid2)) { int i; if (!kids || !match_id) return false; for (i = 0; i < 2; i++) if (match(kids->id[i], match_id)) return true; return false; } /* helper function can be called directly with pre-allocated memory */ inline int __asymmetric_key_hex_to_key_id(const char *id, struct asymmetric_key_id *match_id, size_t hexlen) { match_id->len = hexlen; return hex2bin(match_id->data, id, hexlen); } /** * asymmetric_key_hex_to_key_id - Convert a hex string into a key ID. * @id: The ID as a hex string. */ struct asymmetric_key_id *asymmetric_key_hex_to_key_id(const char *id) { struct asymmetric_key_id *match_id; size_t asciihexlen; int ret; if (!*id) return ERR_PTR(-EINVAL); asciihexlen = strlen(id); if (asciihexlen & 1) return ERR_PTR(-EINVAL); match_id = kmalloc(sizeof(struct asymmetric_key_id) + asciihexlen / 2, GFP_KERNEL); if (!match_id) return ERR_PTR(-ENOMEM); ret = __asymmetric_key_hex_to_key_id(id, match_id, asciihexlen / 2); if (ret < 0) { kfree(match_id); return ERR_PTR(-EINVAL); } return match_id; } /* * Match asymmetric keys by an exact match on one of the first two IDs. */ static bool asymmetric_key_cmp(const struct key *key, const struct key_match_data *match_data) { const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); const struct asymmetric_key_id *match_id = match_data->preparsed; return asymmetric_match_key_ids(kids, match_id, asymmetric_key_id_same); } /* * Match asymmetric keys by a partial match on one of the first two IDs. */ static bool asymmetric_key_cmp_partial(const struct key *key, const struct key_match_data *match_data) { const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); const struct asymmetric_key_id *match_id = match_data->preparsed; return asymmetric_match_key_ids(kids, match_id, asymmetric_key_id_partial); } /* * Match asymmetric keys by an exact match on the third IDs. */ static bool asymmetric_key_cmp_name(const struct key *key, const struct key_match_data *match_data) { const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); const struct asymmetric_key_id *match_id = match_data->preparsed; return kids && asymmetric_key_id_same(kids->id[2], match_id); } /* * Preparse the match criterion. If we don't set lookup_type and cmp, * the default will be an exact match on the key description. * * There are some specifiers for matching key IDs rather than by the key * description: * * "id:<id>" - find a key by partial match on one of the first two IDs * "ex:<id>" - find a key by exact match on one of the first two IDs * "dn:<id>" - find a key by exact match on the third ID * * These have to be searched by iteration rather than by direct lookup because * the key is hashed according to its description. */ static int asymmetric_key_match_preparse(struct key_match_data *match_data) { struct asymmetric_key_id *match_id; const char *spec = match_data->raw_data; const char *id; bool (*cmp)(const struct key *, const struct key_match_data *) = asymmetric_key_cmp; if (!spec || !*spec) return -EINVAL; if (spec[0] == 'i' && spec[1] == 'd' && spec[2] == ':') { id = spec + 3; cmp = asymmetric_key_cmp_partial; } else if (spec[0] == 'e' && spec[1] == 'x' && spec[2] == ':') { id = spec + 3; } else if (spec[0] == 'd' && spec[1] == 'n' && spec[2] == ':') { id = spec + 3; cmp = asymmetric_key_cmp_name; } else { goto default_match; } match_id = asymmetric_key_hex_to_key_id(id); if (IS_ERR(match_id)) return PTR_ERR(match_id); match_data->preparsed = match_id; match_data->cmp = cmp; match_data->lookup_type = KEYRING_SEARCH_LOOKUP_ITERATE; return 0; default_match: return 0; } /* * Free the preparsed the match criterion. */ static void asymmetric_key_match_free(struct key_match_data *match_data) { kfree(match_data->preparsed); } /* * Describe the asymmetric key */ static void asymmetric_key_describe(const struct key *key, struct seq_file *m) { const struct asymmetric_key_subtype *subtype = asymmetric_key_subtype(key); const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); const struct asymmetric_key_id *kid; const unsigned char *p; int n; seq_puts(m, key->description); if (subtype) { seq_puts(m, ": "); subtype->describe(key, m); if (kids && kids->id[1]) { kid = kids->id[1]; seq_putc(m, ' '); n = kid->len; p = kid->data; if (n > 4) { p += n - 4; n = 4; } seq_printf(m, "%*phN", n, p); } seq_puts(m, " ["); /* put something here to indicate the key's capabilities */ seq_putc(m, ']'); } } /* * Preparse a asymmetric payload to get format the contents appropriately for the * internal payload to cut down on the number of scans of the data performed. * * We also generate a proposed description from the contents of the key that * can be used to name the key if the user doesn't want to provide one. */ static int asymmetric_key_preparse(struct key_preparsed_payload *prep) { struct asymmetric_key_parser *parser; int ret; pr_devel("==>%s()\n", __func__); if (prep->datalen == 0) return -EINVAL; down_read(&asymmetric_key_parsers_sem); ret = -EBADMSG; list_for_each_entry(parser, &asymmetric_key_parsers, link) { pr_debug("Trying parser '%s'\n", parser->name); ret = parser->parse(prep); if (ret != -EBADMSG) { pr_debug("Parser recognised the format (ret %d)\n", ret); break; } } up_read(&asymmetric_key_parsers_sem); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } /* * Clean up the key ID list */ static void asymmetric_key_free_kids(struct asymmetric_key_ids *kids) { int i; if (kids) { for (i = 0; i < ARRAY_SIZE(kids->id); i++) kfree(kids->id[i]); kfree(kids); } } /* * Clean up the preparse data */ static void asymmetric_key_free_preparse(struct key_preparsed_payload *prep) { struct asymmetric_key_subtype *subtype = prep->payload.data[asym_subtype]; struct asymmetric_key_ids *kids = prep->payload.data[asym_key_ids]; pr_devel("==>%s()\n", __func__); if (subtype) { subtype->destroy(prep->payload.data[asym_crypto], prep->payload.data[asym_auth]); module_put(subtype->owner); } asymmetric_key_free_kids(kids); kfree(prep->description); } /* * dispose of the data dangling from the corpse of a asymmetric key */ static void asymmetric_key_destroy(struct key *key) { struct asymmetric_key_subtype *subtype = asymmetric_key_subtype(key); struct asymmetric_key_ids *kids = key->payload.data[asym_key_ids]; void *data = key->payload.data[asym_crypto]; void *auth = key->payload.data[asym_auth]; key->payload.data[asym_crypto] = NULL; key->payload.data[asym_subtype] = NULL; key->payload.data[asym_key_ids] = NULL; key->payload.data[asym_auth] = NULL; if (subtype) { subtype->destroy(data, auth); module_put(subtype->owner); } asymmetric_key_free_kids(kids); } static struct key_restriction *asymmetric_restriction_alloc( key_restrict_link_func_t check, struct key *key) { struct key_restriction *keyres = kzalloc(sizeof(struct key_restriction), GFP_KERNEL); if (!keyres) return ERR_PTR(-ENOMEM); keyres->check = check; keyres->key = key; keyres->keytype = &key_type_asymmetric; return keyres; } /* * look up keyring restrict functions for asymmetric keys */ static struct key_restriction *asymmetric_lookup_restriction( const char *restriction) { char *restrict_method; char *parse_buf; char *next; struct key_restriction *ret = ERR_PTR(-EINVAL); if (strcmp("builtin_trusted", restriction) == 0) return asymmetric_restriction_alloc( restrict_link_by_builtin_trusted, NULL); if (strcmp("builtin_and_secondary_trusted", restriction) == 0) return asymmetric_restriction_alloc( restrict_link_by_builtin_and_secondary_trusted, NULL); parse_buf = kstrndup(restriction, PAGE_SIZE, GFP_KERNEL); if (!parse_buf) return ERR_PTR(-ENOMEM); next = parse_buf; restrict_method = strsep(&next, ":"); if ((strcmp(restrict_method, "key_or_keyring") == 0) && next) { char *key_text; key_serial_t serial; struct key *key; key_restrict_link_func_t link_fn = restrict_link_by_key_or_keyring; bool allow_null_key = false; key_text = strsep(&next, ":"); if (next) { if (strcmp(next, "chain") != 0) goto out; link_fn = restrict_link_by_key_or_keyring_chain; allow_null_key = true; } if (kstrtos32(key_text, 0, &serial) < 0) goto out; if ((serial == 0) && allow_null_key) { key = NULL; } else { key = key_lookup(serial); if (IS_ERR(key)) { ret = ERR_CAST(key); goto out; } } ret = asymmetric_restriction_alloc(link_fn, key); if (IS_ERR(ret)) key_put(key); } out: kfree(parse_buf); return ret; } int asymmetric_key_eds_op(struct kernel_pkey_params *params, const void *in, void *out) { const struct asymmetric_key_subtype *subtype; struct key *key = params->key; int ret; pr_devel("==>%s()\n", __func__); if (key->type != &key_type_asymmetric) return -EINVAL; subtype = asymmetric_key_subtype(key); if (!subtype || !key->payload.data[0]) return -EINVAL; if (!subtype->eds_op) return -ENOTSUPP; ret = subtype->eds_op(params, in, out); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } static int asymmetric_key_verify_signature(struct kernel_pkey_params *params, const void *in, const void *in2) { struct public_key_signature sig = { .s_size = params->in2_len, .digest_size = params->in_len, .encoding = params->encoding, .hash_algo = params->hash_algo, .digest = (void *)in, .s = (void *)in2, }; return verify_signature(params->key, &sig); } struct key_type key_type_asymmetric = { .name = "asymmetric", .preparse = asymmetric_key_preparse, .free_preparse = asymmetric_key_free_preparse, .instantiate = generic_key_instantiate, .match_preparse = asymmetric_key_match_preparse, .match_free = asymmetric_key_match_free, .destroy = asymmetric_key_destroy, .describe = asymmetric_key_describe, .lookup_restriction = asymmetric_lookup_restriction, .asym_query = query_asymmetric_key, .asym_eds_op = asymmetric_key_eds_op, .asym_verify_signature = asymmetric_key_verify_signature, }; EXPORT_SYMBOL_GPL(key_type_asymmetric); /** * register_asymmetric_key_parser - Register a asymmetric key blob parser * @parser: The parser to register */ int register_asymmetric_key_parser(struct asymmetric_key_parser *parser) { struct asymmetric_key_parser *cursor; int ret; down_write(&asymmetric_key_parsers_sem); list_for_each_entry(cursor, &asymmetric_key_parsers, link) { if (strcmp(cursor->name, parser->name) == 0) { pr_err("Asymmetric key parser '%s' already registered\n", parser->name); ret = -EEXIST; goto out; } } list_add_tail(&parser->link, &asymmetric_key_parsers); pr_notice("Asymmetric key parser '%s' registered\n", parser->name); ret = 0; out: up_write(&asymmetric_key_parsers_sem); return ret; } EXPORT_SYMBOL_GPL(register_asymmetric_key_parser); /** * unregister_asymmetric_key_parser - Unregister a asymmetric key blob parser * @parser: The parser to unregister */ void unregister_asymmetric_key_parser(struct asymmetric_key_parser *parser) { down_write(&asymmetric_key_parsers_sem); list_del(&parser->link); up_write(&asymmetric_key_parsers_sem); pr_notice("Asymmetric key parser '%s' unregistered\n", parser->name); } EXPORT_SYMBOL_GPL(unregister_asymmetric_key_parser); /* * Module stuff */ static int __init asymmetric_key_init(void) { return register_key_type(&key_type_asymmetric); } static void __exit asymmetric_key_cleanup(void) { unregister_key_type(&key_type_asymmetric); } module_init(asymmetric_key_init); module_exit(asymmetric_key_cleanup);
64 54 2 2 2 2 535 2 5 7 7 5 5 113 113 11 11 11 11 556 559 3 14 14 276 276 2998 2997 2 1 2 2997 2997 2972 48 102 38 2 38 2945 2949 2944 2942 6 6 6 2947 6 6 2942 2943 2948 45 2946 45 2942 6 2996 2997 2997 3002 2918 38 2846 746 747 19 2523 1483 34 18 2940 4 2993 2994 55 2568 1503 2995 2937 2940 26 37 2939 523 95 1072 3590 1704 3497 3570 3579 3584 1780 1075 3569 65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1994, Karl Keyte: Added support for disk statistics * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> * - July2000 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 */ /* * This handles all read/write requests to block devices */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-pm.h> #include <linux/blk-integrity.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/kernel_stat.h> #include <linux/string.h> #include <linux/init.h> #include <linux/completion.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/task_io_accounting_ops.h> #include <linux/fault-inject.h> #include <linux/list_sort.h> #include <linux/delay.h> #include <linux/ratelimit.h> #include <linux/pm_runtime.h> #include <linux/t10-pi.h> #include <linux/debugfs.h> #include <linux/bpf.h> #include <linux/part_stat.h> #include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> #include "blk.h" #include "blk-mq-sched.h" #include "blk-pm.h" #include "blk-cgroup.h" #include "blk-throttle.h" #include "blk-ioprio.h" struct dentry *blk_debugfs_root; EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert); static DEFINE_IDA(blk_queue_ida); /* * For queue allocation */ static struct kmem_cache *blk_requestq_cachep; /* * Controlling structure to kblockd */ static struct workqueue_struct *kblockd_workqueue; /** * blk_queue_flag_set - atomically set a queue flag * @flag: flag to be set * @q: request queue */ void blk_queue_flag_set(unsigned int flag, struct request_queue *q) { set_bit(flag, &q->queue_flags); } EXPORT_SYMBOL(blk_queue_flag_set); /** * blk_queue_flag_clear - atomically clear a queue flag * @flag: flag to be cleared * @q: request queue */ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) { clear_bit(flag, &q->queue_flags); } EXPORT_SYMBOL(blk_queue_flag_clear); #define REQ_OP_NAME(name) [REQ_OP_##name] = #name static const char *const blk_op_name[] = { REQ_OP_NAME(READ), REQ_OP_NAME(WRITE), REQ_OP_NAME(FLUSH), REQ_OP_NAME(DISCARD), REQ_OP_NAME(SECURE_ERASE), REQ_OP_NAME(ZONE_RESET), REQ_OP_NAME(ZONE_RESET_ALL), REQ_OP_NAME(ZONE_OPEN), REQ_OP_NAME(ZONE_CLOSE), REQ_OP_NAME(ZONE_FINISH), REQ_OP_NAME(ZONE_APPEND), REQ_OP_NAME(WRITE_ZEROES), REQ_OP_NAME(DRV_IN), REQ_OP_NAME(DRV_OUT), }; #undef REQ_OP_NAME /** * blk_op_str - Return string XXX in the REQ_OP_XXX. * @op: REQ_OP_XXX. * * Description: Centralize block layer function to convert REQ_OP_XXX into * string format. Useful in the debugging and tracing bio or request. For * invalid REQ_OP_XXX it returns string "UNKNOWN". */ inline const char *blk_op_str(enum req_op op) { const char *op_str = "UNKNOWN"; if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op]) op_str = blk_op_name[op]; return op_str; } EXPORT_SYMBOL_GPL(blk_op_str); static const struct { int errno; const char *name; } blk_errors[] = { [BLK_STS_OK] = { 0, "" }, [BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" }, [BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" }, [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" }, [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" }, [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" }, [BLK_STS_RESV_CONFLICT] = { -EBADE, "reservation conflict" }, [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" }, [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, [BLK_STS_OFFLINE] = { -ENODEV, "device offline" }, /* device mapper special case, should not leak out: */ [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" }, /* zone device specific errors */ [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" }, [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" }, /* Command duration limit device-side timeout */ [BLK_STS_DURATION_LIMIT] = { -ETIME, "duration limit exceeded" }, [BLK_STS_INVAL] = { -EINVAL, "invalid" }, /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; blk_status_t errno_to_blk_status(int errno) { int i; for (i = 0; i < ARRAY_SIZE(blk_errors); i++) { if (blk_errors[i].errno == errno) return (__force blk_status_t)i; } return BLK_STS_IOERR; } EXPORT_SYMBOL_GPL(errno_to_blk_status); int blk_status_to_errno(blk_status_t status) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) return -EIO; return blk_errors[idx].errno; } EXPORT_SYMBOL_GPL(blk_status_to_errno); const char *blk_status_to_str(blk_status_t status) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) return "<null>"; return blk_errors[idx].name; } EXPORT_SYMBOL_GPL(blk_status_to_str); /** * blk_sync_queue - cancel any pending callbacks on a queue * @q: the queue * * Description: * The block layer may perform asynchronous callback activity * on a queue, such as calling the unplug function after a timeout. * A block device may call blk_sync_queue to ensure that any * such activity is cancelled, thus allowing it to release resources * that the callbacks might use. The caller must already have made sure * that its ->submit_bio will not re-add plugging prior to calling * this function. * * This function does not cancel any asynchronous activity arising * out of elevator or throttling code. That would require elevator_exit() * and blkcg_exit_queue() to be called with queue lock initialized. * */ void blk_sync_queue(struct request_queue *q) { del_timer_sync(&q->timeout); cancel_work_sync(&q->timeout_work); } EXPORT_SYMBOL(blk_sync_queue); /** * blk_set_pm_only - increment pm_only counter * @q: request queue pointer */ void blk_set_pm_only(struct request_queue *q) { atomic_inc(&q->pm_only); } EXPORT_SYMBOL_GPL(blk_set_pm_only); void blk_clear_pm_only(struct request_queue *q) { int pm_only; pm_only = atomic_dec_return(&q->pm_only); WARN_ON_ONCE(pm_only < 0); if (pm_only == 0) wake_up_all(&q->mq_freeze_wq); } EXPORT_SYMBOL_GPL(blk_clear_pm_only); static void blk_free_queue_rcu(struct rcu_head *rcu_head) { struct request_queue *q = container_of(rcu_head, struct request_queue, rcu_head); percpu_ref_exit(&q->q_usage_counter); kmem_cache_free(blk_requestq_cachep, q); } static void blk_free_queue(struct request_queue *q) { blk_free_queue_stats(q->stats); if (queue_is_mq(q)) blk_mq_release(q); ida_free(&blk_queue_ida, q->id); lockdep_unregister_key(&q->io_lock_cls_key); lockdep_unregister_key(&q->q_lock_cls_key); call_rcu(&q->rcu_head, blk_free_queue_rcu); } /** * blk_put_queue - decrement the request_queue refcount * @q: the request_queue structure to decrement the refcount for * * Decrements the refcount of the request_queue and free it when the refcount * reaches 0. */ void blk_put_queue(struct request_queue *q) { if (refcount_dec_and_test(&q->refs)) blk_free_queue(q); } EXPORT_SYMBOL(blk_put_queue); bool blk_queue_start_drain(struct request_queue *q) { /* * When queue DYING flag is set, we need to block new req * entering queue, so we call blk_freeze_queue_start() to * prevent I/O from crossing blk_queue_enter(). */ bool freeze = __blk_freeze_queue_start(q, current); if (queue_is_mq(q)) blk_mq_wake_waiters(q); /* Make blk_queue_enter() reexamine the DYING flag. */ wake_up_all(&q->mq_freeze_wq); return freeze; } /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM */ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { const bool pm = flags & BLK_MQ_REQ_PM; while (!blk_try_enter_queue(q, pm)) { if (flags & BLK_MQ_REQ_NOWAIT) return -EAGAIN; /* * read pair of barrier in blk_freeze_queue_start(), we need to * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and * reading .mq_freeze_depth or queue dying flag, otherwise the * following wait may never return if the two reads are * reordered. */ smp_rmb(); wait_event(q->mq_freeze_wq, (!q->mq_freeze_depth && blk_pm_resume_queue(pm, q)) || blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; } rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_); rwsem_release(&q->q_lockdep_map, _RET_IP_); return 0; } int __bio_queue_enter(struct request_queue *q, struct bio *bio) { while (!blk_try_enter_queue(q, false)) { struct gendisk *disk = bio->bi_bdev->bd_disk; if (bio->bi_opf & REQ_NOWAIT) { if (test_bit(GD_DEAD, &disk->state)) goto dead; bio_wouldblock_error(bio); return -EAGAIN; } /* * read pair of barrier in blk_freeze_queue_start(), we need to * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and * reading .mq_freeze_depth or queue dying flag, otherwise the * following wait may never return if the two reads are * reordered. */ smp_rmb(); wait_event(q->mq_freeze_wq, (!q->mq_freeze_depth && blk_pm_resume_queue(false, q)) || test_bit(GD_DEAD, &disk->state)); if (test_bit(GD_DEAD, &disk->state)) goto dead; } rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); rwsem_release(&q->io_lockdep_map, _RET_IP_); return 0; dead: bio_io_error(bio); return -ENODEV; } void blk_queue_exit(struct request_queue *q) { percpu_ref_put(&q->q_usage_counter); } static void blk_queue_usage_counter_release(struct percpu_ref *ref) { struct request_queue *q = container_of(ref, struct request_queue, q_usage_counter); wake_up_all(&q->mq_freeze_wq); } static void blk_rq_timed_out_timer(struct timer_list *t) { struct request_queue *q = from_timer(q, t, timeout); kblockd_schedule_work(&q->timeout_work); } static void blk_timeout_work(struct work_struct *work) { } struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) { struct request_queue *q; int error; q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, node_id); if (!q) return ERR_PTR(-ENOMEM); q->last_merge = NULL; q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); if (q->id < 0) { error = q->id; goto fail_q; } q->stats = blk_alloc_queue_stats(); if (!q->stats) { error = -ENOMEM; goto fail_id; } error = blk_set_default_limits(lim); if (error) goto fail_stats; q->limits = *lim; q->node = node_id; atomic_set(&q->nr_active_requests_shared_tags, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, blk_timeout_work); INIT_LIST_HEAD(&q->icq_list); refcount_set(&q->refs, 1); mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->limits_lock); mutex_init(&q->rq_qos_mutex); spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); mutex_init(&q->mq_freeze_lock); blkg_init_queue(q); /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. */ error = percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, PERCPU_REF_INIT_ATOMIC, GFP_KERNEL); if (error) goto fail_stats; lockdep_register_key(&q->io_lock_cls_key); lockdep_register_key(&q->q_lock_cls_key); lockdep_init_map(&q->io_lockdep_map, "&q->q_usage_counter(io)", &q->io_lock_cls_key, 0); lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)", &q->q_lock_cls_key, 0); q->nr_requests = BLKDEV_DEFAULT_RQ; return q; fail_stats: blk_free_queue_stats(q->stats); fail_id: ida_free(&blk_queue_ida, q->id); fail_q: kmem_cache_free(blk_requestq_cachep, q); return ERR_PTR(error); } /** * blk_get_queue - increment the request_queue refcount * @q: the request_queue structure to increment the refcount for * * Increment the refcount of the request_queue kobject. * * Context: Any context. */ bool blk_get_queue(struct request_queue *q) { if (unlikely(blk_queue_dying(q))) return false; refcount_inc(&q->refs); return true; } EXPORT_SYMBOL(blk_get_queue); #ifdef CONFIG_FAIL_MAKE_REQUEST static DECLARE_FAULT_ATTR(fail_make_request); static int __init setup_fail_make_request(char *str) { return setup_fault_attr(&fail_make_request, str); } __setup("fail_make_request=", setup_fail_make_request); bool should_fail_request(struct block_device *part, unsigned int bytes) { return bdev_test_flag(part, BD_MAKE_IT_FAIL) && should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) { struct dentry *dir = fault_create_debugfs_attr("fail_make_request", NULL, &fail_make_request); return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_make_request_debugfs); #endif /* CONFIG_FAIL_MAKE_REQUEST */ static inline void bio_check_ro(struct bio *bio) { if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) return; if (bdev_test_flag(bio->bi_bdev, BD_RO_WARNED)) return; bdev_set_flag(bio->bi_bdev, BD_RO_WARNED); /* * Use ioctl to set underlying disk of raid/dm to read-only * will trigger this. */ pr_warn("Trying to write to read-only block-device %pg\n", bio->bi_bdev); } } static noinline int should_fail_bio(struct bio *bio) { if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size)) return -EIO; return 0; } ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); /* * Check whether this bio extends beyond the end of the device or partition. * This may well happen - the kernel calls bread() without checking the size of * the device, e.g., when mounting a file system. */ static inline int bio_check_eod(struct bio *bio) { sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); unsigned int nr_sectors = bio_sectors(bio); if (nr_sectors && (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { pr_info_ratelimited("%s: attempt to access beyond end of device\n" "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n", current->comm, bio->bi_bdev, bio->bi_opf, bio->bi_iter.bi_sector, nr_sectors, maxsector); return -EIO; } return 0; } /* * Remap block n of partition p to block n+start(p) of the disk. */ static int blk_partition_remap(struct bio *bio) { struct block_device *p = bio->bi_bdev; if (unlikely(should_fail_request(p, bio->bi_iter.bi_size))) return -EIO; if (bio_sectors(bio)) { bio->bi_iter.bi_sector += p->bd_start_sect; trace_block_bio_remap(bio, p->bd_dev, bio->bi_iter.bi_sector - p->bd_start_sect); } bio_set_flag(bio, BIO_REMAPPED); return 0; } /* * Check write append to a zoned block device. */ static inline blk_status_t blk_check_zone_append(struct request_queue *q, struct bio *bio) { int nr_sectors = bio_sectors(bio); /* Only applicable to zoned block devices */ if (!bdev_is_zoned(bio->bi_bdev)) return BLK_STS_NOTSUPP; /* The bio sector must point to the start of a sequential zone */ if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector)) return BLK_STS_IOERR; /* * Not allowed to cross zone boundaries. Otherwise, the BIO will be * split and could result in non-contiguous sectors being written in * different zones. */ if (nr_sectors > q->limits.chunk_sectors) return BLK_STS_IOERR; /* Make sure the BIO is small enough and will not get split */ if (nr_sectors > q->limits.max_zone_append_sectors) return BLK_STS_IOERR; bio->bi_opf |= REQ_NOMERGE; return BLK_STS_OK; } static void __submit_bio(struct bio *bio) { /* If plug is not used, add new plug here to cache nsecs time. */ struct blk_plug plug; if (unlikely(!blk_crypto_bio_prep(&bio))) return; blk_start_plug(&plug); if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) { blk_mq_submit_bio(bio); } else if (likely(bio_queue_enter(bio) == 0)) { struct gendisk *disk = bio->bi_bdev->bd_disk; if ((bio->bi_opf & REQ_POLLED) && !(disk->queue->limits.features & BLK_FEAT_POLL)) { bio->bi_status = BLK_STS_NOTSUPP; bio_endio(bio); } else { disk->fops->submit_bio(bio); } blk_queue_exit(disk->queue); } blk_finish_plug(&plug); } /* * The loop in this function may be a bit non-obvious, and so deserves some * explanation: * * - Before entering the loop, bio->bi_next is NULL (as all callers ensure * that), so we have a list with a single bio. * - We pretend that we have just taken it off a longer list, so we assign * bio_list to a pointer to the bio_list_on_stack, thus initialising the * bio_list of new bios to be added. ->submit_bio() may indeed add some more * bios through a recursive call to submit_bio_noacct. If it did, we find a * non-NULL value in bio_list and re-enter the loop from the top. * - In this case we really did just take the bio of the top of the list (no * pretending) and so remove it from bio_list, and call into ->submit_bio() * again. * * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio. * bio_list_on_stack[1] contains bios that were submitted before the current * ->submit_bio, but that haven't been processed yet. */ static void __submit_bio_noacct(struct bio *bio) { struct bio_list bio_list_on_stack[2]; BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack[0]); current->bio_list = bio_list_on_stack; do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct bio_list lower, same; /* * Create a fresh bio_list for all subordinate requests. */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); __submit_bio(bio); /* * Sort new bios into those for a lower level and those for the * same level. */ bio_list_init(&lower); bio_list_init(&same); while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) if (q == bdev_get_queue(bio->bi_bdev)) bio_list_add(&same, bio); else bio_list_add(&lower, bio); /* * Now assemble so we handle the lowest level first. */ bio_list_merge(&bio_list_on_stack[0], &lower); bio_list_merge(&bio_list_on_stack[0], &same); bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); current->bio_list = NULL; } static void __submit_bio_noacct_mq(struct bio *bio) { struct bio_list bio_list[2] = { }; current->bio_list = bio_list; do { __submit_bio(bio); } while ((bio = bio_list_pop(&bio_list[0]))); current->bio_list = NULL; } void submit_bio_noacct_nocheck(struct bio *bio) { blk_cgroup_bio_start(bio); blkcg_bio_issue_init(bio); if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_queue(bio); /* * Now that enqueuing has been traced, we need to trace * completion as well. */ bio_set_flag(bio, BIO_TRACE_COMPLETION); } /* * We only want one ->submit_bio to be active at a time, else stack * usage with stacked devices could be a problem. Use current->bio_list * to collect a list of requests submited by a ->submit_bio method while * it is active, and then process them after it returned. */ if (current->bio_list) bio_list_add(&current->bio_list[0], bio); else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) __submit_bio_noacct_mq(bio); else __submit_bio_noacct(bio); } static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, struct bio *bio) { if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q)) return BLK_STS_INVAL; if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q)) return BLK_STS_INVAL; return BLK_STS_OK; } /** * submit_bio_noacct - re-submit a bio to the block device layer for I/O * @bio: The bio describing the location in memory and on the device. * * This is a version of submit_bio() that shall only be used for I/O that is * resubmitted to lower level drivers by stacking block drivers. All file * systems and other upper level users of the block layer should use * submit_bio() instead. */ void submit_bio_noacct(struct bio *bio) { struct block_device *bdev = bio->bi_bdev; struct request_queue *q = bdev_get_queue(bdev); blk_status_t status = BLK_STS_IOERR; might_sleep(); /* * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue does not support NOWAIT. */ if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev)) goto not_supported; if (should_fail_bio(bio)) goto end_io; bio_check_ro(bio); if (!bio_flagged(bio, BIO_REMAPPED)) { if (unlikely(bio_check_eod(bio))) goto end_io; if (bdev_is_partition(bdev) && unlikely(blk_partition_remap(bio))) goto end_io; } /* * Filter flush bio's early so that bio based drivers without flush * support don't have to worry about them. */ if (op_is_flush(bio->bi_opf)) { if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE && bio_op(bio) != REQ_OP_ZONE_APPEND)) goto end_io; if (!bdev_write_cache(bdev)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!bio_sectors(bio)) { status = BLK_STS_OK; goto end_io; } } } switch (bio_op(bio)) { case REQ_OP_READ: break; case REQ_OP_WRITE: if (bio->bi_opf & REQ_ATOMIC) { status = blk_validate_atomic_write_op_size(q, bio); if (status != BLK_STS_OK) goto end_io; } break; case REQ_OP_FLUSH: /* * REQ_OP_FLUSH can't be submitted through bios, it is only * synthetized in struct request by the flush state machine. */ goto not_supported; case REQ_OP_DISCARD: if (!bdev_max_discard_sectors(bdev)) goto not_supported; break; case REQ_OP_SECURE_ERASE: if (!bdev_max_secure_erase_sectors(bdev)) goto not_supported; break; case REQ_OP_ZONE_APPEND: status = blk_check_zone_append(q, bio); if (status != BLK_STS_OK) goto end_io; break; case REQ_OP_WRITE_ZEROES: if (!q->limits.max_write_zeroes_sectors) goto not_supported; break; case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: case REQ_OP_ZONE_RESET_ALL: if (!bdev_is_zoned(bio->bi_bdev)) goto not_supported; break; case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: /* * Driver private operations are only used with passthrough * requests. */ fallthrough; default: goto not_supported; } if (blk_throtl_bio(bio)) return; submit_bio_noacct_nocheck(bio); return; not_supported: status = BLK_STS_NOTSUPP; end_io: bio->bi_status = status; bio_endio(bio); } EXPORT_SYMBOL(submit_bio_noacct); static void bio_set_ioprio(struct bio *bio) { /* Nobody set ioprio so far? Initialize it based on task's nice value */ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE) bio->bi_ioprio = get_current_ioprio(); blkcg_set_ioprio(bio); } /** * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O * * submit_bio() is used to submit I/O requests to block devices. It is passed a * fully set up &struct bio that describes the I/O that needs to be done. The * bio will be send to the device described by the bi_bdev field. * * The success/failure status of the request, along with notification of * completion, is delivered asynchronously through the ->bi_end_io() callback * in @bio. The bio must NOT be touched by the caller until ->bi_end_io() has * been called. */ void submit_bio(struct bio *bio) { if (bio_op(bio) == REQ_OP_READ) { task_io_account_read(bio->bi_iter.bi_size); count_vm_events(PGPGIN, bio_sectors(bio)); } else if (bio_op(bio) == REQ_OP_WRITE) { count_vm_events(PGPGOUT, bio_sectors(bio)); } bio_set_ioprio(bio); submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); /** * bio_poll - poll for BIO completions * @bio: bio to poll for * @iob: batches of IO * @flags: BLK_POLL_* flags that control the behavior * * Poll for completions on queue associated with the bio. Returns number of * completed entries found. * * Note: the caller must either be the context that submitted @bio, or * be in a RCU critical section to prevent freeing of @bio. */ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) { blk_qc_t cookie = READ_ONCE(bio->bi_cookie); struct block_device *bdev; struct request_queue *q; int ret = 0; bdev = READ_ONCE(bio->bi_bdev); if (!bdev) return 0; q = bdev_get_queue(bdev); if (cookie == BLK_QC_T_NONE) return 0; blk_flush_plug(current->plug, false); /* * We need to be able to enter a frozen queue, similar to how * timeouts also need to do that. If that is blocked, then we can * have pending IO when a queue freeze is started, and then the * wait for the freeze to finish will wait for polled requests to * timeout as the poller is preventer from entering the queue and * completing them. As long as we prevent new IO from being queued, * that should be all that matters. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return 0; if (queue_is_mq(q)) { ret = blk_mq_poll(q, cookie, iob, flags); } else { struct gendisk *disk = q->disk; if ((q->limits.features & BLK_FEAT_POLL) && disk && disk->fops->poll_bio) ret = disk->fops->poll_bio(bio, iob, flags); } blk_queue_exit(q); return ret; } EXPORT_SYMBOL_GPL(bio_poll); /* * Helper to implement file_operations.iopoll. Requires the bio to be stored * in iocb->private, and cleared before freeing the bio. */ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, unsigned int flags) { struct bio *bio; int ret = 0; /* * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can * point to a freshly allocated bio at this point. If that happens * we have a few cases to consider: * * 1) the bio is beeing initialized and bi_bdev is NULL. We can just * simply nothing in this case * 2) the bio points to a not poll enabled device. bio_poll will catch * this and return 0 * 3) the bio points to a poll capable device, including but not * limited to the one that the original bio pointed to. In this * case we will call into the actual poll method and poll for I/O, * even if we don't need to, but it won't cause harm either. * * For cases 2) and 3) above the RCU grace period ensures that bi_bdev * is still allocated. Because partitions hold a reference to the whole * device bdev and thus disk, the disk is also still valid. Grabbing * a reference to the queue in bio_poll() ensures the hctxs and requests * are still valid as well. */ rcu_read_lock(); bio = READ_ONCE(kiocb->private); if (bio) ret = bio_poll(bio, iob, flags); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(iocb_bio_iopoll); void update_io_ticks(struct block_device *part, unsigned long now, bool end) { unsigned long stamp; again: stamp = READ_ONCE(part->bd_stamp); if (unlikely(time_after(now, stamp)) && likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) && (end || part_in_flight(part))) __part_stat_add(part, io_ticks, now - stamp); if (bdev_is_partition(part)) { part = bdev_whole(part); goto again; } } unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op, unsigned long start_time) { part_stat_lock(); update_io_ticks(bdev, start_time, false); part_stat_local_inc(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); return start_time; } EXPORT_SYMBOL(bdev_start_io_acct); /** * bio_start_io_acct - start I/O accounting for bio based drivers * @bio: bio to start account for * * Returns the start time that should be passed back to bio_end_io_acct(). */ unsigned long bio_start_io_acct(struct bio *bio) { return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies); } EXPORT_SYMBOL_GPL(bio_start_io_acct); void bdev_end_io_acct(struct block_device *bdev, enum req_op op, unsigned int sectors, unsigned long start_time) { const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); unsigned long duration = now - start_time; part_stat_lock(); update_io_ticks(bdev, now, true); part_stat_inc(bdev, ios[sgrp]); part_stat_add(bdev, sectors[sgrp], sectors); part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration)); part_stat_local_dec(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); } EXPORT_SYMBOL(bdev_end_io_acct); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, struct block_device *orig_bdev) { bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time); } EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped); /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy * @q : the queue of the device being checked * * Description: * Check if underlying low-level drivers of a device are busy. * If the drivers want to export their busy state, they must set own * exporting function using blk_queue_lld_busy() first. * * Basically, this function is used only by request stacking drivers * to stop dispatching requests to underlying devices when underlying * devices are busy. This behavior helps more I/O merging on the queue * of the request stacking driver and prevents I/O throughput regression * on burst I/O load. * * Return: * 0 - Not busy (The request stacking driver should dispatch request) * 1 - Busy (The request stacking driver should stop dispatching request) */ int blk_lld_busy(struct request_queue *q) { if (queue_is_mq(q) && q->mq_ops->busy) return q->mq_ops->busy(q); return 0; } EXPORT_SYMBOL_GPL(blk_lld_busy); int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); } EXPORT_SYMBOL(kblockd_schedule_work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); } EXPORT_SYMBOL(kblockd_mod_delayed_work_on); void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) { struct task_struct *tsk = current; /* * If this is a nested plug, don't actually assign it. */ if (tsk->plug) return; plug->cur_ktime = 0; rq_list_init(&plug->mq_list); rq_list_init(&plug->cached_rqs); plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); plug->rq_count = 0; plug->multiple_queues = false; plug->has_elevator = false; INIT_LIST_HEAD(&plug->cb_list); /* * Store ordering should not be needed here, since a potential * preempt will imply a full memory barrier */ tsk->plug = plug; } /** * blk_start_plug - initialize blk_plug and track it inside the task_struct * @plug: The &struct blk_plug that needs to be initialized * * Description: * blk_start_plug() indicates to the block layer an intent by the caller * to submit multiple I/O requests in a batch. The block layer may use * this hint to defer submitting I/Os from the caller until blk_finish_plug() * is called. However, the block layer may choose to submit requests * before a call to blk_finish_plug() if the number of queued I/Os * exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than * %BLK_PLUG_FLUSH_SIZE. The queued I/Os may also be submitted early if * the task schedules (see below). * * Tracking blk_plug inside the task_struct will help with auto-flushing the * pending I/O should the task end up blocking between blk_start_plug() and * blk_finish_plug(). This is important from a performance perspective, but * also ensures that we don't deadlock. For instance, if the task is blocking * for a memory allocation, memory reclaim could end up wanting to free a * page belonging to that request that is currently residing in our private * plug. By flushing the pending I/O when the process goes to sleep, we avoid * this kind of deadlock. */ void blk_start_plug(struct blk_plug *plug) { blk_start_plug_nr_ios(plug, 1); } EXPORT_SYMBOL(blk_start_plug); static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) { LIST_HEAD(callbacks); while (!list_empty(&plug->cb_list)) { list_splice_init(&plug->cb_list, &callbacks); while (!list_empty(&callbacks)) { struct blk_plug_cb *cb = list_first_entry(&callbacks, struct blk_plug_cb, list); list_del(&cb->list); cb->callback(cb, from_schedule); } } } struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, int size) { struct blk_plug *plug = current->plug; struct blk_plug_cb *cb; if (!plug) return NULL; list_for_each_entry(cb, &plug->cb_list, list) if (cb->callback == unplug && cb->data == data) return cb; /* Not currently on the callback list */ BUG_ON(size < sizeof(*cb)); cb = kzalloc(size, GFP_ATOMIC); if (cb) { cb->data = data; cb->callback = unplug; list_add(&cb->list, &plug->cb_list); } return cb; } EXPORT_SYMBOL(blk_check_plugged); void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) { if (!list_empty(&plug->cb_list)) flush_plug_callbacks(plug, from_schedule); blk_mq_flush_plug_list(plug, from_schedule); /* * Unconditionally flush out cached requests, even if the unplug * event came from schedule. Since we know hold references to the * queue for cached requests, we don't want a blocked task holding * up a queue freeze/quiesce event. */ if (unlikely(!rq_list_empty(&plug->cached_rqs))) blk_mq_free_plug_rqs(plug); plug->cur_ktime = 0; current->flags &= ~PF_BLOCK_TS; } /** * blk_finish_plug - mark the end of a batch of submitted I/O * @plug: The &struct blk_plug passed to blk_start_plug() * * Description: * Indicate that a batch of I/O submissions is complete. This function * must be paired with an initial call to blk_start_plug(). The intent * is to allow the block layer to optimize I/O submission. See the * documentation for blk_start_plug() for more information. */ void blk_finish_plug(struct blk_plug *plug) { if (plug == current->plug) { __blk_flush_plug(plug, false); current->plug = NULL; } } EXPORT_SYMBOL(blk_finish_plug); void blk_io_schedule(void) { /* Prevent hang_check timer from firing at us during very long I/O */ unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; if (timeout) io_schedule_timeout(timeout); else io_schedule(); } EXPORT_SYMBOL_GPL(blk_io_schedule); int __init blk_dev_init(void) { BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (1 << REQ_OP_BITS)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct request, cmd_flags)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct bio, bi_opf)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC); blk_debugfs_root = debugfs_create_dir("block", NULL); return 0; }
1 2 2 1 1 1 1 3 1 2 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 // SPDX-License-Identifier: GPL-2.0-only /*************************************************************************** * Copyright (C) 2010-2012 by Bruno Prémont <bonbons@linux-vserver.org> * * * * Based on Logitech G13 driver (v0.4) * * Copyright (C) 2009 by Rick L. Vinyard, Jr. <rvinyard@cs.nmsu.edu> * * * ***************************************************************************/ #include <linux/hid.h> #include <linux/hid-debug.h> #include <linux/input.h> #include "hid-ids.h" #include <linux/fb.h> #include <linux/vmalloc.h> #include <linux/completion.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/string.h> #include "hid-picolcd.h" /* Input device * * The PicoLCD has an IR receiver header, a built-in keypad with 5 keys * and header for 4x4 key matrix. The built-in keys are part of the matrix. */ static const unsigned short def_keymap[PICOLCD_KEYS] = { KEY_RESERVED, /* none */ KEY_BACK, /* col 4 + row 1 */ KEY_HOMEPAGE, /* col 3 + row 1 */ KEY_RESERVED, /* col 2 + row 1 */ KEY_RESERVED, /* col 1 + row 1 */ KEY_SCROLLUP, /* col 4 + row 2 */ KEY_OK, /* col 3 + row 2 */ KEY_SCROLLDOWN, /* col 2 + row 2 */ KEY_RESERVED, /* col 1 + row 2 */ KEY_RESERVED, /* col 4 + row 3 */ KEY_RESERVED, /* col 3 + row 3 */ KEY_RESERVED, /* col 2 + row 3 */ KEY_RESERVED, /* col 1 + row 3 */ KEY_RESERVED, /* col 4 + row 4 */ KEY_RESERVED, /* col 3 + row 4 */ KEY_RESERVED, /* col 2 + row 4 */ KEY_RESERVED, /* col 1 + row 4 */ }; /* Find a given report */ struct hid_report *picolcd_report(int id, struct hid_device *hdev, int dir) { struct list_head *feature_report_list = &hdev->report_enum[dir].report_list; struct hid_report *report = NULL; list_for_each_entry(report, feature_report_list, list) { if (report->id == id) return report; } hid_warn(hdev, "No report with id 0x%x found\n", id); return NULL; } /* Submit a report and wait for a reply from device - if device fades away * or does not respond in time, return NULL */ struct picolcd_pending *picolcd_send_and_wait(struct hid_device *hdev, int report_id, const u8 *raw_data, int size) { struct picolcd_data *data = hid_get_drvdata(hdev); struct picolcd_pending *work; struct hid_report *report = picolcd_out_report(report_id, hdev); unsigned long flags; int i, j, k; if (!report || !data) return NULL; if (data->status & PICOLCD_FAILED) return NULL; work = kzalloc(sizeof(*work), GFP_KERNEL); if (!work) return NULL; init_completion(&work->ready); work->out_report = report; work->in_report = NULL; work->raw_size = 0; mutex_lock(&data->mutex); spin_lock_irqsave(&data->lock, flags); for (i = k = 0; i < report->maxfield; i++) for (j = 0; j < report->field[i]->report_count; j++) { hid_set_field(report->field[i], j, k < size ? raw_data[k] : 0); k++; } if (data->status & PICOLCD_FAILED) { kfree(work); work = NULL; } else { data->pending = work; hid_hw_request(data->hdev, report, HID_REQ_SET_REPORT); spin_unlock_irqrestore(&data->lock, flags); wait_for_completion_interruptible_timeout(&work->ready, HZ*2); spin_lock_irqsave(&data->lock, flags); data->pending = NULL; } spin_unlock_irqrestore(&data->lock, flags); mutex_unlock(&data->mutex); return work; } /* * input class device */ static int picolcd_raw_keypad(struct picolcd_data *data, struct hid_report *report, u8 *raw_data, int size) { /* * Keypad event * First and second data bytes list currently pressed keys, * 0x00 means no key and at most 2 keys may be pressed at same time */ int i, j; /* determine newly pressed keys */ for (i = 0; i < size; i++) { unsigned int key_code; if (raw_data[i] == 0) continue; for (j = 0; j < sizeof(data->pressed_keys); j++) if (data->pressed_keys[j] == raw_data[i]) goto key_already_down; for (j = 0; j < sizeof(data->pressed_keys); j++) if (data->pressed_keys[j] == 0) { data->pressed_keys[j] = raw_data[i]; break; } input_event(data->input_keys, EV_MSC, MSC_SCAN, raw_data[i]); if (raw_data[i] < PICOLCD_KEYS) key_code = data->keycode[raw_data[i]]; else key_code = KEY_UNKNOWN; if (key_code != KEY_UNKNOWN) { dbg_hid(PICOLCD_NAME " got key press for %u:%d", raw_data[i], key_code); input_report_key(data->input_keys, key_code, 1); } input_sync(data->input_keys); key_already_down: continue; } /* determine newly released keys */ for (j = 0; j < sizeof(data->pressed_keys); j++) { unsigned int key_code; if (data->pressed_keys[j] == 0) continue; for (i = 0; i < size; i++) if (data->pressed_keys[j] == raw_data[i]) goto key_still_down; input_event(data->input_keys, EV_MSC, MSC_SCAN, data->pressed_keys[j]); if (data->pressed_keys[j] < PICOLCD_KEYS) key_code = data->keycode[data->pressed_keys[j]]; else key_code = KEY_UNKNOWN; if (key_code != KEY_UNKNOWN) { dbg_hid(PICOLCD_NAME " got key release for %u:%d", data->pressed_keys[j], key_code); input_report_key(data->input_keys, key_code, 0); } input_sync(data->input_keys); data->pressed_keys[j] = 0; key_still_down: continue; } return 1; } static int picolcd_check_version(struct hid_device *hdev) { struct picolcd_data *data = hid_get_drvdata(hdev); struct picolcd_pending *verinfo; int ret = 0; if (!data) return -ENODEV; verinfo = picolcd_send_and_wait(hdev, REPORT_VERSION, NULL, 0); if (!verinfo) { hid_err(hdev, "no version response from PicoLCD\n"); return -ENODEV; } if (verinfo->raw_size == 2) { data->version[0] = verinfo->raw_data[1]; data->version[1] = verinfo->raw_data[0]; if (data->status & PICOLCD_BOOTLOADER) { hid_info(hdev, "PicoLCD, bootloader version %d.%d\n", verinfo->raw_data[1], verinfo->raw_data[0]); } else { hid_info(hdev, "PicoLCD, firmware version %d.%d\n", verinfo->raw_data[1], verinfo->raw_data[0]); } } else { hid_err(hdev, "confused, got unexpected version response from PicoLCD\n"); ret = -EINVAL; } kfree(verinfo); return ret; } /* * Reset our device and wait for answer to VERSION request */ int picolcd_reset(struct hid_device *hdev) { struct picolcd_data *data = hid_get_drvdata(hdev); struct hid_report *report = picolcd_out_report(REPORT_RESET, hdev); unsigned long flags; int error; if (!data || !report || report->maxfield != 1) return -ENODEV; spin_lock_irqsave(&data->lock, flags); if (hdev->product == USB_DEVICE_ID_PICOLCD_BOOTLOADER) data->status |= PICOLCD_BOOTLOADER; /* perform the reset */ hid_set_field(report->field[0], 0, 1); if (data->status & PICOLCD_FAILED) { spin_unlock_irqrestore(&data->lock, flags); return -ENODEV; } hid_hw_request(hdev, report, HID_REQ_SET_REPORT); spin_unlock_irqrestore(&data->lock, flags); error = picolcd_check_version(hdev); if (error) return error; picolcd_resume_lcd(data); picolcd_resume_backlight(data); picolcd_fb_refresh(data); picolcd_leds_set(data); return 0; } /* * The "operation_mode" sysfs attribute */ static ssize_t picolcd_operation_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct picolcd_data *data = dev_get_drvdata(dev); if (data->status & PICOLCD_BOOTLOADER) return sysfs_emit(buf, "[bootloader] lcd\n"); else return sysfs_emit(buf, "bootloader [lcd]\n"); } static ssize_t picolcd_operation_mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct picolcd_data *data = dev_get_drvdata(dev); struct hid_report *report = NULL; int timeout = data->opmode_delay; unsigned long flags; if (sysfs_streq(buf, "lcd")) { if (data->status & PICOLCD_BOOTLOADER) report = picolcd_out_report(REPORT_EXIT_FLASHER, data->hdev); } else if (sysfs_streq(buf, "bootloader")) { if (!(data->status & PICOLCD_BOOTLOADER)) report = picolcd_out_report(REPORT_EXIT_KEYBOARD, data->hdev); } else { return -EINVAL; } if (!report || report->maxfield != 1) return -EINVAL; spin_lock_irqsave(&data->lock, flags); hid_set_field(report->field[0], 0, timeout & 0xff); hid_set_field(report->field[0], 1, (timeout >> 8) & 0xff); hid_hw_request(data->hdev, report, HID_REQ_SET_REPORT); spin_unlock_irqrestore(&data->lock, flags); return count; } static DEVICE_ATTR(operation_mode, 0644, picolcd_operation_mode_show, picolcd_operation_mode_store); /* * The "operation_mode_delay" sysfs attribute */ static ssize_t picolcd_operation_mode_delay_show(struct device *dev, struct device_attribute *attr, char *buf) { struct picolcd_data *data = dev_get_drvdata(dev); return sysfs_emit(buf, "%hu\n", data->opmode_delay); } static ssize_t picolcd_operation_mode_delay_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct picolcd_data *data = dev_get_drvdata(dev); unsigned u; if (sscanf(buf, "%u", &u) != 1) return -EINVAL; if (u > 30000) return -EINVAL; else data->opmode_delay = u; return count; } static DEVICE_ATTR(operation_mode_delay, 0644, picolcd_operation_mode_delay_show, picolcd_operation_mode_delay_store); /* * Handle raw report as sent by device */ static int picolcd_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *raw_data, int size) { struct picolcd_data *data = hid_get_drvdata(hdev); unsigned long flags; if (!data) return 1; if (size > 64) { hid_warn(hdev, "invalid size value (%d) for picolcd raw event (%d)\n", size, report->id); return 0; } if (report->id == REPORT_KEY_STATE) { if (data->input_keys) picolcd_raw_keypad(data, report, raw_data+1, size-1); } else if (report->id == REPORT_IR_DATA) { picolcd_raw_cir(data, report, raw_data+1, size-1); } else { spin_lock_irqsave(&data->lock, flags); /* * We let the caller of picolcd_send_and_wait() check if the * report we got is one of the expected ones or not. */ if (data->pending) { memcpy(data->pending->raw_data, raw_data+1, size-1); data->pending->raw_size = size-1; data->pending->in_report = report; complete(&data->pending->ready); } spin_unlock_irqrestore(&data->lock, flags); } picolcd_debug_raw_event(data, hdev, report, raw_data, size); return 1; } #ifdef CONFIG_PM static int picolcd_suspend(struct hid_device *hdev, pm_message_t message) { if (PMSG_IS_AUTO(message)) return 0; picolcd_suspend_backlight(hid_get_drvdata(hdev)); dbg_hid(PICOLCD_NAME " device ready for suspend\n"); return 0; } static int picolcd_resume(struct hid_device *hdev) { int ret; ret = picolcd_resume_backlight(hid_get_drvdata(hdev)); if (ret) dbg_hid(PICOLCD_NAME " restoring backlight failed: %d\n", ret); return 0; } static int picolcd_reset_resume(struct hid_device *hdev) { int ret; ret = picolcd_reset(hdev); if (ret) dbg_hid(PICOLCD_NAME " resetting our device failed: %d\n", ret); ret = picolcd_fb_reset(hid_get_drvdata(hdev), 0); if (ret) dbg_hid(PICOLCD_NAME " restoring framebuffer content failed: %d\n", ret); ret = picolcd_resume_lcd(hid_get_drvdata(hdev)); if (ret) dbg_hid(PICOLCD_NAME " restoring lcd failed: %d\n", ret); ret = picolcd_resume_backlight(hid_get_drvdata(hdev)); if (ret) dbg_hid(PICOLCD_NAME " restoring backlight failed: %d\n", ret); picolcd_leds_set(hid_get_drvdata(hdev)); return 0; } #endif /* initialize keypad input device */ static int picolcd_init_keys(struct picolcd_data *data, struct hid_report *report) { struct hid_device *hdev = data->hdev; struct input_dev *idev; int error, i; if (!report) return -ENODEV; if (report->maxfield != 1 || report->field[0]->report_count != 2 || report->field[0]->report_size != 8) { hid_err(hdev, "unsupported KEY_STATE report\n"); return -EINVAL; } idev = input_allocate_device(); if (idev == NULL) { hid_err(hdev, "failed to allocate input device\n"); return -ENOMEM; } input_set_drvdata(idev, hdev); memcpy(data->keycode, def_keymap, sizeof(def_keymap)); idev->name = hdev->name; idev->phys = hdev->phys; idev->uniq = hdev->uniq; idev->id.bustype = hdev->bus; idev->id.vendor = hdev->vendor; idev->id.product = hdev->product; idev->id.version = hdev->version; idev->dev.parent = &hdev->dev; idev->keycode = &data->keycode; idev->keycodemax = PICOLCD_KEYS; idev->keycodesize = sizeof(data->keycode[0]); input_set_capability(idev, EV_MSC, MSC_SCAN); set_bit(EV_REP, idev->evbit); for (i = 0; i < PICOLCD_KEYS; i++) input_set_capability(idev, EV_KEY, data->keycode[i]); error = input_register_device(idev); if (error) { hid_err(hdev, "error registering the input device\n"); input_free_device(idev); return error; } data->input_keys = idev; return 0; } static void picolcd_exit_keys(struct picolcd_data *data) { struct input_dev *idev = data->input_keys; data->input_keys = NULL; if (idev) input_unregister_device(idev); } static int picolcd_probe_lcd(struct hid_device *hdev, struct picolcd_data *data) { int error; /* Setup keypad input device */ error = picolcd_init_keys(data, picolcd_in_report(REPORT_KEY_STATE, hdev)); if (error) goto err; /* Setup CIR input device */ error = picolcd_init_cir(data, picolcd_in_report(REPORT_IR_DATA, hdev)); if (error) goto err; /* Setup lcd class device */ error = picolcd_init_lcd(data, picolcd_out_report(REPORT_CONTRAST, hdev)); if (error) goto err; /* Setup backlight class device */ error = picolcd_init_backlight(data, picolcd_out_report(REPORT_BRIGHTNESS, hdev)); if (error) goto err; /* Set up the framebuffer device */ error = picolcd_init_framebuffer(data); if (error) goto err; /* Setup the LED class devices */ error = picolcd_init_leds(data, picolcd_out_report(REPORT_LED_STATE, hdev)); if (error) goto err; picolcd_init_devfs(data, picolcd_out_report(REPORT_EE_READ, hdev), picolcd_out_report(REPORT_EE_WRITE, hdev), picolcd_out_report(REPORT_READ_MEMORY, hdev), picolcd_out_report(REPORT_WRITE_MEMORY, hdev), picolcd_out_report(REPORT_RESET, hdev)); return 0; err: picolcd_exit_leds(data); picolcd_exit_framebuffer(data); picolcd_exit_backlight(data); picolcd_exit_lcd(data); picolcd_exit_cir(data); picolcd_exit_keys(data); return error; } static int picolcd_probe_bootloader(struct hid_device *hdev, struct picolcd_data *data) { picolcd_init_devfs(data, NULL, NULL, picolcd_out_report(REPORT_BL_READ_MEMORY, hdev), picolcd_out_report(REPORT_BL_WRITE_MEMORY, hdev), NULL); return 0; } static int picolcd_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct picolcd_data *data; int error = -ENOMEM; dbg_hid(PICOLCD_NAME " hardware probe...\n"); /* * Let's allocate the picolcd data structure, set some reasonable * defaults, and associate it with the device */ data = kzalloc(sizeof(struct picolcd_data), GFP_KERNEL); if (data == NULL) { hid_err(hdev, "can't allocate space for Minibox PicoLCD device data\n"); return -ENOMEM; } spin_lock_init(&data->lock); mutex_init(&data->mutex); data->hdev = hdev; data->opmode_delay = 5000; if (hdev->product == USB_DEVICE_ID_PICOLCD_BOOTLOADER) data->status |= PICOLCD_BOOTLOADER; hid_set_drvdata(hdev, data); /* Parse the device reports and start it up */ error = hid_parse(hdev); if (error) { hid_err(hdev, "device report parse failed\n"); goto err_cleanup_data; } error = hid_hw_start(hdev, 0); if (error) { hid_err(hdev, "hardware start failed\n"); goto err_cleanup_data; } error = hid_hw_open(hdev); if (error) { hid_err(hdev, "failed to open input interrupt pipe for key and IR events\n"); goto err_cleanup_hid_hw; } error = device_create_file(&hdev->dev, &dev_attr_operation_mode_delay); if (error) { hid_err(hdev, "failed to create sysfs attributes\n"); goto err_cleanup_hid_ll; } error = device_create_file(&hdev->dev, &dev_attr_operation_mode); if (error) { hid_err(hdev, "failed to create sysfs attributes\n"); goto err_cleanup_sysfs1; } if (data->status & PICOLCD_BOOTLOADER) error = picolcd_probe_bootloader(hdev, data); else error = picolcd_probe_lcd(hdev, data); if (error) goto err_cleanup_sysfs2; dbg_hid(PICOLCD_NAME " activated and initialized\n"); return 0; err_cleanup_sysfs2: device_remove_file(&hdev->dev, &dev_attr_operation_mode); err_cleanup_sysfs1: device_remove_file(&hdev->dev, &dev_attr_operation_mode_delay); err_cleanup_hid_ll: hid_hw_close(hdev); err_cleanup_hid_hw: hid_hw_stop(hdev); err_cleanup_data: kfree(data); return error; } static void picolcd_remove(struct hid_device *hdev) { struct picolcd_data *data = hid_get_drvdata(hdev); unsigned long flags; dbg_hid(PICOLCD_NAME " hardware remove...\n"); spin_lock_irqsave(&data->lock, flags); data->status |= PICOLCD_FAILED; spin_unlock_irqrestore(&data->lock, flags); picolcd_exit_devfs(data); device_remove_file(&hdev->dev, &dev_attr_operation_mode); device_remove_file(&hdev->dev, &dev_attr_operation_mode_delay); hid_hw_close(hdev); hid_hw_stop(hdev); /* Shortcut potential pending reply that will never arrive */ spin_lock_irqsave(&data->lock, flags); if (data->pending) complete(&data->pending->ready); spin_unlock_irqrestore(&data->lock, flags); /* Cleanup LED */ picolcd_exit_leds(data); /* Clean up the framebuffer */ picolcd_exit_framebuffer(data); picolcd_exit_backlight(data); picolcd_exit_lcd(data); /* Cleanup input */ picolcd_exit_cir(data); picolcd_exit_keys(data); mutex_destroy(&data->mutex); /* Finally, clean up the picolcd data itself */ kfree(data); } static const struct hid_device_id picolcd_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_MICROCHIP, USB_DEVICE_ID_PICOLCD) }, { HID_USB_DEVICE(USB_VENDOR_ID_MICROCHIP, USB_DEVICE_ID_PICOLCD_BOOTLOADER) }, { } }; MODULE_DEVICE_TABLE(hid, picolcd_devices); static struct hid_driver picolcd_driver = { .name = "hid-picolcd", .id_table = picolcd_devices, .probe = picolcd_probe, .remove = picolcd_remove, .raw_event = picolcd_raw_event, #ifdef CONFIG_PM .suspend = picolcd_suspend, .resume = picolcd_resume, .reset_resume = picolcd_reset_resume, #endif }; module_hid_driver(picolcd_driver); MODULE_DESCRIPTION("Minibox graphics PicoLCD Driver"); MODULE_LICENSE("GPL v2");
74 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * NET Generic infrastructure for INET connection oriented protocols. * * Definitions for inet_connection_sock * * Authors: Many people, see the TCP sources * * From code originally in TCP */ #ifndef _INET_CONNECTION_SOCK_H #define _INET_CONNECTION_SOCK_H #include <linux/compiler.h> #include <linux/string.h> #include <linux/timer.h> #include <linux/poll.h> #include <linux/kernel.h> #include <linux/sockptr.h> #include <net/inet_sock.h> #include <net/request_sock.h> /* Cancel timers, when they are not required. */ #undef INET_CSK_CLEAR_TIMERS struct inet_bind_bucket; struct inet_bind2_bucket; struct tcp_congestion_ops; /* * Pointers to address related TCP functions * (i.e. things that depend on the address family) */ struct inet_connection_sock_af_ops { int (*queue_xmit)(struct sock *sk, struct sk_buff *skb, struct flowi *fl); void (*send_check)(struct sock *sk, struct sk_buff *skb); int (*rebuild_header)(struct sock *sk); void (*sk_rx_dst_set)(struct sock *sk, const struct sk_buff *skb); int (*conn_request)(struct sock *sk, struct sk_buff *skb); struct sock *(*syn_recv_sock)(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req); u16 net_header_len; u16 sockaddr_len; int (*setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); void (*addr2sockaddr)(struct sock *sk, struct sockaddr *); void (*mtu_reduced)(struct sock *sk); }; /** inet_connection_sock - INET connection oriented sock * * @icsk_accept_queue: FIFO of established children * @icsk_bind_hash: Bind node * @icsk_bind2_hash: Bind node in the bhash2 table * @icsk_timeout: Timeout * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmit timeout * @icsk_pmtu_cookie Last pmtu seen by socket * @icsk_ca_ops Pluggable congestion control hook * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data * @icsk_clean_acked Clean acked data hook * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event * @icsk_backoff: Backoff * @icsk_syn_retries: Number of allowed SYN (or equivalent) retries * @icsk_probes_out: unanswered 0 window probes * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options) * @icsk_ack: Delayed ACK control data * @icsk_mtup; MTU probing control data * @icsk_probes_tstamp: Probe timestamp (cleared by non-zero window ack) * @icsk_user_timeout: TCP_USER_TIMEOUT value */ struct inet_connection_sock { /* inet_sock has to be the first member! */ struct inet_sock icsk_inet; struct request_sock_queue icsk_accept_queue; struct inet_bind_bucket *icsk_bind_hash; struct inet_bind2_bucket *icsk_bind2_hash; unsigned long icsk_timeout; struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; __u32 icsk_rto; __u32 icsk_rto_min; __u32 icsk_delack_max; __u32 icsk_pmtu_cookie; const struct tcp_congestion_ops *icsk_ca_ops; const struct inet_connection_sock_af_ops *icsk_af_ops; const struct tcp_ulp_ops *icsk_ulp_ops; void __rcu *icsk_ulp_data; void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:5, icsk_ca_initialized:1, icsk_ca_setsockopt:1, icsk_ca_dst_locked:1; __u8 icsk_retransmits; __u8 icsk_pending; __u8 icsk_backoff; __u8 icsk_syn_retries; __u8 icsk_probes_out; __u16 icsk_ext_hdr_len; struct { __u8 pending; /* ACK is pending */ __u8 quick; /* Scheduled number of quick acks */ __u8 pingpong; /* The session is interactive */ __u8 retry; /* Number of attempts */ #define ATO_BITS 8 __u32 ato:ATO_BITS, /* Predicted tick of soft clock */ lrcv_flowlabel:20, /* last received ipv6 flowlabel */ unused:4; unsigned long timeout; /* Currently scheduled timeout */ __u32 lrcvtime; /* timestamp of last received data packet */ __u16 last_seg_size; /* Size of last incoming segment */ __u16 rcv_mss; /* MSS used for delayed ACK decisions */ } icsk_ack; struct { /* Range of MTUs to search */ int search_high; int search_low; /* Information on the current probe. */ u32 probe_size:31, /* Is the MTUP feature enabled for this connection? */ enabled:1; u32 probe_timestamp; } icsk_mtup; u32 icsk_probes_tstamp; u32 icsk_user_timeout; u64 icsk_ca_priv[104 / sizeof(u64)]; #define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) }; #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ #define ICSK_TIME_DACK 2 /* Delayed ack timer */ #define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ #define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */ #define ICSK_TIME_REO_TIMEOUT 6 /* Reordering timer */ #define inet_csk(ptr) container_of_const(ptr, struct inet_connection_sock, icsk_inet.sk) static inline void *inet_csk_ca(const struct sock *sk) { return (void *)inet_csk(sk)->icsk_ca_priv; } struct sock *inet_csk_clone_lock(const struct sock *sk, const struct request_sock *req, const gfp_t priority); enum inet_csk_ack_state_t { ICSK_ACK_SCHED = 1, ICSK_ACK_TIMER = 2, ICSK_ACK_PUSHED = 4, ICSK_ACK_PUSHED2 = 8, ICSK_ACK_NOW = 16, /* Send the next ACK immediately (once) */ ICSK_ACK_NOMEM = 32, }; void inet_csk_init_xmit_timers(struct sock *sk, void (*retransmit_handler)(struct timer_list *), void (*delack_handler)(struct timer_list *), void (*keepalive_handler)(struct timer_list *)); void inet_csk_clear_xmit_timers(struct sock *sk); void inet_csk_clear_xmit_timers_sync(struct sock *sk); static inline void inet_csk_schedule_ack(struct sock *sk) { inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_SCHED; } static inline int inet_csk_ack_scheduled(const struct sock *sk) { return inet_csk(sk)->icsk_ack.pending & ICSK_ACK_SCHED; } static inline void inet_csk_delack_init(struct sock *sk) { memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack)); } void inet_csk_delete_keepalive_timer(struct sock *sk); void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout); static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) { struct inet_connection_sock *icsk = inet_csk(sk); if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { smp_store_release(&icsk->icsk_pending, 0); #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_retransmit_timer); #endif } else if (what == ICSK_TIME_DACK) { smp_store_release(&icsk->icsk_ack.pending, 0); icsk->icsk_ack.retry = 0; #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_delack_timer); #endif } else { pr_debug("inet_csk BUG: unknown timer value\n"); } } /* * Reset the retransmission timer */ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, unsigned long when, const unsigned long max_when) { struct inet_connection_sock *icsk = inet_csk(sk); if (when > max_when) { pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, (void *)_THIS_IP_); when = max_when; } if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) { smp_store_release(&icsk->icsk_pending, what); icsk->icsk_timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); } else if (what == ICSK_TIME_DACK) { smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_TIMER); icsk->icsk_ack.timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); } else { pr_debug("inet_csk BUG: unknown timer value\n"); } } static inline unsigned long inet_csk_rto_backoff(const struct inet_connection_sock *icsk, unsigned long max_when) { u64 when = (u64)icsk->icsk_rto << icsk->icsk_backoff; return (unsigned long)min_t(u64, when, max_when); } struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg); int inet_csk_get_port(struct sock *sk, unsigned short snum); struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, const struct request_sock *req); struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, const struct request_sock *req); struct sock *inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, struct sock *child); bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout); struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req); static inline void inet_csk_reqsk_queue_added(struct sock *sk) { reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue); } static inline int inet_csk_reqsk_queue_len(const struct sock *sk) { return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue); } static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { return inet_csk_reqsk_queue_len(sk) > READ_ONCE(sk->sk_max_ack_backlog); } bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req); static inline unsigned long reqsk_timeout(struct request_sock *req, unsigned long max_timeout) { u64 timeout = (u64)req->timeout << req->num_timeout; return (unsigned long)min_t(u64, timeout, max_timeout); } static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk) { /* The below has to be done to allow calling inet_csk_destroy_sock */ sock_set_flag(sk, SOCK_DEAD); this_cpu_inc(*sk->sk_prot->orphan_count); } void inet_csk_destroy_sock(struct sock *sk); void inet_csk_prepare_forced_close(struct sock *sk); /* * LISTEN is a special case for poll.. */ static inline __poll_t inet_csk_listen_poll(const struct sock *sk) { return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ? (EPOLLIN | EPOLLRDNORM) : 0; } int inet_csk_listen_start(struct sock *sk); void inet_csk_listen_stop(struct sock *sk); void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); /* update the fast reuse flag when adding a socket */ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, struct sock *sk); struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu); static inline void inet_csk_enter_pingpong_mode(struct sock *sk) { inet_csk(sk)->icsk_ack.pingpong = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh); } static inline void inet_csk_exit_pingpong_mode(struct sock *sk) { inet_csk(sk)->icsk_ack.pingpong = 0; } static inline bool inet_csk_in_pingpong_mode(struct sock *sk) { return inet_csk(sk)->icsk_ack.pingpong >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh); } static inline void inet_csk_inc_pingpong_cnt(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ack.pingpong < U8_MAX) icsk->icsk_ack.pingpong++; } static inline bool inet_csk_has_ulp(const struct sock *sk) { return inet_test_bit(IS_ICSK, sk) && !!inet_csk(sk)->icsk_ulp_ops; } static inline void inet_init_csk_locks(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); spin_lock_init(&icsk->icsk_accept_queue.rskq_lock); spin_lock_init(&icsk->icsk_accept_queue.fastopenq.lock); } #endif /* _INET_CONNECTION_SOCK_H */
28 15 21 1 11 4 11 4 14 1 8 7 3 12 14 1 17 3 14 4 4 4 170 115 71 178 13 13 68 34 1 35 2 76 15 68 30 196 93 198 21 16 2 2 28 28 5 4 1 1 2 2 7 1 2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/inode.c - kernfs inode implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/pagemap.h> #include <linux/backing-dev.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/xattr.h> #include <linux/security.h> #include "kernfs-internal.h" static const struct inode_operations kernfs_iops = { .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .listxattr = kernfs_iop_listxattr, }; static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc) { static DEFINE_MUTEX(iattr_mutex); struct kernfs_iattrs *ret; mutex_lock(&iattr_mutex); if (kn->iattr || !alloc) goto out_unlock; kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL); if (!kn->iattr) goto out_unlock; /* assign default attributes */ kn->iattr->ia_uid = GLOBAL_ROOT_UID; kn->iattr->ia_gid = GLOBAL_ROOT_GID; ktime_get_real_ts64(&kn->iattr->ia_atime); kn->iattr->ia_mtime = kn->iattr->ia_atime; kn->iattr->ia_ctime = kn->iattr->ia_atime; simple_xattrs_init(&kn->iattr->xattrs); atomic_set(&kn->iattr->nr_user_xattrs, 0); atomic_set(&kn->iattr->user_xattr_size, 0); out_unlock: ret = kn->iattr; mutex_unlock(&iattr_mutex); return ret; } static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) { return __kernfs_iattrs(kn, 1); } static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn) { return __kernfs_iattrs(kn, 0); } int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { struct kernfs_iattrs *attrs; unsigned int ia_valid = iattr->ia_valid; attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; if (ia_valid & ATTR_UID) attrs->ia_uid = iattr->ia_uid; if (ia_valid & ATTR_GID) attrs->ia_gid = iattr->ia_gid; if (ia_valid & ATTR_ATIME) attrs->ia_atime = iattr->ia_atime; if (ia_valid & ATTR_MTIME) attrs->ia_mtime = iattr->ia_mtime; if (ia_valid & ATTR_CTIME) attrs->ia_ctime = iattr->ia_ctime; if (ia_valid & ATTR_MODE) kn->mode = iattr->ia_mode; return 0; } /** * kernfs_setattr - set iattr on a node * @kn: target node * @iattr: iattr to set * * Return: %0 on success, -errno on failure. */ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { int ret; struct kernfs_root *root = kernfs_root(kn); down_write(&root->kernfs_iattr_rwsem); ret = __kernfs_setattr(kn, iattr); up_write(&root->kernfs_iattr_rwsem); return ret; } int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct kernfs_node *kn = inode->i_private; struct kernfs_root *root; int error; if (!kn) return -EINVAL; root = kernfs_root(kn); down_write(&root->kernfs_iattr_rwsem); error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) goto out; error = __kernfs_setattr(kn, iattr); if (error) goto out; /* this ignores size changes */ setattr_copy(&nop_mnt_idmap, inode, iattr); out: up_write(&root->kernfs_iattr_rwsem); return error; } ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) { struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_iattrs *attrs; attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size); } static inline void set_default_inode_attr(struct inode *inode, umode_t mode) { inode->i_mode = mode; simple_inode_init_ts(inode); } static inline void set_inode_attr(struct inode *inode, struct kernfs_iattrs *attrs) { inode->i_uid = attrs->ia_uid; inode->i_gid = attrs->ia_gid; inode_set_atime_to_ts(inode, attrs->ia_atime); inode_set_mtime_to_ts(inode, attrs->ia_mtime); inode_set_ctime_to_ts(inode, attrs->ia_ctime); } static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) { struct kernfs_iattrs *attrs = kn->iattr; inode->i_mode = kn->mode; if (attrs) /* * kernfs_node has non-default attributes get them from * persistent copy in kernfs_node. */ set_inode_attr(inode, attrs); if (kernfs_type(kn) == KERNFS_DIR) set_nlink(inode, kn->dir.subdirs + 2); } int kernfs_iop_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); down_read(&root->kernfs_iattr_rwsem); kernfs_refresh_inode(kn, inode); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); up_read(&root->kernfs_iattr_rwsem); return 0; } static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) { kernfs_get(kn); inode->i_private = kn; inode->i_mapping->a_ops = &ram_aops; inode->i_op = &kernfs_iops; inode->i_generation = kernfs_gen(kn); set_default_inode_attr(inode, kn->mode); kernfs_refresh_inode(kn, inode); /* initialize inode according to type */ switch (kernfs_type(kn)) { case KERNFS_DIR: inode->i_op = &kernfs_dir_iops; inode->i_fop = &kernfs_dir_fops; if (kn->flags & KERNFS_EMPTY_DIR) make_empty_dir_inode(inode); break; case KERNFS_FILE: inode->i_size = kn->attr.size; inode->i_fop = &kernfs_file_fops; break; case KERNFS_LINK: inode->i_op = &kernfs_symlink_iops; break; default: BUG(); } unlock_new_inode(inode); } /** * kernfs_get_inode - get inode for kernfs_node * @sb: super block * @kn: kernfs_node to allocate inode for * * Get inode for @kn. If such inode doesn't exist, a new inode is * allocated and basics are initialized. New inode is returned * locked. * * Locking: * Kernel thread context (may sleep). * * Return: * Pointer to allocated inode on success, %NULL on failure. */ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { struct inode *inode; inode = iget_locked(sb, kernfs_ino(kn)); if (inode && (inode->i_state & I_NEW)) kernfs_init_inode(kn, inode); return inode; } /* * The kernfs_node serves as both an inode and a directory entry for * kernfs. To prevent the kernfs inode numbers from being freed * prematurely we take a reference to kernfs_node from the kernfs inode. A * super_operations.evict_inode() implementation is needed to drop that * reference upon inode destruction. */ void kernfs_evict_inode(struct inode *inode) { struct kernfs_node *kn = inode->i_private; truncate_inode_pages_final(&inode->i_data); clear_inode(inode); kernfs_put(kn); } int kernfs_iop_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct kernfs_node *kn; struct kernfs_root *root; int ret; if (mask & MAY_NOT_BLOCK) return -ECHILD; kn = inode->i_private; root = kernfs_root(kn); down_read(&root->kernfs_iattr_rwsem); kernfs_refresh_inode(kn, inode); ret = generic_permission(&nop_mnt_idmap, inode, mask); up_read(&root->kernfs_iattr_rwsem); return ret; } int kernfs_xattr_get(struct kernfs_node *kn, const char *name, void *value, size_t size) { struct kernfs_iattrs *attrs = kernfs_iattrs_noalloc(kn); if (!attrs) return -ENODATA; return simple_xattr_get(&attrs->xattrs, name, value, size); } int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags) { struct simple_xattr *old_xattr; struct kernfs_iattrs *attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; old_xattr = simple_xattr_set(&attrs->xattrs, name, value, size, flags); if (IS_ERR(old_xattr)) return PTR_ERR(old_xattr); simple_xattr_free(old_xattr); return 0; } static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *suffix, void *value, size_t size) { const char *name = xattr_full_name(handler, suffix); struct kernfs_node *kn = inode->i_private; return kernfs_xattr_get(kn, name, value, size); } static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) { const char *name = xattr_full_name(handler, suffix); struct kernfs_node *kn = inode->i_private; return kernfs_xattr_set(kn, name, value, size, flags); } static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, const char *full_name, struct simple_xattrs *xattrs, const void *value, size_t size, int flags) { atomic_t *sz = &kn->iattr->user_xattr_size; atomic_t *nr = &kn->iattr->nr_user_xattrs; struct simple_xattr *old_xattr; int ret; if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) { ret = -ENOSPC; goto dec_count_out; } if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) { ret = -ENOSPC; goto dec_size_out; } old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); if (!old_xattr) return 0; if (IS_ERR(old_xattr)) { ret = PTR_ERR(old_xattr); goto dec_size_out; } ret = 0; size = old_xattr->size; simple_xattr_free(old_xattr); dec_size_out: atomic_sub(size, sz); dec_count_out: atomic_dec(nr); return ret; } static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, const char *full_name, struct simple_xattrs *xattrs, const void *value, size_t size, int flags) { atomic_t *sz = &kn->iattr->user_xattr_size; atomic_t *nr = &kn->iattr->nr_user_xattrs; struct simple_xattr *old_xattr; old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); if (!old_xattr) return 0; if (IS_ERR(old_xattr)) return PTR_ERR(old_xattr); atomic_sub(old_xattr->size, sz); atomic_dec(nr); simple_xattr_free(old_xattr); return 0; } static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) { const char *full_name = xattr_full_name(handler, suffix); struct kernfs_node *kn = inode->i_private; struct kernfs_iattrs *attrs; if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR)) return -EOPNOTSUPP; attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; if (value) return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs, value, size, flags); else return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs, value, size, flags); } static const struct xattr_handler kernfs_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = kernfs_vfs_xattr_get, .set = kernfs_vfs_xattr_set, }; static const struct xattr_handler kernfs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = kernfs_vfs_xattr_get, .set = kernfs_vfs_xattr_set, }; static const struct xattr_handler kernfs_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .get = kernfs_vfs_xattr_get, .set = kernfs_vfs_user_xattr_set, }; const struct xattr_handler * const kernfs_xattr_handlers[] = { &kernfs_trusted_xattr_handler, &kernfs_security_xattr_handler, &kernfs_user_xattr_handler, NULL };
27 25 20 24 5 19 5 25 23 6 6 9 1 7 8 2 1 10 11 11 4 4 4 5 2 1 4 4 5 4 5 5 2 2 1 1 6 6 6 6 6 6 6 6 6 8 10 5 4 3 4 1 2 6 7 3 4 4 2 2 4 4 1 1 1 1 19 1 1 4 2 2 4 4 6 2 2 5 2 4 4 4 1 1 1 1 1 8 14 4 9 2 6 1 1 1 7 46 1 39 4 7 4 3 47 29 1 391 363 30 30 120 80 41 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/sem.c * Copyright (C) 1992 Krishna Balasubramanian * Copyright (C) 1995 Eric Schenk, Bruno Haible * * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com> * * SMP-threaded, sysctl's added * (c) 1999 Manfred Spraul <manfred@colorfullife.com> * Enforced range limit on SEM_UNDO * (c) 2001 Red Hat Inc * Lockless wakeup * (c) 2003 Manfred Spraul <manfred@colorfullife.com> * (c) 2016 Davidlohr Bueso <dave@stgolabs.net> * Further wakeup optimizations, documentation * (c) 2010 Manfred Spraul <manfred@colorfullife.com> * * support for audit of ipc object properties and permission changes * Dustin Kirkland <dustin.kirkland@us.ibm.com> * * namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> * * Implementation notes: (May 2010) * This file implements System V semaphores. * * User space visible behavior: * - FIFO ordering for semop() operations (just FIFO, not starvation * protection) * - multiple semaphore operations that alter the same semaphore in * one semop() are handled. * - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and * SETALL calls. * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO. * - undo adjustments at process exit are limited to 0..SEMVMX. * - namespace are supported. * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtime by writing * to /proc/sys/kernel/sem. * - statistics about the usage are reported in /proc/sysvipc/sem. * * Internals: * - scalability: * - all global variables are read-mostly. * - semop() calls and semctl(RMID) are synchronized by RCU. * - most operations do write operations (actually: spin_lock calls) to * the per-semaphore array structure. * Thus: Perfect SMP scaling between independent semaphore arrays. * If multiple semaphores in one array are used, then cache line * trashing on the semaphore array spinlock will limit the scaling. * - semncnt and semzcnt are calculated on demand in count_semcnt() * - the task that performs a successful semop() scans the list of all * sleeping tasks and completes any pending operations that can be fulfilled. * Semaphores are actively given to waiting tasks (necessary for FIFO). * (see update_queue()) * - To improve the scalability, the actual wake-up calls are performed after * dropping all locks. (see wake_up_sem_queue_prepare()) * - All work is done by the waker, the woken up task does not have to do * anything - not even acquiring a lock or dropping a refcount. * - A woken up task may not even touch the semaphore array anymore, it may * have been destroyed already by a semctl(RMID). * - UNDO values are stored in an array (one per process and per * semaphore array, lazily allocated). For backwards compatibility, multiple * modes for the UNDO variables are supported (per process, per thread) * (see copy_semundo, CLONE_SYSVSEM) * - There are two lists of the pending operations: a per-array list * and per-semaphore list (stored in the array). This allows to achieve FIFO * ordering without always scanning all pending operations. * The worst-case behavior is nevertheless O(N^2) for N wakeups. */ #include <linux/compat.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/time.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/audit.h> #include <linux/capability.h> #include <linux/seq_file.h> #include <linux/rwsem.h> #include <linux/nsproxy.h> #include <linux/ipc_namespace.h> #include <linux/sched/wake_q.h> #include <linux/nospec.h> #include <linux/rhashtable.h> #include <linux/uaccess.h> #include "util.h" /* One semaphore structure for each semaphore in the system. */ struct sem { int semval; /* current value */ /* * PID of the process that last modified the semaphore. For * Linux, specifically these are: * - semop * - semctl, via SETVAL and SETALL. * - at task exit when performing undo adjustments (see exit_sem). */ struct pid *sempid; spinlock_t lock; /* spinlock for fine-grained semtimedop */ struct list_head pending_alter; /* pending single-sop operations */ /* that alter the semaphore */ struct list_head pending_const; /* pending single-sop operations */ /* that do not alter the semaphore*/ time64_t sem_otime; /* candidate for sem_otime */ } ____cacheline_aligned_in_smp; /* One sem_array data structure for each set of semaphores in the system. */ struct sem_array { struct kern_ipc_perm sem_perm; /* permissions .. see ipc.h */ time64_t sem_ctime; /* create/last semctl() time */ struct list_head pending_alter; /* pending operations */ /* that alter the array */ struct list_head pending_const; /* pending complex operations */ /* that do not alter semvals */ struct list_head list_id; /* undo requests on this array */ int sem_nsems; /* no. of semaphores in array */ int complex_count; /* pending complex operations */ unsigned int use_global_lock;/* >0: global lock required */ struct sem sems[]; } __randomize_layout; /* One queue for each sleeping process in the system. */ struct sem_queue { struct list_head list; /* queue of pending operations */ struct task_struct *sleeper; /* this process */ struct sem_undo *undo; /* undo structure */ struct pid *pid; /* process id of requesting process */ int status; /* completion status of operation */ struct sembuf *sops; /* array of pending operations */ struct sembuf *blocking; /* the operation that blocked */ int nsops; /* number of operations */ bool alter; /* does *sops alter the array? */ bool dupsop; /* sops on more than one sem_num */ }; /* Each task has a list of undo requests. They are executed automatically * when the process exits. */ struct sem_undo { struct list_head list_proc; /* per-process list: * * all undos from one process * rcu protected */ struct rcu_head rcu; /* rcu struct for sem_undo */ struct sem_undo_list *ulp; /* back ptr to sem_undo_list */ struct list_head list_id; /* per semaphore array list: * all undos for one array */ int semid; /* semaphore set identifier */ short semadj[]; /* array of adjustments */ /* one per semaphore */ }; /* sem_undo_list controls shared access to the list of sem_undo structures * that may be shared among all a CLONE_SYSVSEM task group. */ struct sem_undo_list { refcount_t refcnt; spinlock_t lock; struct list_head list_proc; }; #define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS]) static int newary(struct ipc_namespace *, struct ipc_params *); static void freeary(struct ipc_namespace *, struct kern_ipc_perm *); #ifdef CONFIG_PROC_FS static int sysvipc_sem_proc_show(struct seq_file *s, void *it); #endif #define SEMMSL_FAST 256 /* 512 bytes on stack */ #define SEMOPM_FAST 64 /* ~ 372 bytes on stack */ /* * Switching from the mode suitable for simple ops * to the mode for complex ops is costly. Therefore: * use some hysteresis */ #define USE_GLOBAL_LOCK_HYSTERESIS 10 /* * Locking: * a) global sem_lock() for read/write * sem_undo.id_next, * sem_array.complex_count, * sem_array.pending{_alter,_const}, * sem_array.sem_undo * * b) global or semaphore sem_lock() for read/write: * sem_array.sems[i].pending_{const,alter}: * * c) special: * sem_undo_list.list_proc: * * undo_list->lock for write * * rcu for read * use_global_lock: * * global sem_lock() for write * * either local or global sem_lock() for read. * * Memory ordering: * Most ordering is enforced by using spin_lock() and spin_unlock(). * * Exceptions: * 1) use_global_lock: (SEM_BARRIER_1) * Setting it from non-zero to 0 is a RELEASE, this is ensured by * using smp_store_release(): Immediately after setting it to 0, * a simple op can start. * Testing if it is non-zero is an ACQUIRE, this is ensured by using * smp_load_acquire(). * Setting it from 0 to non-zero must be ordered with regards to * this smp_load_acquire(), this is guaranteed because the smp_load_acquire() * is inside a spin_lock() and after a write from 0 to non-zero a * spin_lock()+spin_unlock() is done. * To prevent the compiler/cpu temporarily writing 0 to use_global_lock, * READ_ONCE()/WRITE_ONCE() is used. * * 2) queue.status: (SEM_BARRIER_2) * Initialization is done while holding sem_lock(), so no further barrier is * required. * Setting it to a result code is a RELEASE, this is ensured by both a * smp_store_release() (for case a) and while holding sem_lock() * (for case b). * The ACQUIRE when reading the result code without holding sem_lock() is * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep(). * (case a above). * Reading the result code while holding sem_lock() needs no further barriers, * the locks inside sem_lock() enforce ordering (case b above) * * 3) current->state: * current->state is set to TASK_INTERRUPTIBLE while holding sem_lock(). * The wakeup is handled using the wake_q infrastructure. wake_q wakeups may * happen immediately after calling wake_q_add. As wake_q_add_safe() is called * when holding sem_lock(), no further barriers are required. * * See also ipc/mqueue.c for more details on the covered races. */ #define sc_semmsl sem_ctls[0] #define sc_semmns sem_ctls[1] #define sc_semopm sem_ctls[2] #define sc_semmni sem_ctls[3] void sem_init_ns(struct ipc_namespace *ns) { ns->sc_semmsl = SEMMSL; ns->sc_semmns = SEMMNS; ns->sc_semopm = SEMOPM; ns->sc_semmni = SEMMNI; ns->used_sems = 0; ipc_init_ids(&ns->ids[IPC_SEM_IDS]); } #ifdef CONFIG_IPC_NS void sem_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &sem_ids(ns), freeary); idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht); } #endif void __init sem_init(void) { sem_init_ns(&init_ipc_ns); ipc_init_proc_interface("sysvipc/sem", " key semid perms nsems uid gid cuid cgid otime ctime\n", IPC_SEM_IDS, sysvipc_sem_proc_show); } /** * unmerge_queues - unmerge queues, if possible. * @sma: semaphore array * * The function unmerges the wait queues if complex_count is 0. * It must be called prior to dropping the global semaphore array lock. */ static void unmerge_queues(struct sem_array *sma) { struct sem_queue *q, *tq; /* complex operations still around? */ if (sma->complex_count) return; /* * We will switch back to simple mode. * Move all pending operation back into the per-semaphore * queues. */ list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { struct sem *curr; curr = &sma->sems[q->sops[0].sem_num]; list_add_tail(&q->list, &curr->pending_alter); } INIT_LIST_HEAD(&sma->pending_alter); } /** * merge_queues - merge single semop queues into global queue * @sma: semaphore array * * This function merges all per-semaphore queues into the global queue. * It is necessary to achieve FIFO ordering for the pending single-sop * operations when a multi-semop operation must sleep. * Only the alter operations must be moved, the const operations can stay. */ static void merge_queues(struct sem_array *sma) { int i; for (i = 0; i < sma->sem_nsems; i++) { struct sem *sem = &sma->sems[i]; list_splice_init(&sem->pending_alter, &sma->pending_alter); } } static void sem_rcu_free(struct rcu_head *head) { struct kern_ipc_perm *p = container_of(head, struct kern_ipc_perm, rcu); struct sem_array *sma = container_of(p, struct sem_array, sem_perm); security_sem_free(&sma->sem_perm); kvfree(sma); } /* * Enter the mode suitable for non-simple operations: * Caller must own sem_perm.lock. */ static void complexmode_enter(struct sem_array *sma) { int i; struct sem *sem; if (sma->use_global_lock > 0) { /* * We are already in global lock mode. * Nothing to do, just reset the * counter until we return to simple mode. */ WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS); return; } WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS); for (i = 0; i < sma->sem_nsems; i++) { sem = &sma->sems[i]; spin_lock(&sem->lock); spin_unlock(&sem->lock); } } /* * Try to leave the mode that disallows simple operations: * Caller must own sem_perm.lock. */ static void complexmode_tryleave(struct sem_array *sma) { if (sma->complex_count) { /* Complex ops are sleeping. * We must stay in complex mode */ return; } if (sma->use_global_lock == 1) { /* See SEM_BARRIER_1 for purpose/pairing */ smp_store_release(&sma->use_global_lock, 0); } else { WRITE_ONCE(sma->use_global_lock, sma->use_global_lock-1); } } #define SEM_GLOBAL_LOCK (-1) /* * If the request contains only one semaphore operation, and there are * no complex transactions pending, lock only the semaphore involved. * Otherwise, lock the entire semaphore array, since we either have * multiple semaphores in our own semops, or we need to look at * semaphores from other pending complex operations. */ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, int nsops) { struct sem *sem; int idx; if (nsops != 1) { /* Complex operation - acquire a full lock */ ipc_lock_object(&sma->sem_perm); /* Prevent parallel simple ops */ complexmode_enter(sma); return SEM_GLOBAL_LOCK; } /* * Only one semaphore affected - try to optimize locking. * Optimized locking is possible if no complex operation * is either enqueued or processed right now. * * Both facts are tracked by use_global_mode. */ idx = array_index_nospec(sops->sem_num, sma->sem_nsems); sem = &sma->sems[idx]; /* * Initial check for use_global_lock. Just an optimization, * no locking, no memory barrier. */ if (!READ_ONCE(sma->use_global_lock)) { /* * It appears that no complex operation is around. * Acquire the per-semaphore lock. */ spin_lock(&sem->lock); /* see SEM_BARRIER_1 for purpose/pairing */ if (!smp_load_acquire(&sma->use_global_lock)) { /* fast path successful! */ return sops->sem_num; } spin_unlock(&sem->lock); } /* slow path: acquire the full lock */ ipc_lock_object(&sma->sem_perm); if (sma->use_global_lock == 0) { /* * The use_global_lock mode ended while we waited for * sma->sem_perm.lock. Thus we must switch to locking * with sem->lock. * Unlike in the fast path, there is no need to recheck * sma->use_global_lock after we have acquired sem->lock: * We own sma->sem_perm.lock, thus use_global_lock cannot * change. */ spin_lock(&sem->lock); ipc_unlock_object(&sma->sem_perm); return sops->sem_num; } else { /* * Not a false alarm, thus continue to use the global lock * mode. No need for complexmode_enter(), this was done by * the caller that has set use_global_mode to non-zero. */ return SEM_GLOBAL_LOCK; } } static inline void sem_unlock(struct sem_array *sma, int locknum) { if (locknum == SEM_GLOBAL_LOCK) { unmerge_queues(sma); complexmode_tryleave(sma); ipc_unlock_object(&sma->sem_perm); } else { struct sem *sem = &sma->sems[locknum]; spin_unlock(&sem->lock); } } /* * sem_lock_(check_) routines are called in the paths where the rwsem * is not held. * * The caller holds the RCU read lock. */ static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id); if (IS_ERR(ipcp)) return ERR_CAST(ipcp); return container_of(ipcp, struct sem_array, sem_perm); } static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&sem_ids(ns), id); if (IS_ERR(ipcp)) return ERR_CAST(ipcp); return container_of(ipcp, struct sem_array, sem_perm); } static inline void sem_lock_and_putref(struct sem_array *sma) { sem_lock(sma, NULL, -1); ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); } static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) { ipc_rmid(&sem_ids(ns), &s->sem_perm); } static struct sem_array *sem_alloc(size_t nsems) { struct sem_array *sma; if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0])) return NULL; sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL_ACCOUNT); if (unlikely(!sma)) return NULL; return sma; } /** * newary - Create a new semaphore set * @ns: namespace * @params: ptr to the structure that contains key, semflg and nsems * * Called with sem_ids.rwsem held (as a writer) */ static int newary(struct ipc_namespace *ns, struct ipc_params *params) { int retval; struct sem_array *sma; key_t key = params->key; int nsems = params->u.nsems; int semflg = params->flg; int i; if (!nsems) return -EINVAL; if (ns->used_sems + nsems > ns->sc_semmns) return -ENOSPC; sma = sem_alloc(nsems); if (!sma) return -ENOMEM; sma->sem_perm.mode = (semflg & S_IRWXUGO); sma->sem_perm.key = key; sma->sem_perm.security = NULL; retval = security_sem_alloc(&sma->sem_perm); if (retval) { kvfree(sma); return retval; } for (i = 0; i < nsems; i++) { INIT_LIST_HEAD(&sma->sems[i].pending_alter); INIT_LIST_HEAD(&sma->sems[i].pending_const); spin_lock_init(&sma->sems[i].lock); } sma->complex_count = 0; sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS; INIT_LIST_HEAD(&sma->pending_alter); INIT_LIST_HEAD(&sma->pending_const); INIT_LIST_HEAD(&sma->list_id); sma->sem_nsems = nsems; sma->sem_ctime = ktime_get_real_seconds(); /* ipc_addid() locks sma upon success. */ retval = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); if (retval < 0) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return retval; } ns->used_sems += nsems; sem_unlock(sma, -1); rcu_read_unlock(); return sma->sem_perm.id; } /* * Called with sem_ids.rwsem and ipcp locked. */ static int sem_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params) { struct sem_array *sma; sma = container_of(ipcp, struct sem_array, sem_perm); if (params->u.nsems > sma->sem_nsems) return -EINVAL; return 0; } long ksys_semget(key_t key, int nsems, int semflg) { struct ipc_namespace *ns; static const struct ipc_ops sem_ops = { .getnew = newary, .associate = security_sem_associate, .more_checks = sem_more_checks, }; struct ipc_params sem_params; ns = current->nsproxy->ipc_ns; if (nsems < 0 || nsems > ns->sc_semmsl) return -EINVAL; sem_params.key = key; sem_params.flg = semflg; sem_params.u.nsems = nsems; return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); } SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg) { return ksys_semget(key, nsems, semflg); } /** * perform_atomic_semop[_slow] - Attempt to perform semaphore * operations on a given array. * @sma: semaphore array * @q: struct sem_queue that describes the operation * * Caller blocking are as follows, based the value * indicated by the semaphore operation (sem_op): * * (1) >0 never blocks. * (2) 0 (wait-for-zero operation): semval is non-zero. * (3) <0 attempting to decrement semval to a value smaller than zero. * * Returns 0 if the operation was possible. * Returns 1 if the operation is impossible, the caller must sleep. * Returns <0 for error codes. */ static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q) { int result, sem_op, nsops; struct pid *pid; struct sembuf *sop; struct sem *curr; struct sembuf *sops; struct sem_undo *un; sops = q->sops; nsops = q->nsops; un = q->undo; for (sop = sops; sop < sops + nsops; sop++) { int idx = array_index_nospec(sop->sem_num, sma->sem_nsems); curr = &sma->sems[idx]; sem_op = sop->sem_op; result = curr->semval; if (!sem_op && result) goto would_block; result += sem_op; if (result < 0) goto would_block; if (result > SEMVMX) goto out_of_range; if (sop->sem_flg & SEM_UNDO) { int undo = un->semadj[sop->sem_num] - sem_op; /* Exceeding the undo range is an error. */ if (undo < (-SEMAEM - 1) || undo > SEMAEM) goto out_of_range; un->semadj[sop->sem_num] = undo; } curr->semval = result; } sop--; pid = q->pid; while (sop >= sops) { ipc_update_pid(&sma->sems[sop->sem_num].sempid, pid); sop--; } return 0; out_of_range: result = -ERANGE; goto undo; would_block: q->blocking = sop; if (sop->sem_flg & IPC_NOWAIT) result = -EAGAIN; else result = 1; undo: sop--; while (sop >= sops) { sem_op = sop->sem_op; sma->sems[sop->sem_num].semval -= sem_op; if (sop->sem_flg & SEM_UNDO) un->semadj[sop->sem_num] += sem_op; sop--; } return result; } static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) { int result, sem_op, nsops; struct sembuf *sop; struct sem *curr; struct sembuf *sops; struct sem_undo *un; sops = q->sops; nsops = q->nsops; un = q->undo; if (unlikely(q->dupsop)) return perform_atomic_semop_slow(sma, q); /* * We scan the semaphore set twice, first to ensure that the entire * operation can succeed, therefore avoiding any pointless writes * to shared memory and having to undo such changes in order to block * until the operations can go through. */ for (sop = sops; sop < sops + nsops; sop++) { int idx = array_index_nospec(sop->sem_num, sma->sem_nsems); curr = &sma->sems[idx]; sem_op = sop->sem_op; result = curr->semval; if (!sem_op && result) goto would_block; /* wait-for-zero */ result += sem_op; if (result < 0) goto would_block; if (result > SEMVMX) return -ERANGE; if (sop->sem_flg & SEM_UNDO) { int undo = un->semadj[sop->sem_num] - sem_op; /* Exceeding the undo range is an error. */ if (undo < (-SEMAEM - 1) || undo > SEMAEM) return -ERANGE; } } for (sop = sops; sop < sops + nsops; sop++) { curr = &sma->sems[sop->sem_num]; sem_op = sop->sem_op; if (sop->sem_flg & SEM_UNDO) { int undo = un->semadj[sop->sem_num] - sem_op; un->semadj[sop->sem_num] = undo; } curr->semval += sem_op; ipc_update_pid(&curr->sempid, q->pid); } return 0; would_block: q->blocking = sop; return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1; } static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error, struct wake_q_head *wake_q) { struct task_struct *sleeper; sleeper = get_task_struct(q->sleeper); /* see SEM_BARRIER_2 for purpose/pairing */ smp_store_release(&q->status, error); wake_q_add_safe(wake_q, sleeper); } static void unlink_queue(struct sem_array *sma, struct sem_queue *q) { list_del(&q->list); if (q->nsops > 1) sma->complex_count--; } /** check_restart(sma, q) * @sma: semaphore array * @q: the operation that just completed * * update_queue is O(N^2) when it restarts scanning the whole queue of * waiting operations. Therefore this function checks if the restart is * really necessary. It is called after a previously waiting operation * modified the array. * Note that wait-for-zero operations are handled without restart. */ static inline int check_restart(struct sem_array *sma, struct sem_queue *q) { /* pending complex alter operations are too difficult to analyse */ if (!list_empty(&sma->pending_alter)) return 1; /* we were a sleeping complex operation. Too difficult */ if (q->nsops > 1) return 1; /* It is impossible that someone waits for the new value: * - complex operations always restart. * - wait-for-zero are handled separately. * - q is a previously sleeping simple operation that * altered the array. It must be a decrement, because * simple increments never sleep. * - If there are older (higher priority) decrements * in the queue, then they have observed the original * semval value and couldn't proceed. The operation * decremented to value - thus they won't proceed either. */ return 0; } /** * wake_const_ops - wake up non-alter tasks * @sma: semaphore array. * @semnum: semaphore that was modified. * @wake_q: lockless wake-queue head. * * wake_const_ops must be called after a semaphore in a semaphore array * was set to 0. If complex const operations are pending, wake_const_ops must * be called with semnum = -1, as well as with the number of each modified * semaphore. * The tasks that must be woken up are added to @wake_q. The return code * is stored in q->pid. * The function returns 1 if at least one operation was completed successfully. */ static int wake_const_ops(struct sem_array *sma, int semnum, struct wake_q_head *wake_q) { struct sem_queue *q, *tmp; struct list_head *pending_list; int semop_completed = 0; if (semnum == -1) pending_list = &sma->pending_const; else pending_list = &sma->sems[semnum].pending_const; list_for_each_entry_safe(q, tmp, pending_list, list) { int error = perform_atomic_semop(sma, q); if (error > 0) continue; /* operation completed, remove from queue & wakeup */ unlink_queue(sma, q); wake_up_sem_queue_prepare(q, error, wake_q); if (error == 0) semop_completed = 1; } return semop_completed; } /** * do_smart_wakeup_zero - wakeup all wait for zero tasks * @sma: semaphore array * @sops: operations that were performed * @nsops: number of operations * @wake_q: lockless wake-queue head * * Checks all required queue for wait-for-zero operations, based * on the actual changes that were performed on the semaphore array. * The function returns 1 if at least one operation was completed successfully. */ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, int nsops, struct wake_q_head *wake_q) { int i; int semop_completed = 0; int got_zero = 0; /* first: the per-semaphore queues, if known */ if (sops) { for (i = 0; i < nsops; i++) { int num = sops[i].sem_num; if (sma->sems[num].semval == 0) { got_zero = 1; semop_completed |= wake_const_ops(sma, num, wake_q); } } } else { /* * No sops means modified semaphores not known. * Assume all were changed. */ for (i = 0; i < sma->sem_nsems; i++) { if (sma->sems[i].semval == 0) { got_zero = 1; semop_completed |= wake_const_ops(sma, i, wake_q); } } } /* * If one of the modified semaphores got 0, * then check the global queue, too. */ if (got_zero) semop_completed |= wake_const_ops(sma, -1, wake_q); return semop_completed; } /** * update_queue - look for tasks that can be completed. * @sma: semaphore array. * @semnum: semaphore that was modified. * @wake_q: lockless wake-queue head. * * update_queue must be called after a semaphore in a semaphore array * was modified. If multiple semaphores were modified, update_queue must * be called with semnum = -1, as well as with the number of each modified * semaphore. * The tasks that must be woken up are added to @wake_q. The return code * is stored in q->pid. * The function internally checks if const operations can now succeed. * * The function return 1 if at least one semop was completed successfully. */ static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q) { struct sem_queue *q, *tmp; struct list_head *pending_list; int semop_completed = 0; if (semnum == -1) pending_list = &sma->pending_alter; else pending_list = &sma->sems[semnum].pending_alter; again: list_for_each_entry_safe(q, tmp, pending_list, list) { int error, restart; /* If we are scanning the single sop, per-semaphore list of * one semaphore and that semaphore is 0, then it is not * necessary to scan further: simple increments * that affect only one entry succeed immediately and cannot * be in the per semaphore pending queue, and decrements * cannot be successful if the value is already 0. */ if (semnum != -1 && sma->sems[semnum].semval == 0) break; error = perform_atomic_semop(sma, q); /* Does q->sleeper still need to sleep? */ if (error > 0) continue; unlink_queue(sma, q); if (error) { restart = 0; } else { semop_completed = 1; do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q); restart = check_restart(sma, q); } wake_up_sem_queue_prepare(q, error, wake_q); if (restart) goto again; } return semop_completed; } /** * set_semotime - set sem_otime * @sma: semaphore array * @sops: operations that modified the array, may be NULL * * sem_otime is replicated to avoid cache line trashing. * This function sets one instance to the current time. */ static void set_semotime(struct sem_array *sma, struct sembuf *sops) { if (sops == NULL) { sma->sems[0].sem_otime = ktime_get_real_seconds(); } else { sma->sems[sops[0].sem_num].sem_otime = ktime_get_real_seconds(); } } /** * do_smart_update - optimized update_queue * @sma: semaphore array * @sops: operations that were performed * @nsops: number of operations * @otime: force setting otime * @wake_q: lockless wake-queue head * * do_smart_update() does the required calls to update_queue and wakeup_zero, * based on the actual changes that were performed on the semaphore array. * Note that the function does not do the actual wake-up: the caller is * responsible for calling wake_up_q(). * It is safe to perform this call after dropping all locks. */ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, int otime, struct wake_q_head *wake_q) { int i; otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q); if (!list_empty(&sma->pending_alter)) { /* semaphore array uses the global queue - just process it. */ otime |= update_queue(sma, -1, wake_q); } else { if (!sops) { /* * No sops, thus the modified semaphores are not * known. Check all. */ for (i = 0; i < sma->sem_nsems; i++) otime |= update_queue(sma, i, wake_q); } else { /* * Check the semaphores that were increased: * - No complex ops, thus all sleeping ops are * decrease. * - if we decreased the value, then any sleeping * semaphore ops won't be able to run: If the * previous value was too small, then the new * value will be too small, too. */ for (i = 0; i < nsops; i++) { if (sops[i].sem_op > 0) { otime |= update_queue(sma, sops[i].sem_num, wake_q); } } } } if (otime) set_semotime(sma, sops); } /* * check_qop: Test if a queued operation sleeps on the semaphore semnum */ static int check_qop(struct sem_array *sma, int semnum, struct sem_queue *q, bool count_zero) { struct sembuf *sop = q->blocking; /* * Linux always (since 0.99.10) reported a task as sleeping on all * semaphores. This violates SUS, therefore it was changed to the * standard compliant behavior. * Give the administrators a chance to notice that an application * might misbehave because it relies on the Linux behavior. */ pr_info_once("semctl(GETNCNT/GETZCNT) is since 3.16 Single Unix Specification compliant.\n" "The task %s (%d) triggered the difference, watch for misbehavior.\n", current->comm, task_pid_nr(current)); if (sop->sem_num != semnum) return 0; if (count_zero && sop->sem_op == 0) return 1; if (!count_zero && sop->sem_op < 0) return 1; return 0; } /* The following counts are associated to each semaphore: * semncnt number of tasks waiting on semval being nonzero * semzcnt number of tasks waiting on semval being zero * * Per definition, a task waits only on the semaphore of the first semop * that cannot proceed, even if additional operation would block, too. */ static int count_semcnt(struct sem_array *sma, ushort semnum, bool count_zero) { struct list_head *l; struct sem_queue *q; int semcnt; semcnt = 0; /* First: check the simple operations. They are easy to evaluate */ if (count_zero) l = &sma->sems[semnum].pending_const; else l = &sma->sems[semnum].pending_alter; list_for_each_entry(q, l, list) { /* all task on a per-semaphore list sleep on exactly * that semaphore */ semcnt++; } /* Then: check the complex operations. */ list_for_each_entry(q, &sma->pending_alter, list) { semcnt += check_qop(sma, semnum, q, count_zero); } if (count_zero) { list_for_each_entry(q, &sma->pending_const, list) { semcnt += check_qop(sma, semnum, q, count_zero); } } return semcnt; } /* Free a semaphore set. freeary() is called with sem_ids.rwsem locked * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem * remains locked on exit. */ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { struct sem_undo *un, *tu; struct sem_queue *q, *tq; struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); int i; DEFINE_WAKE_Q(wake_q); /* Free the existing undo structures for this semaphore set. */ ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry_safe(un, tu, &sma->list_id, list_id) { list_del(&un->list_id); spin_lock(&un->ulp->lock); un->semid = -1; list_del_rcu(&un->list_proc); spin_unlock(&un->ulp->lock); kvfree_rcu(un, rcu); } /* Wake up all pending processes and let them fail with EIDRM. */ list_for_each_entry_safe(q, tq, &sma->pending_const, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } for (i = 0; i < sma->sem_nsems; i++) { struct sem *sem = &sma->sems[i]; list_for_each_entry_safe(q, tq, &sem->pending_const, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } ipc_update_pid(&sem->sempid, NULL); } /* Remove the semaphore set from the IDR */ sem_rmid(ns, sma); sem_unlock(sma, -1); rcu_read_unlock(); wake_up_q(&wake_q); ns->used_sems -= sma->sem_nsems; ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); } static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version) { switch (version) { case IPC_64: return copy_to_user(buf, in, sizeof(*in)); case IPC_OLD: { struct semid_ds out; memset(&out, 0, sizeof(out)); ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm); out.sem_otime = in->sem_otime; out.sem_ctime = in->sem_ctime; out.sem_nsems = in->sem_nsems; return copy_to_user(buf, &out, sizeof(out)); } default: return -EINVAL; } } static time64_t get_semotime(struct sem_array *sma) { int i; time64_t res; res = sma->sems[0].sem_otime; for (i = 1; i < sma->sem_nsems; i++) { time64_t to = sma->sems[i].sem_otime; if (to > res) res = to; } return res; } static int semctl_stat(struct ipc_namespace *ns, int semid, int cmd, struct semid64_ds *semid64) { struct sem_array *sma; time64_t semotime; int err; memset(semid64, 0, sizeof(*semid64)); rcu_read_lock(); if (cmd == SEM_STAT || cmd == SEM_STAT_ANY) { sma = sem_obtain_object(ns, semid); if (IS_ERR(sma)) { err = PTR_ERR(sma); goto out_unlock; } } else { /* IPC_STAT */ sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { err = PTR_ERR(sma); goto out_unlock; } } /* see comment for SHM_STAT_ANY */ if (cmd == SEM_STAT_ANY) audit_ipc_obj(&sma->sem_perm); else { err = -EACCES; if (ipcperms(ns, &sma->sem_perm, S_IRUGO)) goto out_unlock; } err = security_sem_semctl(&sma->sem_perm, cmd); if (err) goto out_unlock; ipc_lock_object(&sma->sem_perm); if (!ipc_valid_object(&sma->sem_perm)) { ipc_unlock_object(&sma->sem_perm); err = -EIDRM; goto out_unlock; } kernel_to_ipc64_perm(&sma->sem_perm, &semid64->sem_perm); semotime = get_semotime(sma); semid64->sem_otime = semotime; semid64->sem_ctime = sma->sem_ctime; #ifndef CONFIG_64BIT semid64->sem_otime_high = semotime >> 32; semid64->sem_ctime_high = sma->sem_ctime >> 32; #endif semid64->sem_nsems = sma->sem_nsems; if (cmd == IPC_STAT) { /* * As defined in SUS: * Return 0 on success */ err = 0; } else { /* * SEM_STAT and SEM_STAT_ANY (both Linux specific) * Return the full id, including the sequence number */ err = sma->sem_perm.id; } ipc_unlock_object(&sma->sem_perm); out_unlock: rcu_read_unlock(); return err; } static int semctl_info(struct ipc_namespace *ns, int semid, int cmd, void __user *p) { struct seminfo seminfo; int max_idx; int err; err = security_sem_semctl(NULL, cmd); if (err) return err; memset(&seminfo, 0, sizeof(seminfo)); seminfo.semmni = ns->sc_semmni; seminfo.semmns = ns->sc_semmns; seminfo.semmsl = ns->sc_semmsl; seminfo.semopm = ns->sc_semopm; seminfo.semvmx = SEMVMX; seminfo.semmnu = SEMMNU; seminfo.semmap = SEMMAP; seminfo.semume = SEMUME; down_read(&sem_ids(ns).rwsem); if (cmd == SEM_INFO) { seminfo.semusz = sem_ids(ns).in_use; seminfo.semaem = ns->used_sems; } else { seminfo.semusz = SEMUSZ; seminfo.semaem = SEMAEM; } max_idx = ipc_get_maxidx(&sem_ids(ns)); up_read(&sem_ids(ns).rwsem); if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) return -EFAULT; return (max_idx < 0) ? 0 : max_idx; } static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, int val) { struct sem_undo *un; struct sem_array *sma; struct sem *curr; int err; DEFINE_WAKE_Q(wake_q); if (val > SEMVMX || val < 0) return -ERANGE; rcu_read_lock(); sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { rcu_read_unlock(); return PTR_ERR(sma); } if (semnum < 0 || semnum >= sma->sem_nsems) { rcu_read_unlock(); return -EINVAL; } if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) { rcu_read_unlock(); return -EACCES; } err = security_sem_semctl(&sma->sem_perm, SETVAL); if (err) { rcu_read_unlock(); return -EACCES; } sem_lock(sma, NULL, -1); if (!ipc_valid_object(&sma->sem_perm)) { sem_unlock(sma, -1); rcu_read_unlock(); return -EIDRM; } semnum = array_index_nospec(semnum, sma->sem_nsems); curr = &sma->sems[semnum]; ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry(un, &sma->list_id, list_id) un->semadj[semnum] = 0; curr->semval = val; ipc_update_pid(&curr->sempid, task_tgid(current)); sma->sem_ctime = ktime_get_real_seconds(); /* maybe some queued-up processes were waiting for this */ do_smart_update(sma, NULL, 0, 0, &wake_q); sem_unlock(sma, -1); rcu_read_unlock(); wake_up_q(&wake_q); return 0; } static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, int cmd, void __user *p) { struct sem_array *sma; struct sem *curr; int err, nsems; ushort fast_sem_io[SEMMSL_FAST]; ushort *sem_io = fast_sem_io; DEFINE_WAKE_Q(wake_q); rcu_read_lock(); sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { rcu_read_unlock(); return PTR_ERR(sma); } nsems = sma->sem_nsems; err = -EACCES; if (ipcperms(ns, &sma->sem_perm, cmd == SETALL ? S_IWUGO : S_IRUGO)) goto out_rcu_wakeup; err = security_sem_semctl(&sma->sem_perm, cmd); if (err) goto out_rcu_wakeup; switch (cmd) { case GETALL: { ushort __user *array = p; int i; sem_lock(sma, NULL, -1); if (!ipc_valid_object(&sma->sem_perm)) { err = -EIDRM; goto out_unlock; } if (nsems > SEMMSL_FAST) { if (!ipc_rcu_getref(&sma->sem_perm)) { err = -EIDRM; goto out_unlock; } sem_unlock(sma, -1); rcu_read_unlock(); sem_io = kvmalloc_array(nsems, sizeof(ushort), GFP_KERNEL); if (sem_io == NULL) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return -ENOMEM; } rcu_read_lock(); sem_lock_and_putref(sma); if (!ipc_valid_object(&sma->sem_perm)) { err = -EIDRM; goto out_unlock; } } for (i = 0; i < sma->sem_nsems; i++) sem_io[i] = sma->sems[i].semval; sem_unlock(sma, -1); rcu_read_unlock(); err = 0; if (copy_to_user(array, sem_io, nsems*sizeof(ushort))) err = -EFAULT; goto out_free; } case SETALL: { int i; struct sem_undo *un; if (!ipc_rcu_getref(&sma->sem_perm)) { err = -EIDRM; goto out_rcu_wakeup; } rcu_read_unlock(); if (nsems > SEMMSL_FAST) { sem_io = kvmalloc_array(nsems, sizeof(ushort), GFP_KERNEL); if (sem_io == NULL) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return -ENOMEM; } } if (copy_from_user(sem_io, p, nsems*sizeof(ushort))) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); err = -EFAULT; goto out_free; } for (i = 0; i < nsems; i++) { if (sem_io[i] > SEMVMX) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); err = -ERANGE; goto out_free; } } rcu_read_lock(); sem_lock_and_putref(sma); if (!ipc_valid_object(&sma->sem_perm)) { err = -EIDRM; goto out_unlock; } for (i = 0; i < nsems; i++) { sma->sems[i].semval = sem_io[i]; ipc_update_pid(&sma->sems[i].sempid, task_tgid(current)); } ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry(un, &sma->list_id, list_id) { for (i = 0; i < nsems; i++) un->semadj[i] = 0; } sma->sem_ctime = ktime_get_real_seconds(); /* maybe some queued-up processes were waiting for this */ do_smart_update(sma, NULL, 0, 0, &wake_q); err = 0; goto out_unlock; } /* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */ } err = -EINVAL; if (semnum < 0 || semnum >= nsems) goto out_rcu_wakeup; sem_lock(sma, NULL, -1); if (!ipc_valid_object(&sma->sem_perm)) { err = -EIDRM; goto out_unlock; } semnum = array_index_nospec(semnum, nsems); curr = &sma->sems[semnum]; switch (cmd) { case GETVAL: err = curr->semval; goto out_unlock; case GETPID: err = pid_vnr(curr->sempid); goto out_unlock; case GETNCNT: err = count_semcnt(sma, semnum, 0); goto out_unlock; case GETZCNT: err = count_semcnt(sma, semnum, 1); goto out_unlock; } out_unlock: sem_unlock(sma, -1); out_rcu_wakeup: rcu_read_unlock(); wake_up_q(&wake_q); out_free: if (sem_io != fast_sem_io) kvfree(sem_io); return err; } static inline unsigned long copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version) { switch (version) { case IPC_64: if (copy_from_user(out, buf, sizeof(*out))) return -EFAULT; return 0; case IPC_OLD: { struct semid_ds tbuf_old; if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) return -EFAULT; out->sem_perm.uid = tbuf_old.sem_perm.uid; out->sem_perm.gid = tbuf_old.sem_perm.gid; out->sem_perm.mode = tbuf_old.sem_perm.mode; return 0; } default: return -EINVAL; } } /* * This function handles some semctl commands which require the rwsem * to be held in write mode. * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int semctl_down(struct ipc_namespace *ns, int semid, int cmd, struct semid64_ds *semid64) { struct sem_array *sma; int err; struct kern_ipc_perm *ipcp; down_write(&sem_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_obtain_check(ns, &sem_ids(ns), semid, cmd, &semid64->sem_perm, 0); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); goto out_unlock1; } sma = container_of(ipcp, struct sem_array, sem_perm); err = security_sem_semctl(&sma->sem_perm, cmd); if (err) goto out_unlock1; switch (cmd) { case IPC_RMID: sem_lock(sma, NULL, -1); /* freeary unlocks the ipc object and rcu */ freeary(ns, ipcp); goto out_up; case IPC_SET: sem_lock(sma, NULL, -1); err = ipc_update_perm(&semid64->sem_perm, ipcp); if (err) goto out_unlock0; sma->sem_ctime = ktime_get_real_seconds(); break; default: err = -EINVAL; goto out_unlock1; } out_unlock0: sem_unlock(sma, -1); out_unlock1: rcu_read_unlock(); out_up: up_write(&sem_ids(ns).rwsem); return err; } static long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg, int version) { struct ipc_namespace *ns; void __user *p = (void __user *)arg; struct semid64_ds semid64; int err; if (semid < 0) return -EINVAL; ns = current->nsproxy->ipc_ns; switch (cmd) { case IPC_INFO: case SEM_INFO: return semctl_info(ns, semid, cmd, p); case IPC_STAT: case SEM_STAT: case SEM_STAT_ANY: err = semctl_stat(ns, semid, cmd, &semid64); if (err < 0) return err; if (copy_semid_to_user(p, &semid64, version)) err = -EFAULT; return err; case GETALL: case GETVAL: case GETPID: case GETNCNT: case GETZCNT: case SETALL: return semctl_main(ns, semid, semnum, cmd, p); case SETVAL: { int val; #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) /* big-endian 64bit */ val = arg >> 32; #else /* 32bit or little-endian 64bit */ val = arg; #endif return semctl_setval(ns, semid, semnum, val); } case IPC_SET: if (copy_semid_from_user(&semid64, p, version)) return -EFAULT; fallthrough; case IPC_RMID: return semctl_down(ns, semid, cmd, &semid64); default: return -EINVAL; } } SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg) { return ksys_semctl(semid, semnum, cmd, arg, IPC_64); } #ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION long ksys_old_semctl(int semid, int semnum, int cmd, unsigned long arg) { int version = ipc_parse_version(&cmd); return ksys_semctl(semid, semnum, cmd, arg, version); } SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, unsigned long, arg) { return ksys_old_semctl(semid, semnum, cmd, arg); } #endif #ifdef CONFIG_COMPAT struct compat_semid_ds { struct compat_ipc_perm sem_perm; old_time32_t sem_otime; old_time32_t sem_ctime; compat_uptr_t sem_base; compat_uptr_t sem_pending; compat_uptr_t sem_pending_last; compat_uptr_t undo; unsigned short sem_nsems; }; static int copy_compat_semid_from_user(struct semid64_ds *out, void __user *buf, int version) { memset(out, 0, sizeof(*out)); if (version == IPC_64) { struct compat_semid64_ds __user *p = buf; return get_compat_ipc64_perm(&out->sem_perm, &p->sem_perm); } else { struct compat_semid_ds __user *p = buf; return get_compat_ipc_perm(&out->sem_perm, &p->sem_perm); } } static int copy_compat_semid_to_user(void __user *buf, struct semid64_ds *in, int version) { if (version == IPC_64) { struct compat_semid64_ds v; memset(&v, 0, sizeof(v)); to_compat_ipc64_perm(&v.sem_perm, &in->sem_perm); v.sem_otime = lower_32_bits(in->sem_otime); v.sem_otime_high = upper_32_bits(in->sem_otime); v.sem_ctime = lower_32_bits(in->sem_ctime); v.sem_ctime_high = upper_32_bits(in->sem_ctime); v.sem_nsems = in->sem_nsems; return copy_to_user(buf, &v, sizeof(v)); } else { struct compat_semid_ds v; memset(&v, 0, sizeof(v)); to_compat_ipc_perm(&v.sem_perm, &in->sem_perm); v.sem_otime = in->sem_otime; v.sem_ctime = in->sem_ctime; v.sem_nsems = in->sem_nsems; return copy_to_user(buf, &v, sizeof(v)); } } static long compat_ksys_semctl(int semid, int semnum, int cmd, int arg, int version) { void __user *p = compat_ptr(arg); struct ipc_namespace *ns; struct semid64_ds semid64; int err; ns = current->nsproxy->ipc_ns; if (semid < 0) return -EINVAL; switch (cmd & (~IPC_64)) { case IPC_INFO: case SEM_INFO: return semctl_info(ns, semid, cmd, p); case IPC_STAT: case SEM_STAT: case SEM_STAT_ANY: err = semctl_stat(ns, semid, cmd, &semid64); if (err < 0) return err; if (copy_compat_semid_to_user(p, &semid64, version)) err = -EFAULT; return err; case GETVAL: case GETPID: case GETNCNT: case GETZCNT: case GETALL: case SETALL: return semctl_main(ns, semid, semnum, cmd, p); case SETVAL: return semctl_setval(ns, semid, semnum, arg); case IPC_SET: if (copy_compat_semid_from_user(&semid64, p, version)) return -EFAULT; fallthrough; case IPC_RMID: return semctl_down(ns, semid, cmd, &semid64); default: return -EINVAL; } } COMPAT_SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, int, arg) { return compat_ksys_semctl(semid, semnum, cmd, arg, IPC_64); } #ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION long compat_ksys_old_semctl(int semid, int semnum, int cmd, int arg) { int version = compat_ipc_parse_version(&cmd); return compat_ksys_semctl(semid, semnum, cmd, arg, version); } COMPAT_SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, int, arg) { return compat_ksys_old_semctl(semid, semnum, cmd, arg); } #endif #endif /* If the task doesn't already have a undo_list, then allocate one * here. We guarantee there is only one thread using this undo list, * and current is THE ONE * * If this allocation and assignment succeeds, but later * portions of this code fail, there is no need to free the sem_undo_list. * Just let it stay associated with the task, and it'll be freed later * at exit time. * * This can block, so callers must hold no locks. */ static inline int get_undo_list(struct sem_undo_list **undo_listp) { struct sem_undo_list *undo_list; undo_list = current->sysvsem.undo_list; if (!undo_list) { undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_ACCOUNT); if (undo_list == NULL) return -ENOMEM; spin_lock_init(&undo_list->lock); refcount_set(&undo_list->refcnt, 1); INIT_LIST_HEAD(&undo_list->list_proc); current->sysvsem.undo_list = undo_list; } *undo_listp = undo_list; return 0; } static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid) { struct sem_undo *un; list_for_each_entry_rcu(un, &ulp->list_proc, list_proc, spin_is_locked(&ulp->lock)) { if (un->semid == semid) return un; } return NULL; } static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid) { struct sem_undo *un; assert_spin_locked(&ulp->lock); un = __lookup_undo(ulp, semid); if (un) { list_del_rcu(&un->list_proc); list_add_rcu(&un->list_proc, &ulp->list_proc); } return un; } /** * find_alloc_undo - lookup (and if not present create) undo array * @ns: namespace * @semid: semaphore array id * * The function looks up (and if not present creates) the undo structure. * The size of the undo structure depends on the size of the semaphore * array, thus the alloc path is not that straightforward. * Lifetime-rules: sem_undo is rcu-protected, on success, the function * performs a rcu_read_lock(). */ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) { struct sem_array *sma; struct sem_undo_list *ulp; struct sem_undo *un, *new; int nsems, error; error = get_undo_list(&ulp); if (error) return ERR_PTR(error); rcu_read_lock(); spin_lock(&ulp->lock); un = lookup_undo(ulp, semid); spin_unlock(&ulp->lock); if (likely(un != NULL)) goto out; /* no undo structure around - allocate one. */ /* step 1: figure out the size of the semaphore array */ sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { rcu_read_unlock(); return ERR_CAST(sma); } nsems = sma->sem_nsems; if (!ipc_rcu_getref(&sma->sem_perm)) { rcu_read_unlock(); un = ERR_PTR(-EIDRM); goto out; } rcu_read_unlock(); /* step 2: allocate new undo structure */ new = kvzalloc(struct_size(new, semadj, nsems), GFP_KERNEL_ACCOUNT); if (!new) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return ERR_PTR(-ENOMEM); } /* step 3: Acquire the lock on semaphore array */ rcu_read_lock(); sem_lock_and_putref(sma); if (!ipc_valid_object(&sma->sem_perm)) { sem_unlock(sma, -1); rcu_read_unlock(); kvfree(new); un = ERR_PTR(-EIDRM); goto out; } spin_lock(&ulp->lock); /* * step 4: check for races: did someone else allocate the undo struct? */ un = lookup_undo(ulp, semid); if (un) { spin_unlock(&ulp->lock); kvfree(new); goto success; } /* step 5: initialize & link new undo structure */ new->ulp = ulp; new->semid = semid; assert_spin_locked(&ulp->lock); list_add_rcu(&new->list_proc, &ulp->list_proc); ipc_assert_locked_object(&sma->sem_perm); list_add(&new->list_id, &sma->list_id); un = new; spin_unlock(&ulp->lock); success: sem_unlock(sma, -1); out: return un; } long __do_semtimedop(int semid, struct sembuf *sops, unsigned nsops, const struct timespec64 *timeout, struct ipc_namespace *ns) { int error = -EINVAL; struct sem_array *sma; struct sembuf *sop; struct sem_undo *un; int max, locknum; bool undos = false, alter = false, dupsop = false; struct sem_queue queue; unsigned long dup = 0; ktime_t expires, *exp = NULL; bool timed_out = false; if (nsops < 1 || semid < 0) return -EINVAL; if (nsops > ns->sc_semopm) return -E2BIG; if (timeout) { if (!timespec64_valid(timeout)) return -EINVAL; expires = ktime_add_safe(ktime_get(), timespec64_to_ktime(*timeout)); exp = &expires; } max = 0; for (sop = sops; sop < sops + nsops; sop++) { unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG); if (sop->sem_num >= max) max = sop->sem_num; if (sop->sem_flg & SEM_UNDO) undos = true; if (dup & mask) { /* * There was a previous alter access that appears * to have accessed the same semaphore, thus use * the dupsop logic. "appears", because the detection * can only check % BITS_PER_LONG. */ dupsop = true; } if (sop->sem_op != 0) { alter = true; dup |= mask; } } if (undos) { /* On success, find_alloc_undo takes the rcu_read_lock */ un = find_alloc_undo(ns, semid); if (IS_ERR(un)) { error = PTR_ERR(un); goto out; } } else { un = NULL; rcu_read_lock(); } sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { rcu_read_unlock(); error = PTR_ERR(sma); goto out; } error = -EFBIG; if (max >= sma->sem_nsems) { rcu_read_unlock(); goto out; } error = -EACCES; if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) { rcu_read_unlock(); goto out; } error = security_sem_semop(&sma->sem_perm, sops, nsops, alter); if (error) { rcu_read_unlock(); goto out; } error = -EIDRM; locknum = sem_lock(sma, sops, nsops); /* * We eventually might perform the following check in a lockless * fashion, considering ipc_valid_object() locking constraints. * If nsops == 1 and there is no contention for sem_perm.lock, then * only a per-semaphore lock is held and it's OK to proceed with the * check below. More details on the fine grained locking scheme * entangled here and why it's RMID race safe on comments at sem_lock() */ if (!ipc_valid_object(&sma->sem_perm)) goto out_unlock; /* * semid identifiers are not unique - find_alloc_undo may have * allocated an undo structure, it was invalidated by an RMID * and now a new array with received the same id. Check and fail. * This case can be detected checking un->semid. The existence of * "un" itself is guaranteed by rcu. */ if (un && un->semid == -1) goto out_unlock; queue.sops = sops; queue.nsops = nsops; queue.undo = un; queue.pid = task_tgid(current); queue.alter = alter; queue.dupsop = dupsop; error = perform_atomic_semop(sma, &queue); if (error == 0) { /* non-blocking successful path */ DEFINE_WAKE_Q(wake_q); /* * If the operation was successful, then do * the required updates. */ if (alter) do_smart_update(sma, sops, nsops, 1, &wake_q); else set_semotime(sma, sops); sem_unlock(sma, locknum); rcu_read_unlock(); wake_up_q(&wake_q); goto out; } if (error < 0) /* non-blocking error path */ goto out_unlock; /* * We need to sleep on this operation, so we put the current * task into the pending queue and go to sleep. */ if (nsops == 1) { struct sem *curr; int idx = array_index_nospec(sops->sem_num, sma->sem_nsems); curr = &sma->sems[idx]; if (alter) { if (sma->complex_count) { list_add_tail(&queue.list, &sma->pending_alter); } else { list_add_tail(&queue.list, &curr->pending_alter); } } else { list_add_tail(&queue.list, &curr->pending_const); } } else { if (!sma->complex_count) merge_queues(sma); if (alter) list_add_tail(&queue.list, &sma->pending_alter); else list_add_tail(&queue.list, &sma->pending_const); sma->complex_count++; } do { /* memory ordering ensured by the lock in sem_lock() */ WRITE_ONCE(queue.status, -EINTR); queue.sleeper = current; /* memory ordering is ensured by the lock in sem_lock() */ __set_current_state(TASK_INTERRUPTIBLE); sem_unlock(sma, locknum); rcu_read_unlock(); timed_out = !schedule_hrtimeout_range(exp, current->timer_slack_ns, HRTIMER_MODE_ABS); /* * fastpath: the semop has completed, either successfully or * not, from the syscall pov, is quite irrelevant to us at this * point; we're done. * * We _do_ care, nonetheless, about being awoken by a signal or * spuriously. The queue.status is checked again in the * slowpath (aka after taking sem_lock), such that we can detect * scenarios where we were awakened externally, during the * window between wake_q_add() and wake_up_q(). */ rcu_read_lock(); error = READ_ONCE(queue.status); if (error != -EINTR) { /* see SEM_BARRIER_2 for purpose/pairing */ smp_acquire__after_ctrl_dep(); rcu_read_unlock(); goto out; } locknum = sem_lock(sma, sops, nsops); if (!ipc_valid_object(&sma->sem_perm)) goto out_unlock; /* * No necessity for any barrier: We are protect by sem_lock() */ error = READ_ONCE(queue.status); /* * If queue.status != -EINTR we are woken up by another process. * Leave without unlink_queue(), but with sem_unlock(). */ if (error != -EINTR) goto out_unlock; /* * If an interrupt occurred we have to clean up the queue. */ if (timed_out) error = -EAGAIN; } while (error == -EINTR && !signal_pending(current)); /* spurious */ unlink_queue(sma, &queue); out_unlock: sem_unlock(sma, locknum); rcu_read_unlock(); out: return error; } static long do_semtimedop(int semid, struct sembuf __user *tsops, unsigned nsops, const struct timespec64 *timeout) { struct sembuf fast_sops[SEMOPM_FAST]; struct sembuf *sops = fast_sops; struct ipc_namespace *ns; int ret; ns = current->nsproxy->ipc_ns; if (nsops > ns->sc_semopm) return -E2BIG; if (nsops < 1) return -EINVAL; if (nsops > SEMOPM_FAST) { sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL); if (sops == NULL) return -ENOMEM; } if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { ret = -EFAULT; goto out_free; } ret = __do_semtimedop(semid, sops, nsops, timeout, ns); out_free: if (sops != fast_sops) kvfree(sops); return ret; } long ksys_semtimedop(int semid, struct sembuf __user *tsops, unsigned int nsops, const struct __kernel_timespec __user *timeout) { if (timeout) { struct timespec64 ts; if (get_timespec64(&ts, timeout)) return -EFAULT; return do_semtimedop(semid, tsops, nsops, &ts); } return do_semtimedop(semid, tsops, nsops, NULL); } SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, unsigned int, nsops, const struct __kernel_timespec __user *, timeout) { return ksys_semtimedop(semid, tsops, nsops, timeout); } #ifdef CONFIG_COMPAT_32BIT_TIME long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems, unsigned int nsops, const struct old_timespec32 __user *timeout) { if (timeout) { struct timespec64 ts; if (get_old_timespec32(&ts, timeout)) return -EFAULT; return do_semtimedop(semid, tsems, nsops, &ts); } return do_semtimedop(semid, tsems, nsops, NULL); } SYSCALL_DEFINE4(semtimedop_time32, int, semid, struct sembuf __user *, tsems, unsigned int, nsops, const struct old_timespec32 __user *, timeout) { return compat_ksys_semtimedop(semid, tsems, nsops, timeout); } #endif SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops, unsigned, nsops) { return do_semtimedop(semid, tsops, nsops, NULL); } /* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between * parent and child tasks. */ int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) { struct sem_undo_list *undo_list; int error; if (clone_flags & CLONE_SYSVSEM) { error = get_undo_list(&undo_list); if (error) return error; refcount_inc(&undo_list->refcnt); tsk->sysvsem.undo_list = undo_list; } else tsk->sysvsem.undo_list = NULL; return 0; } /* * add semadj values to semaphores, free undo structures. * undo structures are not freed when semaphore arrays are destroyed * so some of them may be out of date. * IMPLEMENTATION NOTE: There is some confusion over whether the * set of adjustments that needs to be done should be done in an atomic * manner or not. That is, if we are attempting to decrement the semval * should we queue up and wait until we can do so legally? * The original implementation attempted to do this (queue and wait). * The current implementation does not do so. The POSIX standard * and SVID should be consulted to determine what behavior is mandated. */ void exit_sem(struct task_struct *tsk) { struct sem_undo_list *ulp; ulp = tsk->sysvsem.undo_list; if (!ulp) return; tsk->sysvsem.undo_list = NULL; if (!refcount_dec_and_test(&ulp->refcnt)) return; for (;;) { struct sem_array *sma; struct sem_undo *un; int semid, i; DEFINE_WAKE_Q(wake_q); cond_resched(); rcu_read_lock(); un = list_entry_rcu(ulp->list_proc.next, struct sem_undo, list_proc); if (&un->list_proc == &ulp->list_proc) { /* * We must wait for freeary() before freeing this ulp, * in case we raced with last sem_undo. There is a small * possibility where we exit while freeary() didn't * finish unlocking sem_undo_list. */ spin_lock(&ulp->lock); spin_unlock(&ulp->lock); rcu_read_unlock(); break; } spin_lock(&ulp->lock); semid = un->semid; spin_unlock(&ulp->lock); /* exit_sem raced with IPC_RMID, nothing to do */ if (semid == -1) { rcu_read_unlock(); continue; } sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, semid); /* exit_sem raced with IPC_RMID, nothing to do */ if (IS_ERR(sma)) { rcu_read_unlock(); continue; } sem_lock(sma, NULL, -1); /* exit_sem raced with IPC_RMID, nothing to do */ if (!ipc_valid_object(&sma->sem_perm)) { sem_unlock(sma, -1); rcu_read_unlock(); continue; } un = __lookup_undo(ulp, semid); if (un == NULL) { /* exit_sem raced with IPC_RMID+semget() that created * exactly the same semid. Nothing to do. */ sem_unlock(sma, -1); rcu_read_unlock(); continue; } /* remove un from the linked lists */ ipc_assert_locked_object(&sma->sem_perm); list_del(&un->list_id); spin_lock(&ulp->lock); list_del_rcu(&un->list_proc); spin_unlock(&ulp->lock); /* perform adjustments registered in un */ for (i = 0; i < sma->sem_nsems; i++) { struct sem *semaphore = &sma->sems[i]; if (un->semadj[i]) { semaphore->semval += un->semadj[i]; /* * Range checks of the new semaphore value, * not defined by sus: * - Some unices ignore the undo entirely * (e.g. HP UX 11i 11.22, Tru64 V5.1) * - some cap the value (e.g. FreeBSD caps * at 0, but doesn't enforce SEMVMX) * * Linux caps the semaphore value, both at 0 * and at SEMVMX. * * Manfred <manfred@colorfullife.com> */ if (semaphore->semval < 0) semaphore->semval = 0; if (semaphore->semval > SEMVMX) semaphore->semval = SEMVMX; ipc_update_pid(&semaphore->sempid, task_tgid(current)); } } /* maybe some queued-up processes were waiting for this */ do_smart_update(sma, NULL, 0, 1, &wake_q); sem_unlock(sma, -1); rcu_read_unlock(); wake_up_q(&wake_q); kvfree_rcu(un, rcu); } kfree(ulp); } #ifdef CONFIG_PROC_FS static int sysvipc_sem_proc_show(struct seq_file *s, void *it) { struct user_namespace *user_ns = seq_user_ns(s); struct kern_ipc_perm *ipcp = it; struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); time64_t sem_otime; /* * The proc interface isn't aware of sem_lock(), it calls * ipc_lock_object(), i.e. spin_lock(&sma->sem_perm.lock). * (in sysvipc_find_ipc) * In order to stay compatible with sem_lock(), we must * enter / leave complex_mode. */ complexmode_enter(sma); sem_otime = get_semotime(sma); seq_printf(s, "%10d %10d %4o %10u %5u %5u %5u %5u %10llu %10llu\n", sma->sem_perm.key, sma->sem_perm.id, sma->sem_perm.mode, sma->sem_nsems, from_kuid_munged(user_ns, sma->sem_perm.uid), from_kgid_munged(user_ns, sma->sem_perm.gid), from_kuid_munged(user_ns, sma->sem_perm.cuid), from_kgid_munged(user_ns, sma->sem_perm.cgid), sem_otime, sma->sem_ctime); complexmode_tryleave(sma); return 0; } #endif
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2021 Intel Corporation * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES * * iommufd provides control over the IOMMU HW objects created by IOMMU kernel * drivers. IOMMU HW objects revolve around IO page tables that map incoming DMA * addresses (IOVA) to CPU addresses. */ #define pr_fmt(fmt) "iommufd: " fmt #include <linux/bug.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/iommufd.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <uapi/linux/iommufd.h> #include "io_pagetable.h" #include "iommufd_private.h" #include "iommufd_test.h" struct iommufd_object_ops { void (*destroy)(struct iommufd_object *obj); void (*abort)(struct iommufd_object *obj); }; static const struct iommufd_object_ops iommufd_object_ops[]; static struct miscdevice vfio_misc_dev; /* * Allow concurrent access to the object. * * Once another thread can see the object pointer it can prevent object * destruction. Expect for special kernel-only objects there is no in-kernel way * to reliably destroy a single object. Thus all APIs that are creating objects * must use iommufd_object_abort() to handle their errors and only call * iommufd_object_finalize() once object creation cannot fail. */ void iommufd_object_finalize(struct iommufd_ctx *ictx, struct iommufd_object *obj) { XA_STATE(xas, &ictx->objects, obj->id); void *old; xa_lock(&ictx->objects); old = xas_store(&xas, obj); xa_unlock(&ictx->objects); /* obj->id was returned from xa_alloc() so the xas_store() cannot fail */ WARN_ON(old != XA_ZERO_ENTRY); } /* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */ void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) { XA_STATE(xas, &ictx->objects, obj->id); void *old; xa_lock(&ictx->objects); old = xas_store(&xas, NULL); xa_unlock(&ictx->objects); WARN_ON(old != XA_ZERO_ENTRY); kfree(obj); } /* * Abort an object that has been fully initialized and needs destroy, but has * not been finalized. */ void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, struct iommufd_object *obj) { if (iommufd_object_ops[obj->type].abort) iommufd_object_ops[obj->type].abort(obj); else iommufd_object_ops[obj->type].destroy(obj); iommufd_object_abort(ictx, obj); } struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, enum iommufd_object_type type) { struct iommufd_object *obj; if (iommufd_should_fail()) return ERR_PTR(-ENOENT); xa_lock(&ictx->objects); obj = xa_load(&ictx->objects, id); if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) || !iommufd_lock_obj(obj)) obj = ERR_PTR(-ENOENT); xa_unlock(&ictx->objects); return obj; } static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx, struct iommufd_object *to_destroy) { if (refcount_dec_and_test(&to_destroy->shortterm_users)) return 0; if (wait_event_timeout(ictx->destroy_wait, refcount_read(&to_destroy->shortterm_users) == 0, msecs_to_jiffies(60000))) return 0; pr_crit("Time out waiting for iommufd object to become free\n"); refcount_inc(&to_destroy->shortterm_users); return -EBUSY; } /* * Remove the given object id from the xarray if the only reference to the * object is held by the xarray. */ int iommufd_object_remove(struct iommufd_ctx *ictx, struct iommufd_object *to_destroy, u32 id, unsigned int flags) { struct iommufd_object *obj; XA_STATE(xas, &ictx->objects, id); bool zerod_shortterm = false; int ret; /* * The purpose of the shortterm_users is to ensure deterministic * destruction of objects used by external drivers and destroyed by this * function. Any temporary increment of the refcount must increment * shortterm_users, such as during ioctl execution. */ if (flags & REMOVE_WAIT_SHORTTERM) { ret = iommufd_object_dec_wait_shortterm(ictx, to_destroy); if (ret) { /* * We have a bug. Put back the callers reference and * defer cleaning this object until close. */ refcount_dec(&to_destroy->users); return ret; } zerod_shortterm = true; } xa_lock(&ictx->objects); obj = xas_load(&xas); if (to_destroy) { /* * If the caller is holding a ref on obj we put it here under * the spinlock. */ refcount_dec(&obj->users); if (WARN_ON(obj != to_destroy)) { ret = -ENOENT; goto err_xa; } } else if (xa_is_zero(obj) || !obj) { ret = -ENOENT; goto err_xa; } if (!refcount_dec_if_one(&obj->users)) { ret = -EBUSY; goto err_xa; } xas_store(&xas, NULL); if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj)) ictx->vfio_ioas = NULL; xa_unlock(&ictx->objects); /* * Since users is zero any positive users_shortterm must be racing * iommufd_put_object(), or we have a bug. */ if (!zerod_shortterm) { ret = iommufd_object_dec_wait_shortterm(ictx, obj); if (WARN_ON(ret)) return ret; } iommufd_object_ops[obj->type].destroy(obj); kfree(obj); return 0; err_xa: if (zerod_shortterm) { /* Restore the xarray owned reference */ refcount_set(&obj->shortterm_users, 1); } xa_unlock(&ictx->objects); /* The returned object reference count is zero */ return ret; } static int iommufd_destroy(struct iommufd_ucmd *ucmd) { struct iommu_destroy *cmd = ucmd->cmd; return iommufd_object_remove(ucmd->ictx, NULL, cmd->id, 0); } static int iommufd_fops_open(struct inode *inode, struct file *filp) { struct iommufd_ctx *ictx; ictx = kzalloc(sizeof(*ictx), GFP_KERNEL_ACCOUNT); if (!ictx) return -ENOMEM; /* * For compatibility with VFIO when /dev/vfio/vfio is opened we default * to the same rlimit accounting as vfio uses. */ if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) && filp->private_data == &vfio_misc_dev) { ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n"); } init_rwsem(&ictx->ioas_creation_lock); xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); xa_init(&ictx->groups); ictx->file = filp; init_waitqueue_head(&ictx->destroy_wait); filp->private_data = ictx; return 0; } static int iommufd_fops_release(struct inode *inode, struct file *filp) { struct iommufd_ctx *ictx = filp->private_data; struct iommufd_object *obj; /* * The objects in the xarray form a graph of "users" counts, and we have * to destroy them in a depth first manner. Leaf objects will reduce the * users count of interior objects when they are destroyed. * * Repeatedly destroying all the "1 users" leaf objects will progress * until the entire list is destroyed. If this can't progress then there * is some bug related to object refcounting. */ while (!xa_empty(&ictx->objects)) { unsigned int destroyed = 0; unsigned long index; xa_for_each(&ictx->objects, index, obj) { if (!refcount_dec_if_one(&obj->users)) continue; destroyed++; xa_erase(&ictx->objects, index); iommufd_object_ops[obj->type].destroy(obj); kfree(obj); } /* Bug related to users refcount */ if (WARN_ON(!destroyed)) break; } WARN_ON(!xa_empty(&ictx->groups)); kfree(ictx); return 0; } static int iommufd_option(struct iommufd_ucmd *ucmd) { struct iommu_option *cmd = ucmd->cmd; int rc; if (cmd->__reserved) return -EOPNOTSUPP; switch (cmd->option_id) { case IOMMU_OPTION_RLIMIT_MODE: rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); break; case IOMMU_OPTION_HUGE_PAGES: rc = iommufd_ioas_option(ucmd); break; default: return -EOPNOTSUPP; } if (rc) return rc; if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64, &cmd->val64, sizeof(cmd->val64))) return -EFAULT; return 0; } union ucmd_buffer { struct iommu_destroy destroy; struct iommu_fault_alloc fault; struct iommu_hw_info info; struct iommu_hwpt_alloc hwpt; struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; struct iommu_hwpt_invalidate cache; struct iommu_hwpt_set_dirty_tracking set_dirty_tracking; struct iommu_ioas_alloc alloc; struct iommu_ioas_allow_iovas allow_iovas; struct iommu_ioas_copy ioas_copy; struct iommu_ioas_iova_ranges iova_ranges; struct iommu_ioas_map map; struct iommu_ioas_unmap unmap; struct iommu_option option; struct iommu_vdevice_alloc vdev; struct iommu_vfio_ioas vfio_ioas; struct iommu_viommu_alloc viommu; #ifdef CONFIG_IOMMUFD_TEST struct iommu_test_cmd test; #endif }; struct iommufd_ioctl_op { unsigned int size; unsigned int min_size; unsigned int ioctl_num; int (*execute)(struct iommufd_ucmd *ucmd); }; #define IOCTL_OP(_ioctl, _fn, _struct, _last) \ [_IOC_NR(_ioctl) - IOMMUFD_CMD_BASE] = { \ .size = sizeof(_struct) + \ BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ sizeof(_struct)), \ .min_size = offsetofend(_struct, _last), \ .ioctl_num = _ioctl, \ .execute = _fn, \ } static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_fault_alloc, struct iommu_fault_alloc, out_fault_fd), IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info, __reserved), IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, __reserved), IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, struct iommu_hwpt_get_dirty_bitmap, data), IOCTL_OP(IOMMU_HWPT_INVALIDATE, iommufd_hwpt_invalidate, struct iommu_hwpt_invalidate, __reserved), IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking, struct iommu_hwpt_set_dirty_tracking, __reserved), IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, struct iommu_ioas_alloc, out_ioas_id), IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, struct iommu_ioas_allow_iovas, allowed_iovas), IOCTL_OP(IOMMU_IOAS_CHANGE_PROCESS, iommufd_ioas_change_process, struct iommu_ioas_change_process, __reserved), IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, src_iova), IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, struct iommu_ioas_iova_ranges, out_iova_alignment), IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, iova), IOCTL_OP(IOMMU_IOAS_MAP_FILE, iommufd_ioas_map_file, struct iommu_ioas_map_file, iova), IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, length), IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, val64), IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl, struct iommu_vdevice_alloc, virt_id), IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, __reserved), IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl, struct iommu_viommu_alloc, out_viommu_id), #ifdef CONFIG_IOMMUFD_TEST IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), #endif }; static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct iommufd_ctx *ictx = filp->private_data; const struct iommufd_ioctl_op *op; struct iommufd_ucmd ucmd = {}; union ucmd_buffer buf; unsigned int nr; int ret; nr = _IOC_NR(cmd); if (nr < IOMMUFD_CMD_BASE || (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops)) return iommufd_vfio_ioctl(ictx, cmd, arg); ucmd.ictx = ictx; ucmd.ubuffer = (void __user *)arg; ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); if (ret) return ret; op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE]; if (op->ioctl_num != cmd) return -ENOIOCTLCMD; if (ucmd.user_size < op->min_size) return -EINVAL; ucmd.cmd = &buf; ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, ucmd.user_size); if (ret) return ret; ret = op->execute(&ucmd); return ret; } static const struct file_operations iommufd_fops = { .owner = THIS_MODULE, .open = iommufd_fops_open, .release = iommufd_fops_release, .unlocked_ioctl = iommufd_fops_ioctl, }; /** * iommufd_ctx_get - Get a context reference * @ictx: Context to get * * The caller must already hold a valid reference to ictx. */ void iommufd_ctx_get(struct iommufd_ctx *ictx) { get_file(ictx->file); } EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, "IOMMUFD"); /** * iommufd_ctx_from_file - Acquires a reference to the iommufd context * @file: File to obtain the reference from * * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. The struct file * remains owned by the caller and the caller must still do fput. On success * the caller is responsible to call iommufd_ctx_put(). */ struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) { struct iommufd_ctx *ictx; if (file->f_op != &iommufd_fops) return ERR_PTR(-EBADFD); ictx = file->private_data; iommufd_ctx_get(ictx); return ictx; } EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, "IOMMUFD"); /** * iommufd_ctx_from_fd - Acquires a reference to the iommufd context * @fd: File descriptor to obtain the reference from * * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. On success * the caller is responsible to call iommufd_ctx_put(). */ struct iommufd_ctx *iommufd_ctx_from_fd(int fd) { struct file *file; file = fget(fd); if (!file) return ERR_PTR(-EBADF); if (file->f_op != &iommufd_fops) { fput(file); return ERR_PTR(-EBADFD); } /* fget is the same as iommufd_ctx_get() */ return file->private_data; } EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, "IOMMUFD"); /** * iommufd_ctx_put - Put back a reference * @ictx: Context to put back */ void iommufd_ctx_put(struct iommufd_ctx *ictx) { fput(ictx->file); } EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, "IOMMUFD"); static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_ACCESS] = { .destroy = iommufd_access_destroy_object, }, [IOMMUFD_OBJ_DEVICE] = { .destroy = iommufd_device_destroy, }, [IOMMUFD_OBJ_FAULT] = { .destroy = iommufd_fault_destroy, }, [IOMMUFD_OBJ_HWPT_PAGING] = { .destroy = iommufd_hwpt_paging_destroy, .abort = iommufd_hwpt_paging_abort, }, [IOMMUFD_OBJ_HWPT_NESTED] = { .destroy = iommufd_hwpt_nested_destroy, .abort = iommufd_hwpt_nested_abort, }, [IOMMUFD_OBJ_IOAS] = { .destroy = iommufd_ioas_destroy, }, [IOMMUFD_OBJ_VDEVICE] = { .destroy = iommufd_vdevice_destroy, }, [IOMMUFD_OBJ_VIOMMU] = { .destroy = iommufd_viommu_destroy, }, #ifdef CONFIG_IOMMUFD_TEST [IOMMUFD_OBJ_SELFTEST] = { .destroy = iommufd_selftest_destroy, }, #endif }; static struct miscdevice iommu_misc_dev = { .minor = MISC_DYNAMIC_MINOR, .name = "iommu", .fops = &iommufd_fops, .nodename = "iommu", .mode = 0660, }; static struct miscdevice vfio_misc_dev = { .minor = VFIO_MINOR, .name = "vfio", .fops = &iommufd_fops, .nodename = "vfio/vfio", .mode = 0666, }; static int __init iommufd_init(void) { int ret; ret = misc_register(&iommu_misc_dev); if (ret) return ret; if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) { ret = misc_register(&vfio_misc_dev); if (ret) goto err_misc; } ret = iommufd_test_init(); if (ret) goto err_vfio_misc; return 0; err_vfio_misc: if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) misc_deregister(&vfio_misc_dev); err_misc: misc_deregister(&iommu_misc_dev); return ret; } static void __exit iommufd_exit(void) { iommufd_test_exit(); if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) misc_deregister(&vfio_misc_dev); misc_deregister(&iommu_misc_dev); } module_init(iommufd_init); module_exit(iommufd_exit); #if IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) MODULE_ALIAS_MISCDEV(VFIO_MINOR); MODULE_ALIAS("devname:vfio/vfio"); #endif MODULE_IMPORT_NS("IOMMUFD_INTERNAL"); MODULE_IMPORT_NS("IOMMUFD"); MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); MODULE_LICENSE("GPL");
7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 // SPDX-License-Identifier: GPL-2.0-only /* * xt_conntrack - Netfilter module to match connection tracking * information. (Superset of Rusty's minimalistic state match.) * * (C) 2001 Marc Boucher (marc@mbsi.ca). * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <net/ipv6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_conntrack.h> #include <net/netfilter/nf_conntrack.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: connection tracking state match"); MODULE_ALIAS("ipt_conntrack"); MODULE_ALIAS("ip6t_conntrack"); static bool conntrack_addrcmp(const union nf_inet_addr *kaddr, const union nf_inet_addr *uaddr, const union nf_inet_addr *umask, unsigned int l3proto) { if (l3proto == NFPROTO_IPV4) return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0; else if (l3proto == NFPROTO_IPV6) return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6, &uaddr->in6) == 0; else return false; } static inline bool conntrack_mt_origsrc(const struct nf_conn *ct, const struct xt_conntrack_mtinfo2 *info, u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3, &info->origsrc_addr, &info->origsrc_mask, family); } static inline bool conntrack_mt_origdst(const struct nf_conn *ct, const struct xt_conntrack_mtinfo2 *info, u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3, &info->origdst_addr, &info->origdst_mask, family); } static inline bool conntrack_mt_replsrc(const struct nf_conn *ct, const struct xt_conntrack_mtinfo2 *info, u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3, &info->replsrc_addr, &info->replsrc_mask, family); } static inline bool conntrack_mt_repldst(const struct nf_conn *ct, const struct xt_conntrack_mtinfo2 *info, u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3, &info->repldst_addr, &info->repldst_mask, family); } static inline bool ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info, const struct nf_conn *ct) { const struct nf_conntrack_tuple *tuple; tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; if ((info->match_flags & XT_CONNTRACK_PROTO) && (nf_ct_protonum(ct) == info->l4proto) ^ !(info->invert_flags & XT_CONNTRACK_PROTO)) return false; /* Shortcut to match all recognized protocols by using ->src.all. */ if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) && (tuple->src.u.all == info->origsrc_port) ^ !(info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT)) return false; if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) && (tuple->dst.u.all == info->origdst_port) ^ !(info->invert_flags & XT_CONNTRACK_ORIGDST_PORT)) return false; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) && (tuple->src.u.all == info->replsrc_port) ^ !(info->invert_flags & XT_CONNTRACK_REPLSRC_PORT)) return false; if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) && (tuple->dst.u.all == info->repldst_port) ^ !(info->invert_flags & XT_CONNTRACK_REPLDST_PORT)) return false; return true; } static inline bool port_match(u16 min, u16 max, u16 port, bool invert) { return (port >= min && port <= max) ^ invert; } static inline bool ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info, const struct nf_conn *ct) { const struct nf_conntrack_tuple *tuple; tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; if ((info->match_flags & XT_CONNTRACK_PROTO) && (nf_ct_protonum(ct) == info->l4proto) ^ !(info->invert_flags & XT_CONNTRACK_PROTO)) return false; /* Shortcut to match all recognized protocols by using ->src.all. */ if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) && !port_match(info->origsrc_port, info->origsrc_port_high, ntohs(tuple->src.u.all), info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT)) return false; if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) && !port_match(info->origdst_port, info->origdst_port_high, ntohs(tuple->dst.u.all), info->invert_flags & XT_CONNTRACK_ORIGDST_PORT)) return false; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) && !port_match(info->replsrc_port, info->replsrc_port_high, ntohs(tuple->src.u.all), info->invert_flags & XT_CONNTRACK_REPLSRC_PORT)) return false; if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) && !port_match(info->repldst_port, info->repldst_port_high, ntohs(tuple->dst.u.all), info->invert_flags & XT_CONNTRACK_REPLDST_PORT)) return false; return true; } static bool conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 state_mask, u16 status_mask) { const struct xt_conntrack_mtinfo2 *info = par->matchinfo; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; unsigned int statebit; ct = nf_ct_get(skb, &ctinfo); if (ct) statebit = XT_CONNTRACK_STATE_BIT(ctinfo); else if (ctinfo == IP_CT_UNTRACKED) statebit = XT_CONNTRACK_STATE_UNTRACKED; else statebit = XT_CONNTRACK_STATE_INVALID; if (info->match_flags & XT_CONNTRACK_STATE) { if (ct != NULL) { if (test_bit(IPS_SRC_NAT_BIT, &ct->status)) statebit |= XT_CONNTRACK_STATE_SNAT; if (test_bit(IPS_DST_NAT_BIT, &ct->status)) statebit |= XT_CONNTRACK_STATE_DNAT; } if (!!(state_mask & statebit) ^ !(info->invert_flags & XT_CONNTRACK_STATE)) return false; } if (ct == NULL) return info->match_flags & XT_CONNTRACK_STATE; if ((info->match_flags & XT_CONNTRACK_DIRECTION) && (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ^ !(info->invert_flags & XT_CONNTRACK_DIRECTION)) return false; if (info->match_flags & XT_CONNTRACK_ORIGSRC) if (conntrack_mt_origsrc(ct, info, xt_family(par)) ^ !(info->invert_flags & XT_CONNTRACK_ORIGSRC)) return false; if (info->match_flags & XT_CONNTRACK_ORIGDST) if (conntrack_mt_origdst(ct, info, xt_family(par)) ^ !(info->invert_flags & XT_CONNTRACK_ORIGDST)) return false; if (info->match_flags & XT_CONNTRACK_REPLSRC) if (conntrack_mt_replsrc(ct, info, xt_family(par)) ^ !(info->invert_flags & XT_CONNTRACK_REPLSRC)) return false; if (info->match_flags & XT_CONNTRACK_REPLDST) if (conntrack_mt_repldst(ct, info, xt_family(par)) ^ !(info->invert_flags & XT_CONNTRACK_REPLDST)) return false; if (par->match->revision != 3) { if (!ct_proto_port_check(info, ct)) return false; } else { if (!ct_proto_port_check_v3(par->matchinfo, ct)) return false; } if ((info->match_flags & XT_CONNTRACK_STATUS) && (!!(status_mask & ct->status) ^ !(info->invert_flags & XT_CONNTRACK_STATUS))) return false; if (info->match_flags & XT_CONNTRACK_EXPIRES) { unsigned long expires = nf_ct_expires(ct) / HZ; if ((expires >= info->expires_min && expires <= info->expires_max) ^ !(info->invert_flags & XT_CONNTRACK_EXPIRES)) return false; } return true; } static bool conntrack_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_conntrack_mtinfo1 *info = par->matchinfo; return conntrack_mt(skb, par, info->state_mask, info->status_mask); } static bool conntrack_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_conntrack_mtinfo2 *info = par->matchinfo; return conntrack_mt(skb, par, info->state_mask, info->status_mask); } static bool conntrack_mt_v3(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_conntrack_mtinfo3 *info = par->matchinfo; return conntrack_mt(skb, par, info->state_mask, info->status_mask); } static int conntrack_mt_check(const struct xt_mtchk_param *par) { int ret; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } static void conntrack_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_match conntrack_mt_reg[] __read_mostly = { { .name = "conntrack", .revision = 1, .family = NFPROTO_UNSPEC, .matchsize = sizeof(struct xt_conntrack_mtinfo1), .match = conntrack_mt_v1, .checkentry = conntrack_mt_check, .destroy = conntrack_mt_destroy, .me = THIS_MODULE, }, { .name = "conntrack", .revision = 2, .family = NFPROTO_UNSPEC, .matchsize = sizeof(struct xt_conntrack_mtinfo2), .match = conntrack_mt_v2, .checkentry = conntrack_mt_check, .destroy = conntrack_mt_destroy, .me = THIS_MODULE, }, { .name = "conntrack", .revision = 3, .family = NFPROTO_UNSPEC, .matchsize = sizeof(struct xt_conntrack_mtinfo3), .match = conntrack_mt_v3, .checkentry = conntrack_mt_check, .destroy = conntrack_mt_destroy, .me = THIS_MODULE, }, }; static int __init conntrack_mt_init(void) { return xt_register_matches(conntrack_mt_reg, ARRAY_SIZE(conntrack_mt_reg)); } static void __exit conntrack_mt_exit(void) { xt_unregister_matches(conntrack_mt_reg, ARRAY_SIZE(conntrack_mt_reg)); } module_init(conntrack_mt_init); module_exit(conntrack_mt_exit);
2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 // SPDX-License-Identifier: GPL-2.0-only /* * vDPA bus. * * Copyright (c) 2020, Red Hat. All rights reserved. * Author: Jason Wang <jasowang@redhat.com> * */ #include <linux/module.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/vdpa.h> #include <uapi/linux/vdpa.h> #include <net/genetlink.h> #include <linux/mod_devicetable.h> #include <linux/virtio_ids.h> static LIST_HEAD(mdev_head); /* A global mutex that protects vdpa management device and device level operations. */ static DECLARE_RWSEM(vdpa_dev_lock); static DEFINE_IDA(vdpa_index_ida); void vdpa_set_status(struct vdpa_device *vdev, u8 status) { down_write(&vdev->cf_lock); vdev->config->set_status(vdev, status); up_write(&vdev->cf_lock); } EXPORT_SYMBOL(vdpa_set_status); static struct genl_family vdpa_nl_family; static int vdpa_dev_probe(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver); const struct vdpa_config_ops *ops = vdev->config; u32 max_num, min_num = 1; int ret = 0; d->dma_mask = &d->coherent_dma_mask; ret = dma_set_mask_and_coherent(d, DMA_BIT_MASK(64)); if (ret) return ret; max_num = ops->get_vq_num_max(vdev); if (ops->get_vq_num_min) min_num = ops->get_vq_num_min(vdev); if (max_num < min_num) return -EINVAL; if (drv && drv->probe) ret = drv->probe(vdev); return ret; } static void vdpa_dev_remove(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver); if (drv && drv->remove) drv->remove(vdev); } static int vdpa_dev_match(struct device *dev, const struct device_driver *drv) { struct vdpa_device *vdev = dev_to_vdpa(dev); /* Check override first, and if set, only use the named driver */ if (vdev->driver_override) return strcmp(vdev->driver_override, drv->name) == 0; /* Currently devices must be supported by all vDPA bus drivers */ return 1; } static ssize_t driver_override_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct vdpa_device *vdev = dev_to_vdpa(dev); int ret; ret = driver_set_override(dev, &vdev->driver_override, buf, count); if (ret) return ret; return count; } static ssize_t driver_override_show(struct device *dev, struct device_attribute *attr, char *buf) { struct vdpa_device *vdev = dev_to_vdpa(dev); ssize_t len; device_lock(dev); len = sysfs_emit(buf, "%s\n", vdev->driver_override); device_unlock(dev); return len; } static DEVICE_ATTR_RW(driver_override); static struct attribute *vdpa_dev_attrs[] = { &dev_attr_driver_override.attr, NULL, }; static const struct attribute_group vdpa_dev_group = { .attrs = vdpa_dev_attrs, }; __ATTRIBUTE_GROUPS(vdpa_dev); static const struct bus_type vdpa_bus = { .name = "vdpa", .dev_groups = vdpa_dev_groups, .match = vdpa_dev_match, .probe = vdpa_dev_probe, .remove = vdpa_dev_remove, }; static void vdpa_release_dev(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); const struct vdpa_config_ops *ops = vdev->config; if (ops->free) ops->free(vdev); ida_free(&vdpa_index_ida, vdev->index); kfree(vdev->driver_override); kfree(vdev); } /** * __vdpa_alloc_device - allocate and initilaize a vDPA device * This allows driver to some prepartion after device is * initialized but before registered. * @parent: the parent device * @config: the bus operations that is supported by this device * @ngroups: number of groups supported by this device * @nas: number of address spaces supported by this device * @size: size of the parent structure that contains private data * @name: name of the vdpa device; optional. * @use_va: indicate whether virtual address must be used by this device * * Driver should use vdpa_alloc_device() wrapper macro instead of * using this directly. * * Return: Returns an error when parent/config/dma_dev is not set or fail to get * ida. */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, unsigned int ngroups, unsigned int nas, size_t size, const char *name, bool use_va) { struct vdpa_device *vdev; int err = -EINVAL; if (!config) goto err; if (!!config->dma_map != !!config->dma_unmap) goto err; /* It should only work for the device that use on-chip IOMMU */ if (use_va && !(config->dma_map || config->set_map)) goto err; err = -ENOMEM; vdev = kzalloc(size, GFP_KERNEL); if (!vdev) goto err; err = ida_alloc(&vdpa_index_ida, GFP_KERNEL); if (err < 0) goto err_ida; vdev->dev.bus = &vdpa_bus; vdev->dev.parent = parent; vdev->dev.release = vdpa_release_dev; vdev->index = err; vdev->config = config; vdev->features_valid = false; vdev->use_va = use_va; vdev->ngroups = ngroups; vdev->nas = nas; if (name) err = dev_set_name(&vdev->dev, "%s", name); else err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index); if (err) goto err_name; init_rwsem(&vdev->cf_lock); device_initialize(&vdev->dev); return vdev; err_name: ida_free(&vdpa_index_ida, vdev->index); err_ida: kfree(vdev); err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(__vdpa_alloc_device); static int vdpa_name_match(struct device *dev, const void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); return (strcmp(dev_name(&vdev->dev), data) == 0); } static int __vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) { struct device *dev; vdev->nvqs = nvqs; lockdep_assert_held(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match); if (dev) { put_device(dev); return -EEXIST; } return device_add(&vdev->dev); } /** * _vdpa_register_device - register a vDPA device with vdpa lock held * Caller must have a succeed call of vdpa_alloc_device() before. * Caller must invoke this routine in the management device dev_add() * callback after setting up valid mgmtdev for this vdpa device. * @vdev: the vdpa device to be registered to vDPA bus * @nvqs: number of virtqueues supported by this device * * Return: Returns an error when fail to add device to vDPA bus */ int _vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) { if (!vdev->mdev) return -EINVAL; return __vdpa_register_device(vdev, nvqs); } EXPORT_SYMBOL_GPL(_vdpa_register_device); /** * vdpa_register_device - register a vDPA device * Callers must have a succeed call of vdpa_alloc_device() before. * @vdev: the vdpa device to be registered to vDPA bus * @nvqs: number of virtqueues supported by this device * * Return: Returns an error when fail to add to vDPA bus */ int vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) { int err; down_write(&vdpa_dev_lock); err = __vdpa_register_device(vdev, nvqs); up_write(&vdpa_dev_lock); return err; } EXPORT_SYMBOL_GPL(vdpa_register_device); /** * _vdpa_unregister_device - unregister a vDPA device * Caller must invoke this routine as part of management device dev_del() * callback. * @vdev: the vdpa device to be unregisted from vDPA bus */ void _vdpa_unregister_device(struct vdpa_device *vdev) { lockdep_assert_held(&vdpa_dev_lock); WARN_ON(!vdev->mdev); device_unregister(&vdev->dev); } EXPORT_SYMBOL_GPL(_vdpa_unregister_device); /** * vdpa_unregister_device - unregister a vDPA device * @vdev: the vdpa device to be unregisted from vDPA bus */ void vdpa_unregister_device(struct vdpa_device *vdev) { down_write(&vdpa_dev_lock); device_unregister(&vdev->dev); up_write(&vdpa_dev_lock); } EXPORT_SYMBOL_GPL(vdpa_unregister_device); /** * __vdpa_register_driver - register a vDPA device driver * @drv: the vdpa device driver to be registered * @owner: module owner of the driver * * Return: Returns an err when fail to do the registration */ int __vdpa_register_driver(struct vdpa_driver *drv, struct module *owner) { drv->driver.bus = &vdpa_bus; drv->driver.owner = owner; return driver_register(&drv->driver); } EXPORT_SYMBOL_GPL(__vdpa_register_driver); /** * vdpa_unregister_driver - unregister a vDPA device driver * @drv: the vdpa device driver to be unregistered */ void vdpa_unregister_driver(struct vdpa_driver *drv) { driver_unregister(&drv->driver); } EXPORT_SYMBOL_GPL(vdpa_unregister_driver); /** * vdpa_mgmtdev_register - register a vdpa management device * * @mdev: Pointer to vdpa management device * vdpa_mgmtdev_register() register a vdpa management device which supports * vdpa device management. * Return: Returns 0 on success or failure when required callback ops are not * initialized. */ int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev) { if (!mdev->device || !mdev->ops || !mdev->ops->dev_add || !mdev->ops->dev_del) return -EINVAL; INIT_LIST_HEAD(&mdev->list); down_write(&vdpa_dev_lock); list_add_tail(&mdev->list, &mdev_head); up_write(&vdpa_dev_lock); return 0; } EXPORT_SYMBOL_GPL(vdpa_mgmtdev_register); static int vdpa_match_remove(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); struct vdpa_mgmt_dev *mdev = vdev->mdev; if (mdev == data) mdev->ops->dev_del(mdev, vdev); return 0; } void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev) { down_write(&vdpa_dev_lock); list_del(&mdev->list); /* Filter out all the entries belong to this management device and delete it. */ bus_for_each_dev(&vdpa_bus, NULL, mdev, vdpa_match_remove); up_write(&vdpa_dev_lock); } EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister); static void vdpa_get_config_unlocked(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len) { const struct vdpa_config_ops *ops = vdev->config; /* * Config accesses aren't supposed to trigger before features are set. * If it does happen we assume a legacy guest. */ if (!vdev->features_valid) vdpa_set_features_unlocked(vdev, 0); ops->get_config(vdev, offset, buf, len); } /** * vdpa_get_config - Get one or more device configuration fields. * @vdev: vdpa device to operate on * @offset: starting byte offset of the field * @buf: buffer pointer to read to * @len: length of the configuration fields in bytes */ void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len) { down_read(&vdev->cf_lock); vdpa_get_config_unlocked(vdev, offset, buf, len); up_read(&vdev->cf_lock); } EXPORT_SYMBOL_GPL(vdpa_get_config); /** * vdpa_set_config - Set one or more device configuration fields. * @vdev: vdpa device to operate on * @offset: starting byte offset of the field * @buf: buffer pointer to read from * @length: length of the configuration fields in bytes */ void vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf, unsigned int length) { down_write(&vdev->cf_lock); vdev->config->set_config(vdev, offset, buf, length); up_write(&vdev->cf_lock); } EXPORT_SYMBOL_GPL(vdpa_set_config); static bool mgmtdev_handle_match(const struct vdpa_mgmt_dev *mdev, const char *busname, const char *devname) { /* Bus name is optional for simulated management device, so ignore the * device with bus if bus attribute is provided. */ if ((busname && !mdev->device->bus) || (!busname && mdev->device->bus)) return false; if (!busname && strcmp(dev_name(mdev->device), devname) == 0) return true; if (busname && (strcmp(mdev->device->bus->name, busname) == 0) && (strcmp(dev_name(mdev->device), devname) == 0)) return true; return false; } static struct vdpa_mgmt_dev *vdpa_mgmtdev_get_from_attr(struct nlattr **attrs) { struct vdpa_mgmt_dev *mdev; const char *busname = NULL; const char *devname; if (!attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]) return ERR_PTR(-EINVAL); devname = nla_data(attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]); if (attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]) busname = nla_data(attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]); list_for_each_entry(mdev, &mdev_head, list) { if (mgmtdev_handle_match(mdev, busname, devname)) return mdev; } return ERR_PTR(-ENODEV); } static int vdpa_nl_mgmtdev_handle_fill(struct sk_buff *msg, const struct vdpa_mgmt_dev *mdev) { if (mdev->device->bus && nla_put_string(msg, VDPA_ATTR_MGMTDEV_BUS_NAME, mdev->device->bus->name)) return -EMSGSIZE; if (nla_put_string(msg, VDPA_ATTR_MGMTDEV_DEV_NAME, dev_name(mdev->device))) return -EMSGSIZE; return 0; } static u64 vdpa_mgmtdev_get_classes(const struct vdpa_mgmt_dev *mdev, unsigned int *nclasses) { u64 supported_classes = 0; unsigned int n = 0; for (int i = 0; mdev->id_table[i].device; i++) { if (mdev->id_table[i].device > 63) continue; supported_classes |= BIT_ULL(mdev->id_table[i].device); n++; } if (nclasses) *nclasses = n; return supported_classes; } static int vdpa_mgmtdev_fill(const struct vdpa_mgmt_dev *mdev, struct sk_buff *msg, u32 portid, u32 seq, int flags) { void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_MGMTDEV_NEW); if (!hdr) return -EMSGSIZE; err = vdpa_nl_mgmtdev_handle_fill(msg, mdev); if (err) goto msg_err; if (nla_put_u64_64bit(msg, VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES, vdpa_mgmtdev_get_classes(mdev, NULL), VDPA_ATTR_UNSPEC)) { err = -EMSGSIZE; goto msg_err; } if (nla_put_u32(msg, VDPA_ATTR_DEV_MGMTDEV_MAX_VQS, mdev->max_supported_vqs)) { err = -EMSGSIZE; goto msg_err; } if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_SUPPORTED_FEATURES, mdev->supported_features, VDPA_ATTR_PAD)) { err = -EMSGSIZE; goto msg_err; } genlmsg_end(msg, hdr); return 0; msg_err: genlmsg_cancel(msg, hdr); return err; } static int vdpa_nl_cmd_mgmtdev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_mgmt_dev *mdev; struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; down_read(&vdpa_dev_lock); mdev = vdpa_mgmtdev_get_from_attr(info->attrs); if (IS_ERR(mdev)) { up_read(&vdpa_dev_lock); NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified mgmt device"); err = PTR_ERR(mdev); goto out; } err = vdpa_mgmtdev_fill(mdev, msg, info->snd_portid, info->snd_seq, 0); up_read(&vdpa_dev_lock); if (err) goto out; err = genlmsg_reply(msg, info); return err; out: nlmsg_free(msg); return err; } static int vdpa_nl_cmd_mgmtdev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct vdpa_mgmt_dev *mdev; int start = cb->args[0]; int idx = 0; int err; down_read(&vdpa_dev_lock); list_for_each_entry(mdev, &mdev_head, list) { if (idx < start) { idx++; continue; } err = vdpa_mgmtdev_fill(mdev, msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) goto out; idx++; } out: up_read(&vdpa_dev_lock); cb->args[0] = idx; return msg->len; } #define VDPA_DEV_NET_ATTRS_MASK (BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) | \ BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) | \ BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) /* * Bitmask for all per-device features: feature bits VIRTIO_TRANSPORT_F_START * through VIRTIO_TRANSPORT_F_END are unset, i.e. 0xfffffc000fffffff for * all 64bit features. If the features are extended beyond 64 bits, or new * "holes" are reserved for other type of features than per-device, this * macro would have to be updated. */ #define VIRTIO_DEVICE_F_MASK (~0ULL << (VIRTIO_TRANSPORT_F_END + 1) | \ ((1ULL << VIRTIO_TRANSPORT_F_START) - 1)) static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_dev_set_config config = {}; struct nlattr **nl_attrs = info->attrs; struct vdpa_mgmt_dev *mdev; unsigned int ncls = 0; const u8 *macaddr; const char *name; u64 classes; int err = 0; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]) { macaddr = nla_data(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]); memcpy(config.net.mac, macaddr, sizeof(config.net.mac)); config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR); } if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU]) { config.net.mtu = nla_get_u16(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU]); config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU); } if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP]) { config.net.max_vq_pairs = nla_get_u16(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP]); if (!config.net.max_vq_pairs) { NL_SET_ERR_MSG_MOD(info->extack, "At least one pair of VQs is required"); return -EINVAL; } config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP); } if (nl_attrs[VDPA_ATTR_DEV_FEATURES]) { u64 missing = 0x0ULL; config.device_features = nla_get_u64(nl_attrs[VDPA_ATTR_DEV_FEATURES]); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR] && !(config.device_features & BIT_ULL(VIRTIO_NET_F_MAC))) missing |= BIT_ULL(VIRTIO_NET_F_MAC); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU] && !(config.device_features & BIT_ULL(VIRTIO_NET_F_MTU))) missing |= BIT_ULL(VIRTIO_NET_F_MTU); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP] && config.net.max_vq_pairs > 1 && !(config.device_features & BIT_ULL(VIRTIO_NET_F_MQ))) missing |= BIT_ULL(VIRTIO_NET_F_MQ); if (missing) { NL_SET_ERR_MSG_FMT_MOD(info->extack, "Missing features 0x%llx for provided attributes", missing); return -EINVAL; } config.mask |= BIT_ULL(VDPA_ATTR_DEV_FEATURES); } /* Skip checking capability if user didn't prefer to configure any * device networking attributes. It is likely that user might have used * a device specific method to configure such attributes or using device * default attributes. */ if ((config.mask & VDPA_DEV_NET_ATTRS_MASK) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; down_write(&vdpa_dev_lock); mdev = vdpa_mgmtdev_get_from_attr(info->attrs); if (IS_ERR(mdev)) { NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified management device"); err = PTR_ERR(mdev); goto err; } if ((config.mask & mdev->config_attr_mask) != config.mask) { NL_SET_ERR_MSG_FMT_MOD(info->extack, "Some provided attributes are not supported: 0x%llx", config.mask & ~mdev->config_attr_mask); err = -EOPNOTSUPP; goto err; } classes = vdpa_mgmtdev_get_classes(mdev, &ncls); if (config.mask & VDPA_DEV_NET_ATTRS_MASK && !(classes & BIT_ULL(VIRTIO_ID_NET))) { NL_SET_ERR_MSG_MOD(info->extack, "Network class attributes provided on unsupported management device"); err = -EINVAL; goto err; } if (!(config.mask & VDPA_DEV_NET_ATTRS_MASK) && config.mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES) && classes & BIT_ULL(VIRTIO_ID_NET) && ncls > 1 && config.device_features & VIRTIO_DEVICE_F_MASK) { NL_SET_ERR_MSG_MOD(info->extack, "Management device supports multi-class while device features specified are ambiguous"); err = -EINVAL; goto err; } err = mdev->ops->dev_add(mdev, name, &config); err: up_write(&vdpa_dev_lock); return err; } static int vdpa_nl_cmd_dev_del_set_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_mgmt_dev *mdev; struct vdpa_device *vdev; struct device *dev; const char *name; int err = 0; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); down_write(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "Only user created device can be deleted by user"); err = -EINVAL; goto mdev_err; } mdev = vdev->mdev; mdev->ops->dev_del(mdev, vdev); mdev_err: put_device(dev); dev_err: up_write(&vdpa_dev_lock); return err; } static int vdpa_dev_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { u16 max_vq_size; u16 min_vq_size = 1; u32 device_id; u32 vendor_id; void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_NEW); if (!hdr) return -EMSGSIZE; err = vdpa_nl_mgmtdev_handle_fill(msg, vdev->mdev); if (err) goto msg_err; device_id = vdev->config->get_device_id(vdev); vendor_id = vdev->config->get_vendor_id(vdev); max_vq_size = vdev->config->get_vq_num_max(vdev); if (vdev->config->get_vq_num_min) min_vq_size = vdev->config->get_vq_num_min(vdev); err = -EMSGSIZE; if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) goto msg_err; if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) goto msg_err; if (nla_put_u32(msg, VDPA_ATTR_DEV_VENDOR_ID, vendor_id)) goto msg_err; if (nla_put_u32(msg, VDPA_ATTR_DEV_MAX_VQS, vdev->nvqs)) goto msg_err; if (nla_put_u16(msg, VDPA_ATTR_DEV_MAX_VQ_SIZE, max_vq_size)) goto msg_err; if (nla_put_u16(msg, VDPA_ATTR_DEV_MIN_VQ_SIZE, min_vq_size)) goto msg_err; genlmsg_end(msg, hdr); return 0; msg_err: genlmsg_cancel(msg, hdr); return err; } static int vdpa_nl_cmd_dev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct sk_buff *msg; const char *devname; struct device *dev; int err; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { err = -EINVAL; goto mdev_err; } err = vdpa_dev_fill(vdev, msg, info->snd_portid, info->snd_seq, 0, info->extack); if (err) goto mdev_err; err = genlmsg_reply(msg, info); put_device(dev); up_read(&vdpa_dev_lock); return err; mdev_err: put_device(dev); err: up_read(&vdpa_dev_lock); nlmsg_free(msg); return err; } struct vdpa_dev_dump_info { struct sk_buff *msg; struct netlink_callback *cb; int start_idx; int idx; }; static int vdpa_dev_dump(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); struct vdpa_dev_dump_info *info = data; int err; if (!vdev->mdev) return 0; if (info->idx < info->start_idx) { info->idx++; return 0; } err = vdpa_dev_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid, info->cb->nlh->nlmsg_seq, NLM_F_MULTI, info->cb->extack); if (err) return err; info->idx++; return 0; } static int vdpa_nl_cmd_dev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct vdpa_dev_dump_info info; info.msg = msg; info.cb = cb; info.start_idx = cb->args[0]; info.idx = 0; down_read(&vdpa_dev_lock); bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_dump); up_read(&vdpa_dev_lock); cb->args[0] = info.idx; return msg->len; } static int vdpa_dev_net_mq_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_NET_F_MQ)) == 0 && (features & BIT_ULL(VIRTIO_NET_F_RSS)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->max_virtqueue_pairs); return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MAX_VQP, val_u16); } static int vdpa_dev_net_mtu_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_NET_F_MTU)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->mtu); return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MTU, val_u16); } static int vdpa_dev_net_mac_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { if ((features & BIT_ULL(VIRTIO_NET_F_MAC)) == 0) return 0; return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_MACADDR, sizeof(config->mac), config->mac); } static int vdpa_dev_net_status_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_NET_F_STATUS)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->status); return nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16); } static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *msg) { struct virtio_net_config config = {}; u64 features_device; vdev->config->get_config(vdev, 0, &config, sizeof(config)); features_device = vdev->config->get_device_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_FEATURES, features_device, VDPA_ATTR_PAD)) return -EMSGSIZE; if (vdpa_dev_net_mtu_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_net_mac_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_net_status_config_fill(msg, features_device, &config)) return -EMSGSIZE; return vdpa_dev_net_mq_config_fill(msg, features_device, &config); } static int vdpa_dev_blk_capacity_config_fill(struct sk_buff *msg, const struct virtio_blk_config *config) { u64 val_u64; val_u64 = __virtio64_to_cpu(true, config->capacity); return nla_put_u64_64bit(msg, VDPA_ATTR_DEV_BLK_CFG_CAPACITY, val_u64, VDPA_ATTR_PAD); } static int vdpa_dev_blk_seg_size_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_SIZE_MAX)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->size_max); return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_SIZE_MAX, val_u32); } /* fill the block size*/ static int vdpa_dev_blk_block_size_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_BLK_SIZE)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->blk_size); return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_BLK_SIZE, val_u32); } static int vdpa_dev_blk_seg_max_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_SEG_MAX)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->seg_max); return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_SEG_MAX, val_u32); } static int vdpa_dev_blk_mq_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_BLK_F_MQ)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->num_queues); return nla_put_u16(msg, VDPA_ATTR_DEV_BLK_CFG_NUM_QUEUES, val_u16); } static int vdpa_dev_blk_topology_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u16 min_io_size; u32 opt_io_size; if ((features & BIT_ULL(VIRTIO_BLK_F_TOPOLOGY)) == 0) return 0; min_io_size = __virtio16_to_cpu(true, config->min_io_size); opt_io_size = __virtio32_to_cpu(true, config->opt_io_size); if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_CFG_PHY_BLK_EXP, config->physical_block_exp)) return -EMSGSIZE; if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_CFG_ALIGN_OFFSET, config->alignment_offset)) return -EMSGSIZE; if (nla_put_u16(msg, VDPA_ATTR_DEV_BLK_CFG_MIN_IO_SIZE, min_io_size)) return -EMSGSIZE; if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_OPT_IO_SIZE, opt_io_size)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_discard_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_DISCARD)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->max_discard_sectors); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_DISCARD_SEC, val_u32)) return -EMSGSIZE; val_u32 = __virtio32_to_cpu(true, config->max_discard_seg); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_DISCARD_SEG, val_u32)) return -EMSGSIZE; val_u32 = __virtio32_to_cpu(true, config->discard_sector_alignment); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_DISCARD_SEC_ALIGN, val_u32)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_write_zeroes_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_WRITE_ZEROES)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->max_write_zeroes_sectors); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_WRITE_ZEROES_SEC, val_u32)) return -EMSGSIZE; val_u32 = __virtio32_to_cpu(true, config->max_write_zeroes_seg); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_WRITE_ZEROES_SEG, val_u32)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_ro_config_fill(struct sk_buff *msg, u64 features) { u8 ro; ro = ((features & BIT_ULL(VIRTIO_BLK_F_RO)) == 0) ? 0 : 1; if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_READ_ONLY, ro)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_flush_config_fill(struct sk_buff *msg, u64 features) { u8 flush; flush = ((features & BIT_ULL(VIRTIO_BLK_F_FLUSH)) == 0) ? 0 : 1; if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_FLUSH, flush)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_config_fill(struct vdpa_device *vdev, struct sk_buff *msg) { struct virtio_blk_config config = {}; u64 features_device; vdev->config->get_config(vdev, 0, &config, sizeof(config)); features_device = vdev->config->get_device_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_FEATURES, features_device, VDPA_ATTR_PAD)) return -EMSGSIZE; if (vdpa_dev_blk_capacity_config_fill(msg, &config)) return -EMSGSIZE; if (vdpa_dev_blk_seg_size_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_block_size_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_seg_max_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_mq_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_topology_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_discard_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_write_zeroes_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_ro_config_fill(msg, features_device)) return -EMSGSIZE; if (vdpa_dev_blk_flush_config_fill(msg, features_device)) return -EMSGSIZE; return 0; } static int vdpa_dev_config_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { u64 features_driver; u8 status = 0; u32 device_id; void *hdr; int err; down_read(&vdev->cf_lock); hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_CONFIG_GET); if (!hdr) { err = -EMSGSIZE; goto out; } if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) { err = -EMSGSIZE; goto msg_err; } device_id = vdev->config->get_device_id(vdev); if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) { err = -EMSGSIZE; goto msg_err; } /* only read driver features after the feature negotiation is done */ status = vdev->config->get_status(vdev); if (status & VIRTIO_CONFIG_S_FEATURES_OK) { features_driver = vdev->config->get_driver_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features_driver, VDPA_ATTR_PAD)) { err = -EMSGSIZE; goto msg_err; } } switch (device_id) { case VIRTIO_ID_NET: err = vdpa_dev_net_config_fill(vdev, msg); break; case VIRTIO_ID_BLOCK: err = vdpa_dev_blk_config_fill(vdev, msg); break; default: err = -EOPNOTSUPP; break; } if (err) goto msg_err; up_read(&vdev->cf_lock); genlmsg_end(msg, hdr); return 0; msg_err: genlmsg_cancel(msg, hdr); out: up_read(&vdev->cf_lock); return err; } static int vdpa_fill_stats_rec(struct vdpa_device *vdev, struct sk_buff *msg, struct genl_info *info, u32 index) { struct virtio_net_config config = {}; u64 features; u8 status; int err; status = vdev->config->get_status(vdev); if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { NL_SET_ERR_MSG_MOD(info->extack, "feature negotiation not complete"); return -EAGAIN; } vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config)); features = vdev->config->get_driver_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features, VDPA_ATTR_PAD)) return -EMSGSIZE; err = vdpa_dev_net_mq_config_fill(msg, features, &config); if (err) return err; if (nla_put_u32(msg, VDPA_ATTR_DEV_QUEUE_INDEX, index)) return -EMSGSIZE; err = vdev->config->get_vendor_vq_stats(vdev, index, msg, info->extack); if (err) return err; return 0; } static int vendor_stats_fill(struct vdpa_device *vdev, struct sk_buff *msg, struct genl_info *info, u32 index) { int err; down_read(&vdev->cf_lock); if (!vdev->config->get_vendor_vq_stats) { err = -EOPNOTSUPP; goto out; } err = vdpa_fill_stats_rec(vdev, msg, info, index); out: up_read(&vdev->cf_lock); return err; } static int vdpa_dev_vendor_stats_fill(struct vdpa_device *vdev, struct sk_buff *msg, struct genl_info *info, u32 index) { u32 device_id; void *hdr; int err; u32 portid = info->snd_portid; u32 seq = info->snd_seq; u32 flags = 0; hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_VSTATS_GET); if (!hdr) return -EMSGSIZE; if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) { err = -EMSGSIZE; goto undo_msg; } device_id = vdev->config->get_device_id(vdev); if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) { err = -EMSGSIZE; goto undo_msg; } switch (device_id) { case VIRTIO_ID_NET: if (index > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) { NL_SET_ERR_MSG_MOD(info->extack, "queue index exceeds max value"); err = -ERANGE; break; } err = vendor_stats_fill(vdev, msg, info, index); break; default: err = -EOPNOTSUPP; break; } genlmsg_end(msg, hdr); return err; undo_msg: genlmsg_cancel(msg, hdr); return err; } static int vdpa_nl_cmd_dev_config_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct sk_buff *msg; const char *devname; struct device *dev; int err; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); err = -EINVAL; goto mdev_err; } err = vdpa_dev_config_fill(vdev, msg, info->snd_portid, info->snd_seq, 0, info->extack); if (!err) err = genlmsg_reply(msg, info); mdev_err: put_device(dev); dev_err: up_read(&vdpa_dev_lock); if (err) nlmsg_free(msg); return err; } static int vdpa_dev_net_device_attr_set(struct vdpa_device *vdev, struct genl_info *info) { struct vdpa_dev_set_config set_config = {}; struct vdpa_mgmt_dev *mdev = vdev->mdev; struct nlattr **nl_attrs = info->attrs; const u8 *macaddr; int err = -EOPNOTSUPP; down_write(&vdev->cf_lock); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]) { set_config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR); macaddr = nla_data(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]); if (is_valid_ether_addr(macaddr)) { ether_addr_copy(set_config.net.mac, macaddr); if (mdev->ops->dev_set_attr) { err = mdev->ops->dev_set_attr(mdev, vdev, &set_config); } else { NL_SET_ERR_MSG_FMT_MOD(info->extack, "Operation not supported by the device."); } } else { NL_SET_ERR_MSG_FMT_MOD(info->extack, "Invalid MAC address"); } } up_write(&vdev->cf_lock); return err; } static int vdpa_nl_cmd_dev_attr_set_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct device *dev; const char *name; u64 classes; int err = 0; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); down_write(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); err = -EINVAL; goto mdev_err; } classes = vdpa_mgmtdev_get_classes(vdev->mdev, NULL); if (classes & BIT_ULL(VIRTIO_ID_NET)) { err = vdpa_dev_net_device_attr_set(vdev, info); } else { NL_SET_ERR_MSG_FMT_MOD(info->extack, "%s device not supported", name); } mdev_err: put_device(dev); dev_err: up_write(&vdpa_dev_lock); return err; } static int vdpa_dev_config_dump(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); struct vdpa_dev_dump_info *info = data; int err; if (!vdev->mdev) return 0; if (info->idx < info->start_idx) { info->idx++; return 0; } err = vdpa_dev_config_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid, info->cb->nlh->nlmsg_seq, NLM_F_MULTI, info->cb->extack); if (err) return err; info->idx++; return 0; } static int vdpa_nl_cmd_dev_config_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct vdpa_dev_dump_info info; info.msg = msg; info.cb = cb; info.start_idx = cb->args[0]; info.idx = 0; down_read(&vdpa_dev_lock); bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_config_dump); up_read(&vdpa_dev_lock); cb->args[0] = info.idx; return msg->len; } static int vdpa_nl_cmd_dev_stats_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct sk_buff *msg; const char *devname; struct device *dev; u32 index; int err; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; if (!info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]) return -EINVAL; devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; index = nla_get_u32(info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]); down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); err = -EINVAL; goto mdev_err; } err = vdpa_dev_vendor_stats_fill(vdev, msg, info, index); if (err) goto mdev_err; err = genlmsg_reply(msg, info); put_device(dev); up_read(&vdpa_dev_lock); return err; mdev_err: put_device(dev); dev_err: nlmsg_free(msg); up_read(&vdpa_dev_lock); return err; } static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = { [VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING }, [VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING }, [VDPA_ATTR_DEV_NAME] = { .type = NLA_STRING }, [VDPA_ATTR_DEV_NET_CFG_MACADDR] = NLA_POLICY_ETH_ADDR, [VDPA_ATTR_DEV_NET_CFG_MAX_VQP] = { .type = NLA_U16 }, /* virtio spec 1.1 section 5.1.4.1 for valid MTU range */ [VDPA_ATTR_DEV_NET_CFG_MTU] = NLA_POLICY_MIN(NLA_U16, 68), [VDPA_ATTR_DEV_QUEUE_INDEX] = { .type = NLA_U32 }, [VDPA_ATTR_DEV_FEATURES] = { .type = NLA_U64 }, }; static const struct genl_ops vdpa_nl_ops[] = { { .cmd = VDPA_CMD_MGMTDEV_GET, .doit = vdpa_nl_cmd_mgmtdev_get_doit, .dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit, }, { .cmd = VDPA_CMD_DEV_NEW, .doit = vdpa_nl_cmd_dev_add_set_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_DEL, .doit = vdpa_nl_cmd_dev_del_set_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_GET, .doit = vdpa_nl_cmd_dev_get_doit, .dumpit = vdpa_nl_cmd_dev_get_dumpit, }, { .cmd = VDPA_CMD_DEV_CONFIG_GET, .doit = vdpa_nl_cmd_dev_config_get_doit, .dumpit = vdpa_nl_cmd_dev_config_get_dumpit, }, { .cmd = VDPA_CMD_DEV_VSTATS_GET, .doit = vdpa_nl_cmd_dev_stats_get_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_ATTR_SET, .doit = vdpa_nl_cmd_dev_attr_set_doit, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family vdpa_nl_family __ro_after_init = { .name = VDPA_GENL_NAME, .version = VDPA_GENL_VERSION, .maxattr = VDPA_ATTR_MAX, .policy = vdpa_nl_policy, .netnsok = false, .module = THIS_MODULE, .ops = vdpa_nl_ops, .n_ops = ARRAY_SIZE(vdpa_nl_ops), .resv_start_op = VDPA_CMD_DEV_VSTATS_GET + 1, }; static int vdpa_init(void) { int err; err = bus_register(&vdpa_bus); if (err) return err; err = genl_register_family(&vdpa_nl_family); if (err) goto err; return 0; err: bus_unregister(&vdpa_bus); return err; } static void __exit vdpa_exit(void) { genl_unregister_family(&vdpa_nl_family); bus_unregister(&vdpa_bus); ida_destroy(&vdpa_index_ida); } core_initcall(vdpa_init); module_exit(vdpa_exit); MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>"); MODULE_DESCRIPTION("vDPA bus"); MODULE_LICENSE("GPL v2");
14 4 10 30 16 14 8 6 2 2 1 7 4 7 7 2 1 1 2 2 2 152 151 12 63 63 65 2 63 1 1 32 10 1 1 1 1 1 1 1 15 5 5 41 34 7 41 2 2 1 5 2 1 1 21 1 2 2 2 1 5 15 17 14 57 57 57 1 59 2 57 57 7 7 7 7 7 7 6 7 7 7 7 7 7 7 7 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 /* * Copyright (c) 2006-2009 Red Hat Inc. * Copyright (c) 2006-2008 Intel Corporation * Copyright (c) 2007 Dave Airlie <airlied@linux.ie> * * DRM framebuffer helper functions * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. * * Authors: * Dave Airlie <airlied@linux.ie> * Jesse Barnes <jesse.barnes@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/console.h> #include <linux/pci.h> #include <linux/sysrq.h> #include <linux/vga_switcheroo.h> #include <drm/drm_atomic.h> #include <drm/drm_drv.h> #include <drm/drm_fb_helper.h> #include <drm/drm_fourcc.h> #include <drm/drm_framebuffer.h> #include <drm/drm_modeset_helper_vtables.h> #include <drm/drm_print.h> #include <drm/drm_vblank.h> #include "drm_internal.h" #include "drm_crtc_internal.h" static bool drm_fbdev_emulation = true; module_param_named(fbdev_emulation, drm_fbdev_emulation, bool, 0600); MODULE_PARM_DESC(fbdev_emulation, "Enable legacy fbdev emulation [default=true]"); static int drm_fbdev_overalloc = CONFIG_DRM_FBDEV_OVERALLOC; module_param(drm_fbdev_overalloc, int, 0444); MODULE_PARM_DESC(drm_fbdev_overalloc, "Overallocation of the fbdev buffer (%) [default=" __MODULE_STRING(CONFIG_DRM_FBDEV_OVERALLOC) "]"); /* * In order to keep user-space compatibility, we want in certain use-cases * to keep leaking the fbdev physical address to the user-space program * handling the fbdev buffer. * * This is a bad habit, essentially kept to support closed-source OpenGL * drivers that should really be moved into open-source upstream projects * instead of using legacy physical addresses in user space to communicate * with other out-of-tree kernel modules. * * This module_param *should* be removed as soon as possible and be * considered as a broken and legacy behaviour from a modern fbdev device. */ static bool drm_leak_fbdev_smem; #if IS_ENABLED(CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM) module_param_unsafe(drm_leak_fbdev_smem, bool, 0600); MODULE_PARM_DESC(drm_leak_fbdev_smem, "Allow unsafe leaking fbdev physical smem address [default=false]"); #endif static LIST_HEAD(kernel_fb_helper_list); static DEFINE_MUTEX(kernel_fb_helper_lock); /** * DOC: fbdev helpers * * The fb helper functions are useful to provide an fbdev on top of a drm kernel * mode setting driver. They can be used mostly independently from the crtc * helper functions used by many drivers to implement the kernel mode setting * interfaces. Drivers that use one of the shared memory managers, TTM, SHMEM, * DMA, should instead use the corresponding fbdev emulation. * * For suspend/resume consider using drm_mode_config_helper_suspend() and * drm_mode_config_helper_resume() which takes care of fbdev as well. * * All other functions exported by the fb helper library can be used to * implement the fbdev driver interface by the driver. * * It is possible, though perhaps somewhat tricky, to implement race-free * hotplug detection using the fbdev helpers. The drm_fb_helper_prepare() * helper must be called first to initialize the minimum required to make * hotplug detection work. Drivers also need to make sure to properly set up * the &drm_mode_config.funcs member. After calling drm_kms_helper_poll_init() * it is safe to enable interrupts and start processing hotplug events. At the * same time, drivers should initialize all modeset objects such as CRTCs, * encoders and connectors. To finish up the fbdev helper initialization, the * drm_fb_helper_init() function is called. To probe for all attached displays * and set up an initial configuration using the detected hardware, drivers * should call drm_fb_helper_initial_config(). * * If &drm_framebuffer_funcs.dirty is set, the * drm_fb_helper_{cfb,sys}_{write,fillrect,copyarea,imageblit} functions will * accumulate changes and schedule &drm_fb_helper.dirty_work to run right * away. This worker then calls the dirty() function ensuring that it will * always run in process context since the fb_*() function could be running in * atomic context. If drm_fb_helper_deferred_io() is used as the deferred_io * callback it will also schedule dirty_work with the damage collected from the * mmap page writes. */ static void drm_fb_helper_restore_lut_atomic(struct drm_crtc *crtc) { uint16_t *r_base, *g_base, *b_base; if (crtc->funcs->gamma_set == NULL) return; r_base = crtc->gamma_store; g_base = r_base + crtc->gamma_size; b_base = g_base + crtc->gamma_size; crtc->funcs->gamma_set(crtc, r_base, g_base, b_base, crtc->gamma_size, NULL); } /** * drm_fb_helper_debug_enter - implementation for &fb_ops.fb_debug_enter * @info: fbdev registered by the helper */ int drm_fb_helper_debug_enter(struct fb_info *info) { struct drm_fb_helper *helper = info->par; const struct drm_crtc_helper_funcs *funcs; struct drm_mode_set *mode_set; list_for_each_entry(helper, &kernel_fb_helper_list, kernel_fb_list) { mutex_lock(&helper->client.modeset_mutex); drm_client_for_each_modeset(mode_set, &helper->client) { if (!mode_set->crtc->enabled) continue; funcs = mode_set->crtc->helper_private; if (funcs->mode_set_base_atomic == NULL) continue; if (drm_drv_uses_atomic_modeset(mode_set->crtc->dev)) continue; funcs->mode_set_base_atomic(mode_set->crtc, mode_set->fb, mode_set->x, mode_set->y, ENTER_ATOMIC_MODE_SET); } mutex_unlock(&helper->client.modeset_mutex); } return 0; } EXPORT_SYMBOL(drm_fb_helper_debug_enter); /** * drm_fb_helper_debug_leave - implementation for &fb_ops.fb_debug_leave * @info: fbdev registered by the helper */ int drm_fb_helper_debug_leave(struct fb_info *info) { struct drm_fb_helper *helper = info->par; struct drm_client_dev *client = &helper->client; struct drm_device *dev = helper->dev; struct drm_crtc *crtc; const struct drm_crtc_helper_funcs *funcs; struct drm_mode_set *mode_set; struct drm_framebuffer *fb; mutex_lock(&client->modeset_mutex); drm_client_for_each_modeset(mode_set, client) { crtc = mode_set->crtc; if (drm_drv_uses_atomic_modeset(crtc->dev)) continue; funcs = crtc->helper_private; fb = crtc->primary->fb; if (!crtc->enabled) continue; if (!fb) { drm_err(dev, "no fb to restore?\n"); continue; } if (funcs->mode_set_base_atomic == NULL) continue; drm_fb_helper_restore_lut_atomic(mode_set->crtc); funcs->mode_set_base_atomic(mode_set->crtc, fb, crtc->x, crtc->y, LEAVE_ATOMIC_MODE_SET); } mutex_unlock(&client->modeset_mutex); return 0; } EXPORT_SYMBOL(drm_fb_helper_debug_leave); static int __drm_fb_helper_restore_fbdev_mode_unlocked(struct drm_fb_helper *fb_helper, bool force) { bool do_delayed; int ret; if (!drm_fbdev_emulation || !fb_helper) return -ENODEV; if (READ_ONCE(fb_helper->deferred_setup)) return 0; mutex_lock(&fb_helper->lock); if (force) { /* * Yes this is the _locked version which expects the master lock * to be held. But for forced restores we're intentionally * racing here, see drm_fb_helper_set_par(). */ ret = drm_client_modeset_commit_locked(&fb_helper->client); } else { ret = drm_client_modeset_commit(&fb_helper->client); } do_delayed = fb_helper->delayed_hotplug; if (do_delayed) fb_helper->delayed_hotplug = false; mutex_unlock(&fb_helper->lock); if (do_delayed) drm_fb_helper_hotplug_event(fb_helper); return ret; } /** * drm_fb_helper_restore_fbdev_mode_unlocked - restore fbdev configuration * @fb_helper: driver-allocated fbdev helper, can be NULL * * This helper should be called from fbdev emulation's &drm_client_funcs.restore * callback. It ensures that the user isn't greeted with a black screen when the * userspace compositor releases the display device. * * Returns: * 0 on success, or a negative errno code otherwise. */ int drm_fb_helper_restore_fbdev_mode_unlocked(struct drm_fb_helper *fb_helper) { return __drm_fb_helper_restore_fbdev_mode_unlocked(fb_helper, false); } EXPORT_SYMBOL(drm_fb_helper_restore_fbdev_mode_unlocked); #ifdef CONFIG_MAGIC_SYSRQ /* emergency restore, don't bother with error reporting */ static void drm_fb_helper_restore_work_fn(struct work_struct *ignored) { struct drm_fb_helper *helper; mutex_lock(&kernel_fb_helper_lock); list_for_each_entry(helper, &kernel_fb_helper_list, kernel_fb_list) { struct drm_device *dev = helper->dev; if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) continue; mutex_lock(&helper->lock); drm_client_modeset_commit_locked(&helper->client); mutex_unlock(&helper->lock); } mutex_unlock(&kernel_fb_helper_lock); } static DECLARE_WORK(drm_fb_helper_restore_work, drm_fb_helper_restore_work_fn); static void drm_fb_helper_sysrq(u8 dummy1) { schedule_work(&drm_fb_helper_restore_work); } static const struct sysrq_key_op sysrq_drm_fb_helper_restore_op = { .handler = drm_fb_helper_sysrq, .help_msg = "force-fb(v)", .action_msg = "Restore framebuffer console", }; #else static const struct sysrq_key_op sysrq_drm_fb_helper_restore_op = { }; #endif static void drm_fb_helper_dpms(struct fb_info *info, int dpms_mode) { struct drm_fb_helper *fb_helper = info->par; mutex_lock(&fb_helper->lock); drm_client_modeset_dpms(&fb_helper->client, dpms_mode); mutex_unlock(&fb_helper->lock); } /** * drm_fb_helper_blank - implementation for &fb_ops.fb_blank * @blank: desired blanking state * @info: fbdev registered by the helper */ int drm_fb_helper_blank(int blank, struct fb_info *info) { if (oops_in_progress) return -EBUSY; switch (blank) { /* Display: On; HSync: On, VSync: On */ case FB_BLANK_UNBLANK: drm_fb_helper_dpms(info, DRM_MODE_DPMS_ON); break; /* Display: Off; HSync: On, VSync: On */ case FB_BLANK_NORMAL: drm_fb_helper_dpms(info, DRM_MODE_DPMS_STANDBY); break; /* Display: Off; HSync: Off, VSync: On */ case FB_BLANK_HSYNC_SUSPEND: drm_fb_helper_dpms(info, DRM_MODE_DPMS_STANDBY); break; /* Display: Off; HSync: On, VSync: Off */ case FB_BLANK_VSYNC_SUSPEND: drm_fb_helper_dpms(info, DRM_MODE_DPMS_SUSPEND); break; /* Display: Off; HSync: Off, VSync: Off */ case FB_BLANK_POWERDOWN: drm_fb_helper_dpms(info, DRM_MODE_DPMS_OFF); break; } return 0; } EXPORT_SYMBOL(drm_fb_helper_blank); static void drm_fb_helper_resume_worker(struct work_struct *work) { struct drm_fb_helper *helper = container_of(work, struct drm_fb_helper, resume_work); console_lock(); fb_set_suspend(helper->info, 0); console_unlock(); } static void drm_fb_helper_fb_dirty(struct drm_fb_helper *helper) { struct drm_device *dev = helper->dev; struct drm_clip_rect *clip = &helper->damage_clip; struct drm_clip_rect clip_copy; unsigned long flags; int ret; if (drm_WARN_ON_ONCE(dev, !helper->funcs->fb_dirty)) return; spin_lock_irqsave(&helper->damage_lock, flags); clip_copy = *clip; clip->x1 = clip->y1 = ~0; clip->x2 = clip->y2 = 0; spin_unlock_irqrestore(&helper->damage_lock, flags); ret = helper->funcs->fb_dirty(helper, &clip_copy); if (ret) goto err; return; err: /* * Restore damage clip rectangle on errors. The next run * of the damage worker will perform the update. */ spin_lock_irqsave(&helper->damage_lock, flags); clip->x1 = min_t(u32, clip->x1, clip_copy.x1); clip->y1 = min_t(u32, clip->y1, clip_copy.y1); clip->x2 = max_t(u32, clip->x2, clip_copy.x2); clip->y2 = max_t(u32, clip->y2, clip_copy.y2); spin_unlock_irqrestore(&helper->damage_lock, flags); } static void drm_fb_helper_damage_work(struct work_struct *work) { struct drm_fb_helper *helper = container_of(work, struct drm_fb_helper, damage_work); drm_fb_helper_fb_dirty(helper); } /** * drm_fb_helper_prepare - setup a drm_fb_helper structure * @dev: DRM device * @helper: driver-allocated fbdev helper structure to set up * @preferred_bpp: Preferred bits per pixel for the device. * @funcs: pointer to structure of functions associate with this helper * * Sets up the bare minimum to make the framebuffer helper usable. This is * useful to implement race-free initialization of the polling helpers. */ void drm_fb_helper_prepare(struct drm_device *dev, struct drm_fb_helper *helper, unsigned int preferred_bpp, const struct drm_fb_helper_funcs *funcs) { /* * Pick a preferred bpp of 32 if no value has been given. This * will select XRGB8888 for the framebuffer formats. All drivers * have to support XRGB8888 for backwards compatibility with legacy * userspace, so it's the safe choice here. * * TODO: Replace struct drm_mode_config.preferred_depth and this * bpp value with a preferred format that is given as struct * drm_format_info. Then derive all other values from the * format. */ if (!preferred_bpp) preferred_bpp = 32; INIT_LIST_HEAD(&helper->kernel_fb_list); spin_lock_init(&helper->damage_lock); INIT_WORK(&helper->resume_work, drm_fb_helper_resume_worker); INIT_WORK(&helper->damage_work, drm_fb_helper_damage_work); helper->damage_clip.x1 = helper->damage_clip.y1 = ~0; mutex_init(&helper->lock); helper->funcs = funcs; helper->dev = dev; helper->preferred_bpp = preferred_bpp; } EXPORT_SYMBOL(drm_fb_helper_prepare); /** * drm_fb_helper_unprepare - clean up a drm_fb_helper structure * @fb_helper: driver-allocated fbdev helper structure to set up * * Cleans up the framebuffer helper. Inverse of drm_fb_helper_prepare(). */ void drm_fb_helper_unprepare(struct drm_fb_helper *fb_helper) { mutex_destroy(&fb_helper->lock); } EXPORT_SYMBOL(drm_fb_helper_unprepare); /** * drm_fb_helper_init - initialize a &struct drm_fb_helper * @dev: drm device * @fb_helper: driver-allocated fbdev helper structure to initialize * * This allocates the structures for the fbdev helper with the given limits. * Note that this won't yet touch the hardware (through the driver interfaces) * nor register the fbdev. This is only done in drm_fb_helper_initial_config() * to allow driver writes more control over the exact init sequence. * * Drivers must call drm_fb_helper_prepare() before calling this function. * * RETURNS: * Zero if everything went ok, nonzero otherwise. */ int drm_fb_helper_init(struct drm_device *dev, struct drm_fb_helper *fb_helper) { int ret; /* * If this is not the generic fbdev client, initialize a drm_client * without callbacks so we can use the modesets. */ if (!fb_helper->client.funcs) { ret = drm_client_init(dev, &fb_helper->client, "drm_fb_helper", NULL); if (ret) return ret; } dev->fb_helper = fb_helper; return 0; } EXPORT_SYMBOL(drm_fb_helper_init); /** * drm_fb_helper_alloc_info - allocate fb_info and some of its members * @fb_helper: driver-allocated fbdev helper * * A helper to alloc fb_info and the member cmap. Called by the driver * within the struct &drm_driver.fbdev_probe callback function. Drivers do * not need to release the allocated fb_info structure themselves, this is * automatically done when calling drm_fb_helper_fini(). * * RETURNS: * fb_info pointer if things went okay, pointer containing error code * otherwise */ struct fb_info *drm_fb_helper_alloc_info(struct drm_fb_helper *fb_helper) { struct device *dev = fb_helper->dev->dev; struct fb_info *info; int ret; info = framebuffer_alloc(0, dev); if (!info) return ERR_PTR(-ENOMEM); if (!drm_leak_fbdev_smem) info->flags |= FBINFO_HIDE_SMEM_START; ret = fb_alloc_cmap(&info->cmap, 256, 0); if (ret) goto err_release; fb_helper->info = info; info->skip_vt_switch = true; info->skip_panic = drm_panic_is_enabled(fb_helper->dev); return info; err_release: framebuffer_release(info); return ERR_PTR(ret); } EXPORT_SYMBOL(drm_fb_helper_alloc_info); /** * drm_fb_helper_release_info - release fb_info and its members * @fb_helper: driver-allocated fbdev helper * * A helper to release fb_info and the member cmap. Drivers do not * need to release the allocated fb_info structure themselves, this is * automatically done when calling drm_fb_helper_fini(). */ void drm_fb_helper_release_info(struct drm_fb_helper *fb_helper) { struct fb_info *info = fb_helper->info; if (!info) return; fb_helper->info = NULL; if (info->cmap.len) fb_dealloc_cmap(&info->cmap); framebuffer_release(info); } EXPORT_SYMBOL(drm_fb_helper_release_info); /** * drm_fb_helper_unregister_info - unregister fb_info framebuffer device * @fb_helper: driver-allocated fbdev helper, must not be NULL * * A wrapper around unregister_framebuffer, to release the fb_info * framebuffer device. This must be called before releasing all resources for * @fb_helper by calling drm_fb_helper_fini(). */ void drm_fb_helper_unregister_info(struct drm_fb_helper *fb_helper) { struct fb_info *info = fb_helper->info; struct device *dev = info->device; if (dev_is_pci(dev)) vga_switcheroo_client_fb_set(to_pci_dev(dev), NULL); unregister_framebuffer(fb_helper->info); } EXPORT_SYMBOL(drm_fb_helper_unregister_info); /** * drm_fb_helper_fini - finialize a &struct drm_fb_helper * @fb_helper: driver-allocated fbdev helper, can be NULL * * This cleans up all remaining resources associated with @fb_helper. */ void drm_fb_helper_fini(struct drm_fb_helper *fb_helper) { if (!fb_helper) return; fb_helper->dev->fb_helper = NULL; if (!drm_fbdev_emulation) return; cancel_work_sync(&fb_helper->resume_work); cancel_work_sync(&fb_helper->damage_work); drm_fb_helper_release_info(fb_helper); mutex_lock(&kernel_fb_helper_lock); if (!list_empty(&fb_helper->kernel_fb_list)) { list_del(&fb_helper->kernel_fb_list); if (list_empty(&kernel_fb_helper_list)) unregister_sysrq_key('v', &sysrq_drm_fb_helper_restore_op); } mutex_unlock(&kernel_fb_helper_lock); if (!fb_helper->client.funcs) drm_client_release(&fb_helper->client); } EXPORT_SYMBOL(drm_fb_helper_fini); static void drm_fb_helper_add_damage_clip(struct drm_fb_helper *helper, u32 x, u32 y, u32 width, u32 height) { struct drm_clip_rect *clip = &helper->damage_clip; unsigned long flags; spin_lock_irqsave(&helper->damage_lock, flags); clip->x1 = min_t(u32, clip->x1, x); clip->y1 = min_t(u32, clip->y1, y); clip->x2 = max_t(u32, clip->x2, x + width); clip->y2 = max_t(u32, clip->y2, y + height); spin_unlock_irqrestore(&helper->damage_lock, flags); } static void drm_fb_helper_damage(struct drm_fb_helper *helper, u32 x, u32 y, u32 width, u32 height) { /* * This function may be invoked by panic() to flush the frame * buffer, where all CPUs except the panic CPU are stopped. * During the following schedule_work(), the panic CPU needs * the worker_pool lock, which might be held by a stopped CPU, * causing schedule_work() and panic() to block. Return early on * oops_in_progress to prevent this blocking. */ if (oops_in_progress) return; drm_fb_helper_add_damage_clip(helper, x, y, width, height); schedule_work(&helper->damage_work); } /* * Convert memory region into area of scanlines and pixels per * scanline. The parameters off and len must not reach beyond * the end of the framebuffer. */ static void drm_fb_helper_memory_range_to_clip(struct fb_info *info, off_t off, size_t len, struct drm_rect *clip) { u32 line_length = info->fix.line_length; u32 fb_height = info->var.yres; off_t end = off + len; u32 x1 = 0; u32 y1 = off / line_length; u32 x2 = info->var.xres; u32 y2 = DIV_ROUND_UP(end, line_length); /* Don't allow any of them beyond the bottom bound of display area */ if (y1 > fb_height) y1 = fb_height; if (y2 > fb_height) y2 = fb_height; if ((y2 - y1) == 1) { /* * We've only written to a single scanline. Try to reduce * the number of horizontal pixels that need an update. */ off_t bit_off = (off % line_length) * 8; off_t bit_end = (end % line_length) * 8; x1 = bit_off / info->var.bits_per_pixel; x2 = DIV_ROUND_UP(bit_end, info->var.bits_per_pixel); } drm_rect_init(clip, x1, y1, x2 - x1, y2 - y1); } /* Don't use in new code. */ void drm_fb_helper_damage_range(struct fb_info *info, off_t off, size_t len) { struct drm_fb_helper *fb_helper = info->par; struct drm_rect damage_area; drm_fb_helper_memory_range_to_clip(info, off, len, &damage_area); drm_fb_helper_damage(fb_helper, damage_area.x1, damage_area.y1, drm_rect_width(&damage_area), drm_rect_height(&damage_area)); } EXPORT_SYMBOL(drm_fb_helper_damage_range); /* Don't use in new code. */ void drm_fb_helper_damage_area(struct fb_info *info, u32 x, u32 y, u32 width, u32 height) { struct drm_fb_helper *fb_helper = info->par; drm_fb_helper_damage(fb_helper, x, y, width, height); } EXPORT_SYMBOL(drm_fb_helper_damage_area); #ifdef CONFIG_FB_DEFERRED_IO /** * drm_fb_helper_deferred_io() - fbdev deferred_io callback function * @info: fb_info struct pointer * @pagereflist: list of mmap framebuffer pages that have to be flushed * * This function is used as the &fb_deferred_io.deferred_io * callback function for flushing the fbdev mmap writes. */ void drm_fb_helper_deferred_io(struct fb_info *info, struct list_head *pagereflist) { struct drm_fb_helper *helper = info->par; unsigned long start, end, min_off, max_off, total_size; struct fb_deferred_io_pageref *pageref; struct drm_rect damage_area; min_off = ULONG_MAX; max_off = 0; list_for_each_entry(pageref, pagereflist, list) { start = pageref->offset; end = start + PAGE_SIZE; min_off = min(min_off, start); max_off = max(max_off, end); } /* * As we can only track pages, we might reach beyond the end * of the screen and account for non-existing scanlines. Hence, * keep the covered memory area within the screen buffer. */ if (info->screen_size) total_size = info->screen_size; else total_size = info->fix.smem_len; max_off = min(max_off, total_size); if (min_off < max_off) { drm_fb_helper_memory_range_to_clip(info, min_off, max_off - min_off, &damage_area); drm_fb_helper_damage(helper, damage_area.x1, damage_area.y1, drm_rect_width(&damage_area), drm_rect_height(&damage_area)); } } EXPORT_SYMBOL(drm_fb_helper_deferred_io); #endif /** * drm_fb_helper_set_suspend - wrapper around fb_set_suspend * @fb_helper: driver-allocated fbdev helper, can be NULL * @suspend: whether to suspend or resume * * A wrapper around fb_set_suspend implemented by fbdev core. * Use drm_fb_helper_set_suspend_unlocked() if you don't need to take * the lock yourself */ void drm_fb_helper_set_suspend(struct drm_fb_helper *fb_helper, bool suspend) { if (fb_helper && fb_helper->info) fb_set_suspend(fb_helper->info, suspend); } EXPORT_SYMBOL(drm_fb_helper_set_suspend); /** * drm_fb_helper_set_suspend_unlocked - wrapper around fb_set_suspend that also * takes the console lock * @fb_helper: driver-allocated fbdev helper, can be NULL * @suspend: whether to suspend or resume * * A wrapper around fb_set_suspend() that takes the console lock. If the lock * isn't available on resume, a worker is tasked with waiting for the lock * to become available. The console lock can be pretty contented on resume * due to all the printk activity. * * This function can be called multiple times with the same state since * &fb_info.state is checked to see if fbdev is running or not before locking. * * Use drm_fb_helper_set_suspend() if you need to take the lock yourself. */ void drm_fb_helper_set_suspend_unlocked(struct drm_fb_helper *fb_helper, bool suspend) { if (!fb_helper || !fb_helper->info) return; /* make sure there's no pending/ongoing resume */ flush_work(&fb_helper->resume_work); if (suspend) { if (fb_helper->info->state != FBINFO_STATE_RUNNING) return; console_lock(); } else { if (fb_helper->info->state == FBINFO_STATE_RUNNING) return; if (!console_trylock()) { schedule_work(&fb_helper->resume_work); return; } } fb_set_suspend(fb_helper->info, suspend); console_unlock(); } EXPORT_SYMBOL(drm_fb_helper_set_suspend_unlocked); static int setcmap_pseudo_palette(struct fb_cmap *cmap, struct fb_info *info) { u32 *palette = (u32 *)info->pseudo_palette; int i; if (cmap->start + cmap->len > 16) return -EINVAL; for (i = 0; i < cmap->len; ++i) { u16 red = cmap->red[i]; u16 green = cmap->green[i]; u16 blue = cmap->blue[i]; u32 value; red >>= 16 - info->var.red.length; green >>= 16 - info->var.green.length; blue >>= 16 - info->var.blue.length; value = (red << info->var.red.offset) | (green << info->var.green.offset) | (blue << info->var.blue.offset); if (info->var.transp.length > 0) { u32 mask = (1 << info->var.transp.length) - 1; mask <<= info->var.transp.offset; value |= mask; } palette[cmap->start + i] = value; } return 0; } static int setcmap_legacy(struct fb_cmap *cmap, struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; struct drm_mode_set *modeset; struct drm_crtc *crtc; u16 *r, *g, *b; int ret = 0; drm_modeset_lock_all(fb_helper->dev); drm_client_for_each_modeset(modeset, &fb_helper->client) { crtc = modeset->crtc; if (!crtc->funcs->gamma_set || !crtc->gamma_size) { ret = -EINVAL; goto out; } if (cmap->start + cmap->len > crtc->gamma_size) { ret = -EINVAL; goto out; } r = crtc->gamma_store; g = r + crtc->gamma_size; b = g + crtc->gamma_size; memcpy(r + cmap->start, cmap->red, cmap->len * sizeof(*r)); memcpy(g + cmap->start, cmap->green, cmap->len * sizeof(*g)); memcpy(b + cmap->start, cmap->blue, cmap->len * sizeof(*b)); ret = crtc->funcs->gamma_set(crtc, r, g, b, crtc->gamma_size, NULL); if (ret) goto out; } out: drm_modeset_unlock_all(fb_helper->dev); return ret; } static struct drm_property_blob *setcmap_new_gamma_lut(struct drm_crtc *crtc, struct fb_cmap *cmap) { struct drm_device *dev = crtc->dev; struct drm_property_blob *gamma_lut; struct drm_color_lut *lut; int size = crtc->gamma_size; int i; if (!size || cmap->start + cmap->len > size) return ERR_PTR(-EINVAL); gamma_lut = drm_property_create_blob(dev, sizeof(*lut) * size, NULL); if (IS_ERR(gamma_lut)) return gamma_lut; lut = gamma_lut->data; if (cmap->start || cmap->len != size) { u16 *r = crtc->gamma_store; u16 *g = r + crtc->gamma_size; u16 *b = g + crtc->gamma_size; for (i = 0; i < cmap->start; i++) { lut[i].red = r[i]; lut[i].green = g[i]; lut[i].blue = b[i]; } for (i = cmap->start + cmap->len; i < size; i++) { lut[i].red = r[i]; lut[i].green = g[i]; lut[i].blue = b[i]; } } for (i = 0; i < cmap->len; i++) { lut[cmap->start + i].red = cmap->red[i]; lut[cmap->start + i].green = cmap->green[i]; lut[cmap->start + i].blue = cmap->blue[i]; } return gamma_lut; } static int setcmap_atomic(struct fb_cmap *cmap, struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; struct drm_device *dev = fb_helper->dev; struct drm_property_blob *gamma_lut = NULL; struct drm_modeset_acquire_ctx ctx; struct drm_crtc_state *crtc_state; struct drm_atomic_state *state; struct drm_mode_set *modeset; struct drm_crtc *crtc; u16 *r, *g, *b; bool replaced; int ret = 0; drm_modeset_acquire_init(&ctx, 0); state = drm_atomic_state_alloc(dev); if (!state) { ret = -ENOMEM; goto out_ctx; } state->acquire_ctx = &ctx; retry: drm_client_for_each_modeset(modeset, &fb_helper->client) { crtc = modeset->crtc; if (!gamma_lut) gamma_lut = setcmap_new_gamma_lut(crtc, cmap); if (IS_ERR(gamma_lut)) { ret = PTR_ERR(gamma_lut); gamma_lut = NULL; goto out_state; } crtc_state = drm_atomic_get_crtc_state(state, crtc); if (IS_ERR(crtc_state)) { ret = PTR_ERR(crtc_state); goto out_state; } /* * FIXME: This always uses gamma_lut. Some HW have only * degamma_lut, in which case we should reset gamma_lut and set * degamma_lut. See drm_crtc_legacy_gamma_set(). */ replaced = drm_property_replace_blob(&crtc_state->degamma_lut, NULL); replaced |= drm_property_replace_blob(&crtc_state->ctm, NULL); replaced |= drm_property_replace_blob(&crtc_state->gamma_lut, gamma_lut); crtc_state->color_mgmt_changed |= replaced; } ret = drm_atomic_commit(state); if (ret) goto out_state; drm_client_for_each_modeset(modeset, &fb_helper->client) { crtc = modeset->crtc; r = crtc->gamma_store; g = r + crtc->gamma_size; b = g + crtc->gamma_size; memcpy(r + cmap->start, cmap->red, cmap->len * sizeof(*r)); memcpy(g + cmap->start, cmap->green, cmap->len * sizeof(*g)); memcpy(b + cmap->start, cmap->blue, cmap->len * sizeof(*b)); } out_state: if (ret == -EDEADLK) goto backoff; drm_property_blob_put(gamma_lut); drm_atomic_state_put(state); out_ctx: drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); return ret; backoff: drm_atomic_state_clear(state); drm_modeset_backoff(&ctx); goto retry; } /** * drm_fb_helper_setcmap - implementation for &fb_ops.fb_setcmap * @cmap: cmap to set * @info: fbdev registered by the helper */ int drm_fb_helper_setcmap(struct fb_cmap *cmap, struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; struct drm_device *dev = fb_helper->dev; int ret; if (oops_in_progress) return -EBUSY; mutex_lock(&fb_helper->lock); if (!drm_master_internal_acquire(dev)) { ret = -EBUSY; goto unlock; } mutex_lock(&fb_helper->client.modeset_mutex); if (info->fix.visual == FB_VISUAL_TRUECOLOR) ret = setcmap_pseudo_palette(cmap, info); else if (drm_drv_uses_atomic_modeset(fb_helper->dev)) ret = setcmap_atomic(cmap, info); else ret = setcmap_legacy(cmap, info); mutex_unlock(&fb_helper->client.modeset_mutex); drm_master_internal_release(dev); unlock: mutex_unlock(&fb_helper->lock); return ret; } EXPORT_SYMBOL(drm_fb_helper_setcmap); /** * drm_fb_helper_ioctl - legacy ioctl implementation * @info: fbdev registered by the helper * @cmd: ioctl command * @arg: ioctl argument * * A helper to implement the standard fbdev ioctl. Only * FBIO_WAITFORVSYNC is implemented for now. */ int drm_fb_helper_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg) { struct drm_fb_helper *fb_helper = info->par; struct drm_device *dev = fb_helper->dev; struct drm_crtc *crtc; int ret = 0; mutex_lock(&fb_helper->lock); if (!drm_master_internal_acquire(dev)) { ret = -EBUSY; goto unlock; } switch (cmd) { case FBIO_WAITFORVSYNC: /* * Only consider the first CRTC. * * This ioctl is supposed to take the CRTC number as * an argument, but in fbdev times, what that number * was supposed to be was quite unclear, different * drivers were passing that argument differently * (some by reference, some by value), and most of the * userspace applications were just hardcoding 0 as an * argument. * * The first CRTC should be the integrated panel on * most drivers, so this is the best choice we can * make. If we're not smart enough here, one should * just consider switch the userspace to KMS. */ crtc = fb_helper->client.modesets[0].crtc; /* * Only wait for a vblank event if the CRTC is * enabled, otherwise just don't do anythintg, * not even report an error. */ ret = drm_crtc_vblank_get(crtc); if (!ret) { drm_crtc_wait_one_vblank(crtc); drm_crtc_vblank_put(crtc); } ret = 0; break; default: ret = -ENOTTY; } drm_master_internal_release(dev); unlock: mutex_unlock(&fb_helper->lock); return ret; } EXPORT_SYMBOL(drm_fb_helper_ioctl); static bool drm_fb_pixel_format_equal(const struct fb_var_screeninfo *var_1, const struct fb_var_screeninfo *var_2) { return var_1->bits_per_pixel == var_2->bits_per_pixel && var_1->grayscale == var_2->grayscale && var_1->red.offset == var_2->red.offset && var_1->red.length == var_2->red.length && var_1->red.msb_right == var_2->red.msb_right && var_1->green.offset == var_2->green.offset && var_1->green.length == var_2->green.length && var_1->green.msb_right == var_2->green.msb_right && var_1->blue.offset == var_2->blue.offset && var_1->blue.length == var_2->blue.length && var_1->blue.msb_right == var_2->blue.msb_right && var_1->transp.offset == var_2->transp.offset && var_1->transp.length == var_2->transp.length && var_1->transp.msb_right == var_2->transp.msb_right; } static void drm_fb_helper_fill_pixel_fmt(struct fb_var_screeninfo *var, const struct drm_format_info *format) { u8 depth = format->depth; if (format->is_color_indexed) { var->red.offset = 0; var->green.offset = 0; var->blue.offset = 0; var->red.length = depth; var->green.length = depth; var->blue.length = depth; var->transp.offset = 0; var->transp.length = 0; return; } switch (depth) { case 15: var->red.offset = 10; var->green.offset = 5; var->blue.offset = 0; var->red.length = 5; var->green.length = 5; var->blue.length = 5; var->transp.offset = 15; var->transp.length = 1; break; case 16: var->red.offset = 11; var->green.offset = 5; var->blue.offset = 0; var->red.length = 5; var->green.length = 6; var->blue.length = 5; var->transp.offset = 0; break; case 24: var->red.offset = 16; var->green.offset = 8; var->blue.offset = 0; var->red.length = 8; var->green.length = 8; var->blue.length = 8; var->transp.offset = 0; var->transp.length = 0; break; case 32: var->red.offset = 16; var->green.offset = 8; var->blue.offset = 0; var->red.length = 8; var->green.length = 8; var->blue.length = 8; var->transp.offset = 24; var->transp.length = 8; break; default: break; } } static void __fill_var(struct fb_var_screeninfo *var, struct fb_info *info, struct drm_framebuffer *fb) { int i; var->xres_virtual = fb->width; var->yres_virtual = fb->height; var->accel_flags = 0; var->bits_per_pixel = drm_format_info_bpp(fb->format, 0); var->height = info->var.height; var->width = info->var.width; var->left_margin = var->right_margin = 0; var->upper_margin = var->lower_margin = 0; var->hsync_len = var->vsync_len = 0; var->sync = var->vmode = 0; var->rotate = 0; var->colorspace = 0; for (i = 0; i < 4; i++) var->reserved[i] = 0; } /** * drm_fb_helper_check_var - implementation for &fb_ops.fb_check_var * @var: screeninfo to check * @info: fbdev registered by the helper */ int drm_fb_helper_check_var(struct fb_var_screeninfo *var, struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; struct drm_framebuffer *fb = fb_helper->fb; const struct drm_format_info *format = fb->format; struct drm_device *dev = fb_helper->dev; unsigned int bpp; if (in_dbg_master()) return -EINVAL; if (var->pixclock != 0) { drm_dbg_kms(dev, "fbdev emulation doesn't support changing the pixel clock, value of pixclock is ignored\n"); var->pixclock = 0; } switch (format->format) { case DRM_FORMAT_C1: case DRM_FORMAT_C2: case DRM_FORMAT_C4: /* supported format with sub-byte pixels */ break; default: if ((drm_format_info_block_width(format, 0) > 1) || (drm_format_info_block_height(format, 0) > 1)) return -EINVAL; break; } /* * Changes struct fb_var_screeninfo are currently not pushed back * to KMS, hence fail if different settings are requested. */ bpp = drm_format_info_bpp(format, 0); if (var->bits_per_pixel > bpp || var->xres > fb->width || var->yres > fb->height || var->xres_virtual > fb->width || var->yres_virtual > fb->height) { drm_dbg_kms(dev, "fb requested width/height/bpp can't fit in current fb " "request %dx%d-%d (virtual %dx%d) > %dx%d-%d\n", var->xres, var->yres, var->bits_per_pixel, var->xres_virtual, var->yres_virtual, fb->width, fb->height, bpp); return -EINVAL; } __fill_var(var, info, fb); /* * fb_pan_display() validates this, but fb_set_par() doesn't and just * falls over. Note that __fill_var above adjusts y/res_virtual. */ if (var->yoffset > var->yres_virtual - var->yres || var->xoffset > var->xres_virtual - var->xres) return -EINVAL; /* We neither support grayscale nor FOURCC (also stored in here). */ if (var->grayscale > 0) return -EINVAL; if (var->nonstd) return -EINVAL; /* * Workaround for SDL 1.2, which is known to be setting all pixel format * fields values to zero in some cases. We treat this situation as a * kind of "use some reasonable autodetected values". */ if (!var->red.offset && !var->green.offset && !var->blue.offset && !var->transp.offset && !var->red.length && !var->green.length && !var->blue.length && !var->transp.length && !var->red.msb_right && !var->green.msb_right && !var->blue.msb_right && !var->transp.msb_right) { drm_fb_helper_fill_pixel_fmt(var, format); } /* * drm fbdev emulation doesn't support changing the pixel format at all, * so reject all pixel format changing requests. */ if (!drm_fb_pixel_format_equal(var, &info->var)) { drm_dbg_kms(dev, "fbdev emulation doesn't support changing the pixel format\n"); return -EINVAL; } return 0; } EXPORT_SYMBOL(drm_fb_helper_check_var); /** * drm_fb_helper_set_par - implementation for &fb_ops.fb_set_par * @info: fbdev registered by the helper * * This will let fbcon do the mode init and is called at initialization time by * the fbdev core when registering the driver, and later on through the hotplug * callback. */ int drm_fb_helper_set_par(struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; struct fb_var_screeninfo *var = &info->var; bool force; if (oops_in_progress) return -EBUSY; /* * Normally we want to make sure that a kms master takes precedence over * fbdev, to avoid fbdev flickering and occasionally stealing the * display status. But Xorg first sets the vt back to text mode using * the KDSET IOCTL with KD_TEXT, and only after that drops the master * status when exiting. * * In the past this was caught by drm_fb_helper_lastclose(), but on * modern systems where logind always keeps a drm fd open to orchestrate * the vt switching, this doesn't work. * * To not break the userspace ABI we have this special case here, which * is only used for the above case. Everything else uses the normal * commit function, which ensures that we never steal the display from * an active drm master. */ force = var->activate & FB_ACTIVATE_KD_TEXT; __drm_fb_helper_restore_fbdev_mode_unlocked(fb_helper, force); return 0; } EXPORT_SYMBOL(drm_fb_helper_set_par); static void pan_set(struct drm_fb_helper *fb_helper, int dx, int dy) { struct drm_mode_set *mode_set; mutex_lock(&fb_helper->client.modeset_mutex); drm_client_for_each_modeset(mode_set, &fb_helper->client) { mode_set->x += dx; mode_set->y += dy; } mutex_unlock(&fb_helper->client.modeset_mutex); } static int pan_display_atomic(struct fb_var_screeninfo *var, struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; int ret, dx, dy; dx = var->xoffset - info->var.xoffset; dy = var->yoffset - info->var.yoffset; pan_set(fb_helper, dx, dy); ret = drm_client_modeset_commit_locked(&fb_helper->client); if (!ret) { info->var.xoffset = var->xoffset; info->var.yoffset = var->yoffset; } else pan_set(fb_helper, -dx, -dy); return ret; } static int pan_display_legacy(struct fb_var_screeninfo *var, struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; struct drm_client_dev *client = &fb_helper->client; struct drm_mode_set *modeset; int ret = 0; mutex_lock(&client->modeset_mutex); drm_modeset_lock_all(fb_helper->dev); drm_client_for_each_modeset(modeset, client) { modeset->x = var->xoffset; modeset->y = var->yoffset; if (modeset->num_connectors) { ret = drm_mode_set_config_internal(modeset); if (!ret) { info->var.xoffset = var->xoffset; info->var.yoffset = var->yoffset; } } } drm_modeset_unlock_all(fb_helper->dev); mutex_unlock(&client->modeset_mutex); return ret; } /** * drm_fb_helper_pan_display - implementation for &fb_ops.fb_pan_display * @var: updated screen information * @info: fbdev registered by the helper */ int drm_fb_helper_pan_display(struct fb_var_screeninfo *var, struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; struct drm_device *dev = fb_helper->dev; int ret; if (oops_in_progress) return -EBUSY; mutex_lock(&fb_helper->lock); if (!drm_master_internal_acquire(dev)) { ret = -EBUSY; goto unlock; } if (drm_drv_uses_atomic_modeset(dev)) ret = pan_display_atomic(var, info); else ret = pan_display_legacy(var, info); drm_master_internal_release(dev); unlock: mutex_unlock(&fb_helper->lock); return ret; } EXPORT_SYMBOL(drm_fb_helper_pan_display); static uint32_t drm_fb_helper_find_format(struct drm_fb_helper *fb_helper, const uint32_t *formats, size_t format_count, unsigned int color_mode) { struct drm_device *dev = fb_helper->dev; uint32_t format; size_t i; format = drm_driver_color_mode_format(dev, color_mode); if (!format) { drm_info(dev, "unsupported color mode of %d\n", color_mode); return DRM_FORMAT_INVALID; } for (i = 0; i < format_count; ++i) { if (formats[i] == format) return format; } drm_warn(dev, "format %p4cc not supported\n", &format); return DRM_FORMAT_INVALID; } static int __drm_fb_helper_find_sizes(struct drm_fb_helper *fb_helper, struct drm_fb_helper_surface_size *sizes) { struct drm_client_dev *client = &fb_helper->client; struct drm_device *dev = fb_helper->dev; int crtc_count = 0; struct drm_connector_list_iter conn_iter; struct drm_connector *connector; struct drm_mode_set *mode_set; uint32_t surface_format = DRM_FORMAT_INVALID; const struct drm_format_info *info; memset(sizes, 0, sizeof(*sizes)); sizes->fb_width = (u32)-1; sizes->fb_height = (u32)-1; drm_client_for_each_modeset(mode_set, client) { struct drm_crtc *crtc = mode_set->crtc; struct drm_plane *plane = crtc->primary; drm_dbg_kms(dev, "test CRTC %u primary plane\n", drm_crtc_index(crtc)); drm_connector_list_iter_begin(fb_helper->dev, &conn_iter); drm_client_for_each_connector_iter(connector, &conn_iter) { struct drm_cmdline_mode *cmdline_mode = &connector->cmdline_mode; if (!cmdline_mode->bpp_specified) continue; surface_format = drm_fb_helper_find_format(fb_helper, plane->format_types, plane->format_count, cmdline_mode->bpp); if (surface_format != DRM_FORMAT_INVALID) break; /* found supported format */ } drm_connector_list_iter_end(&conn_iter); if (surface_format != DRM_FORMAT_INVALID) break; /* found supported format */ /* try preferred color mode */ surface_format = drm_fb_helper_find_format(fb_helper, plane->format_types, plane->format_count, fb_helper->preferred_bpp); if (surface_format != DRM_FORMAT_INVALID) break; /* found supported format */ } if (surface_format == DRM_FORMAT_INVALID) { /* * If none of the given color modes works, fall back * to XRGB8888. Drivers are expected to provide this * format for compatibility with legacy applications. */ drm_warn(dev, "No compatible format found\n"); surface_format = drm_driver_legacy_fb_format(dev, 32, 24); } info = drm_format_info(surface_format); sizes->surface_bpp = drm_format_info_bpp(info, 0); sizes->surface_depth = info->depth; /* first up get a count of crtcs now in use and new min/maxes width/heights */ crtc_count = 0; drm_client_for_each_modeset(mode_set, client) { struct drm_display_mode *desired_mode; int x, y, j; /* in case of tile group, are we the last tile vert or horiz? * If no tile group you are always the last one both vertically * and horizontally */ bool lastv = true, lasth = true; desired_mode = mode_set->mode; if (!desired_mode) continue; crtc_count++; x = mode_set->x; y = mode_set->y; sizes->surface_width = max_t(u32, desired_mode->hdisplay + x, sizes->surface_width); sizes->surface_height = max_t(u32, desired_mode->vdisplay + y, sizes->surface_height); for (j = 0; j < mode_set->num_connectors; j++) { struct drm_connector *connector = mode_set->connectors[j]; if (connector->has_tile && desired_mode->hdisplay == connector->tile_h_size && desired_mode->vdisplay == connector->tile_v_size) { lasth = (connector->tile_h_loc == (connector->num_h_tile - 1)); lastv = (connector->tile_v_loc == (connector->num_v_tile - 1)); /* cloning to multiple tiles is just crazy-talk, so: */ break; } } if (lasth) sizes->fb_width = min_t(u32, desired_mode->hdisplay + x, sizes->fb_width); if (lastv) sizes->fb_height = min_t(u32, desired_mode->vdisplay + y, sizes->fb_height); } if (crtc_count == 0 || sizes->fb_width == -1 || sizes->fb_height == -1) { drm_info(dev, "Cannot find any crtc or sizes\n"); return -EAGAIN; } return 0; } static int drm_fb_helper_find_sizes(struct drm_fb_helper *fb_helper, struct drm_fb_helper_surface_size *sizes) { struct drm_client_dev *client = &fb_helper->client; struct drm_device *dev = fb_helper->dev; struct drm_mode_config *config = &dev->mode_config; int ret; mutex_lock(&client->modeset_mutex); ret = __drm_fb_helper_find_sizes(fb_helper, sizes); mutex_unlock(&client->modeset_mutex); if (ret) return ret; /* Handle our overallocation */ sizes->surface_height *= drm_fbdev_overalloc; sizes->surface_height /= 100; if (sizes->surface_height > config->max_height) { drm_dbg_kms(dev, "Fbdev over-allocation too large; clamping height to %d\n", config->max_height); sizes->surface_height = config->max_height; } return 0; } /* * Allocates the backing storage and sets up the fbdev info structure through * the ->fbdev_probe callback. */ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper) { struct drm_client_dev *client = &fb_helper->client; struct drm_device *dev = fb_helper->dev; struct drm_fb_helper_surface_size sizes; struct fb_info *info; int ret; ret = drm_fb_helper_find_sizes(fb_helper, &sizes); if (ret) { /* First time: disable all crtc's.. */ if (!fb_helper->deferred_setup) drm_client_modeset_commit(client); return ret; } /* push down into drivers */ if (dev->driver->fbdev_probe) ret = dev->driver->fbdev_probe(fb_helper, &sizes); else if (fb_helper->funcs) ret = fb_helper->funcs->fb_probe(fb_helper, &sizes); if (ret < 0) return ret; strcpy(fb_helper->fb->comm, "[fbcon]"); info = fb_helper->info; /* Set the fb info for vgaswitcheroo clients. Does nothing otherwise. */ if (dev_is_pci(info->device)) vga_switcheroo_client_fb_set(to_pci_dev(info->device), info); return 0; } static void drm_fb_helper_fill_fix(struct fb_info *info, uint32_t pitch, bool is_color_indexed) { info->fix.type = FB_TYPE_PACKED_PIXELS; info->fix.visual = is_color_indexed ? FB_VISUAL_PSEUDOCOLOR : FB_VISUAL_TRUECOLOR; info->fix.mmio_start = 0; info->fix.mmio_len = 0; info->fix.type_aux = 0; info->fix.xpanstep = 1; /* doing it in hw */ info->fix.ypanstep = 1; /* doing it in hw */ info->fix.ywrapstep = 0; info->fix.accel = FB_ACCEL_NONE; info->fix.line_length = pitch; } static void drm_fb_helper_fill_var(struct fb_info *info, struct drm_fb_helper *fb_helper, uint32_t fb_width, uint32_t fb_height) { struct drm_framebuffer *fb = fb_helper->fb; const struct drm_format_info *format = fb->format; switch (format->format) { case DRM_FORMAT_C1: case DRM_FORMAT_C2: case DRM_FORMAT_C4: /* supported format with sub-byte pixels */ break; default: WARN_ON((drm_format_info_block_width(format, 0) > 1) || (drm_format_info_block_height(format, 0) > 1)); break; } info->pseudo_palette = fb_helper->pseudo_palette; info->var.xoffset = 0; info->var.yoffset = 0; __fill_var(&info->var, info, fb); info->var.activate = FB_ACTIVATE_NOW; drm_fb_helper_fill_pixel_fmt(&info->var, format); info->var.xres = fb_width; info->var.yres = fb_height; } /** * drm_fb_helper_fill_info - initializes fbdev information * @info: fbdev instance to set up * @fb_helper: fb helper instance to use as template * @sizes: describes fbdev size and scanout surface size * * Sets up the variable and fixed fbdev metainformation from the given fb helper * instance and the drm framebuffer allocated in &drm_fb_helper.fb. * * Drivers should call this (or their equivalent setup code) from their * &drm_driver.fbdev_probe callback after having allocated the fbdev * backing storage framebuffer. */ void drm_fb_helper_fill_info(struct fb_info *info, struct drm_fb_helper *fb_helper, struct drm_fb_helper_surface_size *sizes) { struct drm_framebuffer *fb = fb_helper->fb; drm_fb_helper_fill_fix(info, fb->pitches[0], fb->format->is_color_indexed); drm_fb_helper_fill_var(info, fb_helper, sizes->fb_width, sizes->fb_height); info->par = fb_helper; /* * The DRM drivers fbdev emulation device name can be confusing if the * driver name also has a "drm" suffix on it. Leading to names such as * "simpledrmdrmfb" in /proc/fb. Unfortunately, it's an uAPI and can't * be changed due user-space tools (e.g: pm-utils) matching against it. */ snprintf(info->fix.id, sizeof(info->fix.id), "%sdrmfb", fb_helper->dev->driver->name); } EXPORT_SYMBOL(drm_fb_helper_fill_info); /* * This is a continuation of drm_setup_crtcs() that sets up anything related * to the framebuffer. During initialization, drm_setup_crtcs() is called before * the framebuffer has been allocated (fb_helper->fb and fb_helper->info). * So, any setup that touches those fields needs to be done here instead of in * drm_setup_crtcs(). */ static void drm_setup_crtcs_fb(struct drm_fb_helper *fb_helper) { struct drm_client_dev *client = &fb_helper->client; struct drm_connector_list_iter conn_iter; struct fb_info *info = fb_helper->info; unsigned int rotation, sw_rotations = 0; struct drm_connector *connector; struct drm_mode_set *modeset; mutex_lock(&client->modeset_mutex); drm_client_for_each_modeset(modeset, client) { if (!modeset->num_connectors) continue; modeset->fb = fb_helper->fb; if (drm_client_rotation(modeset, &rotation)) /* Rotating in hardware, fbcon should not rotate */ sw_rotations |= DRM_MODE_ROTATE_0; else sw_rotations |= rotation; } mutex_unlock(&client->modeset_mutex); drm_connector_list_iter_begin(fb_helper->dev, &conn_iter); drm_client_for_each_connector_iter(connector, &conn_iter) { /* use first connected connector for the physical dimensions */ if (connector->status == connector_status_connected) { info->var.width = connector->display_info.width_mm; info->var.height = connector->display_info.height_mm; break; } } drm_connector_list_iter_end(&conn_iter); switch (sw_rotations) { case DRM_MODE_ROTATE_0: info->fbcon_rotate_hint = FB_ROTATE_UR; break; case DRM_MODE_ROTATE_90: info->fbcon_rotate_hint = FB_ROTATE_CCW; break; case DRM_MODE_ROTATE_180: info->fbcon_rotate_hint = FB_ROTATE_UD; break; case DRM_MODE_ROTATE_270: info->fbcon_rotate_hint = FB_ROTATE_CW; break; default: /* * Multiple bits are set / multiple rotations requested * fbcon cannot handle separate rotation settings per * output, so fallback to unrotated. */ info->fbcon_rotate_hint = FB_ROTATE_UR; } } /* Note: Drops fb_helper->lock before returning. */ static int __drm_fb_helper_initial_config_and_unlock(struct drm_fb_helper *fb_helper) { struct drm_device *dev = fb_helper->dev; struct fb_info *info; unsigned int width, height; int ret; width = dev->mode_config.max_width; height = dev->mode_config.max_height; drm_client_modeset_probe(&fb_helper->client, width, height); ret = drm_fb_helper_single_fb_probe(fb_helper); if (ret < 0) { if (ret == -EAGAIN) { fb_helper->deferred_setup = true; ret = 0; } mutex_unlock(&fb_helper->lock); return ret; } drm_setup_crtcs_fb(fb_helper); fb_helper->deferred_setup = false; info = fb_helper->info; info->var.pixclock = 0; /* Need to drop locks to avoid recursive deadlock in * register_framebuffer. This is ok because the only thing left to do is * register the fbdev emulation instance in kernel_fb_helper_list. */ mutex_unlock(&fb_helper->lock); ret = register_framebuffer(info); if (ret < 0) return ret; drm_info(dev, "fb%d: %s frame buffer device\n", info->node, info->fix.id); mutex_lock(&kernel_fb_helper_lock); if (list_empty(&kernel_fb_helper_list)) register_sysrq_key('v', &sysrq_drm_fb_helper_restore_op); list_add(&fb_helper->kernel_fb_list, &kernel_fb_helper_list); mutex_unlock(&kernel_fb_helper_lock); return 0; } /** * drm_fb_helper_initial_config - setup a sane initial connector configuration * @fb_helper: fb_helper device struct * * Scans the CRTCs and connectors and tries to put together an initial setup. * At the moment, this is a cloned configuration across all heads with * a new framebuffer object as the backing store. * * Note that this also registers the fbdev and so allows userspace to call into * the driver through the fbdev interfaces. * * This function will call down into the &drm_driver.fbdev_probe callback * to let the driver allocate and initialize the fbdev info structure and the * drm framebuffer used to back the fbdev. drm_fb_helper_fill_info() is provided * as a helper to setup simple default values for the fbdev info structure. * * HANG DEBUGGING: * * When you have fbcon support built-in or already loaded, this function will do * a full modeset to setup the fbdev console. Due to locking misdesign in the * VT/fbdev subsystem that entire modeset sequence has to be done while holding * console_lock. Until console_unlock is called no dmesg lines will be sent out * to consoles, not even serial console. This means when your driver crashes, * you will see absolutely nothing else but a system stuck in this function, * with no further output. Any kind of printk() you place within your own driver * or in the drm core modeset code will also never show up. * * Standard debug practice is to run the fbcon setup without taking the * console_lock as a hack, to be able to see backtraces and crashes on the * serial line. This can be done by setting the fb.lockless_register_fb=1 kernel * cmdline option. * * The other option is to just disable fbdev emulation since very likely the * first modeset from userspace will crash in the same way, and is even easier * to debug. This can be done by setting the drm_kms_helper.fbdev_emulation=0 * kernel cmdline option. * * RETURNS: * Zero if everything went ok, nonzero otherwise. */ int drm_fb_helper_initial_config(struct drm_fb_helper *fb_helper) { int ret; if (!drm_fbdev_emulation) return 0; mutex_lock(&fb_helper->lock); ret = __drm_fb_helper_initial_config_and_unlock(fb_helper); return ret; } EXPORT_SYMBOL(drm_fb_helper_initial_config); /** * drm_fb_helper_hotplug_event - respond to a hotplug notification by * probing all the outputs attached to the fb * @fb_helper: driver-allocated fbdev helper, can be NULL * * Scan the connectors attached to the fb_helper and try to put together a * setup after notification of a change in output configuration. * * Called at runtime, takes the mode config locks to be able to check/change the * modeset configuration. Must be run from process context (which usually means * either the output polling work or a work item launched from the driver's * hotplug interrupt). * * Note that drivers may call this even before calling * drm_fb_helper_initial_config but only after drm_fb_helper_init. This allows * for a race-free fbcon setup and will make sure that the fbdev emulation will * not miss any hotplug events. * * RETURNS: * 0 on success and a non-zero error code otherwise. */ int drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper) { int err = 0; if (!drm_fbdev_emulation || !fb_helper) return 0; mutex_lock(&fb_helper->lock); if (fb_helper->deferred_setup) { err = __drm_fb_helper_initial_config_and_unlock(fb_helper); return err; } if (!fb_helper->fb || !drm_master_internal_acquire(fb_helper->dev)) { fb_helper->delayed_hotplug = true; mutex_unlock(&fb_helper->lock); return err; } drm_master_internal_release(fb_helper->dev); drm_dbg_kms(fb_helper->dev, "\n"); drm_client_modeset_probe(&fb_helper->client, fb_helper->fb->width, fb_helper->fb->height); drm_setup_crtcs_fb(fb_helper); mutex_unlock(&fb_helper->lock); drm_fb_helper_set_par(fb_helper->info); return 0; } EXPORT_SYMBOL(drm_fb_helper_hotplug_event); /** * drm_fb_helper_lastclose - DRM driver lastclose helper for fbdev emulation * @dev: DRM device * * This function is obsolete. Call drm_fb_helper_restore_fbdev_mode_unlocked() * instead. */ void drm_fb_helper_lastclose(struct drm_device *dev) { drm_fb_helper_restore_fbdev_mode_unlocked(dev->fb_helper); } EXPORT_SYMBOL(drm_fb_helper_lastclose);
226 267 181 241 242 5 20 132 10 4 52 21 1 2 17 3 11 1 10 201 84 132 199 6 5 55 179 12 48 33 2 14 16 2 11 2 9 1 3 4 7 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 // SPDX-License-Identifier: GPL-2.0-only /* * IPv6 library code, needed by static components when full IPv6 support is * not configured or static. */ #include <linux/export.h> #include <net/ipv6.h> /* * find out if nexthdr is a well-known extension header or a protocol */ bool ipv6_ext_hdr(u8 nexthdr) { /* * find out if nexthdr is an extension header or a protocol */ return (nexthdr == NEXTHDR_HOP) || (nexthdr == NEXTHDR_ROUTING) || (nexthdr == NEXTHDR_FRAGMENT) || (nexthdr == NEXTHDR_AUTH) || (nexthdr == NEXTHDR_NONE) || (nexthdr == NEXTHDR_DEST); } EXPORT_SYMBOL(ipv6_ext_hdr); /* * Skip any extension headers. This is used by the ICMP module. * * Note that strictly speaking this conflicts with RFC 2460 4.0: * ...The contents and semantics of each extension header determine whether * or not to proceed to the next header. Therefore, extension headers must * be processed strictly in the order they appear in the packet; a * receiver must not, for example, scan through a packet looking for a * particular kind of extension header and process that header prior to * processing all preceding ones. * * We do exactly this. This is a protocol bug. We can't decide after a * seeing an unknown discard-with-error flavour TLV option if it's a * ICMP error message or not (errors should never be send in reply to * ICMP error messages). * * But I see no other way to do this. This might need to be reexamined * when Linux implements ESP (and maybe AUTH) headers. * --AK * * This function parses (probably truncated) exthdr set "hdr". * "nexthdrp" initially points to some place, * where type of the first header can be found. * * It skips all well-known exthdrs, and returns pointer to the start * of unparsable area i.e. the first header with unknown type. * If it is not NULL *nexthdr is updated by type/protocol of this header. * * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL. * - it may return pointer pointing beyond end of packet, * if the last recognized header is truncated in the middle. * - if packet is truncated, so that all parsed headers are skipped, * it returns NULL. * - First fragment header is skipped, not-first ones * are considered as unparsable. * - Reports the offset field of the final fragment header so it is * possible to tell whether this is a first fragment, later fragment, * or not fragmented. * - ESP is unparsable for now and considered like * normal payload protocol. * - Note also special handling of AUTH header. Thanks to IPsec wizards. * * --ANK (980726) */ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, __be16 *frag_offp) { u8 nexthdr = *nexthdrp; *frag_offp = 0; while (ipv6_ext_hdr(nexthdr)) { struct ipv6_opt_hdr _hdr, *hp; int hdrlen; if (nexthdr == NEXTHDR_NONE) return -1; hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -1; if (nexthdr == NEXTHDR_FRAGMENT) { __be16 _frag_off, *fp; fp = skb_header_pointer(skb, start+offsetof(struct frag_hdr, frag_off), sizeof(_frag_off), &_frag_off); if (!fp) return -1; *frag_offp = *fp; if (ntohs(*frag_offp) & ~0x7) break; hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) hdrlen = ipv6_authlen(hp); else hdrlen = ipv6_optlen(hp); nexthdr = hp->nexthdr; start += hdrlen; } *nexthdrp = nexthdr; return start; } EXPORT_SYMBOL(ipv6_skip_exthdr); int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type) { const unsigned char *nh = skb_network_header(skb); int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); struct ipv6_opt_hdr *hdr; int len; if (offset + 2 > packet_len) goto bad; hdr = (struct ipv6_opt_hdr *)(nh + offset); len = ((hdr->hdrlen + 1) << 3); if (offset + len > packet_len) goto bad; offset += 2; len -= 2; while (len > 0) { int opttype = nh[offset]; int optlen; if (opttype == type) return offset; switch (opttype) { case IPV6_TLV_PAD1: optlen = 1; break; default: if (len < 2) goto bad; optlen = nh[offset + 1] + 2; if (optlen > len) goto bad; break; } offset += optlen; len -= optlen; } /* not_found */ bad: return -1; } EXPORT_SYMBOL_GPL(ipv6_find_tlv); /* * find the offset to specified header or the protocol number of last header * if target < 0. "last header" is transport protocol header, ESP, or * "No next header". * * Note that *offset is used as input/output parameter, and if it is not zero, * then it must be a valid offset to an inner IPv6 header. This can be used * to explore inner IPv6 header, eg. ICMPv6 error messages. * * If target header is found, its offset is set in *offset and return protocol * number. Otherwise, return -1. * * If the first fragment doesn't contain the final protocol header or * NEXTHDR_NONE it is considered invalid. * * Note that non-1st fragment is special case that "the protocol number * of last header" is "next header" field in Fragment header. In this case, * *offset is meaningless and fragment offset is stored in *fragoff if fragoff * isn't NULL. * * if flags is not NULL and it's a fragment, then the frag flag * IP6_FH_F_FRAG will be set. If it's an AH header, the * IP6_FH_F_AUTH flag is set and target < 0, then this function will * stop at the AH header. If IP6_FH_F_SKIP_RH flag was passed, then this * function will skip all those routing headers, where segements_left was 0. */ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target, unsigned short *fragoff, int *flags) { unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; bool found; if (fragoff) *fragoff = 0; if (*offset) { struct ipv6hdr _ip6, *ip6; ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6); if (!ip6 || (ip6->version != 6)) return -EBADMSG; start = *offset + sizeof(struct ipv6hdr); nexthdr = ip6->nexthdr; } do { struct ipv6_opt_hdr _hdr, *hp; unsigned int hdrlen; found = (nexthdr == target); if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) { if (target < 0 || found) break; return -ENOENT; } hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -EBADMSG; if (nexthdr == NEXTHDR_ROUTING) { struct ipv6_rt_hdr _rh, *rh; rh = skb_header_pointer(skb, start, sizeof(_rh), &_rh); if (!rh) return -EBADMSG; if (flags && (*flags & IP6_FH_F_SKIP_RH) && rh->segments_left == 0) found = false; } if (nexthdr == NEXTHDR_FRAGMENT) { unsigned short _frag_off; __be16 *fp; if (flags) /* Indicate that this is a fragment */ *flags |= IP6_FH_F_FRAG; fp = skb_header_pointer(skb, start+offsetof(struct frag_hdr, frag_off), sizeof(_frag_off), &_frag_off); if (!fp) return -EBADMSG; _frag_off = ntohs(*fp) & ~0x7; if (_frag_off) { if (target < 0 && ((!ipv6_ext_hdr(hp->nexthdr)) || hp->nexthdr == NEXTHDR_NONE)) { if (fragoff) *fragoff = _frag_off; return hp->nexthdr; } if (!found) return -ENOENT; if (fragoff) *fragoff = _frag_off; break; } hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0)) break; hdrlen = ipv6_authlen(hp); } else hdrlen = ipv6_optlen(hp); if (!found) { nexthdr = hp->nexthdr; start += hdrlen; } } while (!found); *offset = start; return nexthdr; } EXPORT_SYMBOL(ipv6_find_hdr);
3 6 3 3 7 13 1 10 10 8 3 7 6 6 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016-2021 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> #include <linux/fs.h> #include <linux/iomap.h> #include <linux/fiemap.h> #include <linux/pagemap.h> static int iomap_to_fiemap(struct fiemap_extent_info *fi, const struct iomap *iomap, u32 flags) { switch (iomap->type) { case IOMAP_HOLE: /* skip holes */ return 0; case IOMAP_DELALLOC: flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; break; case IOMAP_MAPPED: break; case IOMAP_UNWRITTEN: flags |= FIEMAP_EXTENT_UNWRITTEN; break; case IOMAP_INLINE: flags |= FIEMAP_EXTENT_DATA_INLINE; break; } if (iomap->flags & IOMAP_F_MERGED) flags |= FIEMAP_EXTENT_MERGED; if (iomap->flags & IOMAP_F_SHARED) flags |= FIEMAP_EXTENT_SHARED; return fiemap_fill_next_extent(fi, iomap->offset, iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, iomap->length, flags); } static loff_t iomap_fiemap_iter(const struct iomap_iter *iter, struct fiemap_extent_info *fi, struct iomap *prev) { int ret; if (iter->iomap.type == IOMAP_HOLE) return iomap_length(iter); ret = iomap_to_fiemap(fi, prev, 0); *prev = iter->iomap; switch (ret) { case 0: /* success */ return iomap_length(iter); case 1: /* extent array full */ return 0; default: /* error */ return ret; } } int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, u64 start, u64 len, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = inode, .pos = start, .len = len, .flags = IOMAP_REPORT, }; struct iomap prev = { .type = IOMAP_HOLE, }; int ret; ret = fiemap_prep(inode, fi, start, &iter.len, 0); if (ret) return ret; while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_fiemap_iter(&iter, fi, &prev); if (prev.type != IOMAP_HOLE) { ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST); if (ret < 0) return ret; } /* inode with no (attribute) mapping will give ENOENT */ if (ret < 0 && ret != -ENOENT) return ret; return 0; } EXPORT_SYMBOL_GPL(iomap_fiemap); /* legacy ->bmap interface. 0 is the error return (!) */ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = mapping->host, .pos = (loff_t)bno << mapping->host->i_blkbits, .len = i_blocksize(mapping->host), .flags = IOMAP_REPORT, }; const unsigned int blkshift = mapping->host->i_blkbits - SECTOR_SHIFT; int ret; if (filemap_write_and_wait(mapping)) return 0; bno = 0; while ((ret = iomap_iter(&iter, ops)) > 0) { if (iter.iomap.type == IOMAP_MAPPED) bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift; /* leave iter.processed unset to abort loop */ } if (ret) return 0; return bno; } EXPORT_SYMBOL_GPL(iomap_bmap);
301 287 284 124 299 168 6 299 299 6 364 361 360 215 361 73 9 364 360 9 8 8 5 2 6 10 10 6 3 7 18 10 8 14 15 15 15 15 49 2 47 231 3 228 226 9 2 23 99 150 342 2 2 1 277 301 297 11 135 157 4 3 4 21 2 6 320 1 357 357 356 341 427 426 9 417 35 31 20 4 17 5 423 424 3 3 416 277 257 12 254 257 356 3 2 2 3 320 349 1 350 313 1 1 307 313 318 316 318 15 9 7 13 8 212 1 209 4 210 6 2 49 138 120 16 16 3 13 3 7 356 312 314 313 315 383 382 374 378 1 1 42 37 39 10 12 3 10 13 29 25 4 32 431 433 3 3 28 10 18 3 3 3 3 3 3 3 3 3 3 3 3 12 7 12 12 11 8 9 12 2 12 9 5 5 5 5 5 5 5 5 7 7 7 7 7 7 4 7 38 1 37 1 37 1 37 274 133 273 272 273 129 269 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 // SPDX-License-Identifier: GPL-2.0-only /* * x_tables core - Backend for {ip,ip6,arp}_tables * * Copyright (C) 2006-2006 Harald Welte <laforge@netfilter.org> * Copyright (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * Based on existing ip_tables code which is * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/net.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/string.h> #include <linux/vmalloc.h> #include <linux/mutex.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/audit.h> #include <linux/user_namespace.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_arp.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_arp/arp_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); #define XT_PCPU_BLOCK_SIZE 4096 #define XT_MAX_TABLE_SIZE (512 * 1024 * 1024) struct xt_template { struct list_head list; /* called when table is needed in the given netns */ int (*table_init)(struct net *net); struct module *me; /* A unique name... */ char name[XT_TABLE_MAXNAMELEN]; }; static struct list_head xt_templates[NFPROTO_NUMPROTO]; struct xt_pernet { struct list_head tables[NFPROTO_NUMPROTO]; }; struct compat_delta { unsigned int offset; /* offset in kernel */ int delta; /* delta in 32bit user land */ }; struct xt_af { struct mutex mutex; struct list_head match; struct list_head target; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct mutex compat_mutex; struct compat_delta *compat_tab; unsigned int number; /* number of slots in compat_tab[] */ unsigned int cur; /* number of used slots in compat_tab[] */ #endif }; static unsigned int xt_pernet_id __read_mostly; static struct xt_af *xt __read_mostly; static const char *const xt_prefix[NFPROTO_NUMPROTO] = { [NFPROTO_UNSPEC] = "x", [NFPROTO_IPV4] = "ip", [NFPROTO_ARP] = "arp", [NFPROTO_BRIDGE] = "eb", [NFPROTO_IPV6] = "ip6", }; /* Registration hooks for targets. */ int xt_register_target(struct xt_target *target) { u_int8_t af = target->family; mutex_lock(&xt[af].mutex); list_add(&target->list, &xt[af].target); mutex_unlock(&xt[af].mutex); return 0; } EXPORT_SYMBOL(xt_register_target); void xt_unregister_target(struct xt_target *target) { u_int8_t af = target->family; mutex_lock(&xt[af].mutex); list_del(&target->list); mutex_unlock(&xt[af].mutex); } EXPORT_SYMBOL(xt_unregister_target); int xt_register_targets(struct xt_target *target, unsigned int n) { unsigned int i; int err = 0; for (i = 0; i < n; i++) { err = xt_register_target(&target[i]); if (err) goto err; } return err; err: if (i > 0) xt_unregister_targets(target, i); return err; } EXPORT_SYMBOL(xt_register_targets); void xt_unregister_targets(struct xt_target *target, unsigned int n) { while (n-- > 0) xt_unregister_target(&target[n]); } EXPORT_SYMBOL(xt_unregister_targets); int xt_register_match(struct xt_match *match) { u_int8_t af = match->family; mutex_lock(&xt[af].mutex); list_add(&match->list, &xt[af].match); mutex_unlock(&xt[af].mutex); return 0; } EXPORT_SYMBOL(xt_register_match); void xt_unregister_match(struct xt_match *match) { u_int8_t af = match->family; mutex_lock(&xt[af].mutex); list_del(&match->list); mutex_unlock(&xt[af].mutex); } EXPORT_SYMBOL(xt_unregister_match); int xt_register_matches(struct xt_match *match, unsigned int n) { unsigned int i; int err = 0; for (i = 0; i < n; i++) { err = xt_register_match(&match[i]); if (err) goto err; } return err; err: if (i > 0) xt_unregister_matches(match, i); return err; } EXPORT_SYMBOL(xt_register_matches); void xt_unregister_matches(struct xt_match *match, unsigned int n) { while (n-- > 0) xt_unregister_match(&match[n]); } EXPORT_SYMBOL(xt_unregister_matches); /* * These are weird, but module loading must not be done with mutex * held (since they will register), and we have to have a single * function to use. */ /* Find match, grabs ref. Returns ERR_PTR() on error. */ struct xt_match *xt_find_match(u8 af, const char *name, u8 revision) { struct xt_match *m; int err = -ENOENT; if (strnlen(name, XT_EXTENSION_MAXNAMELEN) == XT_EXTENSION_MAXNAMELEN) return ERR_PTR(-EINVAL); mutex_lock(&xt[af].mutex); list_for_each_entry(m, &xt[af].match, list) { if (strcmp(m->name, name) == 0) { if (m->revision == revision) { if (try_module_get(m->me)) { mutex_unlock(&xt[af].mutex); return m; } } else err = -EPROTOTYPE; /* Found something. */ } } mutex_unlock(&xt[af].mutex); if (af != NFPROTO_UNSPEC) /* Try searching again in the family-independent list */ return xt_find_match(NFPROTO_UNSPEC, name, revision); return ERR_PTR(err); } EXPORT_SYMBOL(xt_find_match); struct xt_match * xt_request_find_match(uint8_t nfproto, const char *name, uint8_t revision) { struct xt_match *match; if (strnlen(name, XT_EXTENSION_MAXNAMELEN) == XT_EXTENSION_MAXNAMELEN) return ERR_PTR(-EINVAL); match = xt_find_match(nfproto, name, revision); if (IS_ERR(match)) { request_module("%st_%s", xt_prefix[nfproto], name); match = xt_find_match(nfproto, name, revision); } return match; } EXPORT_SYMBOL_GPL(xt_request_find_match); /* Find target, grabs ref. Returns ERR_PTR() on error. */ static struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) { struct xt_target *t; int err = -ENOENT; if (strnlen(name, XT_EXTENSION_MAXNAMELEN) == XT_EXTENSION_MAXNAMELEN) return ERR_PTR(-EINVAL); mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt[af].target, list) { if (strcmp(t->name, name) == 0) { if (t->revision == revision) { if (try_module_get(t->me)) { mutex_unlock(&xt[af].mutex); return t; } } else err = -EPROTOTYPE; /* Found something. */ } } mutex_unlock(&xt[af].mutex); if (af != NFPROTO_UNSPEC) /* Try searching again in the family-independent list */ return xt_find_target(NFPROTO_UNSPEC, name, revision); return ERR_PTR(err); } struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision) { struct xt_target *target; if (strnlen(name, XT_EXTENSION_MAXNAMELEN) == XT_EXTENSION_MAXNAMELEN) return ERR_PTR(-EINVAL); target = xt_find_target(af, name, revision); if (IS_ERR(target)) { request_module("%st_%s", xt_prefix[af], name); target = xt_find_target(af, name, revision); } return target; } EXPORT_SYMBOL_GPL(xt_request_find_target); static int xt_obj_to_user(u16 __user *psize, u16 size, void __user *pname, const char *name, u8 __user *prev, u8 rev) { if (put_user(size, psize)) return -EFAULT; if (copy_to_user(pname, name, strlen(name) + 1)) return -EFAULT; if (put_user(rev, prev)) return -EFAULT; return 0; } #define XT_OBJ_TO_USER(U, K, TYPE, C_SIZE) \ xt_obj_to_user(&U->u.TYPE##_size, C_SIZE ? : K->u.TYPE##_size, \ U->u.user.name, K->u.kernel.TYPE->name, \ &U->u.user.revision, K->u.kernel.TYPE->revision) int xt_data_to_user(void __user *dst, const void *src, int usersize, int size, int aligned_size) { usersize = usersize ? : size; if (copy_to_user(dst, src, usersize)) return -EFAULT; if (usersize != aligned_size && clear_user(dst + usersize, aligned_size - usersize)) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(xt_data_to_user); #define XT_DATA_TO_USER(U, K, TYPE) \ xt_data_to_user(U->data, K->data, \ K->u.kernel.TYPE->usersize, \ K->u.kernel.TYPE->TYPE##size, \ XT_ALIGN(K->u.kernel.TYPE->TYPE##size)) int xt_match_to_user(const struct xt_entry_match *m, struct xt_entry_match __user *u) { return XT_OBJ_TO_USER(u, m, match, 0) || XT_DATA_TO_USER(u, m, match); } EXPORT_SYMBOL_GPL(xt_match_to_user); int xt_target_to_user(const struct xt_entry_target *t, struct xt_entry_target __user *u) { return XT_OBJ_TO_USER(u, t, target, 0) || XT_DATA_TO_USER(u, t, target); } EXPORT_SYMBOL_GPL(xt_target_to_user); static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) { const struct xt_match *m; int have_rev = 0; mutex_lock(&xt[af].mutex); list_for_each_entry(m, &xt[af].match, list) { if (strcmp(m->name, name) == 0) { if (m->revision > *bestp) *bestp = m->revision; if (m->revision == revision) have_rev = 1; } } mutex_unlock(&xt[af].mutex); if (af != NFPROTO_UNSPEC && !have_rev) return match_revfn(NFPROTO_UNSPEC, name, revision, bestp); return have_rev; } static int target_revfn(u8 af, const char *name, u8 revision, int *bestp) { const struct xt_target *t; int have_rev = 0; mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt[af].target, list) { if (strcmp(t->name, name) == 0) { if (t->revision > *bestp) *bestp = t->revision; if (t->revision == revision) have_rev = 1; } } mutex_unlock(&xt[af].mutex); if (af != NFPROTO_UNSPEC && !have_rev) return target_revfn(NFPROTO_UNSPEC, name, revision, bestp); return have_rev; } /* Returns true or false (if no such extension at all) */ int xt_find_revision(u8 af, const char *name, u8 revision, int target, int *err) { int have_rev, best = -1; if (target == 1) have_rev = target_revfn(af, name, revision, &best); else have_rev = match_revfn(af, name, revision, &best); /* Nothing at all? Return 0 to try loading module. */ if (best == -1) { *err = -ENOENT; return 0; } *err = best; if (!have_rev) *err = -EPROTONOSUPPORT; return 1; } EXPORT_SYMBOL_GPL(xt_find_revision); static char * textify_hooks(char *buf, size_t size, unsigned int mask, uint8_t nfproto) { static const char *const inetbr_names[] = { "PREROUTING", "INPUT", "FORWARD", "OUTPUT", "POSTROUTING", "BROUTING", }; static const char *const arp_names[] = { "INPUT", "FORWARD", "OUTPUT", }; const char *const *names; unsigned int i, max; char *p = buf; bool np = false; int res; names = (nfproto == NFPROTO_ARP) ? arp_names : inetbr_names; max = (nfproto == NFPROTO_ARP) ? ARRAY_SIZE(arp_names) : ARRAY_SIZE(inetbr_names); *p = '\0'; for (i = 0; i < max; ++i) { if (!(mask & (1 << i))) continue; res = snprintf(p, size, "%s%s", np ? "/" : "", names[i]); if (res > 0) { size -= res; p += res; } np = true; } return buf; } /** * xt_check_proc_name - check that name is suitable for /proc file creation * * @name: file name candidate * @size: length of buffer * * some x_tables modules wish to create a file in /proc. * This function makes sure that the name is suitable for this * purpose, it checks that name is NUL terminated and isn't a 'special' * name, like "..". * * returns negative number on error or 0 if name is useable. */ int xt_check_proc_name(const char *name, unsigned int size) { if (name[0] == '\0') return -EINVAL; if (strnlen(name, size) == size) return -ENAMETOOLONG; if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0 || strchr(name, '/')) return -EINVAL; return 0; } EXPORT_SYMBOL(xt_check_proc_name); int xt_check_match(struct xt_mtchk_param *par, unsigned int size, u16 proto, bool inv_proto) { int ret; if (XT_ALIGN(par->match->matchsize) != size && par->match->matchsize != -1) { /* * ebt_among is exempt from centralized matchsize checking * because it uses a dynamic-size data set. */ pr_err_ratelimited("%s_tables: %s.%u match: invalid size %u (kernel) != (user) %u\n", xt_prefix[par->family], par->match->name, par->match->revision, XT_ALIGN(par->match->matchsize), size); return -EINVAL; } if (par->match->table != NULL && strcmp(par->match->table, par->table) != 0) { pr_info_ratelimited("%s_tables: %s match: only valid in %s table, not %s\n", xt_prefix[par->family], par->match->name, par->match->table, par->table); return -EINVAL; } if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) { char used[64], allow[64]; pr_info_ratelimited("%s_tables: %s match: used from hooks %s, but only valid from %s\n", xt_prefix[par->family], par->match->name, textify_hooks(used, sizeof(used), par->hook_mask, par->family), textify_hooks(allow, sizeof(allow), par->match->hooks, par->family)); return -EINVAL; } if (par->match->proto && (par->match->proto != proto || inv_proto)) { pr_info_ratelimited("%s_tables: %s match: only valid for protocol %u\n", xt_prefix[par->family], par->match->name, par->match->proto); return -EINVAL; } if (par->match->checkentry != NULL) { ret = par->match->checkentry(par); if (ret < 0) return ret; else if (ret > 0) /* Flag up potential errors. */ return -EIO; } return 0; } EXPORT_SYMBOL_GPL(xt_check_match); /** xt_check_entry_match - check that matches end before start of target * * @match: beginning of xt_entry_match * @target: beginning of this rules target (alleged end of matches) * @alignment: alignment requirement of match structures * * Validates that all matches add up to the beginning of the target, * and that each match covers at least the base structure size. * * Return: 0 on success, negative errno on failure. */ static int xt_check_entry_match(const char *match, const char *target, const size_t alignment) { const struct xt_entry_match *pos; int length = target - match; if (length == 0) /* no matches */ return 0; pos = (struct xt_entry_match *)match; do { if ((unsigned long)pos % alignment) return -EINVAL; if (length < (int)sizeof(struct xt_entry_match)) return -EINVAL; if (pos->u.match_size < sizeof(struct xt_entry_match)) return -EINVAL; if (pos->u.match_size > length) return -EINVAL; length -= pos->u.match_size; pos = ((void *)((char *)(pos) + (pos)->u.match_size)); } while (length > 0); return 0; } /** xt_check_table_hooks - check hook entry points are sane * * @info xt_table_info to check * @valid_hooks - hook entry points that we can enter from * * Validates that the hook entry and underflows points are set up. * * Return: 0 on success, negative errno on failure. */ int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks) { const char *err = "unsorted underflow"; unsigned int i, max_uflow, max_entry; bool check_hooks = false; BUILD_BUG_ON(ARRAY_SIZE(info->hook_entry) != ARRAY_SIZE(info->underflow)); max_entry = 0; max_uflow = 0; for (i = 0; i < ARRAY_SIZE(info->hook_entry); i++) { if (!(valid_hooks & (1 << i))) continue; if (info->hook_entry[i] == 0xFFFFFFFF) return -EINVAL; if (info->underflow[i] == 0xFFFFFFFF) return -EINVAL; if (check_hooks) { if (max_uflow > info->underflow[i]) goto error; if (max_uflow == info->underflow[i]) { err = "duplicate underflow"; goto error; } if (max_entry > info->hook_entry[i]) { err = "unsorted entry"; goto error; } if (max_entry == info->hook_entry[i]) { err = "duplicate entry"; goto error; } } max_entry = info->hook_entry[i]; max_uflow = info->underflow[i]; check_hooks = true; } return 0; error: pr_err_ratelimited("%s at hook %d\n", err, i); return -EINVAL; } EXPORT_SYMBOL(xt_check_table_hooks); static bool verdict_ok(int verdict) { if (verdict > 0) return true; if (verdict < 0) { int v = -verdict - 1; if (verdict == XT_RETURN) return true; switch (v) { case NF_ACCEPT: return true; case NF_DROP: return true; case NF_QUEUE: return true; default: break; } return false; } return false; } static bool error_tg_ok(unsigned int usersize, unsigned int kernsize, const char *msg, unsigned int msglen) { return usersize == kernsize && strnlen(msg, msglen) < msglen; } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta) { struct xt_af *xp = &xt[af]; WARN_ON(!mutex_is_locked(&xt[af].compat_mutex)); if (WARN_ON(!xp->compat_tab)) return -ENOMEM; if (xp->cur >= xp->number) return -EINVAL; if (xp->cur) delta += xp->compat_tab[xp->cur - 1].delta; xp->compat_tab[xp->cur].offset = offset; xp->compat_tab[xp->cur].delta = delta; xp->cur++; return 0; } EXPORT_SYMBOL_GPL(xt_compat_add_offset); void xt_compat_flush_offsets(u_int8_t af) { WARN_ON(!mutex_is_locked(&xt[af].compat_mutex)); if (xt[af].compat_tab) { vfree(xt[af].compat_tab); xt[af].compat_tab = NULL; xt[af].number = 0; xt[af].cur = 0; } } EXPORT_SYMBOL_GPL(xt_compat_flush_offsets); int xt_compat_calc_jump(u_int8_t af, unsigned int offset) { struct compat_delta *tmp = xt[af].compat_tab; int mid, left = 0, right = xt[af].cur - 1; while (left <= right) { mid = (left + right) >> 1; if (offset > tmp[mid].offset) left = mid + 1; else if (offset < tmp[mid].offset) right = mid - 1; else return mid ? tmp[mid - 1].delta : 0; } return left ? tmp[left - 1].delta : 0; } EXPORT_SYMBOL_GPL(xt_compat_calc_jump); int xt_compat_init_offsets(u8 af, unsigned int number) { size_t mem; WARN_ON(!mutex_is_locked(&xt[af].compat_mutex)); if (!number || number > (INT_MAX / sizeof(struct compat_delta))) return -EINVAL; if (WARN_ON(xt[af].compat_tab)) return -EINVAL; mem = sizeof(struct compat_delta) * number; if (mem > XT_MAX_TABLE_SIZE) return -ENOMEM; xt[af].compat_tab = vmalloc(mem); if (!xt[af].compat_tab) return -ENOMEM; xt[af].number = number; xt[af].cur = 0; return 0; } EXPORT_SYMBOL(xt_compat_init_offsets); int xt_compat_match_offset(const struct xt_match *match) { u_int16_t csize = match->compatsize ? : match->matchsize; return XT_ALIGN(match->matchsize) - COMPAT_XT_ALIGN(csize); } EXPORT_SYMBOL_GPL(xt_compat_match_offset); void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, unsigned int *size) { const struct xt_match *match = m->u.kernel.match; struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m; int off = xt_compat_match_offset(match); u_int16_t msize = cm->u.user.match_size; char name[sizeof(m->u.user.name)]; m = *dstptr; memcpy(m, cm, sizeof(*cm)); if (match->compat_from_user) match->compat_from_user(m->data, cm->data); else memcpy(m->data, cm->data, msize - sizeof(*cm)); msize += off; m->u.user.match_size = msize; strscpy(name, match->name, sizeof(name)); module_put(match->me); strscpy_pad(m->u.user.name, name, sizeof(m->u.user.name)); *size += off; *dstptr += msize; } EXPORT_SYMBOL_GPL(xt_compat_match_from_user); #define COMPAT_XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \ xt_data_to_user(U->data, K->data, \ K->u.kernel.TYPE->usersize, \ C_SIZE, \ COMPAT_XT_ALIGN(C_SIZE)) int xt_compat_match_to_user(const struct xt_entry_match *m, void __user **dstptr, unsigned int *size) { const struct xt_match *match = m->u.kernel.match; struct compat_xt_entry_match __user *cm = *dstptr; int off = xt_compat_match_offset(match); u_int16_t msize = m->u.user.match_size - off; if (XT_OBJ_TO_USER(cm, m, match, msize)) return -EFAULT; if (match->compat_to_user) { if (match->compat_to_user((void __user *)cm->data, m->data)) return -EFAULT; } else { if (COMPAT_XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm))) return -EFAULT; } *size -= off; *dstptr += msize; return 0; } EXPORT_SYMBOL_GPL(xt_compat_match_to_user); /* non-compat version may have padding after verdict */ struct compat_xt_standard_target { struct compat_xt_entry_target t; compat_uint_t verdict; }; struct compat_xt_error_target { struct compat_xt_entry_target t; char errorname[XT_FUNCTION_MAXNAMELEN]; }; int xt_compat_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset) { long size_of_base_struct = elems - (const char *)base; const struct compat_xt_entry_target *t; const char *e = base; if (target_offset < size_of_base_struct) return -EINVAL; if (target_offset + sizeof(*t) > next_offset) return -EINVAL; t = (void *)(e + target_offset); if (t->u.target_size < sizeof(*t)) return -EINVAL; if (target_offset + t->u.target_size > next_offset) return -EINVAL; if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0) { const struct compat_xt_standard_target *st = (const void *)t; if (COMPAT_XT_ALIGN(target_offset + sizeof(*st)) != next_offset) return -EINVAL; if (!verdict_ok(st->verdict)) return -EINVAL; } else if (strcmp(t->u.user.name, XT_ERROR_TARGET) == 0) { const struct compat_xt_error_target *et = (const void *)t; if (!error_tg_ok(t->u.target_size, sizeof(*et), et->errorname, sizeof(et->errorname))) return -EINVAL; } /* compat_xt_entry match has less strict alignment requirements, * otherwise they are identical. In case of padding differences * we need to add compat version of xt_check_entry_match. */ BUILD_BUG_ON(sizeof(struct compat_xt_entry_match) != sizeof(struct xt_entry_match)); return xt_check_entry_match(elems, base + target_offset, __alignof__(struct compat_xt_entry_match)); } EXPORT_SYMBOL(xt_compat_check_entry_offsets); #endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ /** * xt_check_entry_offsets - validate arp/ip/ip6t_entry * * @base: pointer to arp/ip/ip6t_entry * @elems: pointer to first xt_entry_match, i.e. ip(6)t_entry->elems * @target_offset: the arp/ip/ip6_t->target_offset * @next_offset: the arp/ip/ip6_t->next_offset * * validates that target_offset and next_offset are sane and that all * match sizes (if any) align with the target offset. * * This function does not validate the targets or matches themselves, it * only tests that all the offsets and sizes are correct, that all * match structures are aligned, and that the last structure ends where * the target structure begins. * * Also see xt_compat_check_entry_offsets for CONFIG_NETFILTER_XTABLES_COMPAT version. * * The arp/ip/ip6t_entry structure @base must have passed following tests: * - it must point to a valid memory location * - base to base + next_offset must be accessible, i.e. not exceed allocated * length. * * A well-formed entry looks like this: * * ip(6)t_entry match [mtdata] match [mtdata] target [tgdata] ip(6)t_entry * e->elems[]-----' | | * matchsize | | * matchsize | | * | | * target_offset---------------------------------' | * next_offset---------------------------------------------------' * * elems[]: flexible array member at end of ip(6)/arpt_entry struct. * This is where matches (if any) and the target reside. * target_offset: beginning of target. * next_offset: start of the next rule; also: size of this rule. * Since targets have a minimum size, target_offset + minlen <= next_offset. * * Every match stores its size, sum of sizes must not exceed target_offset. * * Return: 0 on success, negative errno on failure. */ int xt_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset) { long size_of_base_struct = elems - (const char *)base; const struct xt_entry_target *t; const char *e = base; /* target start is within the ip/ip6/arpt_entry struct */ if (target_offset < size_of_base_struct) return -EINVAL; if (target_offset + sizeof(*t) > next_offset) return -EINVAL; t = (void *)(e + target_offset); if (t->u.target_size < sizeof(*t)) return -EINVAL; if (target_offset + t->u.target_size > next_offset) return -EINVAL; if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0) { const struct xt_standard_target *st = (const void *)t; if (XT_ALIGN(target_offset + sizeof(*st)) != next_offset) return -EINVAL; if (!verdict_ok(st->verdict)) return -EINVAL; } else if (strcmp(t->u.user.name, XT_ERROR_TARGET) == 0) { const struct xt_error_target *et = (const void *)t; if (!error_tg_ok(t->u.target_size, sizeof(*et), et->errorname, sizeof(et->errorname))) return -EINVAL; } return xt_check_entry_match(elems, base + target_offset, __alignof__(struct xt_entry_match)); } EXPORT_SYMBOL(xt_check_entry_offsets); /** * xt_alloc_entry_offsets - allocate array to store rule head offsets * * @size: number of entries * * Return: NULL or zeroed kmalloc'd or vmalloc'd array */ unsigned int *xt_alloc_entry_offsets(unsigned int size) { if (size > XT_MAX_TABLE_SIZE / sizeof(unsigned int)) return NULL; return kvcalloc(size, sizeof(unsigned int), GFP_KERNEL); } EXPORT_SYMBOL(xt_alloc_entry_offsets); /** * xt_find_jump_offset - check if target is a valid jump offset * * @offsets: array containing all valid rule start offsets of a rule blob * @target: the jump target to search for * @size: entries in @offset */ bool xt_find_jump_offset(const unsigned int *offsets, unsigned int target, unsigned int size) { int m, low = 0, hi = size; while (hi > low) { m = (low + hi) / 2u; if (offsets[m] > target) hi = m; else if (offsets[m] < target) low = m + 1; else return true; } return false; } EXPORT_SYMBOL(xt_find_jump_offset); int xt_check_target(struct xt_tgchk_param *par, unsigned int size, u16 proto, bool inv_proto) { int ret; if (XT_ALIGN(par->target->targetsize) != size) { pr_err_ratelimited("%s_tables: %s.%u target: invalid size %u (kernel) != (user) %u\n", xt_prefix[par->family], par->target->name, par->target->revision, XT_ALIGN(par->target->targetsize), size); return -EINVAL; } if (par->target->table != NULL && strcmp(par->target->table, par->table) != 0) { pr_info_ratelimited("%s_tables: %s target: only valid in %s table, not %s\n", xt_prefix[par->family], par->target->name, par->target->table, par->table); return -EINVAL; } if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) { char used[64], allow[64]; pr_info_ratelimited("%s_tables: %s target: used from hooks %s, but only usable from %s\n", xt_prefix[par->family], par->target->name, textify_hooks(used, sizeof(used), par->hook_mask, par->family), textify_hooks(allow, sizeof(allow), par->target->hooks, par->family)); return -EINVAL; } if (par->target->proto && (par->target->proto != proto || inv_proto)) { pr_info_ratelimited("%s_tables: %s target: only valid for protocol %u\n", xt_prefix[par->family], par->target->name, par->target->proto); return -EINVAL; } if (par->target->checkentry != NULL) { ret = par->target->checkentry(par); if (ret < 0) return ret; else if (ret > 0) /* Flag up potential errors. */ return -EIO; } return 0; } EXPORT_SYMBOL_GPL(xt_check_target); /** * xt_copy_counters - copy counters and metadata from a sockptr_t * * @arg: src sockptr * @len: alleged size of userspace memory * @info: where to store the xt_counters_info metadata * * Copies counter meta data from @user and stores it in @info. * * vmallocs memory to hold the counters, then copies the counter data * from @user to the new memory and returns a pointer to it. * * If called from a compat syscall, @info gets converted automatically to the * 64bit representation. * * The metadata associated with the counters is stored in @info. * * Return: returns pointer that caller has to test via IS_ERR(). * If IS_ERR is false, caller has to vfree the pointer. */ void *xt_copy_counters(sockptr_t arg, unsigned int len, struct xt_counters_info *info) { size_t offset; void *mem; u64 size; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) { /* structures only differ in size due to alignment */ struct compat_xt_counters_info compat_tmp; if (len <= sizeof(compat_tmp)) return ERR_PTR(-EINVAL); len -= sizeof(compat_tmp); if (copy_from_sockptr(&compat_tmp, arg, sizeof(compat_tmp)) != 0) return ERR_PTR(-EFAULT); memcpy(info->name, compat_tmp.name, sizeof(info->name) - 1); info->num_counters = compat_tmp.num_counters; offset = sizeof(compat_tmp); } else #endif { if (len <= sizeof(*info)) return ERR_PTR(-EINVAL); len -= sizeof(*info); if (copy_from_sockptr(info, arg, sizeof(*info)) != 0) return ERR_PTR(-EFAULT); offset = sizeof(*info); } info->name[sizeof(info->name) - 1] = '\0'; size = sizeof(struct xt_counters); size *= info->num_counters; if (size != (u64)len) return ERR_PTR(-EINVAL); mem = vmalloc(len); if (!mem) return ERR_PTR(-ENOMEM); if (copy_from_sockptr_offset(mem, arg, offset, len) == 0) return mem; vfree(mem); return ERR_PTR(-EFAULT); } EXPORT_SYMBOL_GPL(xt_copy_counters); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT int xt_compat_target_offset(const struct xt_target *target) { u_int16_t csize = target->compatsize ? : target->targetsize; return XT_ALIGN(target->targetsize) - COMPAT_XT_ALIGN(csize); } EXPORT_SYMBOL_GPL(xt_compat_target_offset); void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, unsigned int *size) { const struct xt_target *target = t->u.kernel.target; struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t; int off = xt_compat_target_offset(target); u_int16_t tsize = ct->u.user.target_size; char name[sizeof(t->u.user.name)]; t = *dstptr; memcpy(t, ct, sizeof(*ct)); if (target->compat_from_user) target->compat_from_user(t->data, ct->data); else unsafe_memcpy(t->data, ct->data, tsize - sizeof(*ct), /* UAPI 0-sized destination */); tsize += off; t->u.user.target_size = tsize; strscpy(name, target->name, sizeof(name)); module_put(target->me); strscpy_pad(t->u.user.name, name, sizeof(t->u.user.name)); *size += off; *dstptr += tsize; } EXPORT_SYMBOL_GPL(xt_compat_target_from_user); int xt_compat_target_to_user(const struct xt_entry_target *t, void __user **dstptr, unsigned int *size) { const struct xt_target *target = t->u.kernel.target; struct compat_xt_entry_target __user *ct = *dstptr; int off = xt_compat_target_offset(target); u_int16_t tsize = t->u.user.target_size - off; if (XT_OBJ_TO_USER(ct, t, target, tsize)) return -EFAULT; if (target->compat_to_user) { if (target->compat_to_user((void __user *)ct->data, t->data)) return -EFAULT; } else { if (COMPAT_XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct))) return -EFAULT; } *size -= off; *dstptr += tsize; return 0; } EXPORT_SYMBOL_GPL(xt_compat_target_to_user); #endif struct xt_table_info *xt_alloc_table_info(unsigned int size) { struct xt_table_info *info = NULL; size_t sz = sizeof(*info) + size; if (sz < sizeof(*info) || sz >= XT_MAX_TABLE_SIZE) return NULL; info = kvmalloc(sz, GFP_KERNEL_ACCOUNT); if (!info) return NULL; memset(info, 0, sizeof(*info)); info->size = size; return info; } EXPORT_SYMBOL(xt_alloc_table_info); void xt_free_table_info(struct xt_table_info *info) { int cpu; if (info->jumpstack != NULL) { for_each_possible_cpu(cpu) kvfree(info->jumpstack[cpu]); kvfree(info->jumpstack); } kvfree(info); } EXPORT_SYMBOL(xt_free_table_info); struct xt_table *xt_find_table(struct net *net, u8 af, const char *name) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); struct xt_table *t; mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt_net->tables[af], list) { if (strcmp(t->name, name) == 0) { mutex_unlock(&xt[af].mutex); return t; } } mutex_unlock(&xt[af].mutex); return NULL; } EXPORT_SYMBOL(xt_find_table); /* Find table by name, grabs mutex & ref. Returns ERR_PTR on error. */ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); struct module *owner = NULL; struct xt_template *tmpl; struct xt_table *t; mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt_net->tables[af], list) if (strcmp(t->name, name) == 0 && try_module_get(t->me)) return t; /* Table doesn't exist in this netns, check larval list */ list_for_each_entry(tmpl, &xt_templates[af], list) { int err; if (strcmp(tmpl->name, name)) continue; if (!try_module_get(tmpl->me)) goto out; owner = tmpl->me; mutex_unlock(&xt[af].mutex); err = tmpl->table_init(net); if (err < 0) { module_put(owner); return ERR_PTR(err); } mutex_lock(&xt[af].mutex); break; } /* and once again: */ list_for_each_entry(t, &xt_net->tables[af], list) if (strcmp(t->name, name) == 0 && owner == t->me) return t; module_put(owner); out: mutex_unlock(&xt[af].mutex); return ERR_PTR(-ENOENT); } EXPORT_SYMBOL_GPL(xt_find_table_lock); struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af, const char *name) { struct xt_table *t = xt_find_table_lock(net, af, name); #ifdef CONFIG_MODULES if (IS_ERR(t)) { int err = request_module("%stable_%s", xt_prefix[af], name); if (err < 0) return ERR_PTR(err); t = xt_find_table_lock(net, af, name); } #endif return t; } EXPORT_SYMBOL_GPL(xt_request_find_table_lock); void xt_table_unlock(struct xt_table *table) { mutex_unlock(&xt[table->af].mutex); } EXPORT_SYMBOL_GPL(xt_table_unlock); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT void xt_compat_lock(u_int8_t af) { mutex_lock(&xt[af].compat_mutex); } EXPORT_SYMBOL_GPL(xt_compat_lock); void xt_compat_unlock(u_int8_t af) { mutex_unlock(&xt[af].compat_mutex); } EXPORT_SYMBOL_GPL(xt_compat_unlock); #endif DEFINE_PER_CPU(seqcount_t, xt_recseq); EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq); struct static_key xt_tee_enabled __read_mostly; EXPORT_SYMBOL_GPL(xt_tee_enabled); static int xt_jumpstack_alloc(struct xt_table_info *i) { unsigned int size; int cpu; size = sizeof(void **) * nr_cpu_ids; if (size > PAGE_SIZE) i->jumpstack = kvzalloc(size, GFP_KERNEL); else i->jumpstack = kzalloc(size, GFP_KERNEL); if (i->jumpstack == NULL) return -ENOMEM; /* ruleset without jumps -- no stack needed */ if (i->stacksize == 0) return 0; /* Jumpstack needs to be able to record two full callchains, one * from the first rule set traversal, plus one table reentrancy * via -j TEE without clobbering the callchain that brought us to * TEE target. * * This is done by allocating two jumpstacks per cpu, on reentry * the upper half of the stack is used. * * see the jumpstack setup in ipt_do_table() for more details. */ size = sizeof(void *) * i->stacksize * 2u; for_each_possible_cpu(cpu) { i->jumpstack[cpu] = kvmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (i->jumpstack[cpu] == NULL) /* * Freeing will be done later on by the callers. The * chain is: xt_replace_table -> __do_replace -> * do_replace -> xt_free_table_info. */ return -ENOMEM; } return 0; } struct xt_counters *xt_counters_alloc(unsigned int counters) { struct xt_counters *mem; if (counters == 0 || counters > INT_MAX / sizeof(*mem)) return NULL; counters *= sizeof(*mem); if (counters > XT_MAX_TABLE_SIZE) return NULL; return vzalloc(counters); } EXPORT_SYMBOL(xt_counters_alloc); struct xt_table_info * xt_replace_table(struct xt_table *table, unsigned int num_counters, struct xt_table_info *newinfo, int *error) { struct xt_table_info *private; unsigned int cpu; int ret; ret = xt_jumpstack_alloc(newinfo); if (ret < 0) { *error = ret; return NULL; } /* Do the substitution. */ local_bh_disable(); private = table->private; /* Check inside lock: is the old number correct? */ if (num_counters != private->number) { pr_debug("num_counters != table->private->number (%u/%u)\n", num_counters, private->number); local_bh_enable(); *error = -EAGAIN; return NULL; } newinfo->initial_entries = private->initial_entries; /* * Ensure contents of newinfo are visible before assigning to * private. */ smp_wmb(); table->private = newinfo; /* make sure all cpus see new ->private value */ smp_mb(); /* * Even though table entries have now been swapped, other CPU's * may still be using the old entries... */ local_bh_enable(); /* ... so wait for even xt_recseq on all cpus */ for_each_possible_cpu(cpu) { seqcount_t *s = &per_cpu(xt_recseq, cpu); u32 seq = raw_read_seqcount(s); if (seq & 1) { do { cond_resched(); cpu_relax(); } while (seq == raw_read_seqcount(s)); } } audit_log_nfcfg(table->name, table->af, private->number, !private->number ? AUDIT_XT_OP_REGISTER : AUDIT_XT_OP_REPLACE, GFP_KERNEL); return private; } EXPORT_SYMBOL_GPL(xt_replace_table); struct xt_table *xt_register_table(struct net *net, const struct xt_table *input_table, struct xt_table_info *bootstrap, struct xt_table_info *newinfo) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); struct xt_table_info *private; struct xt_table *t, *table; int ret; /* Don't add one object to multiple lists. */ table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL); if (!table) { ret = -ENOMEM; goto out; } mutex_lock(&xt[table->af].mutex); /* Don't autoload: we'd eat our tail... */ list_for_each_entry(t, &xt_net->tables[table->af], list) { if (strcmp(t->name, table->name) == 0) { ret = -EEXIST; goto unlock; } } /* Simplifies replace_table code. */ table->private = bootstrap; if (!xt_replace_table(table, 0, newinfo, &ret)) goto unlock; private = table->private; pr_debug("table->private->number = %u\n", private->number); /* save number of initial entries */ private->initial_entries = private->number; list_add(&table->list, &xt_net->tables[table->af]); mutex_unlock(&xt[table->af].mutex); return table; unlock: mutex_unlock(&xt[table->af].mutex); kfree(table); out: return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(xt_register_table); void *xt_unregister_table(struct xt_table *table) { struct xt_table_info *private; mutex_lock(&xt[table->af].mutex); private = table->private; list_del(&table->list); mutex_unlock(&xt[table->af].mutex); audit_log_nfcfg(table->name, table->af, private->number, AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); kfree(table->ops); kfree(table); return private; } EXPORT_SYMBOL_GPL(xt_unregister_table); #ifdef CONFIG_PROC_FS static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) { u8 af = (unsigned long)pde_data(file_inode(seq->file)); struct net *net = seq_file_net(seq); struct xt_pernet *xt_net; xt_net = net_generic(net, xt_pernet_id); mutex_lock(&xt[af].mutex); return seq_list_start(&xt_net->tables[af], *pos); } static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) { u8 af = (unsigned long)pde_data(file_inode(seq->file)); struct net *net = seq_file_net(seq); struct xt_pernet *xt_net; xt_net = net_generic(net, xt_pernet_id); return seq_list_next(v, &xt_net->tables[af], pos); } static void xt_table_seq_stop(struct seq_file *seq, void *v) { u_int8_t af = (unsigned long)pde_data(file_inode(seq->file)); mutex_unlock(&xt[af].mutex); } static int xt_table_seq_show(struct seq_file *seq, void *v) { struct xt_table *table = list_entry(v, struct xt_table, list); if (*table->name) seq_printf(seq, "%s\n", table->name); return 0; } static const struct seq_operations xt_table_seq_ops = { .start = xt_table_seq_start, .next = xt_table_seq_next, .stop = xt_table_seq_stop, .show = xt_table_seq_show, }; /* * Traverse state for ip{,6}_{tables,matches} for helping crossing * the multi-AF mutexes. */ struct nf_mttg_trav { struct list_head *head, *curr; uint8_t class; }; enum { MTTG_TRAV_INIT, MTTG_TRAV_NFP_UNSPEC, MTTG_TRAV_NFP_SPEC, MTTG_TRAV_DONE, }; static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, bool is_target) { static const uint8_t next_class[] = { [MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC, [MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE, }; uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file)); struct nf_mttg_trav *trav = seq->private; if (ppos != NULL) ++(*ppos); switch (trav->class) { case MTTG_TRAV_INIT: trav->class = MTTG_TRAV_NFP_UNSPEC; mutex_lock(&xt[NFPROTO_UNSPEC].mutex); trav->head = trav->curr = is_target ? &xt[NFPROTO_UNSPEC].target : &xt[NFPROTO_UNSPEC].match; break; case MTTG_TRAV_NFP_UNSPEC: trav->curr = trav->curr->next; if (trav->curr != trav->head) break; mutex_unlock(&xt[NFPROTO_UNSPEC].mutex); mutex_lock(&xt[nfproto].mutex); trav->head = trav->curr = is_target ? &xt[nfproto].target : &xt[nfproto].match; trav->class = next_class[trav->class]; break; case MTTG_TRAV_NFP_SPEC: trav->curr = trav->curr->next; if (trav->curr != trav->head) break; fallthrough; default: return NULL; } return trav; } static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos, bool is_target) { struct nf_mttg_trav *trav = seq->private; unsigned int j; trav->class = MTTG_TRAV_INIT; for (j = 0; j < *pos; ++j) if (xt_mttg_seq_next(seq, NULL, NULL, is_target) == NULL) return NULL; return trav; } static void xt_mttg_seq_stop(struct seq_file *seq, void *v) { uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file)); struct nf_mttg_trav *trav = seq->private; switch (trav->class) { case MTTG_TRAV_NFP_UNSPEC: mutex_unlock(&xt[NFPROTO_UNSPEC].mutex); break; case MTTG_TRAV_NFP_SPEC: mutex_unlock(&xt[nfproto].mutex); break; } } static void *xt_match_seq_start(struct seq_file *seq, loff_t *pos) { return xt_mttg_seq_start(seq, pos, false); } static void *xt_match_seq_next(struct seq_file *seq, void *v, loff_t *ppos) { return xt_mttg_seq_next(seq, v, ppos, false); } static int xt_match_seq_show(struct seq_file *seq, void *v) { const struct nf_mttg_trav *trav = seq->private; const struct xt_match *match; switch (trav->class) { case MTTG_TRAV_NFP_UNSPEC: case MTTG_TRAV_NFP_SPEC: if (trav->curr == trav->head) return 0; match = list_entry(trav->curr, struct xt_match, list); if (*match->name) seq_printf(seq, "%s\n", match->name); } return 0; } static const struct seq_operations xt_match_seq_ops = { .start = xt_match_seq_start, .next = xt_match_seq_next, .stop = xt_mttg_seq_stop, .show = xt_match_seq_show, }; static void *xt_target_seq_start(struct seq_file *seq, loff_t *pos) { return xt_mttg_seq_start(seq, pos, true); } static void *xt_target_seq_next(struct seq_file *seq, void *v, loff_t *ppos) { return xt_mttg_seq_next(seq, v, ppos, true); } static int xt_target_seq_show(struct seq_file *seq, void *v) { const struct nf_mttg_trav *trav = seq->private; const struct xt_target *target; switch (trav->class) { case MTTG_TRAV_NFP_UNSPEC: case MTTG_TRAV_NFP_SPEC: if (trav->curr == trav->head) return 0; target = list_entry(trav->curr, struct xt_target, list); if (*target->name) seq_printf(seq, "%s\n", target->name); } return 0; } static const struct seq_operations xt_target_seq_ops = { .start = xt_target_seq_start, .next = xt_target_seq_next, .stop = xt_mttg_seq_stop, .show = xt_target_seq_show, }; #define FORMAT_TABLES "_tables_names" #define FORMAT_MATCHES "_tables_matches" #define FORMAT_TARGETS "_tables_targets" #endif /* CONFIG_PROC_FS */ /** * xt_hook_ops_alloc - set up hooks for a new table * @table: table with metadata needed to set up hooks * @fn: Hook function * * This function will create the nf_hook_ops that the x_table needs * to hand to xt_hook_link_net(). */ struct nf_hook_ops * xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn) { unsigned int hook_mask = table->valid_hooks; uint8_t i, num_hooks = hweight32(hook_mask); uint8_t hooknum; struct nf_hook_ops *ops; if (!num_hooks) return ERR_PTR(-EINVAL); ops = kcalloc(num_hooks, sizeof(*ops), GFP_KERNEL); if (ops == NULL) return ERR_PTR(-ENOMEM); for (i = 0, hooknum = 0; i < num_hooks && hook_mask != 0; hook_mask >>= 1, ++hooknum) { if (!(hook_mask & 1)) continue; ops[i].hook = fn; ops[i].pf = table->af; ops[i].hooknum = hooknum; ops[i].priority = table->priority; ++i; } return ops; } EXPORT_SYMBOL_GPL(xt_hook_ops_alloc); int xt_register_template(const struct xt_table *table, int (*table_init)(struct net *net)) { int ret = -EEXIST, af = table->af; struct xt_template *t; mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt_templates[af], list) { if (WARN_ON_ONCE(strcmp(table->name, t->name) == 0)) goto out_unlock; } ret = -ENOMEM; t = kzalloc(sizeof(*t), GFP_KERNEL); if (!t) goto out_unlock; BUILD_BUG_ON(sizeof(t->name) != sizeof(table->name)); strscpy(t->name, table->name, sizeof(t->name)); t->table_init = table_init; t->me = table->me; list_add(&t->list, &xt_templates[af]); ret = 0; out_unlock: mutex_unlock(&xt[af].mutex); return ret; } EXPORT_SYMBOL_GPL(xt_register_template); void xt_unregister_template(const struct xt_table *table) { struct xt_template *t; int af = table->af; mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt_templates[af], list) { if (strcmp(table->name, t->name)) continue; list_del(&t->list); mutex_unlock(&xt[af].mutex); kfree(t); return; } mutex_unlock(&xt[af].mutex); WARN_ON_ONCE(1); } EXPORT_SYMBOL_GPL(xt_unregister_template); int xt_proto_init(struct net *net, u_int8_t af) { #ifdef CONFIG_PROC_FS char buf[XT_FUNCTION_MAXNAMELEN]; struct proc_dir_entry *proc; kuid_t root_uid; kgid_t root_gid; #endif if (af >= ARRAY_SIZE(xt_prefix)) return -EINVAL; #ifdef CONFIG_PROC_FS root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); proc = proc_create_net_data(buf, 0440, net->proc_net, &xt_table_seq_ops, sizeof(struct seq_net_private), (void *)(unsigned long)af); if (!proc) goto out; if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); proc = proc_create_seq_private(buf, 0440, net->proc_net, &xt_match_seq_ops, sizeof(struct nf_mttg_trav), (void *)(unsigned long)af); if (!proc) goto out_remove_tables; if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TARGETS, sizeof(buf)); proc = proc_create_seq_private(buf, 0440, net->proc_net, &xt_target_seq_ops, sizeof(struct nf_mttg_trav), (void *)(unsigned long)af); if (!proc) goto out_remove_matches; if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); #endif return 0; #ifdef CONFIG_PROC_FS out_remove_matches: strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); remove_proc_entry(buf, net->proc_net); out_remove_tables: strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); remove_proc_entry(buf, net->proc_net); out: return -1; #endif } EXPORT_SYMBOL_GPL(xt_proto_init); void xt_proto_fini(struct net *net, u_int8_t af) { #ifdef CONFIG_PROC_FS char buf[XT_FUNCTION_MAXNAMELEN]; strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); remove_proc_entry(buf, net->proc_net); strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TARGETS, sizeof(buf)); remove_proc_entry(buf, net->proc_net); strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); remove_proc_entry(buf, net->proc_net); #endif /*CONFIG_PROC_FS*/ } EXPORT_SYMBOL_GPL(xt_proto_fini); /** * xt_percpu_counter_alloc - allocate x_tables rule counter * * @state: pointer to xt_percpu allocation state * @counter: pointer to counter struct inside the ip(6)/arpt_entry struct * * On SMP, the packet counter [ ip(6)t_entry->counters.pcnt ] will then * contain the address of the real (percpu) counter. * * Rule evaluation needs to use xt_get_this_cpu_counter() helper * to fetch the real percpu counter. * * To speed up allocation and improve data locality, a 4kb block is * allocated. Freeing any counter may free an entire block, so all * counters allocated using the same state must be freed at the same * time. * * xt_percpu_counter_alloc_state contains the base address of the * allocated page and the current sub-offset. * * returns false on error. */ bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state, struct xt_counters *counter) { BUILD_BUG_ON(XT_PCPU_BLOCK_SIZE < (sizeof(*counter) * 2)); if (nr_cpu_ids <= 1) return true; if (!state->mem) { state->mem = __alloc_percpu(XT_PCPU_BLOCK_SIZE, XT_PCPU_BLOCK_SIZE); if (!state->mem) return false; } counter->pcnt = (__force unsigned long)(state->mem + state->off); state->off += sizeof(*counter); if (state->off > (XT_PCPU_BLOCK_SIZE - sizeof(*counter))) { state->mem = NULL; state->off = 0; } return true; } EXPORT_SYMBOL_GPL(xt_percpu_counter_alloc); void xt_percpu_counter_free(struct xt_counters *counters) { unsigned long pcnt = counters->pcnt; if (nr_cpu_ids > 1 && (pcnt & (XT_PCPU_BLOCK_SIZE - 1)) == 0) free_percpu((void __percpu *)pcnt); } EXPORT_SYMBOL_GPL(xt_percpu_counter_free); static int __net_init xt_net_init(struct net *net) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; for (i = 0; i < NFPROTO_NUMPROTO; i++) INIT_LIST_HEAD(&xt_net->tables[i]); return 0; } static void __net_exit xt_net_exit(struct net *net) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; for (i = 0; i < NFPROTO_NUMPROTO; i++) WARN_ON_ONCE(!list_empty(&xt_net->tables[i])); } static struct pernet_operations xt_net_ops = { .init = xt_net_init, .exit = xt_net_exit, .id = &xt_pernet_id, .size = sizeof(struct xt_pernet), }; static int __init xt_init(void) { unsigned int i; int rv; for_each_possible_cpu(i) { seqcount_init(&per_cpu(xt_recseq, i)); } xt = kcalloc(NFPROTO_NUMPROTO, sizeof(struct xt_af), GFP_KERNEL); if (!xt) return -ENOMEM; for (i = 0; i < NFPROTO_NUMPROTO; i++) { mutex_init(&xt[i].mutex); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT mutex_init(&xt[i].compat_mutex); xt[i].compat_tab = NULL; #endif INIT_LIST_HEAD(&xt[i].target); INIT_LIST_HEAD(&xt[i].match); INIT_LIST_HEAD(&xt_templates[i]); } rv = register_pernet_subsys(&xt_net_ops); if (rv < 0) kfree(xt); return rv; } static void __exit xt_fini(void) { unregister_pernet_subsys(&xt_net_ops); kfree(xt); } module_init(xt_init); module_exit(xt_fini);
19 19 11 92 91 19 19 19 19 19 19 6 6 53 266 300 298 68 284 13 272 284 12 271 284 58 58 1 4 53 118 1 120 3 1 121 121 119 95 7 16 13 51 1 58 229 227 228 226 229 212 17 229 229 230 569 460 1 83 25 84 210 17 60 136 27 10 165 9 6 1 3 2 32 32 2 3 1 2 1 4 10 10 15 2 5 2 22 19 2 15 2 2 3 15 1 410 411 1 1 1 1 3 1 156 156 146 138 6 144 142 139 7 3 1 2 1 147 142 4 1 1 1 1 4 5 2 1 2 2 2 8 7 7 8 14 5 12 2 4 3 1 2 1 2 71 66 9 105 22 9 74 1 1 1 1 1 1 1 1 1 95 3 95 1 2 88 88 67 3 248 246 2 2 240 1 3 1 3 206 7 32 236 1 4 230 230 32 201 230 32 3 1 1 2 2 2 84 97 23 76 72 9 92 12 89 88 2 91 52 119 38 81 1 118 28 90 718 714 3 718 13 36 678 3 2 55 719 716 5 6 6 1 1 1 245 242 3 1 243 242 244 241 1 1 1 7 1 240 1 244 153 156 245 8 63 63 33 4 222 17 1 30 3 3 1 2 128 130 29 29 619 619 220 221 219 218 220 219 221 204 3 3 3 3 3 3 3 539 14 375 360 533 218 218 218 4 219 135 37 101 88 9 2 94 97 95 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 5