Total coverage: 434422 (25%)of 1761464
295 293 294 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 // SPDX-License-Identifier: GPL-2.0 /* Lock down the kernel * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public Licence * as published by the Free Software Foundation; either version * 2 of the Licence, or (at your option) any later version. */ #include <linux/security.h> #include <linux/export.h> #include <linux/lsm_hooks.h> #include <uapi/linux/lsm.h> static enum lockdown_reason kernel_locked_down; static const enum lockdown_reason lockdown_levels[] = {LOCKDOWN_NONE, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX}; /* * Put the kernel into lock-down mode. */ static int lock_kernel_down(const char *where, enum lockdown_reason level) { if (kernel_locked_down >= level) return -EPERM; kernel_locked_down = level; pr_notice("Kernel is locked down from %s; see man kernel_lockdown.7\n", where); return 0; } static int __init lockdown_param(char *level) { if (!level) return -EINVAL; if (strcmp(level, "integrity") == 0) lock_kernel_down("command line", LOCKDOWN_INTEGRITY_MAX); else if (strcmp(level, "confidentiality") == 0) lock_kernel_down("command line", LOCKDOWN_CONFIDENTIALITY_MAX); else return -EINVAL; return 0; } early_param("lockdown", lockdown_param); /** * lockdown_is_locked_down - Find out if the kernel is locked down * @what: Tag to use in notice generated if lockdown is in effect */ static int lockdown_is_locked_down(enum lockdown_reason what) { if (WARN(what >= LOCKDOWN_CONFIDENTIALITY_MAX, "Invalid lockdown reason")) return -EPERM; if (kernel_locked_down >= what) { if (lockdown_reasons[what]) pr_notice_ratelimited("Lockdown: %s: %s is restricted; see man kernel_lockdown.7\n", current->comm, lockdown_reasons[what]); return -EPERM; } return 0; } static struct security_hook_list lockdown_hooks[] __ro_after_init = { LSM_HOOK_INIT(locked_down, lockdown_is_locked_down), }; static const struct lsm_id lockdown_lsmid = { .name = "lockdown", .id = LSM_ID_LOCKDOWN, }; static int __init lockdown_lsm_init(void) { #if defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY) lock_kernel_down("Kernel configuration", LOCKDOWN_INTEGRITY_MAX); #elif defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY) lock_kernel_down("Kernel configuration", LOCKDOWN_CONFIDENTIALITY_MAX); #endif security_add_hooks(lockdown_hooks, ARRAY_SIZE(lockdown_hooks), &lockdown_lsmid); return 0; } static ssize_t lockdown_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { char temp[80]; int i, offset = 0; for (i = 0; i < ARRAY_SIZE(lockdown_levels); i++) { enum lockdown_reason level = lockdown_levels[i]; if (lockdown_reasons[level]) { const char *label = lockdown_reasons[level]; if (kernel_locked_down == level) offset += sprintf(temp+offset, "[%s] ", label); else offset += sprintf(temp+offset, "%s ", label); } } /* Convert the last space to a newline if needed. */ if (offset > 0) temp[offset-1] = '\n'; return simple_read_from_buffer(buf, count, ppos, temp, strlen(temp)); } static ssize_t lockdown_write(struct file *file, const char __user *buf, size_t n, loff_t *ppos) { char *state; int i, len, err = -EINVAL; state = memdup_user_nul(buf, n); if (IS_ERR(state)) return PTR_ERR(state); len = strlen(state); if (len && state[len-1] == '\n') { state[len-1] = '\0'; len--; } for (i = 0; i < ARRAY_SIZE(lockdown_levels); i++) { enum lockdown_reason level = lockdown_levels[i]; const char *label = lockdown_reasons[level]; if (label && !strcmp(state, label)) err = lock_kernel_down("securityfs", level); } kfree(state); return err ? err : n; } static const struct file_operations lockdown_ops = { .read = lockdown_read, .write = lockdown_write, }; static int __init lockdown_secfs_init(void) { struct dentry *dentry; dentry = securityfs_create_file("lockdown", 0644, NULL, NULL, &lockdown_ops); return PTR_ERR_OR_ZERO(dentry); } core_initcall(lockdown_secfs_init); #ifdef CONFIG_SECURITY_LOCKDOWN_LSM_EARLY DEFINE_EARLY_LSM(lockdown) = { #else DEFINE_LSM(lockdown) = { #endif .name = "lockdown", .init = lockdown_lsm_init, };
6 12 12 12 12 12 12 12 12 12 12 959 963 9 9 9 3 2 2 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1484 752 336 66 1485 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 // SPDX-License-Identifier: GPL-2.0-only #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <net/busy_poll.h> #include <net/net_namespace.h> #include <net/netdev_queues.h> #include <net/netdev_rx_queue.h> #include <net/sock.h> #include <net/xdp.h> #include <net/xdp_sock.h> #include "dev.h" #include "devmem.h" #include "netdev-genl-gen.h" struct netdev_nl_dump_ctx { unsigned long ifindex; unsigned int rxq_idx; unsigned int txq_idx; unsigned int napi_id; }; static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) { NL_ASSERT_DUMP_CTX_FITS(struct netdev_nl_dump_ctx); return (struct netdev_nl_dump_ctx *)cb->ctx; } static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { u64 xsk_features = 0; u64 xdp_rx_meta = 0; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; #define XDP_METADATA_KFUNC(_, flag, __, xmo) \ if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \ xdp_rx_meta |= flag; XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC if (netdev->xsk_tx_metadata_ops) { if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp) xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; } if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, netdev->xdp_features, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, xdp_rx_meta, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, xsk_features, NETDEV_A_DEV_PAD)) goto err_cancel_msg; if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, netdev->xdp_zc_max_segs)) goto err_cancel_msg; } genlmsg_end(rsp, hdr); return 0; err_cancel_msg: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static void netdev_genl_dev_notify(struct net_device *netdev, int cmd) { struct genl_info info; struct sk_buff *ntf; if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev), NETDEV_NLGRP_MGMT)) return; genl_info_init_ntf(&info, &netdev_nl_family, cmd); ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!ntf) return; if (netdev_nl_dev_fill(netdev, ntf, &info)) { nlmsg_free(ntf); return; } genlmsg_multicast_netns(&netdev_nl_family, dev_net(netdev), ntf, 0, NETDEV_NLGRP_MGMT, GFP_KERNEL); } int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net_device *netdev; struct sk_buff *rsp; u32 ifindex; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX)) return -EINVAL; ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; rtnl_lock(); netdev = __dev_get_by_index(genl_info_net(info), ifindex); if (netdev) err = netdev_nl_dev_fill(netdev, rsp, info); else err = -ENODEV; rtnl_unlock(); if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; int err = 0; rtnl_lock(); for_each_netdev_dump(net, netdev, ctx->ifindex) { err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); if (err < 0) break; } rtnl_unlock(); return err; } static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { void *hdr; pid_t pid; if (WARN_ON_ONCE(!napi->dev)) return -EINVAL; if (!(napi->dev->flags & IFF_UP)) return 0; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (napi->napi_id >= MIN_NAPI_ID && nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) goto nla_put_failure; if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) goto nla_put_failure; if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq)) goto nla_put_failure; if (napi->thread) { pid = task_pid_nr(napi->thread); if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid)) goto nla_put_failure; } genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) { struct napi_struct *napi; struct sk_buff *rsp; u32 napi_id; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) return -EINVAL; napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; rtnl_lock(); napi = napi_by_id(napi_id); if (napi) { err = netdev_nl_napi_fill_one(rsp, napi, info); } else { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); err = -ENOENT; } rtnl_unlock(); if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { struct napi_struct *napi; int err = 0; if (!(netdev->flags & IFF_UP)) return err; list_for_each_entry(napi, &netdev->napi_list, dev_list) { if (ctx->napi_id && napi->napi_id >= ctx->napi_id) continue; err = netdev_nl_napi_fill_one(rsp, napi, info); if (err) return err; ctx->napi_id = napi->napi_id; } return err; } int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; u32 ifindex = 0; int err = 0; if (info->attrs[NETDEV_A_NAPI_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]); rtnl_lock(); if (ifindex) { netdev = __dev_get_by_index(net, ifindex); if (netdev) err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); else err = -ENODEV; } else { for_each_netdev_dump(net, netdev, ctx->ifindex) { err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); if (err < 0) break; ctx->napi_id = 0; } } rtnl_unlock(); return err; } static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { struct net_devmem_dmabuf_binding *binding; struct netdev_rx_queue *rxq; struct netdev_queue *txq; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx) || nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type) || nla_put_u32(rsp, NETDEV_A_QUEUE_IFINDEX, netdev->ifindex)) goto nla_put_failure; switch (q_type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(netdev, q_idx); if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, rxq->napi->napi_id)) goto nla_put_failure; binding = rxq->mp_params.mp_priv; if (binding && nla_put_u32(rsp, NETDEV_A_QUEUE_DMABUF, binding->id)) goto nla_put_failure; break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); if (txq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, txq->napi->napi_id)) goto nla_put_failure; } genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_queue_validate(struct net_device *netdev, u32 q_id, u32 q_type) { switch (q_type) { case NETDEV_QUEUE_TYPE_RX: if (q_id >= netdev->real_num_rx_queues) return -EINVAL; return 0; case NETDEV_QUEUE_TYPE_TX: if (q_id >= netdev->real_num_tx_queues) return -EINVAL; } return 0; } static int netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { int err = 0; if (!(netdev->flags & IFF_UP)) return err; err = netdev_nl_queue_validate(netdev, q_idx, q_type); if (err) return err; return netdev_nl_queue_fill_one(rsp, netdev, q_idx, q_type, info); } int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) { u32 q_id, q_type, ifindex; struct net_device *netdev; struct sk_buff *rsp; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_ID) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX)) return -EINVAL; q_id = nla_get_u32(info->attrs[NETDEV_A_QUEUE_ID]); q_type = nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]); ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; rtnl_lock(); netdev = __dev_get_by_index(genl_info_net(info), ifindex); if (netdev) err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); else err = -ENODEV; rtnl_unlock(); if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { int err = 0; int i; if (!(netdev->flags & IFF_UP)) return err; for (i = ctx->rxq_idx; i < netdev->real_num_rx_queues;) { err = netdev_nl_queue_fill_one(rsp, netdev, i, NETDEV_QUEUE_TYPE_RX, info); if (err) return err; ctx->rxq_idx = i++; } for (i = ctx->txq_idx; i < netdev->real_num_tx_queues;) { err = netdev_nl_queue_fill_one(rsp, netdev, i, NETDEV_QUEUE_TYPE_TX, info); if (err) return err; ctx->txq_idx = i++; } return err; } int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; u32 ifindex = 0; int err = 0; if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); rtnl_lock(); if (ifindex) { netdev = __dev_get_by_index(net, ifindex); if (netdev) err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); else err = -ENODEV; } else { for_each_netdev_dump(net, netdev, ctx->ifindex) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); if (err < 0) break; ctx->rxq_idx = 0; ctx->txq_idx = 0; } } rtnl_unlock(); return err; } #define NETDEV_STAT_NOT_SET (~0ULL) static void netdev_nl_stats_add(void *_sum, const void *_add, size_t size) { const u64 *add = _add; u64 *sum = _sum; while (size) { if (*add != NETDEV_STAT_NOT_SET && *sum != NETDEV_STAT_NOT_SET) *sum += *add; sum++; add++; size -= 8; } } static int netdev_stat_put(struct sk_buff *rsp, unsigned int attr_id, u64 value) { if (value == NETDEV_STAT_NOT_SET) return 0; return nla_put_uint(rsp, attr_id, value); } static int netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) { if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits)) return -EMSGSIZE; return 0; } static int netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx) { if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake)) return -EMSGSIZE; return 0; } static int netdev_nl_stats_queue(struct net_device *netdev, struct sk_buff *rsp, u32 q_type, int i, const struct genl_info *info) { const struct netdev_stat_ops *ops = netdev->stat_ops; struct netdev_queue_stats_rx rx; struct netdev_queue_stats_tx tx; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex) || nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_TYPE, q_type) || nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_ID, i)) goto nla_put_failure; switch (q_type) { case NETDEV_QUEUE_TYPE_RX: memset(&rx, 0xff, sizeof(rx)); ops->get_queue_stats_rx(netdev, i, &rx); if (!memchr_inv(&rx, 0xff, sizeof(rx))) goto nla_cancel; if (netdev_nl_stats_write_rx(rsp, &rx)) goto nla_put_failure; break; case NETDEV_QUEUE_TYPE_TX: memset(&tx, 0xff, sizeof(tx)); ops->get_queue_stats_tx(netdev, i, &tx); if (!memchr_inv(&tx, 0xff, sizeof(tx))) goto nla_cancel; if (netdev_nl_stats_write_tx(rsp, &tx)) goto nla_put_failure; break; } genlmsg_end(rsp, hdr); return 0; nla_cancel: genlmsg_cancel(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { const struct netdev_stat_ops *ops = netdev->stat_ops; int i, err; if (!(netdev->flags & IFF_UP)) return 0; i = ctx->rxq_idx; while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) { err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX, i, info); if (err) return err; ctx->rxq_idx = i++; } i = ctx->txq_idx; while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_TX, i, info); if (err) return err; ctx->txq_idx = i++; } ctx->rxq_idx = 0; ctx->txq_idx = 0; return 0; } static int netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { struct netdev_queue_stats_rx rx_sum, rx; struct netdev_queue_stats_tx tx_sum, tx; const struct netdev_stat_ops *ops; void *hdr; int i; ops = netdev->stat_ops; /* Netdev can't guarantee any complete counters */ if (!ops->get_base_stats) return 0; memset(&rx_sum, 0xff, sizeof(rx_sum)); memset(&tx_sum, 0xff, sizeof(tx_sum)); ops->get_base_stats(netdev, &rx_sum, &tx_sum); /* The op was there, but nothing reported, don't bother */ if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) && !memchr_inv(&tx_sum, 0xff, sizeof(tx_sum))) return 0; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex)) goto nla_put_failure; for (i = 0; i < netdev->real_num_rx_queues; i++) { memset(&rx, 0xff, sizeof(rx)); if (ops->get_queue_stats_rx) ops->get_queue_stats_rx(netdev, i, &rx); netdev_nl_stats_add(&rx_sum, &rx, sizeof(rx)); } for (i = 0; i < netdev->real_num_tx_queues; i++) { memset(&tx, 0xff, sizeof(tx)); if (ops->get_queue_stats_tx) ops->get_queue_stats_tx(netdev, i, &tx); netdev_nl_stats_add(&tx_sum, &tx, sizeof(tx)); } if (netdev_nl_stats_write_rx(rsp, &rx_sum) || netdev_nl_stats_write_tx(rsp, &tx_sum)) goto nla_put_failure; genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope, struct sk_buff *skb, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { if (!netdev->stat_ops) return 0; switch (scope) { case 0: return netdev_nl_stats_by_netdev(netdev, skb, info); case NETDEV_QSTATS_SCOPE_QUEUE: return netdev_nl_stats_by_queue(netdev, skb, info, ctx); } return -EINVAL; /* Should not happen, per netlink policy */ } int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; unsigned int ifindex; unsigned int scope; int err = 0; scope = 0; if (info->attrs[NETDEV_A_QSTATS_SCOPE]) scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]); ifindex = 0; if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); rtnl_lock(); if (ifindex) { netdev = __dev_get_by_index(net, ifindex); if (netdev && netdev->stat_ops) { err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, info, ctx); } else { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QSTATS_IFINDEX]); err = netdev ? -EOPNOTSUPP : -ENODEV; } } else { for_each_netdev_dump(net, netdev, ctx->ifindex) { err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, info, ctx); if (err < 0) break; } } rtnl_unlock(); return err; } int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; struct net_devmem_dmabuf_binding *binding; struct list_head *sock_binding_list; u32 ifindex, dmabuf_fd, rxq_idx; struct net_device *netdev; struct sk_buff *rsp; struct nlattr *attr; int rem, err = 0; void *hdr; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_QUEUES)) return -EINVAL; ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); sock_binding_list = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); if (IS_ERR(sock_binding_list)) return PTR_ERR(sock_binding_list); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; hdr = genlmsg_iput(rsp, info); if (!hdr) { err = -EMSGSIZE; goto err_genlmsg_free; } rtnl_lock(); netdev = __dev_get_by_index(genl_info_net(info), ifindex); if (!netdev || !netif_device_present(netdev)) { err = -ENODEV; goto err_unlock; } if (dev_xdp_prog_count(netdev)) { NL_SET_ERR_MSG(info->extack, "unable to bind dmabuf to device with XDP program attached"); err = -EEXIST; goto err_unlock; } binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock; } nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, genlmsg_data(info->genlhdr), genlmsg_len(info->genlhdr), rem) { err = nla_parse_nested( tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr, netdev_queue_id_nl_policy, info->extack); if (err < 0) goto err_unbind; if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) { err = -EINVAL; goto err_unbind; } if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); err = -EINVAL; goto err_unbind; } rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding, info->extack); if (err) goto err_unbind; } list_add(&binding->list, sock_binding_list); nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); genlmsg_end(rsp, hdr); err = genlmsg_reply(rsp, info); if (err) goto err_unbind; rtnl_unlock(); return 0; err_unbind: net_devmem_unbind_dmabuf(binding); err_unlock: rtnl_unlock(); err_genlmsg_free: nlmsg_free(rsp); return err; } void netdev_nl_sock_priv_init(struct list_head *priv) { INIT_LIST_HEAD(priv); } void netdev_nl_sock_priv_destroy(struct list_head *priv) { struct net_devmem_dmabuf_binding *binding; struct net_devmem_dmabuf_binding *temp; list_for_each_entry_safe(binding, temp, priv, list) { rtnl_lock(); net_devmem_unbind_dmabuf(binding); rtnl_unlock(); } } static int netdev_genl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_REGISTER: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF); break; case NETDEV_UNREGISTER: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF); break; case NETDEV_XDP_FEAT_CHANGE: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF); break; } return NOTIFY_OK; } static struct notifier_block netdev_genl_nb = { .notifier_call = netdev_genl_netdevice_event, }; static int __init netdev_genl_init(void) { int err; err = register_netdevice_notifier(&netdev_genl_nb); if (err) return err; err = genl_register_family(&netdev_nl_family); if (err) goto err_unreg_ntf; return 0; err_unreg_ntf: unregister_netdevice_notifier(&netdev_genl_nb); return err; } subsys_initcall(netdev_genl_init);
71 72 71 47 47 47 13 13 13 13 1 45 83 83 83 7 7 7 7 7 7 7 2 2 2 2 61 3 58 3 2 2 2 2 2 2 2 73 70 73 1 72 1 71 1 13 69 1 68 1 67 67 67 61 6 59 2 2 1 82 6 2 2 1 5 2 4 3 2 3 82 1 1 47 47 47 47 73 1 73 72 72 73 61 2 61 2 2 61 61 2 8 53 4 49 49 3 1 1 48 48 47 47 47 73 72 1 1 1 1 1 1 1 37 37 83 10 10 82 82 82 82 27 1 26 27 27 1 27 2 2 2 3 3 3 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ * Copyright (C) 2021, Alibaba Cloud */ #include <linux/statfs.h> #include <linux/seq_file.h> #include <linux/crc32c.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/exportfs.h> #include <linux/backing-dev.h> #include "xattr.h" #define CREATE_TRACE_POINTS #include <trace/events/erofs.h> static struct kmem_cache *erofs_inode_cachep __read_mostly; void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (sb) pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf); else pr_err("%s: %pV", func, &vaf); va_end(args); } void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (sb) pr_info("(device %s): %pV", sb->s_id, &vaf); else pr_info("%pV", &vaf); va_end(args); } static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata) { size_t len = 1 << EROFS_SB(sb)->blkszbits; struct erofs_super_block *dsb; u32 expected_crc, crc; if (len > EROFS_SUPER_OFFSET) len -= EROFS_SUPER_OFFSET; dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET, len, GFP_KERNEL); if (!dsb) return -ENOMEM; expected_crc = le32_to_cpu(dsb->checksum); dsb->checksum = 0; /* to allow for x86 boot sectors and other oddities. */ crc = crc32c(~0, dsb, len); kfree(dsb); if (crc != expected_crc) { erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected", crc, expected_crc); return -EBADMSG; } return 0; } static void erofs_inode_init_once(void *ptr) { struct erofs_inode *vi = ptr; inode_init_once(&vi->vfs_inode); } static struct inode *erofs_alloc_inode(struct super_block *sb) { struct erofs_inode *vi = alloc_inode_sb(sb, erofs_inode_cachep, GFP_KERNEL); if (!vi) return NULL; /* zero out everything except vfs_inode */ memset(vi, 0, offsetof(struct erofs_inode, vfs_inode)); return &vi->vfs_inode; } static void erofs_free_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); if (inode->i_op == &erofs_fast_symlink_iops) kfree(inode->i_link); kfree(vi->xattr_shared_xattrs); kmem_cache_free(erofs_inode_cachep, vi); } /* read variable-sized metadata, offset will be aligned by 4-byte */ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp) { u8 *buffer, *ptr; int len, i, cnt; *offset = round_up(*offset, 4); ptr = erofs_bread(buf, *offset, EROFS_KMAP); if (IS_ERR(ptr)) return ptr; len = le16_to_cpu(*(__le16 *)ptr); if (!len) len = U16_MAX + 1; buffer = kmalloc(len, GFP_KERNEL); if (!buffer) return ERR_PTR(-ENOMEM); *offset += sizeof(__le16); *lengthp = len; for (i = 0; i < len; i += cnt) { cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset), len - i); ptr = erofs_bread(buf, *offset, EROFS_KMAP); if (IS_ERR(ptr)) { kfree(buffer); return ptr; } memcpy(buffer + i, ptr, cnt); *offset += cnt; } return buffer; } #ifndef CONFIG_EROFS_FS_ZIP static int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) { if (!dsb->u1.available_compr_algs) return 0; erofs_err(sb, "compression disabled, unable to mount compressed EROFS"); return -EOPNOTSUPP; } #endif static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_device_info *dif, erofs_off_t *pos) { struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; struct erofs_deviceslot *dis; struct file *file; dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP); if (IS_ERR(dis)) return PTR_ERR(dis); if (!sbi->devs->flatdev && !dif->path) { if (!dis->tag[0]) { erofs_err(sb, "empty device tag @ pos %llu", *pos); return -EINVAL; } dif->path = kmemdup_nul(dis->tag, sizeof(dis->tag), GFP_KERNEL); if (!dif->path) return -ENOMEM; } if (erofs_is_fscache_mode(sb)) { fscache = erofs_fscache_register_cookie(sb, dif->path, 0); if (IS_ERR(fscache)) return PTR_ERR(fscache); dif->fscache = fscache; } else if (!sbi->devs->flatdev) { file = erofs_is_fileio_mode(sbi) ? filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) : bdev_file_open_by_path(dif->path, BLK_OPEN_READ, sb->s_type, NULL); if (IS_ERR(file)) return PTR_ERR(file); dif->file = file; if (!erofs_is_fileio_mode(sbi)) dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file), &dif->dax_part_off, NULL, NULL); } dif->blocks = le32_to_cpu(dis->blocks); dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); sbi->total_blocks += dif->blocks; *pos += EROFS_DEVT_SLOT_SIZE; return 0; } static int erofs_scan_devices(struct super_block *sb, struct erofs_super_block *dsb) { struct erofs_sb_info *sbi = EROFS_SB(sb); unsigned int ondisk_extradevs; erofs_off_t pos; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_device_info *dif; int id, err = 0; sbi->total_blocks = sbi->primarydevice_blocks; if (!erofs_sb_has_device_table(sbi)) ondisk_extradevs = 0; else ondisk_extradevs = le16_to_cpu(dsb->extra_devices); if (sbi->devs->extra_devices && ondisk_extradevs != sbi->devs->extra_devices) { erofs_err(sb, "extra devices don't match (ondisk %u, given %u)", ondisk_extradevs, sbi->devs->extra_devices); return -EINVAL; } if (!ondisk_extradevs) return 0; if (!sbi->devs->extra_devices && !erofs_is_fscache_mode(sb)) sbi->devs->flatdev = true; sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1; pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE; down_read(&sbi->devs->rwsem); if (sbi->devs->extra_devices) { idr_for_each_entry(&sbi->devs->tree, dif, id) { err = erofs_init_device(&buf, sb, dif, &pos); if (err) break; } } else { for (id = 0; id < ondisk_extradevs; id++) { dif = kzalloc(sizeof(*dif), GFP_KERNEL); if (!dif) { err = -ENOMEM; break; } err = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL); if (err < 0) { kfree(dif); break; } ++sbi->devs->extra_devices; err = erofs_init_device(&buf, sb, dif, &pos); if (err) break; } } up_read(&sbi->devs->rwsem); erofs_put_metabuf(&buf); return err; } static int erofs_read_superblock(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_super_block *dsb; void *data; int ret; data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP); if (IS_ERR(data)) { erofs_err(sb, "cannot read erofs superblock"); return PTR_ERR(data); } dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET); ret = -EINVAL; if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) { erofs_err(sb, "cannot find valid erofs superblock"); goto out; } sbi->blkszbits = dsb->blkszbits; if (sbi->blkszbits < 9 || sbi->blkszbits > PAGE_SHIFT) { erofs_err(sb, "blkszbits %u isn't supported", sbi->blkszbits); goto out; } if (dsb->dirblkbits) { erofs_err(sb, "dirblkbits %u isn't supported", dsb->dirblkbits); goto out; } sbi->feature_compat = le32_to_cpu(dsb->feature_compat); if (erofs_sb_has_sb_chksum(sbi)) { ret = erofs_superblock_csum_verify(sb, data); if (ret) goto out; } ret = -EINVAL; sbi->feature_incompat = le32_to_cpu(dsb->feature_incompat); if (sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT) { erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT); goto out; } sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE; if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) { erofs_err(sb, "invalid sb_extslots %u (more than a fs block)", sbi->sb_size); goto out; } sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start); sbi->xattr_prefix_count = dsb->xattr_prefix_count; sbi->xattr_filter_reserved = dsb->xattr_filter_reserved; #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); sbi->root_nid = le16_to_cpu(dsb->root_nid); sbi->packed_nid = le64_to_cpu(dsb->packed_nid); sbi->inos = le64_to_cpu(dsb->inos); sbi->build_time = le64_to_cpu(dsb->build_time); sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec); super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); ret = strscpy(sbi->volume_name, dsb->volume_name, sizeof(dsb->volume_name)); if (ret < 0) { /* -E2BIG */ erofs_err(sb, "bad volume name without NIL terminator"); ret = -EFSCORRUPTED; goto out; } /* parse on-disk compression configurations */ ret = z_erofs_parse_cfgs(sb, dsb); if (ret < 0) goto out; /* handle multiple devices */ ret = erofs_scan_devices(sb, dsb); if (erofs_is_fscache_mode(sb)) erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; } static void erofs_default_options(struct erofs_sb_info *sbi) { #ifdef CONFIG_EROFS_FS_ZIP sbi->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; sbi->opt.max_sync_decompress_pages = 3; sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO; #endif #ifdef CONFIG_EROFS_FS_XATTR set_opt(&sbi->opt, XATTR_USER); #endif #ifdef CONFIG_EROFS_FS_POSIX_ACL set_opt(&sbi->opt, POSIX_ACL); #endif } enum { Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum, Opt_device, Opt_fsid, Opt_domain_id, Opt_err }; static const struct constant_table erofs_param_cache_strategy[] = { {"disabled", EROFS_ZIP_CACHE_DISABLED}, {"readahead", EROFS_ZIP_CACHE_READAHEAD}, {"readaround", EROFS_ZIP_CACHE_READAROUND}, {} }; static const struct constant_table erofs_dax_param_enums[] = { {"always", EROFS_MOUNT_DAX_ALWAYS}, {"never", EROFS_MOUNT_DAX_NEVER}, {} }; static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_flag_no("user_xattr", Opt_user_xattr), fsparam_flag_no("acl", Opt_acl), fsparam_enum("cache_strategy", Opt_cache_strategy, erofs_param_cache_strategy), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), fsparam_string("device", Opt_device), fsparam_string("fsid", Opt_fsid), fsparam_string("domain_id", Opt_domain_id), {} }; static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) { #ifdef CONFIG_FS_DAX struct erofs_sb_info *sbi = fc->s_fs_info; switch (mode) { case EROFS_MOUNT_DAX_ALWAYS: set_opt(&sbi->opt, DAX_ALWAYS); clear_opt(&sbi->opt, DAX_NEVER); return true; case EROFS_MOUNT_DAX_NEVER: set_opt(&sbi->opt, DAX_NEVER); clear_opt(&sbi->opt, DAX_ALWAYS); return true; default: DBG_BUGON(1); return false; } #else errorfc(fc, "dax options not supported"); return false; #endif } static int erofs_fc_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct erofs_sb_info *sbi = fc->s_fs_info; struct fs_parse_result result; struct erofs_device_info *dif; int opt, ret; opt = fs_parse(fc, erofs_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_user_xattr: #ifdef CONFIG_EROFS_FS_XATTR if (result.boolean) set_opt(&sbi->opt, XATTR_USER); else clear_opt(&sbi->opt, XATTR_USER); #else errorfc(fc, "{,no}user_xattr options not supported"); #endif break; case Opt_acl: #ifdef CONFIG_EROFS_FS_POSIX_ACL if (result.boolean) set_opt(&sbi->opt, POSIX_ACL); else clear_opt(&sbi->opt, POSIX_ACL); #else errorfc(fc, "{,no}acl options not supported"); #endif break; case Opt_cache_strategy: #ifdef CONFIG_EROFS_FS_ZIP sbi->opt.cache_strategy = result.uint_32; #else errorfc(fc, "compression not supported, cache_strategy ignored"); #endif break; case Opt_dax: if (!erofs_fc_set_dax_mode(fc, EROFS_MOUNT_DAX_ALWAYS)) return -EINVAL; break; case Opt_dax_enum: if (!erofs_fc_set_dax_mode(fc, result.uint_32)) return -EINVAL; break; case Opt_device: dif = kzalloc(sizeof(*dif), GFP_KERNEL); if (!dif) return -ENOMEM; dif->path = kstrdup(param->string, GFP_KERNEL); if (!dif->path) { kfree(dif); return -ENOMEM; } down_write(&sbi->devs->rwsem); ret = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL); up_write(&sbi->devs->rwsem); if (ret < 0) { kfree(dif->path); kfree(dif); return ret; } ++sbi->devs->extra_devices; break; #ifdef CONFIG_EROFS_FS_ONDEMAND case Opt_fsid: kfree(sbi->fsid); sbi->fsid = kstrdup(param->string, GFP_KERNEL); if (!sbi->fsid) return -ENOMEM; break; case Opt_domain_id: kfree(sbi->domain_id); sbi->domain_id = kstrdup(param->string, GFP_KERNEL); if (!sbi->domain_id) return -ENOMEM; break; #else case Opt_fsid: case Opt_domain_id: errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); break; #endif default: return -ENOPARAM; } return 0; } static struct inode *erofs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { return erofs_iget(sb, ino); } static struct dentry *erofs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, erofs_nfs_get_inode); } static struct dentry *erofs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, erofs_nfs_get_inode); } static struct dentry *erofs_get_parent(struct dentry *child) { erofs_nid_t nid; unsigned int d_type; int err; err = erofs_namei(d_inode(child), &dotdot_name, &nid, &d_type); if (err) return ERR_PTR(err); return d_obtain_alias(erofs_iget(child->d_sb, nid)); } static const struct export_operations erofs_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = erofs_fh_to_dentry, .fh_to_parent = erofs_fh_to_parent, .get_parent = erofs_get_parent, }; static void erofs_set_sysfs_name(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); if (sbi->domain_id) super_set_sysfs_name_generic(sb, "%s,%s", sbi->domain_id, sbi->fsid); else if (sbi->fsid) super_set_sysfs_name_generic(sb, "%s", sbi->fsid); else if (erofs_is_fileio_mode(sbi)) super_set_sysfs_name_generic(sb, "%s", bdi_dev_name(sb->s_bdi)); else super_set_sysfs_name_id(sb); } static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; struct erofs_sb_info *sbi = EROFS_SB(sb); int err; sb->s_magic = EROFS_SUPER_MAGIC; sb->s_flags |= SB_RDONLY | SB_NOATIME; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_op = &erofs_sops; sbi->blkszbits = PAGE_SHIFT; if (!sb->s_bdev) { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; if (erofs_is_fscache_mode(sb)) { err = erofs_fscache_register_fs(sb); if (err) return err; } err = super_setup_bdi(sb); if (err) return err; } else { if (!sb_set_blocksize(sb, PAGE_SIZE)) { errorfc(fc, "failed to set initial blksize"); return -EINVAL; } sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->dax_part_off, NULL, NULL); } err = erofs_read_superblock(sb); if (err) return err; if (sb->s_blocksize_bits != sbi->blkszbits) { if (erofs_is_fscache_mode(sb)) { errorfc(fc, "unsupported blksize for fscache mode"); return -EINVAL; } if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) { errorfc(fc, "failed to set erofs blksize"); return -EINVAL; } } if (test_opt(&sbi->opt, DAX_ALWAYS)) { if (!sbi->dax_dev) { errorfc(fc, "DAX unsupported by block device. Turning off DAX."); clear_opt(&sbi->opt, DAX_ALWAYS); } else if (sbi->blkszbits != PAGE_SHIFT) { errorfc(fc, "unsupported blocksize for DAX"); clear_opt(&sbi->opt, DAX_ALWAYS); } } sb->s_time_gran = 1; sb->s_xattr = erofs_xattr_handlers; sb->s_export_op = &erofs_export_ops; if (test_opt(&sbi->opt, POSIX_ACL)) sb->s_flags |= SB_POSIXACL; else sb->s_flags &= ~SB_POSIXACL; #ifdef CONFIG_EROFS_FS_ZIP xa_init(&sbi->managed_pslots); #endif inode = erofs_iget(sb, sbi->root_nid); if (IS_ERR(inode)) return PTR_ERR(inode); if (!S_ISDIR(inode->i_mode)) { erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)", sbi->root_nid, inode->i_mode); iput(inode); return -EINVAL; } sb->s_root = d_make_root(inode); if (!sb->s_root) return -ENOMEM; erofs_shrinker_register(sb); if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { sbi->packed_inode = erofs_iget(sb, sbi->packed_nid); if (IS_ERR(sbi->packed_inode)) { err = PTR_ERR(sbi->packed_inode); sbi->packed_inode = NULL; return err; } } err = erofs_init_managed_cache(sb); if (err) return err; err = erofs_xattr_prefixes_init(sb); if (err) return err; erofs_set_sysfs_name(sb); err = erofs_register_sysfs(sb); if (err) return err; erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid); return 0; } static int erofs_fc_get_tree(struct fs_context *fc) { struct erofs_sb_info *sbi = fc->s_fs_info; int ret; if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) return get_tree_nodev(fc, erofs_fc_fill_super); ret = get_tree_bdev(fc, erofs_fc_fill_super); #ifdef CONFIG_EROFS_FS_BACKED_BY_FILE if (ret == -ENOTBLK) { if (!fc->source) return invalf(fc, "No source specified"); sbi->fdev = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); if (IS_ERR(sbi->fdev)) return PTR_ERR(sbi->fdev); return get_tree_nodev(fc, erofs_fc_fill_super); } #endif return ret; } static int erofs_fc_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_sb_info *new_sbi = fc->s_fs_info; DBG_BUGON(!sb_rdonly(sb)); if (new_sbi->fsid || new_sbi->domain_id) erofs_info(sb, "ignoring reconfiguration for fsid|domain_id."); if (test_opt(&new_sbi->opt, POSIX_ACL)) fc->sb_flags |= SB_POSIXACL; else fc->sb_flags &= ~SB_POSIXACL; sbi->opt = new_sbi->opt; fc->sb_flags |= SB_RDONLY; return 0; } static int erofs_release_device_info(int id, void *ptr, void *data) { struct erofs_device_info *dif = ptr; fs_put_dax(dif->dax_dev, NULL); if (dif->file) fput(dif->file); erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); kfree(dif); return 0; } static void erofs_free_dev_context(struct erofs_dev_context *devs) { if (!devs) return; idr_for_each(&devs->tree, &erofs_release_device_info, NULL); idr_destroy(&devs->tree); kfree(devs); } static void erofs_fc_free(struct fs_context *fc) { struct erofs_sb_info *sbi = fc->s_fs_info; if (!sbi) return; erofs_free_dev_context(sbi->devs); kfree(sbi->fsid); kfree(sbi->domain_id); kfree(sbi); } static const struct fs_context_operations erofs_context_ops = { .parse_param = erofs_fc_parse_param, .get_tree = erofs_fc_get_tree, .reconfigure = erofs_fc_reconfigure, .free = erofs_fc_free, }; static int erofs_init_fs_context(struct fs_context *fc) { struct erofs_sb_info *sbi; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; sbi->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); if (!sbi->devs) { kfree(sbi); return -ENOMEM; } fc->s_fs_info = sbi; idr_init(&sbi->devs->tree); init_rwsem(&sbi->devs->rwsem); erofs_default_options(sbi); fc->ops = &erofs_context_ops; return 0; } static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || sbi->fdev) kill_anon_super(sb); else kill_block_super(sb); erofs_free_dev_context(sbi->devs); fs_put_dax(sbi->dax_dev, NULL); erofs_fscache_unregister_fs(sb); kfree(sbi->fsid); kfree(sbi->domain_id); if (sbi->fdev) fput(sbi->fdev); kfree(sbi); sb->s_fs_info = NULL; } static void erofs_put_super(struct super_block *sb) { struct erofs_sb_info *const sbi = EROFS_SB(sb); DBG_BUGON(!sbi); erofs_unregister_sysfs(sb); erofs_shrinker_unregister(sb); erofs_xattr_prefixes_cleanup(sb); #ifdef CONFIG_EROFS_FS_ZIP iput(sbi->managed_cache); sbi->managed_cache = NULL; #endif iput(sbi->packed_inode); sbi->packed_inode = NULL; erofs_free_dev_context(sbi->devs); sbi->devs = NULL; erofs_fscache_unregister_fs(sb); } static struct file_system_type erofs_fs_type = { .owner = THIS_MODULE, .name = "erofs", .init_fs_context = erofs_init_fs_context, .kill_sb = erofs_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("erofs"); static int __init erofs_module_init(void) { int err; erofs_check_ondisk_layout_definitions(); erofs_inode_cachep = kmem_cache_create("erofs_inode", sizeof(struct erofs_inode), 0, SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, erofs_inode_init_once); if (!erofs_inode_cachep) return -ENOMEM; err = erofs_init_shrinker(); if (err) goto shrinker_err; err = z_erofs_init_subsystem(); if (err) goto zip_err; err = erofs_init_sysfs(); if (err) goto sysfs_err; err = register_filesystem(&erofs_fs_type); if (err) goto fs_err; return 0; fs_err: erofs_exit_sysfs(); sysfs_err: z_erofs_exit_subsystem(); zip_err: erofs_exit_shrinker(); shrinker_err: kmem_cache_destroy(erofs_inode_cachep); return err; } static void __exit erofs_module_exit(void) { unregister_filesystem(&erofs_fs_type); /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */ rcu_barrier(); erofs_exit_sysfs(); z_erofs_exit_subsystem(); erofs_exit_shrinker(); kmem_cache_destroy(erofs_inode_cachep); } static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); buf->f_type = sb->s_magic; buf->f_bsize = sb->s_blocksize; buf->f_blocks = sbi->total_blocks; buf->f_bfree = buf->f_bavail = 0; buf->f_files = ULLONG_MAX; buf->f_ffree = ULLONG_MAX - sbi->inos; buf->f_namelen = EROFS_NAME_LEN; if (uuid_is_null(&sb->s_uuid)) buf->f_fsid = u64_to_fsid(!sb->s_bdev ? 0 : huge_encode_dev(sb->s_bdev->bd_dev)); else buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); return 0; } static int erofs_show_options(struct seq_file *seq, struct dentry *root) { struct erofs_sb_info *sbi = EROFS_SB(root->d_sb); struct erofs_mount_opts *opt = &sbi->opt; if (IS_ENABLED(CONFIG_EROFS_FS_XATTR)) seq_puts(seq, test_opt(opt, XATTR_USER) ? ",user_xattr" : ",nouser_xattr"); if (IS_ENABLED(CONFIG_EROFS_FS_POSIX_ACL)) seq_puts(seq, test_opt(opt, POSIX_ACL) ? ",acl" : ",noacl"); if (IS_ENABLED(CONFIG_EROFS_FS_ZIP)) seq_printf(seq, ",cache_strategy=%s", erofs_param_cache_strategy[opt->cache_strategy].name); if (test_opt(opt, DAX_ALWAYS)) seq_puts(seq, ",dax=always"); if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); #ifdef CONFIG_EROFS_FS_ONDEMAND if (sbi->fsid) seq_printf(seq, ",fsid=%s", sbi->fsid); if (sbi->domain_id) seq_printf(seq, ",domain_id=%s", sbi->domain_id); #endif return 0; } const struct super_operations erofs_sops = { .put_super = erofs_put_super, .alloc_inode = erofs_alloc_inode, .free_inode = erofs_free_inode, .statfs = erofs_statfs, .show_options = erofs_show_options, }; module_init(erofs_module_init); module_exit(erofs_module_exit); MODULE_DESCRIPTION("Enhanced ROM File System"); MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc."); MODULE_LICENSE("GPL");
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 // SPDX-License-Identifier: GPL-2.0-only /* * CAN driver for esd electronics gmbh CAN-USB/2, CAN-USB/3 and CAN-USB/Micro * * Copyright (C) 2010-2012 esd electronic system design gmbh, Matthias Fuchs <socketcan@esd.eu> * Copyright (C) 2022-2024 esd electronics gmbh, Frank Jungclaus <frank.jungclaus@esd.eu> */ #include <linux/can.h> #include <linux/can/dev.h> #include <linux/can/error.h> #include <linux/ethtool.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/signal.h> #include <linux/slab.h> #include <linux/units.h> #include <linux/usb.h> MODULE_AUTHOR("Matthias Fuchs <socketcan@esd.eu>"); MODULE_AUTHOR("Frank Jungclaus <frank.jungclaus@esd.eu>"); MODULE_DESCRIPTION("CAN driver for esd electronics gmbh CAN-USB/2, CAN-USB/3 and CAN-USB/Micro interfaces"); MODULE_LICENSE("GPL v2"); /* USB vendor and product ID */ #define ESD_USB_ESDGMBH_VENDOR_ID 0x0ab4 #define ESD_USB_CANUSB2_PRODUCT_ID 0x0010 #define ESD_USB_CANUSBM_PRODUCT_ID 0x0011 #define ESD_USB_CANUSB3_PRODUCT_ID 0x0014 /* CAN controller clock frequencies */ #define ESD_USB_2_CAN_CLOCK (60 * MEGA) /* Hz */ #define ESD_USB_M_CAN_CLOCK (36 * MEGA) /* Hz */ #define ESD_USB_3_CAN_CLOCK (80 * MEGA) /* Hz */ /* Maximum number of CAN nets */ #define ESD_USB_MAX_NETS 2 /* USB commands */ #define ESD_USB_CMD_VERSION 1 /* also used for VERSION_REPLY */ #define ESD_USB_CMD_CAN_RX 2 /* device to host only */ #define ESD_USB_CMD_CAN_TX 3 /* also used for TX_DONE */ #define ESD_USB_CMD_SETBAUD 4 /* also used for SETBAUD_REPLY */ #define ESD_USB_CMD_TS 5 /* also used for TS_REPLY */ #define ESD_USB_CMD_IDADD 6 /* also used for IDADD_REPLY */ /* esd CAN message flags - dlc field */ #define ESD_USB_RTR BIT(4) #define ESD_USB_NO_BRS BIT(4) #define ESD_USB_ESI BIT(5) #define ESD_USB_FD BIT(7) /* esd CAN message flags - id field */ #define ESD_USB_EXTID BIT(29) #define ESD_USB_EVENT BIT(30) #define ESD_USB_IDMASK GENMASK(28, 0) /* esd CAN event ids */ #define ESD_USB_EV_CAN_ERROR_EXT 2 /* CAN controller specific diagnostic data */ /* baudrate message flags */ #define ESD_USB_LOM BIT(30) /* Listen Only Mode */ #define ESD_USB_UBR BIT(31) /* User Bit Rate (controller BTR) in bits 0..27 */ #define ESD_USB_NO_BAUDRATE GENMASK(30, 0) /* bit rate unconfigured */ /* bit timing esd CAN-USB */ #define ESD_USB_2_TSEG1_SHIFT 16 #define ESD_USB_2_TSEG2_SHIFT 20 #define ESD_USB_2_SJW_SHIFT 14 #define ESD_USB_M_SJW_SHIFT 24 #define ESD_USB_TRIPLE_SAMPLES BIT(23) /* Transmitter Delay Compensation */ #define ESD_USB_3_TDC_MODE_AUTO 0 /* esd IDADD message */ #define ESD_USB_ID_ENABLE BIT(7) #define ESD_USB_MAX_ID_SEGMENT 64 /* SJA1000 ECC register (emulated by usb firmware) */ #define ESD_USB_SJA1000_ECC_SEG GENMASK(4, 0) #define ESD_USB_SJA1000_ECC_DIR BIT(5) #define ESD_USB_SJA1000_ECC_ERR BIT(2, 1) #define ESD_USB_SJA1000_ECC_BIT 0x00 #define ESD_USB_SJA1000_ECC_FORM BIT(6) #define ESD_USB_SJA1000_ECC_STUFF BIT(7) #define ESD_USB_SJA1000_ECC_MASK GENMASK(7, 6) /* esd bus state event codes */ #define ESD_USB_BUSSTATE_MASK GENMASK(7, 6) #define ESD_USB_BUSSTATE_WARN BIT(6) #define ESD_USB_BUSSTATE_ERRPASSIVE BIT(7) #define ESD_USB_BUSSTATE_BUSOFF GENMASK(7, 6) #define ESD_USB_RX_BUFFER_SIZE 1024 #define ESD_USB_MAX_RX_URBS 4 #define ESD_USB_MAX_TX_URBS 16 /* must be power of 2 */ /* Modes for CAN-USB/3, to be used for esd_usb_3_set_baudrate_msg_x.mode */ #define ESD_USB_3_BAUDRATE_MODE_DISABLE 0 /* remove from bus */ #define ESD_USB_3_BAUDRATE_MODE_INDEX 1 /* ESD (CiA) bit rate idx */ #define ESD_USB_3_BAUDRATE_MODE_BTR_CTRL 2 /* BTR values (controller)*/ #define ESD_USB_3_BAUDRATE_MODE_BTR_CANONICAL 3 /* BTR values (canonical) */ #define ESD_USB_3_BAUDRATE_MODE_NUM 4 /* numerical bit rate */ #define ESD_USB_3_BAUDRATE_MODE_AUTOBAUD 5 /* autobaud */ /* Flags for CAN-USB/3, to be used for esd_usb_3_set_baudrate_msg_x.flags */ #define ESD_USB_3_BAUDRATE_FLAG_FD BIT(0) /* enable CAN FD mode */ #define ESD_USB_3_BAUDRATE_FLAG_LOM BIT(1) /* enable listen only mode */ #define ESD_USB_3_BAUDRATE_FLAG_STM BIT(2) /* enable self test mode */ #define ESD_USB_3_BAUDRATE_FLAG_TRS BIT(3) /* enable triple sampling */ #define ESD_USB_3_BAUDRATE_FLAG_TXP BIT(4) /* enable transmit pause */ struct esd_usb_header_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 rsvd[2]; }; struct esd_usb_version_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 rsvd; u8 flags; __le32 drv_version; }; struct esd_usb_version_reply_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 nets; u8 features; __le32 version; u8 name[16]; __le32 rsvd; __le32 ts; }; struct esd_usb_rx_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 dlc; __le32 ts; __le32 id; /* upper 3 bits contain flags */ union { u8 data[CAN_MAX_DLEN]; u8 data_fd[CANFD_MAX_DLEN]; struct { u8 status; /* CAN Controller Status */ u8 ecc; /* Error Capture Register */ u8 rec; /* RX Error Counter */ u8 tec; /* TX Error Counter */ } ev_can_err_ext; /* For ESD_EV_CAN_ERROR_EXT */ }; }; struct esd_usb_tx_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 dlc; u32 hnd; /* opaque handle, not used by device */ __le32 id; /* upper 3 bits contain flags */ union { u8 data[CAN_MAX_DLEN]; u8 data_fd[CANFD_MAX_DLEN]; }; }; struct esd_usb_tx_done_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 status; u32 hnd; /* opaque handle, not used by device */ __le32 ts; }; struct esd_usb_id_filter_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 option; __le32 mask[ESD_USB_MAX_ID_SEGMENT + 1]; /* +1 for 29bit extended IDs */ }; struct esd_usb_set_baudrate_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 rsvd; __le32 baud; }; /* CAN-USB/3 baudrate configuration, used for nominal as well as for data bit rate */ struct esd_usb_3_baudrate_cfg { __le16 brp; /* bit rate pre-scaler */ __le16 tseg1; /* time segment before sample point */ __le16 tseg2; /* time segment after sample point */ __le16 sjw; /* synchronization jump Width */ }; /* In principle, the esd CAN-USB/3 supports Transmitter Delay Compensation (TDC), * but currently only the automatic TDC mode is supported by this driver. * An implementation for manual TDC configuration will follow. * * For information about struct esd_usb_3_tdc_cfg, see * NTCAN Application Developers Manual, 6.2.25 NTCAN_TDC_CFG + related chapters * https://esd.eu/fileadmin/esd/docs/manuals/NTCAN_Part1_Function_API_Manual_en_56.pdf */ struct esd_usb_3_tdc_cfg { u8 tdc_mode; /* transmitter delay compensation mode */ u8 ssp_offset; /* secondary sample point offset in mtq */ s8 ssp_shift; /* secondary sample point shift in mtq */ u8 tdc_filter; /* TDC filter in mtq */ }; /* Extended version of the above set_baudrate_msg for a CAN-USB/3 * to define the CAN bit timing configuration of the CAN controller in * CAN FD mode as well as in Classical CAN mode. * * The payload of this command is a NTCAN_BAUDRATE_X structure according to * esd electronics gmbh, NTCAN Application Developers Manual, 6.2.15 NTCAN_BAUDRATE_X * https://esd.eu/fileadmin/esd/docs/manuals/NTCAN_Part1_Function_API_Manual_en_56.pdf */ struct esd_usb_3_set_baudrate_msg_x { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 rsvd; /*reserved */ /* Payload ... */ __le16 mode; /* mode word, see ESD_USB_3_BAUDRATE_MODE_xxx */ __le16 flags; /* control flags, see ESD_USB_3_BAUDRATE_FLAG_xxx */ struct esd_usb_3_tdc_cfg tdc; /* TDC configuration */ struct esd_usb_3_baudrate_cfg nom; /* nominal bit rate */ struct esd_usb_3_baudrate_cfg data; /* data bit rate */ }; /* Main message type used between library and application */ union __packed esd_usb_msg { struct esd_usb_header_msg hdr; struct esd_usb_version_msg version; struct esd_usb_version_reply_msg version_reply; struct esd_usb_rx_msg rx; struct esd_usb_tx_msg tx; struct esd_usb_tx_done_msg txdone; struct esd_usb_set_baudrate_msg setbaud; struct esd_usb_3_set_baudrate_msg_x setbaud_x; struct esd_usb_id_filter_msg filter; }; static struct usb_device_id esd_usb_table[] = { {USB_DEVICE(ESD_USB_ESDGMBH_VENDOR_ID, ESD_USB_CANUSB2_PRODUCT_ID)}, {USB_DEVICE(ESD_USB_ESDGMBH_VENDOR_ID, ESD_USB_CANUSBM_PRODUCT_ID)}, {USB_DEVICE(ESD_USB_ESDGMBH_VENDOR_ID, ESD_USB_CANUSB3_PRODUCT_ID)}, {} }; MODULE_DEVICE_TABLE(usb, esd_usb_table); struct esd_usb_net_priv; struct esd_tx_urb_context { struct esd_usb_net_priv *priv; u32 echo_index; }; struct esd_usb { struct usb_device *udev; struct esd_usb_net_priv *nets[ESD_USB_MAX_NETS]; struct usb_anchor rx_submitted; int net_count; u32 version; int rxinitdone; void *rxbuf[ESD_USB_MAX_RX_URBS]; dma_addr_t rxbuf_dma[ESD_USB_MAX_RX_URBS]; }; struct esd_usb_net_priv { struct can_priv can; /* must be the first member */ atomic_t active_tx_jobs; struct usb_anchor tx_submitted; struct esd_tx_urb_context tx_contexts[ESD_USB_MAX_TX_URBS]; struct esd_usb *usb; struct net_device *netdev; int index; u8 old_state; struct can_berr_counter bec; }; static void esd_usb_rx_event(struct esd_usb_net_priv *priv, union esd_usb_msg *msg) { struct net_device_stats *stats = &priv->netdev->stats; struct can_frame *cf; struct sk_buff *skb; u32 id = le32_to_cpu(msg->rx.id) & ESD_USB_IDMASK; if (id == ESD_USB_EV_CAN_ERROR_EXT) { u8 state = msg->rx.ev_can_err_ext.status; u8 ecc = msg->rx.ev_can_err_ext.ecc; priv->bec.rxerr = msg->rx.ev_can_err_ext.rec; priv->bec.txerr = msg->rx.ev_can_err_ext.tec; netdev_dbg(priv->netdev, "CAN_ERR_EV_EXT: dlc=%#02x state=%02x ecc=%02x rec=%02x tec=%02x\n", msg->rx.dlc, state, ecc, priv->bec.rxerr, priv->bec.txerr); /* if berr-reporting is off, only pass through on state change ... */ if (!(priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) && state == priv->old_state) return; skb = alloc_can_err_skb(priv->netdev, &cf); if (!skb) stats->rx_dropped++; if (state != priv->old_state) { enum can_state tx_state, rx_state; enum can_state new_state = CAN_STATE_ERROR_ACTIVE; priv->old_state = state; switch (state & ESD_USB_BUSSTATE_MASK) { case ESD_USB_BUSSTATE_BUSOFF: new_state = CAN_STATE_BUS_OFF; can_bus_off(priv->netdev); break; case ESD_USB_BUSSTATE_WARN: new_state = CAN_STATE_ERROR_WARNING; break; case ESD_USB_BUSSTATE_ERRPASSIVE: new_state = CAN_STATE_ERROR_PASSIVE; break; default: new_state = CAN_STATE_ERROR_ACTIVE; priv->bec.txerr = 0; priv->bec.rxerr = 0; break; } if (new_state != priv->can.state) { tx_state = (priv->bec.txerr >= priv->bec.rxerr) ? new_state : 0; rx_state = (priv->bec.txerr <= priv->bec.rxerr) ? new_state : 0; can_change_state(priv->netdev, cf, tx_state, rx_state); } } else if (skb) { priv->can.can_stats.bus_error++; stats->rx_errors++; cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR; switch (ecc & ESD_USB_SJA1000_ECC_MASK) { case ESD_USB_SJA1000_ECC_BIT: cf->data[2] |= CAN_ERR_PROT_BIT; break; case ESD_USB_SJA1000_ECC_FORM: cf->data[2] |= CAN_ERR_PROT_FORM; break; case ESD_USB_SJA1000_ECC_STUFF: cf->data[2] |= CAN_ERR_PROT_STUFF; break; default: break; } /* Error occurred during transmission? */ if (!(ecc & ESD_USB_SJA1000_ECC_DIR)) cf->data[2] |= CAN_ERR_PROT_TX; /* Bit stream position in CAN frame as the error was detected */ cf->data[3] = ecc & ESD_USB_SJA1000_ECC_SEG; } if (skb) { cf->can_id |= CAN_ERR_CNT; cf->data[6] = priv->bec.txerr; cf->data[7] = priv->bec.rxerr; netif_rx(skb); } } } static void esd_usb_rx_can_msg(struct esd_usb_net_priv *priv, union esd_usb_msg *msg) { struct net_device_stats *stats = &priv->netdev->stats; struct can_frame *cf; struct canfd_frame *cfd; struct sk_buff *skb; u32 id; u8 len; if (!netif_device_present(priv->netdev)) return; id = le32_to_cpu(msg->rx.id); if (id & ESD_USB_EVENT) { esd_usb_rx_event(priv, msg); } else { if (msg->rx.dlc & ESD_USB_FD) { skb = alloc_canfd_skb(priv->netdev, &cfd); } else { skb = alloc_can_skb(priv->netdev, &cf); cfd = (struct canfd_frame *)cf; } if (skb == NULL) { stats->rx_dropped++; return; } cfd->can_id = id & ESD_USB_IDMASK; if (msg->rx.dlc & ESD_USB_FD) { /* masking by 0x0F is already done within can_fd_dlc2len() */ cfd->len = can_fd_dlc2len(msg->rx.dlc); len = cfd->len; if ((msg->rx.dlc & ESD_USB_NO_BRS) == 0) cfd->flags |= CANFD_BRS; if (msg->rx.dlc & ESD_USB_ESI) cfd->flags |= CANFD_ESI; } else { can_frame_set_cc_len(cf, msg->rx.dlc & ~ESD_USB_RTR, priv->can.ctrlmode); len = cf->len; if (msg->rx.dlc & ESD_USB_RTR) { cf->can_id |= CAN_RTR_FLAG; len = 0; } } if (id & ESD_USB_EXTID) cfd->can_id |= CAN_EFF_FLAG; memcpy(cfd->data, msg->rx.data_fd, len); stats->rx_bytes += len; stats->rx_packets++; netif_rx(skb); } } static void esd_usb_tx_done_msg(struct esd_usb_net_priv *priv, union esd_usb_msg *msg) { struct net_device_stats *stats = &priv->netdev->stats; struct net_device *netdev = priv->netdev; struct esd_tx_urb_context *context; if (!netif_device_present(netdev)) return; context = &priv->tx_contexts[msg->txdone.hnd & (ESD_USB_MAX_TX_URBS - 1)]; if (!msg->txdone.status) { stats->tx_packets++; stats->tx_bytes += can_get_echo_skb(netdev, context->echo_index, NULL); } else { stats->tx_errors++; can_free_echo_skb(netdev, context->echo_index, NULL); } /* Release context */ context->echo_index = ESD_USB_MAX_TX_URBS; atomic_dec(&priv->active_tx_jobs); netif_wake_queue(netdev); } static void esd_usb_read_bulk_callback(struct urb *urb) { struct esd_usb *dev = urb->context; int retval; int pos = 0; int i; switch (urb->status) { case 0: /* success */ break; case -ENOENT: case -EPIPE: case -EPROTO: case -ESHUTDOWN: return; default: dev_info(dev->udev->dev.parent, "Rx URB aborted (%d)\n", urb->status); goto resubmit_urb; } while (pos < urb->actual_length) { union esd_usb_msg *msg; msg = (union esd_usb_msg *)(urb->transfer_buffer + pos); switch (msg->hdr.cmd) { case ESD_USB_CMD_CAN_RX: if (msg->rx.net >= dev->net_count) { dev_err(dev->udev->dev.parent, "format error\n"); break; } esd_usb_rx_can_msg(dev->nets[msg->rx.net], msg); break; case ESD_USB_CMD_CAN_TX: if (msg->txdone.net >= dev->net_count) { dev_err(dev->udev->dev.parent, "format error\n"); break; } esd_usb_tx_done_msg(dev->nets[msg->txdone.net], msg); break; } pos += msg->hdr.len * sizeof(u32); /* convert to # of bytes */ if (pos > urb->actual_length) { dev_err(dev->udev->dev.parent, "format error\n"); break; } } resubmit_urb: usb_fill_bulk_urb(urb, dev->udev, usb_rcvbulkpipe(dev->udev, 1), urb->transfer_buffer, ESD_USB_RX_BUFFER_SIZE, esd_usb_read_bulk_callback, dev); retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval == -ENODEV) { for (i = 0; i < dev->net_count; i++) { if (dev->nets[i]) netif_device_detach(dev->nets[i]->netdev); } } else if (retval) { dev_err(dev->udev->dev.parent, "failed resubmitting read bulk urb: %d\n", retval); } } /* callback for bulk IN urb */ static void esd_usb_write_bulk_callback(struct urb *urb) { struct esd_tx_urb_context *context = urb->context; struct esd_usb_net_priv *priv; struct net_device *netdev; size_t size = sizeof(union esd_usb_msg); WARN_ON(!context); priv = context->priv; netdev = priv->netdev; /* free up our allocated buffer */ usb_free_coherent(urb->dev, size, urb->transfer_buffer, urb->transfer_dma); if (!netif_device_present(netdev)) return; if (urb->status) netdev_info(netdev, "Tx URB aborted (%d)\n", urb->status); netif_trans_update(netdev); } static ssize_t firmware_show(struct device *d, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(d); struct esd_usb *dev = usb_get_intfdata(intf); return sprintf(buf, "%d.%d.%d\n", (dev->version >> 12) & 0xf, (dev->version >> 8) & 0xf, dev->version & 0xff); } static DEVICE_ATTR_RO(firmware); static ssize_t hardware_show(struct device *d, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(d); struct esd_usb *dev = usb_get_intfdata(intf); return sprintf(buf, "%d.%d.%d\n", (dev->version >> 28) & 0xf, (dev->version >> 24) & 0xf, (dev->version >> 16) & 0xff); } static DEVICE_ATTR_RO(hardware); static ssize_t nets_show(struct device *d, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(d); struct esd_usb *dev = usb_get_intfdata(intf); return sprintf(buf, "%d", dev->net_count); } static DEVICE_ATTR_RO(nets); static int esd_usb_send_msg(struct esd_usb *dev, union esd_usb_msg *msg) { int actual_length; return usb_bulk_msg(dev->udev, usb_sndbulkpipe(dev->udev, 2), msg, msg->hdr.len * sizeof(u32), /* convert to # of bytes */ &actual_length, 1000); } static int esd_usb_wait_msg(struct esd_usb *dev, union esd_usb_msg *msg) { int actual_length; return usb_bulk_msg(dev->udev, usb_rcvbulkpipe(dev->udev, 1), msg, sizeof(*msg), &actual_length, 1000); } static int esd_usb_setup_rx_urbs(struct esd_usb *dev) { int i, err = 0; if (dev->rxinitdone) return 0; for (i = 0; i < ESD_USB_MAX_RX_URBS; i++) { struct urb *urb = NULL; u8 *buf = NULL; dma_addr_t buf_dma; /* create a URB, and a buffer for it */ urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) { err = -ENOMEM; break; } buf = usb_alloc_coherent(dev->udev, ESD_USB_RX_BUFFER_SIZE, GFP_KERNEL, &buf_dma); if (!buf) { dev_warn(dev->udev->dev.parent, "No memory left for USB buffer\n"); err = -ENOMEM; goto freeurb; } urb->transfer_dma = buf_dma; usb_fill_bulk_urb(urb, dev->udev, usb_rcvbulkpipe(dev->udev, 1), buf, ESD_USB_RX_BUFFER_SIZE, esd_usb_read_bulk_callback, dev); urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; usb_anchor_urb(urb, &dev->rx_submitted); err = usb_submit_urb(urb, GFP_KERNEL); if (err) { usb_unanchor_urb(urb); usb_free_coherent(dev->udev, ESD_USB_RX_BUFFER_SIZE, buf, urb->transfer_dma); goto freeurb; } dev->rxbuf[i] = buf; dev->rxbuf_dma[i] = buf_dma; freeurb: /* Drop reference, USB core will take care of freeing it */ usb_free_urb(urb); if (err) break; } /* Did we submit any URBs */ if (i == 0) { dev_err(dev->udev->dev.parent, "couldn't setup read URBs\n"); return err; } /* Warn if we've couldn't transmit all the URBs */ if (i < ESD_USB_MAX_RX_URBS) { dev_warn(dev->udev->dev.parent, "rx performance may be slow\n"); } dev->rxinitdone = 1; return 0; } /* Start interface */ static int esd_usb_start(struct esd_usb_net_priv *priv) { struct esd_usb *dev = priv->usb; struct net_device *netdev = priv->netdev; union esd_usb_msg *msg; int err, i; msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (!msg) { err = -ENOMEM; goto out; } /* Enable all IDs * The IDADD message takes up to 64 32 bit bitmasks (2048 bits). * Each bit represents one 11 bit CAN identifier. A set bit * enables reception of the corresponding CAN identifier. A cleared * bit disabled this identifier. An additional bitmask value * following the CAN 2.0A bits is used to enable reception of * extended CAN frames. Only the LSB of this final mask is checked * for the complete 29 bit ID range. The IDADD message also allows * filter configuration for an ID subset. In this case you can add * the number of the starting bitmask (0..64) to the filter.option * field followed by only some bitmasks. */ msg->hdr.cmd = ESD_USB_CMD_IDADD; msg->hdr.len = sizeof(struct esd_usb_id_filter_msg) / sizeof(u32); /* # of 32bit words */ msg->filter.net = priv->index; msg->filter.option = ESD_USB_ID_ENABLE; /* start with segment 0 */ for (i = 0; i < ESD_USB_MAX_ID_SEGMENT; i++) msg->filter.mask[i] = cpu_to_le32(GENMASK(31, 0)); /* enable 29bit extended IDs */ msg->filter.mask[ESD_USB_MAX_ID_SEGMENT] = cpu_to_le32(BIT(0)); err = esd_usb_send_msg(dev, msg); if (err) goto out; err = esd_usb_setup_rx_urbs(dev); if (err) goto out; priv->can.state = CAN_STATE_ERROR_ACTIVE; out: if (err == -ENODEV) netif_device_detach(netdev); if (err) netdev_err(netdev, "couldn't start device: %d\n", err); kfree(msg); return err; } static void unlink_all_urbs(struct esd_usb *dev) { struct esd_usb_net_priv *priv; int i, j; usb_kill_anchored_urbs(&dev->rx_submitted); for (i = 0; i < ESD_USB_MAX_RX_URBS; ++i) usb_free_coherent(dev->udev, ESD_USB_RX_BUFFER_SIZE, dev->rxbuf[i], dev->rxbuf_dma[i]); for (i = 0; i < dev->net_count; i++) { priv = dev->nets[i]; if (priv) { usb_kill_anchored_urbs(&priv->tx_submitted); atomic_set(&priv->active_tx_jobs, 0); for (j = 0; j < ESD_USB_MAX_TX_URBS; j++) priv->tx_contexts[j].echo_index = ESD_USB_MAX_TX_URBS; } } } static int esd_usb_open(struct net_device *netdev) { struct esd_usb_net_priv *priv = netdev_priv(netdev); int err; /* common open */ err = open_candev(netdev); if (err) return err; /* finally start device */ err = esd_usb_start(priv); if (err) { netdev_warn(netdev, "couldn't start device: %d\n", err); close_candev(netdev); return err; } netif_start_queue(netdev); return 0; } static netdev_tx_t esd_usb_start_xmit(struct sk_buff *skb, struct net_device *netdev) { struct esd_usb_net_priv *priv = netdev_priv(netdev); struct esd_usb *dev = priv->usb; struct esd_tx_urb_context *context = NULL; struct net_device_stats *stats = &netdev->stats; struct canfd_frame *cfd = (struct canfd_frame *)skb->data; union esd_usb_msg *msg; struct urb *urb; u8 *buf; int i, err; int ret = NETDEV_TX_OK; size_t size = sizeof(union esd_usb_msg); if (can_dev_dropped_skb(netdev, skb)) return NETDEV_TX_OK; /* create a URB, and a buffer for it, and copy the data to the URB */ urb = usb_alloc_urb(0, GFP_ATOMIC); if (!urb) { stats->tx_dropped++; dev_kfree_skb(skb); goto nourbmem; } buf = usb_alloc_coherent(dev->udev, size, GFP_ATOMIC, &urb->transfer_dma); if (!buf) { netdev_err(netdev, "No memory left for USB buffer\n"); stats->tx_dropped++; dev_kfree_skb(skb); goto nobufmem; } msg = (union esd_usb_msg *)buf; /* minimal length as # of 32bit words */ msg->hdr.len = offsetof(struct esd_usb_tx_msg, data) / sizeof(u32); msg->hdr.cmd = ESD_USB_CMD_CAN_TX; msg->tx.net = priv->index; if (can_is_canfd_skb(skb)) { msg->tx.dlc = can_fd_len2dlc(cfd->len); msg->tx.dlc |= ESD_USB_FD; if ((cfd->flags & CANFD_BRS) == 0) msg->tx.dlc |= ESD_USB_NO_BRS; } else { msg->tx.dlc = can_get_cc_dlc((struct can_frame *)cfd, priv->can.ctrlmode); if (cfd->can_id & CAN_RTR_FLAG) msg->tx.dlc |= ESD_USB_RTR; } msg->tx.id = cpu_to_le32(cfd->can_id & CAN_ERR_MASK); if (cfd->can_id & CAN_EFF_FLAG) msg->tx.id |= cpu_to_le32(ESD_USB_EXTID); memcpy(msg->tx.data_fd, cfd->data, cfd->len); /* round up, then divide by 4 to add the payload length as # of 32bit words */ msg->hdr.len += DIV_ROUND_UP(cfd->len, sizeof(u32)); for (i = 0; i < ESD_USB_MAX_TX_URBS; i++) { if (priv->tx_contexts[i].echo_index == ESD_USB_MAX_TX_URBS) { context = &priv->tx_contexts[i]; break; } } /* This may never happen */ if (!context) { netdev_warn(netdev, "couldn't find free context\n"); ret = NETDEV_TX_BUSY; goto releasebuf; } context->priv = priv; context->echo_index = i; /* hnd must not be 0 - MSB is stripped in txdone handling */ msg->tx.hnd = BIT(31) | i; /* returned in TX done message */ usb_fill_bulk_urb(urb, dev->udev, usb_sndbulkpipe(dev->udev, 2), buf, msg->hdr.len * sizeof(u32), /* convert to # of bytes */ esd_usb_write_bulk_callback, context); urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; usb_anchor_urb(urb, &priv->tx_submitted); can_put_echo_skb(skb, netdev, context->echo_index, 0); atomic_inc(&priv->active_tx_jobs); /* Slow down tx path */ if (atomic_read(&priv->active_tx_jobs) >= ESD_USB_MAX_TX_URBS) netif_stop_queue(netdev); err = usb_submit_urb(urb, GFP_ATOMIC); if (err) { can_free_echo_skb(netdev, context->echo_index, NULL); atomic_dec(&priv->active_tx_jobs); usb_unanchor_urb(urb); stats->tx_dropped++; if (err == -ENODEV) netif_device_detach(netdev); else netdev_warn(netdev, "failed tx_urb %d\n", err); goto releasebuf; } netif_trans_update(netdev); /* Release our reference to this URB, the USB core will eventually free * it entirely. */ usb_free_urb(urb); return NETDEV_TX_OK; releasebuf: usb_free_coherent(dev->udev, size, buf, urb->transfer_dma); nobufmem: usb_free_urb(urb); nourbmem: return ret; } static int esd_usb_close(struct net_device *netdev) { struct esd_usb_net_priv *priv = netdev_priv(netdev); union esd_usb_msg *msg; int i; msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (!msg) return -ENOMEM; /* Disable all IDs (see esd_usb_start()) */ msg->hdr.cmd = ESD_USB_CMD_IDADD; msg->hdr.len = sizeof(struct esd_usb_id_filter_msg) / sizeof(u32);/* # of 32bit words */ msg->filter.net = priv->index; msg->filter.option = ESD_USB_ID_ENABLE; /* start with segment 0 */ for (i = 0; i <= ESD_USB_MAX_ID_SEGMENT; i++) msg->filter.mask[i] = 0; if (esd_usb_send_msg(priv->usb, msg) < 0) netdev_err(netdev, "sending idadd message failed\n"); /* set CAN controller to reset mode */ msg->hdr.len = sizeof(struct esd_usb_set_baudrate_msg) / sizeof(u32); /* # of 32bit words */ msg->hdr.cmd = ESD_USB_CMD_SETBAUD; msg->setbaud.net = priv->index; msg->setbaud.rsvd = 0; msg->setbaud.baud = cpu_to_le32(ESD_USB_NO_BAUDRATE); if (esd_usb_send_msg(priv->usb, msg) < 0) netdev_err(netdev, "sending setbaud message failed\n"); priv->can.state = CAN_STATE_STOPPED; netif_stop_queue(netdev); close_candev(netdev); kfree(msg); return 0; } static const struct net_device_ops esd_usb_netdev_ops = { .ndo_open = esd_usb_open, .ndo_stop = esd_usb_close, .ndo_start_xmit = esd_usb_start_xmit, .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops esd_usb_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; static const struct can_bittiming_const esd_usb_2_bittiming_const = { .name = "esd_usb_2", .tseg1_min = 1, .tseg1_max = 16, .tseg2_min = 1, .tseg2_max = 8, .sjw_max = 4, .brp_min = 1, .brp_max = 1024, .brp_inc = 1, }; static int esd_usb_2_set_bittiming(struct net_device *netdev) { const struct can_bittiming_const *btc = &esd_usb_2_bittiming_const; struct esd_usb_net_priv *priv = netdev_priv(netdev); struct can_bittiming *bt = &priv->can.bittiming; union esd_usb_msg *msg; int err; u32 canbtr; int sjw_shift; canbtr = ESD_USB_UBR; if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY) canbtr |= ESD_USB_LOM; canbtr |= (bt->brp - 1) & (btc->brp_max - 1); if (le16_to_cpu(priv->usb->udev->descriptor.idProduct) == ESD_USB_CANUSBM_PRODUCT_ID) sjw_shift = ESD_USB_M_SJW_SHIFT; else sjw_shift = ESD_USB_2_SJW_SHIFT; canbtr |= ((bt->sjw - 1) & (btc->sjw_max - 1)) << sjw_shift; canbtr |= ((bt->prop_seg + bt->phase_seg1 - 1) & (btc->tseg1_max - 1)) << ESD_USB_2_TSEG1_SHIFT; canbtr |= ((bt->phase_seg2 - 1) & (btc->tseg2_max - 1)) << ESD_USB_2_TSEG2_SHIFT; if (priv->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES) canbtr |= ESD_USB_TRIPLE_SAMPLES; msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (!msg) return -ENOMEM; msg->hdr.len = sizeof(struct esd_usb_set_baudrate_msg) / sizeof(u32); /* # of 32bit words */ msg->hdr.cmd = ESD_USB_CMD_SETBAUD; msg->setbaud.net = priv->index; msg->setbaud.rsvd = 0; msg->setbaud.baud = cpu_to_le32(canbtr); netdev_dbg(netdev, "setting BTR=%#x\n", canbtr); err = esd_usb_send_msg(priv->usb, msg); kfree(msg); return err; } /* Nominal bittiming constants, see * Microchip SAM E70/S70/V70/V71, Data Sheet, Rev. G - 07/2022 * 48.6.8 MCAN Nominal Bit Timing and Prescaler Register */ static const struct can_bittiming_const esd_usb_3_nom_bittiming_const = { .name = "esd_usb_3", .tseg1_min = 2, .tseg1_max = 256, .tseg2_min = 2, .tseg2_max = 128, .sjw_max = 128, .brp_min = 1, .brp_max = 512, .brp_inc = 1, }; /* Data bittiming constants, see * Microchip SAM E70/S70/V70/V71, Data Sheet, Rev. G - 07/2022 * 48.6.4 MCAN Data Bit Timing and Prescaler Register */ static const struct can_bittiming_const esd_usb_3_data_bittiming_const = { .name = "esd_usb_3", .tseg1_min = 2, .tseg1_max = 32, .tseg2_min = 1, .tseg2_max = 16, .sjw_max = 8, .brp_min = 1, .brp_max = 32, .brp_inc = 1, }; static int esd_usb_3_set_bittiming(struct net_device *netdev) { const struct can_bittiming_const *nom_btc = &esd_usb_3_nom_bittiming_const; const struct can_bittiming_const *data_btc = &esd_usb_3_data_bittiming_const; struct esd_usb_net_priv *priv = netdev_priv(netdev); struct can_bittiming *nom_bt = &priv->can.bittiming; struct can_bittiming *data_bt = &priv->can.data_bittiming; struct esd_usb_3_set_baudrate_msg_x *baud_x; union esd_usb_msg *msg; u16 flags = 0; int err; msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (!msg) return -ENOMEM; baud_x = &msg->setbaud_x; /* Canonical is the most reasonable mode for SocketCAN on CAN-USB/3 ... */ baud_x->mode = cpu_to_le16(ESD_USB_3_BAUDRATE_MODE_BTR_CANONICAL); if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY) flags |= ESD_USB_3_BAUDRATE_FLAG_LOM; baud_x->nom.brp = cpu_to_le16(nom_bt->brp & (nom_btc->brp_max - 1)); baud_x->nom.sjw = cpu_to_le16(nom_bt->sjw & (nom_btc->sjw_max - 1)); baud_x->nom.tseg1 = cpu_to_le16((nom_bt->prop_seg + nom_bt->phase_seg1) & (nom_btc->tseg1_max - 1)); baud_x->nom.tseg2 = cpu_to_le16(nom_bt->phase_seg2 & (nom_btc->tseg2_max - 1)); if (priv->can.ctrlmode & CAN_CTRLMODE_FD) { baud_x->data.brp = cpu_to_le16(data_bt->brp & (data_btc->brp_max - 1)); baud_x->data.sjw = cpu_to_le16(data_bt->sjw & (data_btc->sjw_max - 1)); baud_x->data.tseg1 = cpu_to_le16((data_bt->prop_seg + data_bt->phase_seg1) & (data_btc->tseg1_max - 1)); baud_x->data.tseg2 = cpu_to_le16(data_bt->phase_seg2 & (data_btc->tseg2_max - 1)); flags |= ESD_USB_3_BAUDRATE_FLAG_FD; } /* Currently this driver only supports the automatic TDC mode */ baud_x->tdc.tdc_mode = ESD_USB_3_TDC_MODE_AUTO; baud_x->tdc.ssp_offset = 0; baud_x->tdc.ssp_shift = 0; baud_x->tdc.tdc_filter = 0; baud_x->flags = cpu_to_le16(flags); baud_x->net = priv->index; baud_x->rsvd = 0; /* set len as # of 32bit words */ msg->hdr.len = sizeof(struct esd_usb_3_set_baudrate_msg_x) / sizeof(u32); msg->hdr.cmd = ESD_USB_CMD_SETBAUD; netdev_dbg(netdev, "ctrlmode=%#x/%#x, esd-net=%u, esd-mode=%#x, esd-flags=%#x\n", priv->can.ctrlmode, priv->can.ctrlmode_supported, priv->index, le16_to_cpu(baud_x->mode), flags); err = esd_usb_send_msg(priv->usb, msg); kfree(msg); return err; } static int esd_usb_get_berr_counter(const struct net_device *netdev, struct can_berr_counter *bec) { struct esd_usb_net_priv *priv = netdev_priv(netdev); bec->txerr = priv->bec.txerr; bec->rxerr = priv->bec.rxerr; return 0; } static int esd_usb_set_mode(struct net_device *netdev, enum can_mode mode) { switch (mode) { case CAN_MODE_START: netif_wake_queue(netdev); break; default: return -EOPNOTSUPP; } return 0; } static int esd_usb_probe_one_net(struct usb_interface *intf, int index) { struct esd_usb *dev = usb_get_intfdata(intf); struct net_device *netdev; struct esd_usb_net_priv *priv; int err = 0; int i; netdev = alloc_candev(sizeof(*priv), ESD_USB_MAX_TX_URBS); if (!netdev) { dev_err(&intf->dev, "couldn't alloc candev\n"); err = -ENOMEM; goto done; } priv = netdev_priv(netdev); init_usb_anchor(&priv->tx_submitted); atomic_set(&priv->active_tx_jobs, 0); for (i = 0; i < ESD_USB_MAX_TX_URBS; i++) priv->tx_contexts[i].echo_index = ESD_USB_MAX_TX_URBS; priv->usb = dev; priv->netdev = netdev; priv->index = index; priv->can.state = CAN_STATE_STOPPED; priv->can.ctrlmode_supported = CAN_CTRLMODE_LISTENONLY | CAN_CTRLMODE_CC_LEN8_DLC | CAN_CTRLMODE_BERR_REPORTING; switch (le16_to_cpu(dev->udev->descriptor.idProduct)) { case ESD_USB_CANUSB3_PRODUCT_ID: priv->can.clock.freq = ESD_USB_3_CAN_CLOCK; priv->can.ctrlmode_supported |= CAN_CTRLMODE_FD; priv->can.bittiming_const = &esd_usb_3_nom_bittiming_const; priv->can.data_bittiming_const = &esd_usb_3_data_bittiming_const; priv->can.do_set_bittiming = esd_usb_3_set_bittiming; priv->can.do_set_data_bittiming = esd_usb_3_set_bittiming; break; case ESD_USB_CANUSBM_PRODUCT_ID: priv->can.clock.freq = ESD_USB_M_CAN_CLOCK; priv->can.bittiming_const = &esd_usb_2_bittiming_const; priv->can.do_set_bittiming = esd_usb_2_set_bittiming; break; case ESD_USB_CANUSB2_PRODUCT_ID: default: priv->can.clock.freq = ESD_USB_2_CAN_CLOCK; priv->can.ctrlmode_supported |= CAN_CTRLMODE_3_SAMPLES; priv->can.bittiming_const = &esd_usb_2_bittiming_const; priv->can.do_set_bittiming = esd_usb_2_set_bittiming; break; } priv->can.do_set_mode = esd_usb_set_mode; priv->can.do_get_berr_counter = esd_usb_get_berr_counter; netdev->flags |= IFF_ECHO; /* we support local echo */ netdev->netdev_ops = &esd_usb_netdev_ops; netdev->ethtool_ops = &esd_usb_ethtool_ops; SET_NETDEV_DEV(netdev, &intf->dev); netdev->dev_id = index; err = register_candev(netdev); if (err) { dev_err(&intf->dev, "couldn't register CAN device: %d\n", err); free_candev(netdev); err = -ENOMEM; goto done; } dev->nets[index] = priv; netdev_info(netdev, "device %s registered\n", netdev->name); done: return err; } /* probe function for new USB devices * * check version information and number of available * CAN interfaces */ static int esd_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct esd_usb *dev; union esd_usb_msg *msg; int i, err; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { err = -ENOMEM; goto done; } dev->udev = interface_to_usbdev(intf); init_usb_anchor(&dev->rx_submitted); usb_set_intfdata(intf, dev); msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (!msg) { err = -ENOMEM; goto free_msg; } /* query number of CAN interfaces (nets) */ msg->hdr.cmd = ESD_USB_CMD_VERSION; msg->hdr.len = sizeof(struct esd_usb_version_msg) / sizeof(u32); /* # of 32bit words */ msg->version.rsvd = 0; msg->version.flags = 0; msg->version.drv_version = 0; err = esd_usb_send_msg(dev, msg); if (err < 0) { dev_err(&intf->dev, "sending version message failed\n"); goto free_msg; } err = esd_usb_wait_msg(dev, msg); if (err < 0) { dev_err(&intf->dev, "no version message answer\n"); goto free_msg; } dev->net_count = (int)msg->version_reply.nets; dev->version = le32_to_cpu(msg->version_reply.version); if (device_create_file(&intf->dev, &dev_attr_firmware)) dev_err(&intf->dev, "Couldn't create device file for firmware\n"); if (device_create_file(&intf->dev, &dev_attr_hardware)) dev_err(&intf->dev, "Couldn't create device file for hardware\n"); if (device_create_file(&intf->dev, &dev_attr_nets)) dev_err(&intf->dev, "Couldn't create device file for nets\n"); /* do per device probing */ for (i = 0; i < dev->net_count; i++) esd_usb_probe_one_net(intf, i); free_msg: kfree(msg); if (err) kfree(dev); done: return err; } /* called by the usb core when the device is removed from the system */ static void esd_usb_disconnect(struct usb_interface *intf) { struct esd_usb *dev = usb_get_intfdata(intf); struct net_device *netdev; int i; device_remove_file(&intf->dev, &dev_attr_firmware); device_remove_file(&intf->dev, &dev_attr_hardware); device_remove_file(&intf->dev, &dev_attr_nets); usb_set_intfdata(intf, NULL); if (dev) { for (i = 0; i < dev->net_count; i++) { if (dev->nets[i]) { netdev = dev->nets[i]->netdev; unregister_netdev(netdev); free_candev(netdev); } } unlink_all_urbs(dev); kfree(dev); } } /* usb specific object needed to register this driver with the usb subsystem */ static struct usb_driver esd_usb_driver = { .name = KBUILD_MODNAME, .probe = esd_usb_probe, .disconnect = esd_usb_disconnect, .id_table = esd_usb_table, }; module_usb_driver(esd_usb_driver);
307 307 307 307 307 307 307 307 307 306 307 307 307 305 7 7 305 305 305 303 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 5 304 304 305 299 305 132 305 305 304 305 304 305 305 304 304 305 304 305 1 305 305 307 307 307 2 307 305 305 305 305 304 305 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 1 1 6 7 7 298 305 305 305 305 298 298 298 305 305 304 305 306 307 307 2 2 2 305 305 305 305 304 7 7 1 7 298 305 305 2 307 307 307 307 2 304 305 304 305 304 305 5 305 305 304 307 305 305 305 13 305 304 304 305 305 304 305 304 305 305 305 305 305 305 305 305 217 305 88 88 5 5 5 5 5 6 7 7 7 7 7 7 7 7 6 6 7 7 7 6 7 7 7 7 7 5 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 1 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 307 307 307 305 7 7 7 7 7 7 7 7 7 7 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" #include "xfs_trans_priv.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_error.h" #include "xfs_buf_item.h" #include "xfs_ag.h" #include "xfs_quota.h" #include "xfs_reflink.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) STATIC int xlog_find_zeroed( struct xlog *, xfs_daddr_t *); STATIC int xlog_clear_stale_blocks( struct xlog *, xfs_lsn_t); STATIC int xlog_do_recovery_pass( struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *); /* * Sector aligned buffer routines for buffer create/read/write/access */ /* * Verify the log-relative block number and length in basic blocks are valid for * an operation involving the given XFS log buffer. Returns true if the fields * are valid, false otherwise. */ static inline bool xlog_verify_bno( struct xlog *log, xfs_daddr_t blk_no, int bbcount) { if (blk_no < 0 || blk_no >= log->l_logBBsize) return false; if (bbcount <= 0 || (blk_no + bbcount) > log->l_logBBsize) return false; return true; } /* * Allocate a buffer to hold log data. The buffer needs to be able to map to * a range of nbblks basic blocks at any valid offset within the log. */ static char * xlog_alloc_buffer( struct xlog *log, int nbblks) { /* * Pass log block 0 since we don't have an addr yet, buffer will be * verified on read. */ if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, 0, nbblks))) { xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", nbblks); return NULL; } /* * We do log I/O in units of log sectors (a power-of-2 multiple of the * basic block size), so we round up the requested size to accommodate * the basic blocks required for complete log sectors. * * In addition, the buffer may be used for a non-sector-aligned block * offset, in which case an I/O of the requested size could extend * beyond the end of the buffer. If the requested size is only 1 basic * block it will never straddle a sector boundary, so this won't be an * issue. Nor will this be a problem if the log I/O is done in basic * blocks (sector size 1). But otherwise we extend the buffer by one * extra log sector to ensure there's space to accommodate this * possibility. */ if (nbblks > 1 && log->l_sectBBsize > 1) nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); return kvzalloc(BBTOB(nbblks), GFP_KERNEL | __GFP_RETRY_MAYFAIL); } /* * Return the address of the start of the given block number's data * in a log buffer. The buffer covers a log sector-aligned region. */ static inline unsigned int xlog_align( struct xlog *log, xfs_daddr_t blk_no) { return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1)); } static int xlog_do_io( struct xlog *log, xfs_daddr_t blk_no, unsigned int nbblks, char *data, enum req_op op) { int error; if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) { xfs_warn(log->l_mp, "Invalid log block/length (0x%llx, 0x%x) for buffer", blk_no, nbblks); return -EFSCORRUPTED; } blk_no = round_down(blk_no, log->l_sectBBsize); nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no, BBTOB(nbblks), data, op); if (error && !xlog_is_shutdown(log)) { xfs_alert(log->l_mp, "log recovery %s I/O error at daddr 0x%llx len %d error %d", op == REQ_OP_WRITE ? "write" : "read", blk_no, nbblks, error); } return error; } STATIC int xlog_bread_noalign( struct xlog *log, xfs_daddr_t blk_no, int nbblks, char *data) { return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ); } STATIC int xlog_bread( struct xlog *log, xfs_daddr_t blk_no, int nbblks, char *data, char **offset) { int error; error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ); if (!error) *offset = data + xlog_align(log, blk_no); return error; } STATIC int xlog_bwrite( struct xlog *log, xfs_daddr_t blk_no, int nbblks, char *data) { return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE); } #ifdef DEBUG /* * dump debug superblock and log record information */ STATIC void xlog_header_check_dump( xfs_mount_t *mp, xlog_rec_header_t *head) { xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d", __func__, &mp->m_sb.sb_uuid, XLOG_FMT); xfs_debug(mp, " log : uuid = %pU, fmt = %d", &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); } #else #define xlog_header_check_dump(mp, head) #endif /* * check log record header for recovery */ STATIC int xlog_header_check_recover( xfs_mount_t *mp, xlog_rec_header_t *head) { ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); /* * IRIX doesn't write the h_fmt field and leaves it zeroed * (XLOG_FMT_UNKNOWN). This stops us from trying to recover * a dirty log created in IRIX. */ if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) { xfs_warn(mp, "dirty log written in incompatible format - can't recover"); xlog_header_check_dump(mp, head); return -EFSCORRUPTED; } if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { xfs_warn(mp, "dirty log entry has mismatched uuid - can't recover"); xlog_header_check_dump(mp, head); return -EFSCORRUPTED; } return 0; } /* * read the head block of the log and check the header */ STATIC int xlog_header_check_mount( xfs_mount_t *mp, xlog_rec_header_t *head) { ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); if (uuid_is_null(&head->h_fs_uuid)) { /* * IRIX doesn't write the h_fs_uuid or h_fmt fields. If * h_fs_uuid is null, we assume this log was last mounted * by IRIX and continue. */ xfs_warn(mp, "null uuid in log - IRIX style log"); } else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { xfs_warn(mp, "log has mismatched uuid - can't recover"); xlog_header_check_dump(mp, head); return -EFSCORRUPTED; } return 0; } /* * This routine finds (to an approximation) the first block in the physical * log which contains the given cycle. It uses a binary search algorithm. * Note that the algorithm can not be perfect because the disk will not * necessarily be perfect. */ STATIC int xlog_find_cycle_start( struct xlog *log, char *buffer, xfs_daddr_t first_blk, xfs_daddr_t *last_blk, uint cycle) { char *offset; xfs_daddr_t mid_blk; xfs_daddr_t end_blk; uint mid_cycle; int error; end_blk = *last_blk; mid_blk = BLK_AVG(first_blk, end_blk); while (mid_blk != first_blk && mid_blk != end_blk) { error = xlog_bread(log, mid_blk, 1, buffer, &offset); if (error) return error; mid_cycle = xlog_get_cycle(offset); if (mid_cycle == cycle) end_blk = mid_blk; /* last_half_cycle == mid_cycle */ else first_blk = mid_blk; /* first_half_cycle == mid_cycle */ mid_blk = BLK_AVG(first_blk, end_blk); } ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) || (mid_blk == end_blk && mid_blk-1 == first_blk)); *last_blk = end_blk; return 0; } /* * Check that a range of blocks does not contain stop_on_cycle_no. * Fill in *new_blk with the block offset where such a block is * found, or with -1 (an invalid block number) if there is no such * block in the range. The scan needs to occur from front to back * and the pointer into the region must be updated since a later * routine will need to perform another test. */ STATIC int xlog_find_verify_cycle( struct xlog *log, xfs_daddr_t start_blk, int nbblks, uint stop_on_cycle_no, xfs_daddr_t *new_blk) { xfs_daddr_t i, j; uint cycle; char *buffer; xfs_daddr_t bufblks; char *buf = NULL; int error = 0; /* * Greedily allocate a buffer big enough to handle the full * range of basic blocks we'll be examining. If that fails, * try a smaller size. We need to be able to read at least * a log sector, or we're out of luck. */ bufblks = roundup_pow_of_two(nbblks); while (bufblks > log->l_logBBsize) bufblks >>= 1; while (!(buffer = xlog_alloc_buffer(log, bufblks))) { bufblks >>= 1; if (bufblks < log->l_sectBBsize) return -ENOMEM; } for (i = start_blk; i < start_blk + nbblks; i += bufblks) { int bcount; bcount = min(bufblks, (start_blk + nbblks - i)); error = xlog_bread(log, i, bcount, buffer, &buf); if (error) goto out; for (j = 0; j < bcount; j++) { cycle = xlog_get_cycle(buf); if (cycle == stop_on_cycle_no) { *new_blk = i+j; goto out; } buf += BBSIZE; } } *new_blk = -1; out: kvfree(buffer); return error; } static inline int xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh) { if (xfs_has_logv2(log->l_mp)) { int h_size = be32_to_cpu(rh->h_size); if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) && h_size > XLOG_HEADER_CYCLE_SIZE) return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE); } return 1; } /* * Potentially backup over partial log record write. * * In the typical case, last_blk is the number of the block directly after * a good log record. Therefore, we subtract one to get the block number * of the last block in the given buffer. extra_bblks contains the number * of blocks we would have read on a previous read. This happens when the * last log record is split over the end of the physical log. * * extra_bblks is the number of blocks potentially verified on a previous * call to this routine. */ STATIC int xlog_find_verify_log_record( struct xlog *log, xfs_daddr_t start_blk, xfs_daddr_t *last_blk, int extra_bblks) { xfs_daddr_t i; char *buffer; char *offset = NULL; xlog_rec_header_t *head = NULL; int error = 0; int smallmem = 0; int num_blks = *last_blk - start_blk; int xhdrs; ASSERT(start_blk != 0 || *last_blk != start_blk); buffer = xlog_alloc_buffer(log, num_blks); if (!buffer) { buffer = xlog_alloc_buffer(log, 1); if (!buffer) return -ENOMEM; smallmem = 1; } else { error = xlog_bread(log, start_blk, num_blks, buffer, &offset); if (error) goto out; offset += ((num_blks - 1) << BBSHIFT); } for (i = (*last_blk) - 1; i >= 0; i--) { if (i < start_blk) { /* valid log record not found */ xfs_warn(log->l_mp, "Log inconsistent (didn't find previous header)"); ASSERT(0); error = -EFSCORRUPTED; goto out; } if (smallmem) { error = xlog_bread(log, i, 1, buffer, &offset); if (error) goto out; } head = (xlog_rec_header_t *)offset; if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) break; if (!smallmem) offset -= BBSIZE; } /* * We hit the beginning of the physical log & still no header. Return * to caller. If caller can handle a return of -1, then this routine * will be called again for the end of the physical log. */ if (i == -1) { error = 1; goto out; } /* * We have the final block of the good log (the first block * of the log record _before_ the head. So we check the uuid. */ if ((error = xlog_header_check_mount(log->l_mp, head))) goto out; /* * We may have found a log record header before we expected one. * last_blk will be the 1st block # with a given cycle #. We may end * up reading an entire log record. In this case, we don't want to * reset last_blk. Only when last_blk points in the middle of a log * record do we update last_blk. */ xhdrs = xlog_logrec_hblks(log, head); if (*last_blk - i + extra_bblks != BTOBB(be32_to_cpu(head->h_len)) + xhdrs) *last_blk = i; out: kvfree(buffer); return error; } /* * Head is defined to be the point of the log where the next log write * could go. This means that incomplete LR writes at the end are * eliminated when calculating the head. We aren't guaranteed that previous * LR have complete transactions. We only know that a cycle number of * current cycle number -1 won't be present in the log if we start writing * from our current block number. * * last_blk contains the block number of the first block with a given * cycle number. * * Return: zero if normal, non-zero if error. */ STATIC int xlog_find_head( struct xlog *log, xfs_daddr_t *return_head_blk) { char *buffer; char *offset; xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk; int num_scan_bblks; uint first_half_cycle, last_half_cycle; uint stop_on_cycle; int error, log_bbnum = log->l_logBBsize; /* Is the end of the log device zeroed? */ error = xlog_find_zeroed(log, &first_blk); if (error < 0) { xfs_warn(log->l_mp, "empty log check failed"); return error; } if (error == 1) { *return_head_blk = first_blk; /* Is the whole lot zeroed? */ if (!first_blk) { /* Linux XFS shouldn't generate totally zeroed logs - * mkfs etc write a dummy unmount record to a fresh * log so we can store the uuid in there */ xfs_warn(log->l_mp, "totally zeroed log"); } return 0; } first_blk = 0; /* get cycle # of 1st block */ buffer = xlog_alloc_buffer(log, 1); if (!buffer) return -ENOMEM; error = xlog_bread(log, 0, 1, buffer, &offset); if (error) goto out_free_buffer; first_half_cycle = xlog_get_cycle(offset); last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */ error = xlog_bread(log, last_blk, 1, buffer, &offset); if (error) goto out_free_buffer; last_half_cycle = xlog_get_cycle(offset); ASSERT(last_half_cycle != 0); /* * If the 1st half cycle number is equal to the last half cycle number, * then the entire log is stamped with the same cycle number. In this * case, head_blk can't be set to zero (which makes sense). The below * math doesn't work out properly with head_blk equal to zero. Instead, * we set it to log_bbnum which is an invalid block number, but this * value makes the math correct. If head_blk doesn't changed through * all the tests below, *head_blk is set to zero at the very end rather * than log_bbnum. In a sense, log_bbnum and zero are the same block * in a circular file. */ if (first_half_cycle == last_half_cycle) { /* * In this case we believe that the entire log should have * cycle number last_half_cycle. We need to scan backwards * from the end verifying that there are no holes still * containing last_half_cycle - 1. If we find such a hole, * then the start of that hole will be the new head. The * simple case looks like * x | x ... | x - 1 | x * Another case that fits this picture would be * x | x + 1 | x ... | x * In this case the head really is somewhere at the end of the * log, as one of the latest writes at the beginning was * incomplete. * One more case is * x | x + 1 | x ... | x - 1 | x * This is really the combination of the above two cases, and * the head has to end up at the start of the x-1 hole at the * end of the log. * * In the 256k log case, we will read from the beginning to the * end of the log and search for cycle numbers equal to x-1. * We don't worry about the x+1 blocks that we encounter, * because we know that they cannot be the head since the log * started with x. */ head_blk = log_bbnum; stop_on_cycle = last_half_cycle - 1; } else { /* * In this case we want to find the first block with cycle * number matching last_half_cycle. We expect the log to be * some variation on * x + 1 ... | x ... | x * The first block with cycle number x (last_half_cycle) will * be where the new head belongs. First we do a binary search * for the first occurrence of last_half_cycle. The binary * search may not be totally accurate, so then we scan back * from there looking for occurrences of last_half_cycle before * us. If that backwards scan wraps around the beginning of * the log, then we look for occurrences of last_half_cycle - 1 * at the end of the log. The cases we're looking for look * like * v binary search stopped here * x + 1 ... | x | x + 1 | x ... | x * ^ but we want to locate this spot * or * <---------> less than scan distance * x + 1 ... | x ... | x - 1 | x * ^ we want to locate this spot */ stop_on_cycle = last_half_cycle; error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk, last_half_cycle); if (error) goto out_free_buffer; } /* * Now validate the answer. Scan back some number of maximum possible * blocks and make sure each one has the expected cycle number. The * maximum is determined by the total possible amount of buffering * in the in-core log. The following number can be made tighter if * we actually look at the block size of the filesystem. */ num_scan_bblks = min_t(int, log_bbnum, XLOG_TOTAL_REC_SHIFT(log)); if (head_blk >= num_scan_bblks) { /* * We are guaranteed that the entire check can be performed * in one buffer. */ start_blk = head_blk - num_scan_bblks; if ((error = xlog_find_verify_cycle(log, start_blk, num_scan_bblks, stop_on_cycle, &new_blk))) goto out_free_buffer; if (new_blk != -1) head_blk = new_blk; } else { /* need to read 2 parts of log */ /* * We are going to scan backwards in the log in two parts. * First we scan the physical end of the log. In this part * of the log, we are looking for blocks with cycle number * last_half_cycle - 1. * If we find one, then we know that the log starts there, as * we've found a hole that didn't get written in going around * the end of the physical log. The simple case for this is * x + 1 ... | x ... | x - 1 | x * <---------> less than scan distance * If all of the blocks at the end of the log have cycle number * last_half_cycle, then we check the blocks at the start of * the log looking for occurrences of last_half_cycle. If we * find one, then our current estimate for the location of the * first occurrence of last_half_cycle is wrong and we move * back to the hole we've found. This case looks like * x + 1 ... | x | x + 1 | x ... * ^ binary search stopped here * Another case we need to handle that only occurs in 256k * logs is * x + 1 ... | x ... | x+1 | x ... * ^ binary search stops here * In a 256k log, the scan at the end of the log will see the * x + 1 blocks. We need to skip past those since that is * certainly not the head of the log. By searching for * last_half_cycle-1 we accomplish that. */ ASSERT(head_blk <= INT_MAX && (xfs_daddr_t) num_scan_bblks >= head_blk); start_blk = log_bbnum - (num_scan_bblks - head_blk); if ((error = xlog_find_verify_cycle(log, start_blk, num_scan_bblks - (int)head_blk, (stop_on_cycle - 1), &new_blk))) goto out_free_buffer; if (new_blk != -1) { head_blk = new_blk; goto validate_head; } /* * Scan beginning of log now. The last part of the physical * log is good. This scan needs to verify that it doesn't find * the last_half_cycle. */ start_blk = 0; ASSERT(head_blk <= INT_MAX); if ((error = xlog_find_verify_cycle(log, start_blk, (int)head_blk, stop_on_cycle, &new_blk))) goto out_free_buffer; if (new_blk != -1) head_blk = new_blk; } validate_head: /* * Now we need to make sure head_blk is not pointing to a block in * the middle of a log record. */ num_scan_bblks = XLOG_REC_SHIFT(log); if (head_blk >= num_scan_bblks) { start_blk = head_blk - num_scan_bblks; /* don't read head_blk */ /* start ptr at last block ptr before head_blk */ error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0); if (error == 1) error = -EIO; if (error) goto out_free_buffer; } else { start_blk = 0; ASSERT(head_blk <= INT_MAX); error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0); if (error < 0) goto out_free_buffer; if (error == 1) { /* We hit the beginning of the log during our search */ start_blk = log_bbnum - (num_scan_bblks - head_blk); new_blk = log_bbnum; ASSERT(start_blk <= INT_MAX && (xfs_daddr_t) log_bbnum-start_blk >= 0); ASSERT(head_blk <= INT_MAX); error = xlog_find_verify_log_record(log, start_blk, &new_blk, (int)head_blk); if (error == 1) error = -EIO; if (error) goto out_free_buffer; if (new_blk != log_bbnum) head_blk = new_blk; } else if (error) goto out_free_buffer; } kvfree(buffer); if (head_blk == log_bbnum) *return_head_blk = 0; else *return_head_blk = head_blk; /* * When returning here, we have a good block number. Bad block * means that during a previous crash, we didn't have a clean break * from cycle number N to cycle number N-1. In this case, we need * to find the first block with cycle number N-1. */ return 0; out_free_buffer: kvfree(buffer); if (error) xfs_warn(log->l_mp, "failed to find log head"); return error; } /* * Seek backwards in the log for log record headers. * * Given a starting log block, walk backwards until we find the provided number * of records or hit the provided tail block. The return value is the number of * records encountered or a negative error code. The log block and buffer * pointer of the last record seen are returned in rblk and rhead respectively. */ STATIC int xlog_rseek_logrec_hdr( struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk, int count, char *buffer, xfs_daddr_t *rblk, struct xlog_rec_header **rhead, bool *wrapped) { int i; int error; int found = 0; char *offset = NULL; xfs_daddr_t end_blk; *wrapped = false; /* * Walk backwards from the head block until we hit the tail or the first * block in the log. */ end_blk = head_blk > tail_blk ? tail_blk : 0; for (i = (int) head_blk - 1; i >= end_blk; i--) { error = xlog_bread(log, i, 1, buffer, &offset); if (error) goto out_error; if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { *rblk = i; *rhead = (struct xlog_rec_header *) offset; if (++found == count) break; } } /* * If we haven't hit the tail block or the log record header count, * start looking again from the end of the physical log. Note that * callers can pass head == tail if the tail is not yet known. */ if (tail_blk >= head_blk && found != count) { for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) { error = xlog_bread(log, i, 1, buffer, &offset); if (error) goto out_error; if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { *wrapped = true; *rblk = i; *rhead = (struct xlog_rec_header *) offset; if (++found == count) break; } } } return found; out_error: return error; } /* * Seek forward in the log for log record headers. * * Given head and tail blocks, walk forward from the tail block until we find * the provided number of records or hit the head block. The return value is the * number of records encountered or a negative error code. The log block and * buffer pointer of the last record seen are returned in rblk and rhead * respectively. */ STATIC int xlog_seek_logrec_hdr( struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk, int count, char *buffer, xfs_daddr_t *rblk, struct xlog_rec_header **rhead, bool *wrapped) { int i; int error; int found = 0; char *offset = NULL; xfs_daddr_t end_blk; *wrapped = false; /* * Walk forward from the tail block until we hit the head or the last * block in the log. */ end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1; for (i = (int) tail_blk; i <= end_blk; i++) { error = xlog_bread(log, i, 1, buffer, &offset); if (error) goto out_error; if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { *rblk = i; *rhead = (struct xlog_rec_header *) offset; if (++found == count) break; } } /* * If we haven't hit the head block or the log record header count, * start looking again from the start of the physical log. */ if (tail_blk > head_blk && found != count) { for (i = 0; i < (int) head_blk; i++) { error = xlog_bread(log, i, 1, buffer, &offset); if (error) goto out_error; if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { *wrapped = true; *rblk = i; *rhead = (struct xlog_rec_header *) offset; if (++found == count) break; } } } return found; out_error: return error; } /* * Calculate distance from head to tail (i.e., unused space in the log). */ static inline int xlog_tail_distance( struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { if (head_blk < tail_blk) return tail_blk - head_blk; return tail_blk + (log->l_logBBsize - head_blk); } /* * Verify the log tail. This is particularly important when torn or incomplete * writes have been detected near the front of the log and the head has been * walked back accordingly. * * We also have to handle the case where the tail was pinned and the head * blocked behind the tail right before a crash. If the tail had been pushed * immediately prior to the crash and the subsequent checkpoint was only * partially written, it's possible it overwrote the last referenced tail in the * log with garbage. This is not a coherency problem because the tail must have * been pushed before it can be overwritten, but appears as log corruption to * recovery because we have no way to know the tail was updated if the * subsequent checkpoint didn't write successfully. * * Therefore, CRC check the log from tail to head. If a failure occurs and the * offending record is within max iclog bufs from the head, walk the tail * forward and retry until a valid tail is found or corruption is detected out * of the range of a possible overwrite. */ STATIC int xlog_verify_tail( struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t *tail_blk, int hsize) { struct xlog_rec_header *thead; char *buffer; xfs_daddr_t first_bad; int error = 0; bool wrapped; xfs_daddr_t tmp_tail; xfs_daddr_t orig_tail = *tail_blk; buffer = xlog_alloc_buffer(log, 1); if (!buffer) return -ENOMEM; /* * Make sure the tail points to a record (returns positive count on * success). */ error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer, &tmp_tail, &thead, &wrapped); if (error < 0) goto out; if (*tail_blk != tmp_tail) *tail_blk = tmp_tail; /* * Run a CRC check from the tail to the head. We can't just check * MAX_ICLOGS records past the tail because the tail may point to stale * blocks cleared during the search for the head/tail. These blocks are * overwritten with zero-length records and thus record count is not a * reliable indicator of the iclog state before a crash. */ first_bad = 0; error = xlog_do_recovery_pass(log, head_blk, *tail_blk, XLOG_RECOVER_CRCPASS, &first_bad); while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) { int tail_distance; /* * Is corruption within range of the head? If so, retry from * the next record. Otherwise return an error. */ tail_distance = xlog_tail_distance(log, head_blk, first_bad); if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize)) break; /* skip to the next record; returns positive count on success */ error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, buffer, &tmp_tail, &thead, &wrapped); if (error < 0) goto out; *tail_blk = tmp_tail; first_bad = 0; error = xlog_do_recovery_pass(log, head_blk, *tail_blk, XLOG_RECOVER_CRCPASS, &first_bad); } if (!error && *tail_blk != orig_tail) xfs_warn(log->l_mp, "Tail block (0x%llx) overwrite detected. Updated to 0x%llx", orig_tail, *tail_blk); out: kvfree(buffer); return error; } /* * Detect and trim torn writes from the head of the log. * * Storage without sector atomicity guarantees can result in torn writes in the * log in the event of a crash. Our only means to detect this scenario is via * CRC verification. While we can't always be certain that CRC verification * failure is due to a torn write vs. an unrelated corruption, we do know that * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of * the log and treat failures in this range as torn writes as a matter of * policy. In the event of CRC failure, the head is walked back to the last good * record in the log and the tail is updated from that record and verified. */ STATIC int xlog_verify_head( struct xlog *log, xfs_daddr_t *head_blk, /* in/out: unverified head */ xfs_daddr_t *tail_blk, /* out: tail block */ char *buffer, xfs_daddr_t *rhead_blk, /* start blk of last record */ struct xlog_rec_header **rhead, /* ptr to last record */ bool *wrapped) /* last rec. wraps phys. log */ { struct xlog_rec_header *tmp_rhead; char *tmp_buffer; xfs_daddr_t first_bad; xfs_daddr_t tmp_rhead_blk; int found; int error; bool tmp_wrapped; /* * Check the head of the log for torn writes. Search backwards from the * head until we hit the tail or the maximum number of log record I/Os * that could have been in flight at one time. Use a temporary buffer so * we don't trash the rhead/buffer pointers from the caller. */ tmp_buffer = xlog_alloc_buffer(log, 1); if (!tmp_buffer) return -ENOMEM; error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk, XLOG_MAX_ICLOGS, tmp_buffer, &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped); kvfree(tmp_buffer); if (error < 0) return error; /* * Now run a CRC verification pass over the records starting at the * block found above to the current head. If a CRC failure occurs, the * log block of the first bad record is saved in first_bad. */ error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk, XLOG_RECOVER_CRCPASS, &first_bad); if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) { /* * We've hit a potential torn write. Reset the error and warn * about it. */ error = 0; xfs_warn(log->l_mp, "Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.", first_bad, *head_blk); /* * Get the header block and buffer pointer for the last good * record before the bad record. * * Note that xlog_find_tail() clears the blocks at the new head * (i.e., the records with invalid CRC) if the cycle number * matches the current cycle. */ found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, buffer, rhead_blk, rhead, wrapped); if (found < 0) return found; if (found == 0) /* XXX: right thing to do here? */ return -EIO; /* * Reset the head block to the starting block of the first bad * log record and set the tail block based on the last good * record. * * Bail out if the updated head/tail match as this indicates * possible corruption outside of the acceptable * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair... */ *head_blk = first_bad; *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn)); if (*head_blk == *tail_blk) { ASSERT(0); return 0; } } if (error) return error; return xlog_verify_tail(log, *head_blk, tail_blk, be32_to_cpu((*rhead)->h_size)); } /* * We need to make sure we handle log wrapping properly, so we can't use the * calculated logbno directly. Make sure it wraps to the correct bno inside the * log. * * The log is limited to 32 bit sizes, so we use the appropriate modulus * operation here and cast it back to a 64 bit daddr on return. */ static inline xfs_daddr_t xlog_wrap_logbno( struct xlog *log, xfs_daddr_t bno) { int mod; div_s64_rem(bno, log->l_logBBsize, &mod); return mod; } /* * Check whether the head of the log points to an unmount record. In other * words, determine whether the log is clean. If so, update the in-core state * appropriately. */ static int xlog_check_unmount_rec( struct xlog *log, xfs_daddr_t *head_blk, xfs_daddr_t *tail_blk, struct xlog_rec_header *rhead, xfs_daddr_t rhead_blk, char *buffer, bool *clean) { struct xlog_op_header *op_head; xfs_daddr_t umount_data_blk; xfs_daddr_t after_umount_blk; int hblks; int error; char *offset; *clean = false; /* * Look for unmount record. If we find it, then we know there was a * clean unmount. Since 'i' could be the last block in the physical * log, we convert to a log block before comparing to the head_blk. * * Save the current tail lsn to use to pass to xlog_clear_stale_blocks() * below. We won't want to clear the unmount record if there is one, so * we pass the lsn of the unmount record rather than the block after it. */ hblks = xlog_logrec_hblks(log, rhead); after_umount_blk = xlog_wrap_logbno(log, rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len))); if (*head_blk == after_umount_blk && be32_to_cpu(rhead->h_num_logops) == 1) { umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks); error = xlog_bread(log, umount_data_blk, 1, buffer, &offset); if (error) return error; op_head = (struct xlog_op_header *)offset; if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { /* * Set tail and last sync so that newly written log * records will point recovery to after the current * unmount record. */ xlog_assign_atomic_lsn(&log->l_tail_lsn, log->l_curr_cycle, after_umount_blk); log->l_ailp->ail_head_lsn = atomic64_read(&log->l_tail_lsn); *tail_blk = after_umount_blk; *clean = true; } } return 0; } static void xlog_set_state( struct xlog *log, xfs_daddr_t head_blk, struct xlog_rec_header *rhead, xfs_daddr_t rhead_blk, bool bump_cycle) { /* * Reset log values according to the state of the log when we * crashed. In the case where head_blk == 0, we bump curr_cycle * one because the next write starts a new cycle rather than * continuing the cycle of the last good log record. At this * point we have guaranteed that all partial log records have been * accounted for. Therefore, we know that the last good log record * written was complete and ended exactly on the end boundary * of the physical log. */ log->l_prev_block = rhead_blk; log->l_curr_block = (int)head_blk; log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); if (bump_cycle) log->l_curr_cycle++; atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); log->l_ailp->ail_head_lsn = be64_to_cpu(rhead->h_lsn); } /* * Find the sync block number or the tail of the log. * * This will be the block number of the last record to have its * associated buffers synced to disk. Every log record header has * a sync lsn embedded in it. LSNs hold block numbers, so it is easy * to get a sync block number. The only concern is to figure out which * log record header to believe. * * The following algorithm uses the log record header with the largest * lsn. The entire log record does not need to be valid. We only care * that the header is valid. * * We could speed up search by using current head_blk buffer, but it is not * available. */ STATIC int xlog_find_tail( struct xlog *log, xfs_daddr_t *head_blk, xfs_daddr_t *tail_blk) { xlog_rec_header_t *rhead; char *offset = NULL; char *buffer; int error; xfs_daddr_t rhead_blk; xfs_lsn_t tail_lsn; bool wrapped = false; bool clean = false; /* * Find previous log record */ if ((error = xlog_find_head(log, head_blk))) return error; ASSERT(*head_blk < INT_MAX); buffer = xlog_alloc_buffer(log, 1); if (!buffer) return -ENOMEM; if (*head_blk == 0) { /* special case */ error = xlog_bread(log, 0, 1, buffer, &offset); if (error) goto done; if (xlog_get_cycle(offset) == 0) { *tail_blk = 0; /* leave all other log inited values alone */ goto done; } } /* * Search backwards through the log looking for the log record header * block. This wraps all the way back around to the head so something is * seriously wrong if we can't find it. */ error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer, &rhead_blk, &rhead, &wrapped); if (error < 0) goto done; if (!error) { xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); error = -EFSCORRUPTED; goto done; } *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); /* * Set the log state based on the current head record. */ xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped); tail_lsn = atomic64_read(&log->l_tail_lsn); /* * Look for an unmount record at the head of the log. This sets the log * state to determine whether recovery is necessary. */ error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead, rhead_blk, buffer, &clean); if (error) goto done; /* * Verify the log head if the log is not clean (e.g., we have anything * but an unmount record at the head). This uses CRC verification to * detect and trim torn writes. If discovered, CRC failures are * considered torn writes and the log head is trimmed accordingly. * * Note that we can only run CRC verification when the log is dirty * because there's no guarantee that the log data behind an unmount * record is compatible with the current architecture. */ if (!clean) { xfs_daddr_t orig_head = *head_blk; error = xlog_verify_head(log, head_blk, tail_blk, buffer, &rhead_blk, &rhead, &wrapped); if (error) goto done; /* update in-core state again if the head changed */ if (*head_blk != orig_head) { xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped); tail_lsn = atomic64_read(&log->l_tail_lsn); error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead, rhead_blk, buffer, &clean); if (error) goto done; } } /* * Note that the unmount was clean. If the unmount was not clean, we * need to know this to rebuild the superblock counters from the perag * headers if we have a filesystem using non-persistent counters. */ if (clean) set_bit(XFS_OPSTATE_CLEAN, &log->l_mp->m_opstate); /* * Make sure that there are no blocks in front of the head * with the same cycle number as the head. This can happen * because we allow multiple outstanding log writes concurrently, * and the later writes might make it out before earlier ones. * * We use the lsn from before modifying it so that we'll never * overwrite the unmount record after a clean unmount. * * Do this only if we are going to recover the filesystem * * NOTE: This used to say "if (!readonly)" * However on Linux, we can & do recover a read-only filesystem. * We only skip recovery if NORECOVERY is specified on mount, * in which case we would not be here. * * But... if the -device- itself is readonly, just skip this. * We can't recover this device anyway, so it won't matter. */ if (!xfs_readonly_buftarg(log->l_targ)) error = xlog_clear_stale_blocks(log, tail_lsn); done: kvfree(buffer); if (error) xfs_warn(log->l_mp, "failed to locate log tail"); return error; } /* * Is the log zeroed at all? * * The last binary search should be changed to perform an X block read * once X becomes small enough. You can then search linearly through * the X blocks. This will cut down on the number of reads we need to do. * * If the log is partially zeroed, this routine will pass back the blkno * of the first block with cycle number 0. It won't have a complete LR * preceding it. * * Return: * 0 => the log is completely written to * 1 => use *blk_no as the first block of the log * <0 => error has occurred */ STATIC int xlog_find_zeroed( struct xlog *log, xfs_daddr_t *blk_no) { char *buffer; char *offset; uint first_cycle, last_cycle; xfs_daddr_t new_blk, last_blk, start_blk; xfs_daddr_t num_scan_bblks; int error, log_bbnum = log->l_logBBsize; int ret = 1; *blk_no = 0; /* check totally zeroed log */ buffer = xlog_alloc_buffer(log, 1); if (!buffer) return -ENOMEM; error = xlog_bread(log, 0, 1, buffer, &offset); if (error) goto out_free_buffer; first_cycle = xlog_get_cycle(offset); if (first_cycle == 0) { /* completely zeroed log */ *blk_no = 0; goto out_free_buffer; } /* check partially zeroed log */ error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset); if (error) goto out_free_buffer; last_cycle = xlog_get_cycle(offset); if (last_cycle != 0) { /* log completely written to */ ret = 0; goto out_free_buffer; } /* we have a partially zeroed log */ last_blk = log_bbnum-1; error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0); if (error) goto out_free_buffer; /* * Validate the answer. Because there is no way to guarantee that * the entire log is made up of log records which are the same size, * we scan over the defined maximum blocks. At this point, the maximum * is not chosen to mean anything special. XXXmiken */ num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); ASSERT(num_scan_bblks <= INT_MAX); if (last_blk < num_scan_bblks) num_scan_bblks = last_blk; start_blk = last_blk - num_scan_bblks; /* * We search for any instances of cycle number 0 that occur before * our current estimate of the head. What we're trying to detect is * 1 ... | 0 | 1 | 0... * ^ binary search ends here */ if ((error = xlog_find_verify_cycle(log, start_blk, (int)num_scan_bblks, 0, &new_blk))) goto out_free_buffer; if (new_blk != -1) last_blk = new_blk; /* * Potentially backup over partial log record write. We don't need * to search the end of the log because we know it is zero. */ error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0); if (error == 1) error = -EIO; if (error) goto out_free_buffer; *blk_no = last_blk; out_free_buffer: kvfree(buffer); if (error) return error; return ret; } /* * These are simple subroutines used by xlog_clear_stale_blocks() below * to initialize a buffer full of empty log record headers and write * them into the log. */ STATIC void xlog_add_record( struct xlog *log, char *buf, int cycle, int block, int tail_cycle, int tail_block) { xlog_rec_header_t *recp = (xlog_rec_header_t *)buf; memset(buf, 0, BBSIZE); recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); recp->h_cycle = cpu_to_be32(cycle); recp->h_version = cpu_to_be32( xfs_has_logv2(log->l_mp) ? 2 : 1); recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block)); recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block)); recp->h_fmt = cpu_to_be32(XLOG_FMT); memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t)); } STATIC int xlog_write_log_records( struct xlog *log, int cycle, int start_block, int blocks, int tail_cycle, int tail_block) { char *offset; char *buffer; int balign, ealign; int sectbb = log->l_sectBBsize; int end_block = start_block + blocks; int bufblks; int error = 0; int i, j = 0; /* * Greedily allocate a buffer big enough to handle the full * range of basic blocks to be written. If that fails, try * a smaller size. We need to be able to write at least a * log sector, or we're out of luck. */ bufblks = roundup_pow_of_two(blocks); while (bufblks > log->l_logBBsize) bufblks >>= 1; while (!(buffer = xlog_alloc_buffer(log, bufblks))) { bufblks >>= 1; if (bufblks < sectbb) return -ENOMEM; } /* We may need to do a read at the start to fill in part of * the buffer in the starting sector not covered by the first * write below. */ balign = round_down(start_block, sectbb); if (balign != start_block) { error = xlog_bread_noalign(log, start_block, 1, buffer); if (error) goto out_free_buffer; j = start_block - balign; } for (i = start_block; i < end_block; i += bufblks) { int bcount, endcount; bcount = min(bufblks, end_block - start_block); endcount = bcount - j; /* We may need to do a read at the end to fill in part of * the buffer in the final sector not covered by the write. * If this is the same sector as the above read, skip it. */ ealign = round_down(end_block, sectbb); if (j == 0 && (start_block + endcount > ealign)) { error = xlog_bread_noalign(log, ealign, sectbb, buffer + BBTOB(ealign - start_block)); if (error) break; } offset = buffer + xlog_align(log, start_block); for (; j < endcount; j++) { xlog_add_record(log, offset, cycle, i+j, tail_cycle, tail_block); offset += BBSIZE; } error = xlog_bwrite(log, start_block, endcount, buffer); if (error) break; start_block += endcount; j = 0; } out_free_buffer: kvfree(buffer); return error; } /* * This routine is called to blow away any incomplete log writes out * in front of the log head. We do this so that we won't become confused * if we come up, write only a little bit more, and then crash again. * If we leave the partial log records out there, this situation could * cause us to think those partial writes are valid blocks since they * have the current cycle number. We get rid of them by overwriting them * with empty log records with the old cycle number rather than the * current one. * * The tail lsn is passed in rather than taken from * the log so that we will not write over the unmount record after a * clean unmount in a 512 block log. Doing so would leave the log without * any valid log records in it until a new one was written. If we crashed * during that time we would not be able to recover. */ STATIC int xlog_clear_stale_blocks( struct xlog *log, xfs_lsn_t tail_lsn) { int tail_cycle, head_cycle; int tail_block, head_block; int tail_distance, max_distance; int distance; int error; tail_cycle = CYCLE_LSN(tail_lsn); tail_block = BLOCK_LSN(tail_lsn); head_cycle = log->l_curr_cycle; head_block = log->l_curr_block; /* * Figure out the distance between the new head of the log * and the tail. We want to write over any blocks beyond the * head that we may have written just before the crash, but * we don't want to overwrite the tail of the log. */ if (head_cycle == tail_cycle) { /* * The tail is behind the head in the physical log, * so the distance from the head to the tail is the * distance from the head to the end of the log plus * the distance from the beginning of the log to the * tail. */ if (XFS_IS_CORRUPT(log->l_mp, head_block < tail_block || head_block >= log->l_logBBsize)) return -EFSCORRUPTED; tail_distance = tail_block + (log->l_logBBsize - head_block); } else { /* * The head is behind the tail in the physical log, * so the distance from the head to the tail is just * the tail block minus the head block. */ if (XFS_IS_CORRUPT(log->l_mp, head_block >= tail_block || head_cycle != tail_cycle + 1)) return -EFSCORRUPTED; tail_distance = tail_block - head_block; } /* * If the head is right up against the tail, we can't clear * anything. */ if (tail_distance <= 0) { ASSERT(tail_distance == 0); return 0; } max_distance = XLOG_TOTAL_REC_SHIFT(log); /* * Take the smaller of the maximum amount of outstanding I/O * we could have and the distance to the tail to clear out. * We take the smaller so that we don't overwrite the tail and * we don't waste all day writing from the head to the tail * for no reason. */ max_distance = min(max_distance, tail_distance); if ((head_block + max_distance) <= log->l_logBBsize) { /* * We can stomp all the blocks we need to without * wrapping around the end of the log. Just do it * in a single write. Use the cycle number of the * current cycle minus one so that the log will look like: * n ... | n - 1 ... */ error = xlog_write_log_records(log, (head_cycle - 1), head_block, max_distance, tail_cycle, tail_block); if (error) return error; } else { /* * We need to wrap around the end of the physical log in * order to clear all the blocks. Do it in two separate * I/Os. The first write should be from the head to the * end of the physical log, and it should use the current * cycle number minus one just like above. */ distance = log->l_logBBsize - head_block; error = xlog_write_log_records(log, (head_cycle - 1), head_block, distance, tail_cycle, tail_block); if (error) return error; /* * Now write the blocks at the start of the physical log. * This writes the remainder of the blocks we want to clear. * It uses the current cycle number since we're now on the * same cycle as the head so that we get: * n ... n ... | n - 1 ... * ^^^^^ blocks we're writing */ distance = max_distance - (log->l_logBBsize - head_block); error = xlog_write_log_records(log, head_cycle, 0, distance, tail_cycle, tail_block); if (error) return error; } return 0; } /* * Release the recovered intent item in the AIL that matches the given intent * type and intent id. */ void xlog_recover_release_intent( struct xlog *log, unsigned short intent_type, uint64_t intent_id) { struct xfs_defer_pending *dfp, *n; list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { struct xfs_log_item *lip = dfp->dfp_intent; if (lip->li_type != intent_type) continue; if (!lip->li_ops->iop_match(lip, intent_id)) continue; ASSERT(xlog_item_is_intent(lip)); xfs_defer_cancel_recovery(log->l_mp, dfp); } } int xlog_recover_iget( struct xfs_mount *mp, xfs_ino_t ino, struct xfs_inode **ipp) { int error; error = xfs_iget(mp, NULL, ino, 0, 0, ipp); if (error) return error; error = xfs_qm_dqattach(*ipp); if (error) { xfs_irele(*ipp); return error; } if (VFS_I(*ipp)->i_nlink == 0) xfs_iflags_set(*ipp, XFS_IRECOVERY); return 0; } /* * Get an inode so that we can recover a log operation. * * Log intent items that target inodes effectively contain a file handle. * Check that the generation number matches the intent item like we do for * other file handles. Log intent items defined after this validation weakness * was identified must use this function. */ int xlog_recover_iget_handle( struct xfs_mount *mp, xfs_ino_t ino, uint32_t gen, struct xfs_inode **ipp) { struct xfs_inode *ip; int error; error = xlog_recover_iget(mp, ino, &ip); if (error) return error; if (VFS_I(ip)->i_generation != gen) { xfs_irele(ip); return -EFSCORRUPTED; } *ipp = ip; return 0; } /****************************************************************************** * * Log recover routines * ****************************************************************************** */ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = { &xlog_buf_item_ops, &xlog_inode_item_ops, &xlog_dquot_item_ops, &xlog_quotaoff_item_ops, &xlog_icreate_item_ops, &xlog_efi_item_ops, &xlog_efd_item_ops, &xlog_rui_item_ops, &xlog_rud_item_ops, &xlog_cui_item_ops, &xlog_cud_item_ops, &xlog_bui_item_ops, &xlog_bud_item_ops, &xlog_attri_item_ops, &xlog_attrd_item_ops, &xlog_xmi_item_ops, &xlog_xmd_item_ops, }; static const struct xlog_recover_item_ops * xlog_find_item_ops( struct xlog_recover_item *item) { unsigned int i; for (i = 0; i < ARRAY_SIZE(xlog_recover_item_ops); i++) if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type) return xlog_recover_item_ops[i]; return NULL; } /* * Sort the log items in the transaction. * * The ordering constraints are defined by the inode allocation and unlink * behaviour. The rules are: * * 1. Every item is only logged once in a given transaction. Hence it * represents the last logged state of the item. Hence ordering is * dependent on the order in which operations need to be performed so * required initial conditions are always met. * * 2. Cancelled buffers are recorded in pass 1 in a separate table and * there's nothing to replay from them so we can simply cull them * from the transaction. However, we can't do that until after we've * replayed all the other items because they may be dependent on the * cancelled buffer and replaying the cancelled buffer can remove it * form the cancelled buffer table. Hence they have tobe done last. * * 3. Inode allocation buffers must be replayed before inode items that * read the buffer and replay changes into it. For filesystems using the * ICREATE transactions, this means XFS_LI_ICREATE objects need to get * treated the same as inode allocation buffers as they create and * initialise the buffers directly. * * 4. Inode unlink buffers must be replayed after inode items are replayed. * This ensures that inodes are completely flushed to the inode buffer * in a "free" state before we remove the unlinked inode list pointer. * * Hence the ordering needs to be inode allocation buffers first, inode items * second, inode unlink buffers third and cancelled buffers last. * * But there's a problem with that - we can't tell an inode allocation buffer * apart from a regular buffer, so we can't separate them. We can, however, * tell an inode unlink buffer from the others, and so we can separate them out * from all the other buffers and move them to last. * * Hence, 4 lists, in order from head to tail: * - buffer_list for all buffers except cancelled/inode unlink buffers * - item_list for all non-buffer items * - inode_buffer_list for inode unlink buffers * - cancel_list for the cancelled buffers * * Note that we add objects to the tail of the lists so that first-to-last * ordering is preserved within the lists. Adding objects to the head of the * list means when we traverse from the head we walk them in last-to-first * order. For cancelled buffers and inode unlink buffers this doesn't matter, * but for all other items there may be specific ordering that we need to * preserve. */ STATIC int xlog_recover_reorder_trans( struct xlog *log, struct xlog_recover *trans, int pass) { struct xlog_recover_item *item, *n; int error = 0; LIST_HEAD(sort_list); LIST_HEAD(cancel_list); LIST_HEAD(buffer_list); LIST_HEAD(inode_buffer_list); LIST_HEAD(item_list); list_splice_init(&trans->r_itemq, &sort_list); list_for_each_entry_safe(item, n, &sort_list, ri_list) { enum xlog_recover_reorder fate = XLOG_REORDER_ITEM_LIST; item->ri_ops = xlog_find_item_ops(item); if (!item->ri_ops) { xfs_warn(log->l_mp, "%s: unrecognized type of log operation (%d)", __func__, ITEM_TYPE(item)); ASSERT(0); /* * return the remaining items back to the transaction * item list so they can be freed in caller. */ if (!list_empty(&sort_list)) list_splice_init(&sort_list, &trans->r_itemq); error = -EFSCORRUPTED; break; } if (item->ri_ops->reorder) fate = item->ri_ops->reorder(item); switch (fate) { case XLOG_REORDER_BUFFER_LIST: list_move_tail(&item->ri_list, &buffer_list); break; case XLOG_REORDER_CANCEL_LIST: trace_xfs_log_recover_item_reorder_head(log, trans, item, pass); list_move(&item->ri_list, &cancel_list); break; case XLOG_REORDER_INODE_BUFFER_LIST: list_move(&item->ri_list, &inode_buffer_list); break; case XLOG_REORDER_ITEM_LIST: trace_xfs_log_recover_item_reorder_tail(log, trans, item, pass); list_move_tail(&item->ri_list, &item_list); break; } } ASSERT(list_empty(&sort_list)); if (!list_empty(&buffer_list)) list_splice(&buffer_list, &trans->r_itemq); if (!list_empty(&item_list)) list_splice_tail(&item_list, &trans->r_itemq); if (!list_empty(&inode_buffer_list)) list_splice_tail(&inode_buffer_list, &trans->r_itemq); if (!list_empty(&cancel_list)) list_splice_tail(&cancel_list, &trans->r_itemq); return error; } void xlog_buf_readahead( struct xlog *log, xfs_daddr_t blkno, uint len, const struct xfs_buf_ops *ops) { if (!xlog_is_buffer_cancelled(log, blkno, len)) xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops); } /* * Create a deferred work structure for resuming and tracking the progress of a * log intent item that was found during recovery. */ void xlog_recover_intent_item( struct xlog *log, struct xfs_log_item *lip, xfs_lsn_t lsn, const struct xfs_defer_op_type *ops) { ASSERT(xlog_item_is_intent(lip)); xfs_defer_start_recovery(lip, &log->r_dfops, ops); /* * Insert the intent into the AIL directly and drop one reference so * that finishing or canceling the work will drop the other. */ xfs_trans_ail_insert(log->l_ailp, lip, lsn); lip->li_ops->iop_unpin(lip, 0); } STATIC int xlog_recover_items_pass2( struct xlog *log, struct xlog_recover *trans, struct list_head *buffer_list, struct list_head *item_list) { struct xlog_recover_item *item; int error = 0; list_for_each_entry(item, item_list, ri_list) { trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); if (item->ri_ops->commit_pass2) error = item->ri_ops->commit_pass2(log, buffer_list, item, trans->r_lsn); if (error) return error; } return error; } /* * Perform the transaction. * * If the transaction modifies a buffer or inode, do it now. Otherwise, * EFIs and EFDs get queued up by adding entries into the AIL for them. */ STATIC int xlog_recover_commit_trans( struct xlog *log, struct xlog_recover *trans, int pass, struct list_head *buffer_list) { int error = 0; int items_queued = 0; struct xlog_recover_item *item; struct xlog_recover_item *next; LIST_HEAD (ra_list); LIST_HEAD (done_list); #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 hlist_del_init(&trans->r_list); error = xlog_recover_reorder_trans(log, trans, pass); if (error) return error; list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) { trace_xfs_log_recover_item_recover(log, trans, item, pass); switch (pass) { case XLOG_RECOVER_PASS1: if (item->ri_ops->commit_pass1) error = item->ri_ops->commit_pass1(log, item); break; case XLOG_RECOVER_PASS2: if (item->ri_ops->ra_pass2) item->ri_ops->ra_pass2(log, item); list_move_tail(&item->ri_list, &ra_list); items_queued++; if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { error = xlog_recover_items_pass2(log, trans, buffer_list, &ra_list); list_splice_tail_init(&ra_list, &done_list); items_queued = 0; } break; default: ASSERT(0); } if (error) goto out; } out: if (!list_empty(&ra_list)) { if (!error) error = xlog_recover_items_pass2(log, trans, buffer_list, &ra_list); list_splice_tail_init(&ra_list, &done_list); } if (!list_empty(&done_list)) list_splice_init(&done_list, &trans->r_itemq); return error; } STATIC void xlog_recover_add_item( struct list_head *head) { struct xlog_recover_item *item; item = kzalloc(sizeof(struct xlog_recover_item), GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&item->ri_list); list_add_tail(&item->ri_list, head); } STATIC int xlog_recover_add_to_cont_trans( struct xlog *log, struct xlog_recover *trans, char *dp, int len) { struct xlog_recover_item *item; char *ptr, *old_ptr; int old_len; /* * If the transaction is empty, the header was split across this and the * previous record. Copy the rest of the header. */ if (list_empty(&trans->r_itemq)) { ASSERT(len <= sizeof(struct xfs_trans_header)); if (len > sizeof(struct xfs_trans_header)) { xfs_warn(log->l_mp, "%s: bad header length", __func__); return -EFSCORRUPTED; } xlog_recover_add_item(&trans->r_itemq); ptr = (char *)&trans->r_theader + sizeof(struct xfs_trans_header) - len; memcpy(ptr, dp, len); return 0; } /* take the tail entry */ item = list_entry(trans->r_itemq.prev, struct xlog_recover_item, ri_list); old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL); if (!ptr) return -ENOMEM; memcpy(&ptr[old_len], dp, len); item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; trace_xfs_log_recover_item_add_cont(log, trans, item, 0); return 0; } /* * The next region to add is the start of a new region. It could be * a whole region or it could be the first part of a new region. Because * of this, the assumption here is that the type and size fields of all * format structures fit into the first 32 bits of the structure. * * This works because all regions must be 32 bit aligned. Therefore, we * either have both fields or we have neither field. In the case we have * neither field, the data part of the region is zero length. We only have * a log_op_header and can throw away the header since a new one will appear * later. If we have at least 4 bytes, then we can determine how many regions * will appear in the current log item. */ STATIC int xlog_recover_add_to_trans( struct xlog *log, struct xlog_recover *trans, char *dp, int len) { struct xfs_inode_log_format *in_f; /* any will do */ struct xlog_recover_item *item; char *ptr; if (!len) return 0; if (list_empty(&trans->r_itemq)) { /* we need to catch log corruptions here */ if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { xfs_warn(log->l_mp, "%s: bad header magic number", __func__); ASSERT(0); return -EFSCORRUPTED; } if (len > sizeof(struct xfs_trans_header)) { xfs_warn(log->l_mp, "%s: bad header length", __func__); ASSERT(0); return -EFSCORRUPTED; } /* * The transaction header can be arbitrarily split across op * records. If we don't have the whole thing here, copy what we * do have and handle the rest in the next record. */ if (len == sizeof(struct xfs_trans_header)) xlog_recover_add_item(&trans->r_itemq); memcpy(&trans->r_theader, dp, len); return 0; } ptr = xlog_kvmalloc(len); memcpy(ptr, dp, len); in_f = (struct xfs_inode_log_format *)ptr; /* take the tail entry */ item = list_entry(trans->r_itemq.prev, struct xlog_recover_item, ri_list); if (item->ri_total != 0 && item->ri_total == item->ri_cnt) { /* tail item is in use, get a new one */ xlog_recover_add_item(&trans->r_itemq); item = list_entry(trans->r_itemq.prev, struct xlog_recover_item, ri_list); } if (item->ri_total == 0) { /* first region to be added */ if (in_f->ilf_size == 0 || in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { xfs_warn(log->l_mp, "bad number of regions (%d) in inode log format", in_f->ilf_size); ASSERT(0); kvfree(ptr); return -EFSCORRUPTED; } item->ri_total = in_f->ilf_size; item->ri_buf = kzalloc(item->ri_total * sizeof(xfs_log_iovec_t), GFP_KERNEL | __GFP_NOFAIL); } if (item->ri_total <= item->ri_cnt) { xfs_warn(log->l_mp, "log item region count (%d) overflowed size (%d)", item->ri_cnt, item->ri_total); ASSERT(0); kvfree(ptr); return -EFSCORRUPTED; } /* Description region is ri_buf[0] */ item->ri_buf[item->ri_cnt].i_addr = ptr; item->ri_buf[item->ri_cnt].i_len = len; item->ri_cnt++; trace_xfs_log_recover_item_add(log, trans, item, 0); return 0; } /* * Free up any resources allocated by the transaction * * Remember that EFIs, EFDs, and IUNLINKs are handled later. */ STATIC void xlog_recover_free_trans( struct xlog_recover *trans) { struct xlog_recover_item *item, *n; int i; hlist_del_init(&trans->r_list); list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { /* Free the regions in the item. */ list_del(&item->ri_list); for (i = 0; i < item->ri_cnt; i++) kvfree(item->ri_buf[i].i_addr); /* Free the item itself */ kfree(item->ri_buf); kfree(item); } /* Free the transaction recover structure */ kfree(trans); } /* * On error or completion, trans is freed. */ STATIC int xlog_recovery_process_trans( struct xlog *log, struct xlog_recover *trans, char *dp, unsigned int len, unsigned int flags, int pass, struct list_head *buffer_list) { int error = 0; bool freeit = false; /* mask off ophdr transaction container flags */ flags &= ~XLOG_END_TRANS; if (flags & XLOG_WAS_CONT_TRANS) flags &= ~XLOG_CONTINUE_TRANS; /* * Callees must not free the trans structure. We'll decide if we need to * free it or not based on the operation being done and it's result. */ switch (flags) { /* expected flag values */ case 0: case XLOG_CONTINUE_TRANS: error = xlog_recover_add_to_trans(log, trans, dp, len); break; case XLOG_WAS_CONT_TRANS: error = xlog_recover_add_to_cont_trans(log, trans, dp, len); break; case XLOG_COMMIT_TRANS: error = xlog_recover_commit_trans(log, trans, pass, buffer_list); /* success or fail, we are now done with this transaction. */ freeit = true; break; /* unexpected flag values */ case XLOG_UNMOUNT_TRANS: /* just skip trans */ xfs_warn(log->l_mp, "%s: Unmount LR", __func__); freeit = true; break; case XLOG_START_TRANS: default: xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags); ASSERT(0); error = -EFSCORRUPTED; break; } if (error || freeit) xlog_recover_free_trans(trans); return error; } /* * Lookup the transaction recovery structure associated with the ID in the * current ophdr. If the transaction doesn't exist and the start flag is set in * the ophdr, then allocate a new transaction for future ID matches to find. * Either way, return what we found during the lookup - an existing transaction * or nothing. */ STATIC struct xlog_recover * xlog_recover_ophdr_to_trans( struct hlist_head rhash[], struct xlog_rec_header *rhead, struct xlog_op_header *ohead) { struct xlog_recover *trans; xlog_tid_t tid; struct hlist_head *rhp; tid = be32_to_cpu(ohead->oh_tid); rhp = &rhash[XLOG_RHASH(tid)]; hlist_for_each_entry(trans, rhp, r_list) { if (trans->r_log_tid == tid) return trans; } /* * skip over non-start transaction headers - we could be * processing slack space before the next transaction starts */ if (!(ohead->oh_flags & XLOG_START_TRANS)) return NULL; ASSERT(be32_to_cpu(ohead->oh_len) == 0); /* * This is a new transaction so allocate a new recovery container to * hold the recovery ops that will follow. */ trans = kzalloc(sizeof(struct xlog_recover), GFP_KERNEL | __GFP_NOFAIL); trans->r_log_tid = tid; trans->r_lsn = be64_to_cpu(rhead->h_lsn); INIT_LIST_HEAD(&trans->r_itemq); INIT_HLIST_NODE(&trans->r_list); hlist_add_head(&trans->r_list, rhp); /* * Nothing more to do for this ophdr. Items to be added to this new * transaction will be in subsequent ophdr containers. */ return NULL; } STATIC int xlog_recover_process_ophdr( struct xlog *log, struct hlist_head rhash[], struct xlog_rec_header *rhead, struct xlog_op_header *ohead, char *dp, char *end, int pass, struct list_head *buffer_list) { struct xlog_recover *trans; unsigned int len; int error; /* Do we understand who wrote this op? */ if (ohead->oh_clientid != XFS_TRANSACTION && ohead->oh_clientid != XFS_LOG) { xfs_warn(log->l_mp, "%s: bad clientid 0x%x", __func__, ohead->oh_clientid); ASSERT(0); return -EFSCORRUPTED; } /* * Check the ophdr contains all the data it is supposed to contain. */ len = be32_to_cpu(ohead->oh_len); if (dp + len > end) { xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len); WARN_ON(1); return -EFSCORRUPTED; } trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead); if (!trans) { /* nothing to do, so skip over this ophdr */ return 0; } /* * The recovered buffer queue is drained only once we know that all * recovery items for the current LSN have been processed. This is * required because: * * - Buffer write submission updates the metadata LSN of the buffer. * - Log recovery skips items with a metadata LSN >= the current LSN of * the recovery item. * - Separate recovery items against the same metadata buffer can share * a current LSN. I.e., consider that the LSN of a recovery item is * defined as the starting LSN of the first record in which its * transaction appears, that a record can hold multiple transactions, * and/or that a transaction can span multiple records. * * In other words, we are allowed to submit a buffer from log recovery * once per current LSN. Otherwise, we may incorrectly skip recovery * items and cause corruption. * * We don't know up front whether buffers are updated multiple times per * LSN. Therefore, track the current LSN of each commit log record as it * is processed and drain the queue when it changes. Use commit records * because they are ordered correctly by the logging code. */ if (log->l_recovery_lsn != trans->r_lsn && ohead->oh_flags & XLOG_COMMIT_TRANS) { error = xfs_buf_delwri_submit(buffer_list); if (error) return error; log->l_recovery_lsn = trans->r_lsn; } return xlog_recovery_process_trans(log, trans, dp, len, ohead->oh_flags, pass, buffer_list); } /* * There are two valid states of the r_state field. 0 indicates that the * transaction structure is in a normal state. We have either seen the * start of the transaction or the last operation we added was not a partial * operation. If the last operation we added to the transaction was a * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS. * * NOTE: skip LRs with 0 data length. */ STATIC int xlog_recover_process_data( struct xlog *log, struct hlist_head rhash[], struct xlog_rec_header *rhead, char *dp, int pass, struct list_head *buffer_list) { struct xlog_op_header *ohead; char *end; int num_logops; int error; end = dp + be32_to_cpu(rhead->h_len); num_logops = be32_to_cpu(rhead->h_num_logops); /* check the log format matches our own - else we can't recover */ if (xlog_header_check_recover(log->l_mp, rhead)) return -EIO; trace_xfs_log_recover_record(log, rhead, pass); while ((dp < end) && num_logops) { ohead = (struct xlog_op_header *)dp; dp += sizeof(*ohead); if (dp > end) { xfs_warn(log->l_mp, "%s: op header overrun", __func__); return -EFSCORRUPTED; } /* errors will abort recovery */ error = xlog_recover_process_ophdr(log, rhash, rhead, ohead, dp, end, pass, buffer_list); if (error) return error; dp += be32_to_cpu(ohead->oh_len); num_logops--; } return 0; } /* Take all the collected deferred ops and finish them in order. */ static int xlog_finish_defer_ops( struct xfs_mount *mp, struct list_head *capture_list) { struct xfs_defer_capture *dfc, *next; struct xfs_trans *tp; int error = 0; list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { struct xfs_trans_res resv; struct xfs_defer_resources dres; /* * Create a new transaction reservation from the captured * information. Set logcount to 1 to force the new transaction * to regrant every roll so that we can make forward progress * in recovery no matter how full the log might be. */ resv.tr_logres = dfc->dfc_logres; resv.tr_logcount = 1; resv.tr_logflags = XFS_TRANS_PERM_LOG_RES; error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres, dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp); if (error) { xlog_force_shutdown(mp->m_log, SHUTDOWN_LOG_IO_ERROR); return error; } /* * Transfer to this new transaction all the dfops we captured * from recovering a single intent item. */ list_del_init(&dfc->dfc_list); xfs_defer_ops_continue(dfc, tp, &dres); error = xfs_trans_commit(tp); xfs_defer_resources_rele(&dres); if (error) return error; } ASSERT(list_empty(capture_list)); return 0; } /* Release all the captured defer ops and capture structures in this list. */ static void xlog_abort_defer_ops( struct xfs_mount *mp, struct list_head *capture_list) { struct xfs_defer_capture *dfc; struct xfs_defer_capture *next; list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { list_del_init(&dfc->dfc_list); xfs_defer_ops_capture_abort(mp, dfc); } } /* * When this is called, all of the log intent items which did not have * corresponding log done items should be in the AIL. What we do now is update * the data structures associated with each one. * * Since we process the log intent items in normal transactions, they will be * removed at some point after the commit. This prevents us from just walking * down the list processing each one. We'll use a flag in the intent item to * skip those that we've already processed and use the AIL iteration mechanism's * generation count to try to speed this up at least a bit. * * When we start, we know that the intents are the only things in the AIL. As we * process them, however, other items are added to the AIL. Hence we know we * have started recovery on all the pending intents when we find an non-intent * item in the AIL. */ STATIC int xlog_recover_process_intents( struct xlog *log) { LIST_HEAD(capture_list); struct xfs_defer_pending *dfp, *n; int error = 0; #if defined(DEBUG) || defined(XFS_WARN) xfs_lsn_t last_lsn; last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); #endif list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { ASSERT(xlog_item_is_intent(dfp->dfp_intent)); /* * We should never see a redo item with a LSN higher than * the last transaction we found in the log at the start * of recovery. */ ASSERT(XFS_LSN_CMP(last_lsn, dfp->dfp_intent->li_lsn) >= 0); /* * NOTE: If your intent processing routine can create more * deferred ops, you /must/ attach them to the capture list in * the recover routine or else those subsequent intents will be * replayed in the wrong order! * * The recovery function can free the log item, so we must not * access dfp->dfp_intent after it returns. It must dispose of * @dfp if it returns 0. */ error = xfs_defer_finish_recovery(log->l_mp, dfp, &capture_list); if (error) break; } if (error) goto err; error = xlog_finish_defer_ops(log->l_mp, &capture_list); if (error) goto err; return 0; err: xlog_abort_defer_ops(log->l_mp, &capture_list); return error; } /* * A cancel occurs when the mount has failed and we're bailing out. Release all * pending log intent items that we haven't started recovery on so they don't * pin the AIL. */ STATIC void xlog_recover_cancel_intents( struct xlog *log) { struct xfs_defer_pending *dfp, *n; list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { ASSERT(xlog_item_is_intent(dfp->dfp_intent)); xfs_defer_cancel_recovery(log->l_mp, dfp); } } /* * Transfer ownership of the recovered pending work to the recovery transaction * and try to finish the work. If there is more work to be done, the dfp will * remain attached to the transaction. If not, the dfp is freed. */ int xlog_recover_finish_intent( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { int error; list_move(&dfp->dfp_list, &tp->t_dfops); error = xfs_defer_finish_one(tp, dfp); if (error == -EAGAIN) return 0; return error; } /* * This routine performs a transaction to null out a bad inode pointer * in an agi unlinked inode hash bucket. */ STATIC void xlog_recover_clear_agi_bucket( struct xfs_perag *pag, int bucket) { struct xfs_mount *mp = pag->pag_mount; struct xfs_trans *tp; struct xfs_agi *agi; struct xfs_buf *agibp; int offset; int error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp); if (error) goto out_error; error = xfs_read_agi(pag, tp, 0, &agibp); if (error) goto out_abort; agi = agibp->b_addr; agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); error = xfs_trans_commit(tp); if (error) goto out_error; return; out_abort: xfs_trans_cancel(tp); out_error: xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, pag->pag_agno); return; } static int xlog_recover_iunlink_bucket( struct xfs_perag *pag, struct xfs_agi *agi, int bucket) { struct xfs_mount *mp = pag->pag_mount; struct xfs_inode *prev_ip = NULL; struct xfs_inode *ip; xfs_agino_t prev_agino, agino; int error = 0; agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { error = xfs_iget(mp, NULL, XFS_AGINO_TO_INO(mp, pag->pag_agno, agino), 0, 0, &ip); if (error) break; ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); xfs_iflags_clear(ip, XFS_IRECOVERY); agino = ip->i_next_unlinked; if (prev_ip) { ip->i_prev_unlinked = prev_agino; xfs_irele(prev_ip); /* * Ensure the inode is removed from the unlinked list * before we continue so that it won't race with * building the in-memory list here. This could be * serialised with the agibp lock, but that just * serialises via lockstepping and it's much simpler * just to flush the inodegc queue and wait for it to * complete. */ error = xfs_inodegc_flush(mp); if (error) break; } prev_agino = agino; prev_ip = ip; } if (prev_ip) { int error2; ip->i_prev_unlinked = prev_agino; xfs_irele(prev_ip); error2 = xfs_inodegc_flush(mp); if (error2 && !error) return error2; } return error; } /* * Recover AGI unlinked lists * * This is called during recovery to process any inodes which we unlinked but * not freed when the system crashed. These inodes will be on the lists in the * AGI blocks. What we do here is scan all the AGIs and fully truncate and free * any inodes found on the lists. Each inode is removed from the lists when it * has been fully truncated and is freed. The freeing of the inode and its * removal from the list must be atomic. * * If everything we touch in the agi processing loop is already in memory, this * loop can hold the cpu for a long time. It runs without lock contention, * memory allocation contention, the need wait for IO, etc, and so will run * until we either run out of inodes to process, run low on memory or we run out * of log space. * * This behaviour is bad for latency on single CPU and non-preemptible kernels, * and can prevent other filesystem work (such as CIL pushes) from running. This * can lead to deadlocks if the recovery process runs out of log reservation * space. Hence we need to yield the CPU when there is other kernel work * scheduled on this CPU to ensure other scheduled work can run without undue * latency. */ static void xlog_recover_iunlink_ag( struct xfs_perag *pag) { struct xfs_agi *agi; struct xfs_buf *agibp; int bucket; int error; error = xfs_read_agi(pag, NULL, 0, &agibp); if (error) { /* * AGI is b0rked. Don't process it. * * We should probably mark the filesystem as corrupt after we've * recovered all the ag's we can.... */ return; } /* * Unlock the buffer so that it can be acquired in the normal course of * the transaction to truncate and free each inode. Because we are not * racing with anyone else here for the AGI buffer, we don't even need * to hold it locked to read the initial unlinked bucket entries out of * the buffer. We keep buffer reference though, so that it stays pinned * in memory while we need the buffer. */ agi = agibp->b_addr; xfs_buf_unlock(agibp); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { error = xlog_recover_iunlink_bucket(pag, agi, bucket); if (error) { /* * Bucket is unrecoverable, so only a repair scan can * free the remaining unlinked inodes. Just empty the * bucket and remaining inodes on it unreferenced and * unfreeable. */ xlog_recover_clear_agi_bucket(pag, bucket); } } xfs_buf_rele(agibp); } static void xlog_recover_process_iunlinks( struct xlog *log) { struct xfs_perag *pag; xfs_agnumber_t agno; for_each_perag(log->l_mp, agno, pag) xlog_recover_iunlink_ag(pag); } STATIC void xlog_unpack_data( struct xlog_rec_header *rhead, char *dp, struct xlog *log) { int i, j, k; for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; dp += BBSIZE; } if (xfs_has_logv2(log->l_mp)) { xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; dp += BBSIZE; } } } /* * CRC check, unpack and process a log record. */ STATIC int xlog_recover_process( struct xlog *log, struct hlist_head rhash[], struct xlog_rec_header *rhead, char *dp, int pass, struct list_head *buffer_list) { __le32 old_crc = rhead->h_crc; __le32 crc; crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); /* * Nothing else to do if this is a CRC verification pass. Just return * if this a record with a non-zero crc. Unfortunately, mkfs always * sets old_crc to 0 so we must consider this valid even on v5 supers. * Otherwise, return EFSBADCRC on failure so the callers up the stack * know precisely what failed. */ if (pass == XLOG_RECOVER_CRCPASS) { if (old_crc && crc != old_crc) return -EFSBADCRC; return 0; } /* * We're in the normal recovery path. Issue a warning if and only if the * CRC in the header is non-zero. This is an advisory warning and the * zero CRC check prevents warnings from being emitted when upgrading * the kernel from one that does not add CRCs by default. */ if (crc != old_crc) { if (old_crc || xfs_has_crc(log->l_mp)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", le32_to_cpu(old_crc), le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } /* * If the filesystem is CRC enabled, this mismatch becomes a * fatal log corruption failure. */ if (xfs_has_crc(log->l_mp)) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; } } xlog_unpack_data(rhead, dp, log); return xlog_recover_process_data(log, rhash, rhead, dp, pass, buffer_list); } STATIC int xlog_valid_rec_header( struct xlog *log, struct xlog_rec_header *rhead, xfs_daddr_t blkno, int bufsize) { int hlen; if (XFS_IS_CORRUPT(log->l_mp, rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) return -EFSCORRUPTED; if (XFS_IS_CORRUPT(log->l_mp, (!rhead->h_version || (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { xfs_warn(log->l_mp, "%s: unrecognised log version (%d).", __func__, be32_to_cpu(rhead->h_version)); return -EFSCORRUPTED; } /* * LR body must have data (or it wouldn't have been written) * and h_len must not be greater than LR buffer size. */ hlen = be32_to_cpu(rhead->h_len); if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > bufsize)) return -EFSCORRUPTED; if (XFS_IS_CORRUPT(log->l_mp, blkno > log->l_logBBsize || blkno > INT_MAX)) return -EFSCORRUPTED; return 0; } /* * Read the log from tail to head and process the log records found. * Handle the two cases where the tail and head are in the same cycle * and where the active portion of the log wraps around the end of * the physical log separately. The pass parameter is passed through * to the routines called to process the data and is not looked at * here. */ STATIC int xlog_do_recovery_pass( struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk, int pass, xfs_daddr_t *first_bad) /* out: first bad log rec */ { xlog_rec_header_t *rhead; xfs_daddr_t blk_no, rblk_no; xfs_daddr_t rhead_blk; char *offset; char *hbp, *dbp; int error = 0, h_size, h_len; int error2 = 0; int bblks, split_bblks; int hblks = 1, split_hblks, wrapped_hblks; int i; struct hlist_head rhash[XLOG_RHASH_SIZE]; LIST_HEAD (buffer_list); ASSERT(head_blk != tail_blk); blk_no = rhead_blk = tail_blk; for (i = 0; i < XLOG_RHASH_SIZE; i++) INIT_HLIST_HEAD(&rhash[i]); hbp = xlog_alloc_buffer(log, hblks); if (!hbp) return -ENOMEM; /* * Read the header of the tail block and get the iclog buffer size from * h_size. Use this to tell how many sectors make up the log header. */ if (xfs_has_logv2(log->l_mp)) { /* * When using variable length iclogs, read first sector of * iclog header and extract the header size from it. Get a * new hbp that is the correct size. */ error = xlog_bread(log, tail_blk, 1, hbp, &offset); if (error) goto bread_err1; rhead = (xlog_rec_header_t *)offset; /* * xfsprogs has a bug where record length is based on lsunit but * h_size (iclog size) is hardcoded to 32k. Now that we * unconditionally CRC verify the unmount record, this means the * log buffer can be too small for the record and cause an * overrun. * * Detect this condition here. Use lsunit for the buffer size as * long as this looks like the mkfs case. Otherwise, return an * error to avoid a buffer overrun. */ h_size = be32_to_cpu(rhead->h_size); h_len = be32_to_cpu(rhead->h_len); if (h_len > h_size && h_len <= log->l_mp->m_logbsize && rhead->h_num_logops == cpu_to_be32(1)) { xfs_warn(log->l_mp, "invalid iclog size (%d bytes), using lsunit (%d bytes)", h_size, log->l_mp->m_logbsize); h_size = log->l_mp->m_logbsize; } error = xlog_valid_rec_header(log, rhead, tail_blk, h_size); if (error) goto bread_err1; /* * This open codes xlog_logrec_hblks so that we can reuse the * fixed up h_size value calculated above. Without that we'd * still allocate the buffer based on the incorrect on-disk * size. */ if (h_size > XLOG_HEADER_CYCLE_SIZE && (rhead->h_version & cpu_to_be32(XLOG_VERSION_2))) { hblks = DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE); if (hblks > 1) { kvfree(hbp); hbp = xlog_alloc_buffer(log, hblks); if (!hbp) return -ENOMEM; } } } else { ASSERT(log->l_sectBBsize == 1); h_size = XLOG_BIG_RECORD_BSIZE; } dbp = xlog_alloc_buffer(log, BTOBB(h_size)); if (!dbp) { kvfree(hbp); return -ENOMEM; } memset(rhash, 0, sizeof(rhash)); if (tail_blk > head_blk) { /* * Perform recovery around the end of the physical log. * When the head is not on the same cycle number as the tail, * we can't do a sequential recovery. */ while (blk_no < log->l_logBBsize) { /* * Check for header wrapping around physical end-of-log */ offset = hbp; split_hblks = 0; wrapped_hblks = 0; if (blk_no + hblks <= log->l_logBBsize) { /* Read header in one read */ error = xlog_bread(log, blk_no, hblks, hbp, &offset); if (error) goto bread_err2; } else { /* This LR is split across physical log end */ if (blk_no != log->l_logBBsize) { /* some data before physical log end */ ASSERT(blk_no <= INT_MAX); split_hblks = log->l_logBBsize - (int)blk_no; ASSERT(split_hblks > 0); error = xlog_bread(log, blk_no, split_hblks, hbp, &offset); if (error) goto bread_err2; } /* * Note: this black magic still works with * large sector sizes (non-512) only because: * - we increased the buffer size originally * by 1 sector giving us enough extra space * for the second read; * - the log start is guaranteed to be sector * aligned; * - we read the log end (LR header start) * _first_, then the log start (LR header end) * - order is important. */ wrapped_hblks = hblks - split_hblks; error = xlog_bread_noalign(log, 0, wrapped_hblks, offset + BBTOB(split_hblks)); if (error) goto bread_err2; } rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, split_hblks ? blk_no : 0, h_size); if (error) goto bread_err2; bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); blk_no += hblks; /* * Read the log record data in multiple reads if it * wraps around the end of the log. Note that if the * header already wrapped, blk_no could point past the * end of the log. The record data is contiguous in * that case. */ if (blk_no + bblks <= log->l_logBBsize || blk_no >= log->l_logBBsize) { rblk_no = xlog_wrap_logbno(log, blk_no); error = xlog_bread(log, rblk_no, bblks, dbp, &offset); if (error) goto bread_err2; } else { /* This log record is split across the * physical end of log */ offset = dbp; split_bblks = 0; if (blk_no != log->l_logBBsize) { /* some data is before the physical * end of log */ ASSERT(!wrapped_hblks); ASSERT(blk_no <= INT_MAX); split_bblks = log->l_logBBsize - (int)blk_no; ASSERT(split_bblks > 0); error = xlog_bread(log, blk_no, split_bblks, dbp, &offset); if (error) goto bread_err2; } /* * Note: this black magic still works with * large sector sizes (non-512) only because: * - we increased the buffer size originally * by 1 sector giving us enough extra space * for the second read; * - the log start is guaranteed to be sector * aligned; * - we read the log end (LR header start) * _first_, then the log start (LR header end) * - order is important. */ error = xlog_bread_noalign(log, 0, bblks - split_bblks, offset + BBTOB(split_bblks)); if (error) goto bread_err2; } error = xlog_recover_process(log, rhash, rhead, offset, pass, &buffer_list); if (error) goto bread_err2; blk_no += bblks; rhead_blk = blk_no; } ASSERT(blk_no >= log->l_logBBsize); blk_no -= log->l_logBBsize; rhead_blk = blk_no; } /* read first part of physical log */ while (blk_no < head_blk) { error = xlog_bread(log, blk_no, hblks, hbp, &offset); if (error) goto bread_err2; rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, blk_no, h_size); if (error) goto bread_err2; /* blocks in data section */ bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); error = xlog_bread(log, blk_no+hblks, bblks, dbp, &offset); if (error) goto bread_err2; error = xlog_recover_process(log, rhash, rhead, offset, pass, &buffer_list); if (error) goto bread_err2; blk_no += bblks + hblks; rhead_blk = blk_no; } bread_err2: kvfree(dbp); bread_err1: kvfree(hbp); /* * Submit buffers that have been dirtied by the last record recovered. */ if (!list_empty(&buffer_list)) { if (error) { /* * If there has been an item recovery error then we * cannot allow partial checkpoint writeback to * occur. We might have multiple checkpoints with the * same start LSN in this buffer list, and partial * writeback of a checkpoint in this situation can * prevent future recovery of all the changes in the * checkpoints at this start LSN. * * Note: Shutting down the filesystem will result in the * delwri submission marking all the buffers stale, * completing them and cleaning up _XBF_LOGRECOVERY * state without doing any IO. */ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } error2 = xfs_buf_delwri_submit(&buffer_list); } if (error && first_bad) *first_bad = rhead_blk; /* * Transactions are freed at commit time but transactions without commit * records on disk are never committed. Free any that may be left in the * hash table. */ for (i = 0; i < XLOG_RHASH_SIZE; i++) { struct hlist_node *tmp; struct xlog_recover *trans; hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list) xlog_recover_free_trans(trans); } return error ? error : error2; } /* * Do the recovery of the log. We actually do this in two phases. * The two passes are necessary in order to implement the function * of cancelling a record written into the log. The first pass * determines those things which have been cancelled, and the * second pass replays log items normally except for those which * have been cancelled. The handling of the replay and cancellations * takes place in the log item type specific routines. * * The table of items which have cancel records in the log is allocated * and freed at this level, since only here do we know when all of * the log recovery has been completed. */ STATIC int xlog_do_log_recovery( struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { int error; ASSERT(head_blk != tail_blk); /* * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ error = xlog_alloc_buf_cancel_table(log); if (error) return error; error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1, NULL); if (error != 0) goto out_cancel; /* * Then do a second pass to actually recover the items in the log. * When it is complete free the table of buf cancel items. */ error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS2, NULL); if (!error) xlog_check_buf_cancel_table(log); out_cancel: xlog_free_buf_cancel_table(log); return error; } /* * Do the actual recovery */ STATIC int xlog_do_recover( struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { struct xfs_mount *mp = log->l_mp; struct xfs_buf *bp = mp->m_sb_bp; struct xfs_sb *sbp = &mp->m_sb; int error; trace_xfs_log_recover(log, head_blk, tail_blk); /* * First replay the images in the log. */ error = xlog_do_log_recovery(log, head_blk, tail_blk); if (error) return error; if (xlog_is_shutdown(log)) return -EIO; /* * We now update the tail_lsn since much of the recovery has completed * and there may be space available to use. If there were no extent or * iunlinks, we can free up the entire log. This was set in * xlog_find_tail to be the lsn of the last known good LR on disk. If * there are extent frees or iunlinks they will have some entries in the * AIL; so we look at the AIL to determine how to set the tail_lsn. */ xfs_ail_assign_tail_lsn(log->l_ailp); /* * Now that we've finished replaying all buffer and inode updates, * re-read the superblock and reverify it. */ xfs_buf_lock(bp); xfs_buf_hold(bp); error = _xfs_buf_read(bp, XBF_READ); if (error) { if (!xlog_is_shutdown(log)) { xfs_buf_ioerror_alert(bp, __this_address); ASSERT(0); } xfs_buf_relse(bp); return error; } /* Convert superblock from on-disk format */ xfs_sb_from_disk(sbp, bp->b_addr); xfs_buf_relse(bp); /* re-initialise in-core superblock and geometry structures */ mp->m_features |= xfs_sb_version_to_features(sbp); xfs_reinit_percpu_counters(mp); error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks, &mp->m_maxagi); if (error) { xfs_warn(mp, "Failed post-recovery per-ag init: %d", error); return error; } mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); /* Normal transactions can now occur */ clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); return 0; } /* * Perform recovery and re-initialize some log variables in xlog_find_tail. * * Return error or zero. */ int xlog_recover( struct xlog *log) { xfs_daddr_t head_blk, tail_blk; int error; /* find the tail of the log */ error = xlog_find_tail(log, &head_blk, &tail_blk); if (error) return error; /* * The superblock was read before the log was available and thus the LSN * could not be verified. Check the superblock LSN against the current * LSN now that it's known. */ if (xfs_has_crc(log->l_mp) && !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn)) return -EINVAL; if (tail_blk != head_blk) { /* There used to be a comment here: * * disallow recovery on read-only mounts. note -- mount * checks for ENOSPC and turns it into an intelligent * error message. * ...but this is no longer true. Now, unless you specify * NORECOVERY (in which case this function would never be * called), we just go ahead and recover. We do this all * under the vfs layer, so we can get away with it unless * the device itself is read-only, in which case we fail. */ if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) { return error; } /* * Version 5 superblock log feature mask validation. We know the * log is dirty so check if there are any unknown log features * in what we need to recover. If there are unknown features * (e.g. unsupported transactions, then simply reject the * attempt at recovery before touching anything. */ if (xfs_sb_is_v5(&log->l_mp->m_sb) && xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { xfs_warn(log->l_mp, "Superblock has unknown incompatible log features (0x%x) enabled.", (log->l_mp->m_sb.sb_features_log_incompat & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); xfs_warn(log->l_mp, "The log can not be fully and/or safely recovered by this kernel."); xfs_warn(log->l_mp, "Please recover the log on a kernel that supports the unknown features."); return -EINVAL; } /* * Delay log recovery if the debug hook is set. This is debug * instrumentation to coordinate simulation of I/O failures with * log recovery. */ if (xfs_globals.log_recovery_delay) { xfs_notice(log->l_mp, "Delaying log recovery for %d seconds.", xfs_globals.log_recovery_delay); msleep(xfs_globals.log_recovery_delay * 1000); } xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", log->l_mp->m_logname ? log->l_mp->m_logname : "internal"); error = xlog_do_recover(log, head_blk, tail_blk); set_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); } return error; } /* * In the first part of recovery we replay inodes and buffers and build up the * list of intents which need to be processed. Here we process the intents and * clean up the on disk unlinked inode lists. This is separated from the first * part of recovery so that the root and real-time bitmap inodes can be read in * from disk in between the two stages. This is necessary so that we can free * space in the real-time portion of the file system. * * We run this whole process under GFP_NOFS allocation context. We do a * combination of non-transactional and transactional work, yet we really don't * want to recurse into the filesystem from direct reclaim during any of this * processing. This allows all the recovery code run here not to care about the * memory allocation context it is running in. */ int xlog_recover_finish( struct xlog *log) { unsigned int nofs_flags = memalloc_nofs_save(); int error; error = xlog_recover_process_intents(log); if (error) { /* * Cancel all the unprocessed intent items now so that we don't * leave them pinned in the AIL. This can cause the AIL to * livelock on the pinned item if anyone tries to push the AIL * (inode reclaim does this) before we get around to * xfs_log_mount_cancel. */ xlog_recover_cancel_intents(log); xfs_alert(log->l_mp, "Failed to recover intents"); xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); goto out_error; } /* * Sync the log to get all the intents out of the AIL. This isn't * absolutely necessary, but it helps in case the unlink transactions * would have problems pushing the intents out of the way. */ xfs_log_force(log->l_mp, XFS_LOG_SYNC); xlog_recover_process_iunlinks(log); /* * Recover any CoW staging blocks that are still referenced by the * ondisk refcount metadata. During mount there cannot be any live * staging extents as we have not permitted any user modifications. * Therefore, it is safe to free them all right now, even on a * read-only mount. */ error = xfs_reflink_recover_cow(log->l_mp); if (error) { xfs_alert(log->l_mp, "Failed to recover leftover CoW staging extents, err %d.", error); /* * If we get an error here, make sure the log is shut down * but return zero so that any log items committed since the * end of intents processing can be pushed through the CIL * and AIL. */ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); error = 0; goto out_error; } out_error: memalloc_nofs_restore(nofs_flags); return error; } void xlog_recover_cancel( struct xlog *log) { if (xlog_recovery_needed(log)) xlog_recover_cancel_intents(log); }
7 7 7 370 434 466 466 466 466 464 466 31 31 31 465 3 3 466 466 466 465 435 466 465 30 466 370 466 465 464 466 465 466 465 428 428 11 516 485 487 486 475 483 483 480 478 480 31 475 465 501 503 483 515 514 500 503 503 502 222 469 39 469 466 469 467 469 468 469 469 3 3 3 482 483 483 483 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_quota.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_error.h" struct kmem_cache *xfs_buf_item_cache; static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_buf_log_item, bli_item); } /* Is this log iovec plausibly large enough to contain the buffer log format? */ bool xfs_buf_log_check_iovec( struct xfs_log_iovec *iovec) { struct xfs_buf_log_format *blfp = iovec->i_addr; char *bmp_end; char *item_end; if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len) return false; item_end = (char *)iovec->i_addr + iovec->i_len; bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size]; return bmp_end <= item_end; } static inline int xfs_buf_log_format_size( struct xfs_buf_log_format *blfp) { return offsetof(struct xfs_buf_log_format, blf_data_map) + (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); } static inline bool xfs_buf_item_straddle( struct xfs_buf *bp, uint offset, int first_bit, int nbits) { void *first, *last; first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT)); last = xfs_buf_offset(bp, offset + ((first_bit + nbits) << XFS_BLF_SHIFT)); if (last - first != nbits * XFS_BLF_CHUNK) return true; return false; } /* * Return the number of log iovecs and space needed to log the given buf log * item segment. * * It calculates this as 1 iovec for the buf log format structure and 1 for each * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged * in a single iovec. */ STATIC void xfs_buf_item_size_segment( struct xfs_buf_log_item *bip, struct xfs_buf_log_format *blfp, uint offset, int *nvecs, int *nbytes) { struct xfs_buf *bp = bip->bli_buf; int first_bit; int nbits; int next_bit; int last_bit; first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); if (first_bit == -1) return; (*nvecs)++; *nbytes += xfs_buf_log_format_size(blfp); do { nbits = xfs_contig_bits(blfp->blf_data_map, blfp->blf_map_size, first_bit); ASSERT(nbits > 0); /* * Straddling a page is rare because we don't log contiguous * chunks of unmapped buffers anywhere. */ if (nbits > 1 && xfs_buf_item_straddle(bp, offset, first_bit, nbits)) goto slow_scan; (*nvecs)++; *nbytes += nbits * XFS_BLF_CHUNK; /* * This takes the bit number to start looking from and * returns the next set bit from there. It returns -1 * if there are no more bits set or the start bit is * beyond the end of the bitmap. */ first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, (uint)first_bit + nbits + 1); } while (first_bit != -1); return; slow_scan: /* Count the first bit we jumped out of the above loop from */ (*nvecs)++; *nbytes += XFS_BLF_CHUNK; last_bit = first_bit; while (last_bit != -1) { /* * This takes the bit number to start looking from and * returns the next set bit from there. It returns -1 * if there are no more bits set or the start bit is * beyond the end of the bitmap. */ next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, last_bit + 1); /* * If we run out of bits, leave the loop, * else if we find a new set of bits bump the number of vecs, * else keep scanning the current set of bits. */ if (next_bit == -1) { break; } else if (next_bit != last_bit + 1 || xfs_buf_item_straddle(bp, offset, first_bit, nbits)) { last_bit = next_bit; first_bit = next_bit; (*nvecs)++; nbits = 1; } else { last_bit++; nbits++; } *nbytes += XFS_BLF_CHUNK; } } /* * Return the number of log iovecs and space needed to log the given buf log * item. * * Discontiguous buffers need a format structure per region that is being * logged. This makes the changes in the buffer appear to log recovery as though * they came from separate buffers, just like would occur if multiple buffers * were used instead of a single discontiguous buffer. This enables * discontiguous buffers to be in-memory constructs, completely transparent to * what ends up on disk. * * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log * format structures. If the item has previously been logged and has dirty * regions, we do not relog them in stale buffers. This has the effect of * reducing the size of the relogged item by the amount of dirty data tracked * by the log item. This can result in the committing transaction reducing the * amount of space being consumed by the CIL. */ STATIC void xfs_buf_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; int i; int bytes; uint offset = 0; ASSERT(atomic_read(&bip->bli_refcount) > 0); if (bip->bli_flags & XFS_BLI_STALE) { /* * The buffer is stale, so all we need to log is the buf log * format structure with the cancel flag in it as we are never * going to replay the changes tracked in the log item. */ trace_xfs_buf_item_size_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); *nvecs += bip->bli_format_count; for (i = 0; i < bip->bli_format_count; i++) { *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]); } return; } ASSERT(bip->bli_flags & XFS_BLI_LOGGED); if (bip->bli_flags & XFS_BLI_ORDERED) { /* * The buffer has been logged just to order it. It is not being * included in the transaction commit, so no vectors are used at * all. */ trace_xfs_buf_item_size_ordered(bip); *nvecs = XFS_LOG_VEC_ORDERED; return; } /* * The vector count is based on the number of buffer vectors we have * dirty bits in. This will only be greater than one when we have a * compound buffer with more than one segment dirty. Hence for compound * buffers we need to track which segment the dirty bits correspond to, * and when we move from one segment to the next increment the vector * count for the extra buf log format structure that will need to be * written. */ bytes = 0; for (i = 0; i < bip->bli_format_count; i++) { xfs_buf_item_size_segment(bip, &bip->bli_formats[i], offset, nvecs, &bytes); offset += BBTOB(bp->b_maps[i].bm_len); } /* * Round up the buffer size required to minimise the number of memory * allocations that need to be done as this item grows when relogged by * repeated modifications. */ *nbytes = round_up(bytes, 512); trace_xfs_buf_item_size(bip); } static inline void xfs_buf_item_copy_iovec( struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, struct xfs_buf *bp, uint offset, int first_bit, uint nbits) { offset += first_bit * XFS_BLF_CHUNK; xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK, xfs_buf_offset(bp, offset), nbits * XFS_BLF_CHUNK); } static void xfs_buf_item_format_segment( struct xfs_buf_log_item *bip, struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, uint offset, struct xfs_buf_log_format *blfp) { struct xfs_buf *bp = bip->bli_buf; uint base_size; int first_bit; int last_bit; int next_bit; uint nbits; /* copy the flags across from the base format item */ blfp->blf_flags = bip->__bli_format.blf_flags; /* * Base size is the actual size of the ondisk structure - it reflects * the actual size of the dirty bitmap rather than the size of the in * memory structure. */ base_size = xfs_buf_log_format_size(blfp); first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { /* * If the map is not be dirty in the transaction, mark * the size as zero and do not advance the vector pointer. */ return; } blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size); blfp->blf_size = 1; if (bip->bli_flags & XFS_BLI_STALE) { /* * The buffer is stale, so all we need to log * is the buf log format structure with the * cancel flag in it. */ trace_xfs_buf_item_format_stale(bip); ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); return; } /* * Fill in an iovec for each set of contiguous chunks. */ do { ASSERT(first_bit >= 0); nbits = xfs_contig_bits(blfp->blf_data_map, blfp->blf_map_size, first_bit); ASSERT(nbits > 0); /* * Straddling a page is rare because we don't log contiguous * chunks of unmapped buffers anywhere. */ if (nbits > 1 && xfs_buf_item_straddle(bp, offset, first_bit, nbits)) goto slow_scan; xfs_buf_item_copy_iovec(lv, vecp, bp, offset, first_bit, nbits); blfp->blf_size++; /* * This takes the bit number to start looking from and * returns the next set bit from there. It returns -1 * if there are no more bits set or the start bit is * beyond the end of the bitmap. */ first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, (uint)first_bit + nbits + 1); } while (first_bit != -1); return; slow_scan: ASSERT(bp->b_addr == NULL); last_bit = first_bit; nbits = 1; for (;;) { /* * This takes the bit number to start looking from and * returns the next set bit from there. It returns -1 * if there are no more bits set or the start bit is * beyond the end of the bitmap. */ next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, (uint)last_bit + 1); /* * If we run out of bits fill in the last iovec and get out of * the loop. Else if we start a new set of bits then fill in * the iovec for the series we were looking at and start * counting the bits in the new one. Else we're still in the * same set of bits so just keep counting and scanning. */ if (next_bit == -1) { xfs_buf_item_copy_iovec(lv, vecp, bp, offset, first_bit, nbits); blfp->blf_size++; break; } else if (next_bit != last_bit + 1 || xfs_buf_item_straddle(bp, offset, first_bit, nbits)) { xfs_buf_item_copy_iovec(lv, vecp, bp, offset, first_bit, nbits); blfp->blf_size++; first_bit = next_bit; last_bit = next_bit; nbits = 1; } else { last_bit++; nbits++; } } } /* * This is called to fill in the vector of log iovecs for the * given log buf item. It fills the first entry with a buf log * format structure, and the rest point to contiguous chunks * within the buffer. */ STATIC void xfs_buf_item_format( struct xfs_log_item *lip, struct xfs_log_vec *lv) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; struct xfs_log_iovec *vecp = NULL; uint offset = 0; int i; ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); ASSERT((bip->bli_flags & XFS_BLI_STALE) || (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) || (bip->bli_flags & XFS_BLI_STALE)); /* * If it is an inode buffer, transfer the in-memory state to the * format flags and clear the in-memory state. * * For buffer based inode allocation, we do not transfer * this state if the inode buffer allocation has not yet been committed * to the log as setting the XFS_BLI_INODE_BUF flag will prevent * correct replay of the inode allocation. * * For icreate item based inode allocation, the buffers aren't written * to the journal during allocation, and hence we should always tag the * buffer as an inode buffer so that the correct unlinked list replay * occurs during recovery. */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { if (xfs_has_v3inodes(lip->li_log->l_mp) || !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; bip->bli_flags &= ~XFS_BLI_INODE_BUF; } for (i = 0; i < bip->bli_format_count; i++) { xfs_buf_item_format_segment(bip, lv, &vecp, offset, &bip->bli_formats[i]); offset += BBTOB(bp->b_maps[i].bm_len); } /* * Check to make sure everything is consistent. */ trace_xfs_buf_item_format(bip); } /* * This is called to pin the buffer associated with the buf log item in memory * so it cannot be written out. * * We take a reference to the buffer log item here so that the BLI life cycle * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and * inserted into the AIL. * * We also need to take a reference to the buffer itself as the BLI unpin * processing requires accessing the buffer after the BLI has dropped the final * BLI reference. See xfs_buf_item_unpin() for an explanation. * If unpins race to drop the final BLI reference and only the * BLI owns a reference to the buffer, then the loser of the race can have the * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per * pin count ensures the life cycle of the buffer extends for as * long as we hold the buffer pin reference in xfs_buf_item_unpin(). */ STATIC void xfs_buf_item_pin( struct xfs_log_item *lip) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_ORDERED) || (bip->bli_flags & XFS_BLI_STALE)); trace_xfs_buf_item_pin(bip); xfs_buf_hold(bip->bli_buf); atomic_inc(&bip->bli_refcount); atomic_inc(&bip->bli_buf->b_pin_count); } /* * This is called to unpin the buffer associated with the buf log item which was * previously pinned with a call to xfs_buf_item_pin(). We enter this function * with a buffer pin count, a buffer reference and a BLI reference. * * We must drop the BLI reference before we unpin the buffer because the AIL * doesn't acquire a BLI reference whenever it accesses it. Therefore if the * refcount drops to zero, the bli could still be AIL resident and the buffer * submitted for I/O at any point before we return. This can result in IO * completion freeing the buffer while we are still trying to access it here. * This race condition can also occur in shutdown situations where we abort and * unpin buffers from contexts other that journal IO completion. * * Hence we have to hold a buffer reference per pin count to ensure that the * buffer cannot be freed until we have finished processing the unpin operation. * The reference is taken in xfs_buf_item_pin(), and we must hold it until we * are done processing the buffer state. In the case of an abort (remove = * true) then we re-use the current pin reference as the IO reference we hand * off to IO failure handling. */ STATIC void xfs_buf_item_unpin( struct xfs_log_item *lip, int remove) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; int stale = bip->bli_flags & XFS_BLI_STALE; int freed; ASSERT(bp->b_log_item == bip); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_buf_item_unpin(bip); freed = atomic_dec_and_test(&bip->bli_refcount); if (atomic_dec_and_test(&bp->b_pin_count)) wake_up_all(&bp->b_waiters); /* * Nothing to do but drop the buffer pin reference if the BLI is * still active. */ if (!freed) { xfs_buf_rele(bp); return; } if (stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); ASSERT(xfs_buf_islocked(bp)); ASSERT(bp->b_flags & XBF_STALE); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(list_empty(&lip->li_trans)); ASSERT(!bp->b_transp); trace_xfs_buf_item_unpin_stale(bip); /* * The buffer has been locked and referenced since it was marked * stale so we own both lock and reference exclusively here. We * do not need the pin reference any more, so drop it now so * that we only have one reference to drop once item completion * processing is complete. */ xfs_buf_rele(bp); /* * If we get called here because of an IO error, we may or may * not have the item on the AIL. xfs_trans_ail_delete() will * take care of that situation. xfs_trans_ail_delete() drops * the AIL lock. */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { xfs_buf_item_done(bp); xfs_buf_inode_iodone(bp); ASSERT(list_empty(&bp->b_li_list)); } else { xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); ASSERT(bp->b_log_item == NULL); } xfs_buf_relse(bp); return; } if (remove) { /* * We need to simulate an async IO failures here to ensure that * the correct error completion is run on this buffer. This * requires a reference to the buffer and for the buffer to be * locked. We can safely pass ownership of the pin reference to * the IO to ensure that nothing can free the buffer while we * wait for the lock and then run the IO failure completion. */ xfs_buf_lock(bp); bp->b_flags |= XBF_ASYNC; xfs_buf_ioend_fail(bp); return; } /* * BLI has no more active references - it will be moved to the AIL to * manage the remaining BLI/buffer life cycle. There is nothing left for * us to do here so drop the pin reference to the buffer. */ xfs_buf_rele(bp); } STATIC uint xfs_buf_item_push( struct xfs_log_item *lip, struct list_head *buffer_list) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; uint rval = XFS_ITEM_SUCCESS; if (xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; if (!xfs_buf_trylock(bp)) { /* * If we have just raced with a buffer being pinned and it has * been marked stale, we could end up stalling until someone else * issues a log force to unpin the stale buffer. Check for the * race condition here so xfsaild recognizes the buffer is pinned * and queues a log force to move it along. */ if (xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; return XFS_ITEM_LOCKED; } ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); trace_xfs_buf_item_push(bip); /* has a previous flush failed due to IO errors? */ if (bp->b_flags & XBF_WRITE_FAIL) { xfs_buf_alert_ratelimited(bp, "XFS: Failing async write", "Failing async write on buffer block 0x%llx. Retrying async write.", (long long)xfs_buf_daddr(bp)); } if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_unlock(bp); return rval; } /* * Drop the buffer log item refcount and take appropriate action. This helper * determines whether the bli must be freed or not, since a decrement to zero * does not necessarily mean the bli is unused. * * Return true if the bli is freed, false otherwise. */ bool xfs_buf_item_put( struct xfs_buf_log_item *bip) { struct xfs_log_item *lip = &bip->bli_item; bool aborted; bool dirty; /* drop the bli ref and return if it wasn't the last one */ if (!atomic_dec_and_test(&bip->bli_refcount)) return false; /* * We dropped the last ref and must free the item if clean or aborted. * If the bli is dirty and non-aborted, the buffer was clean in the * transaction but still awaiting writeback from previous changes. In * that case, the bli is freed on buffer writeback completion. */ aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || xlog_is_shutdown(lip->li_log); dirty = bip->bli_flags & XFS_BLI_DIRTY; if (dirty && !aborted) return false; /* * The bli is aborted or clean. An aborted item may be in the AIL * regardless of dirty state. For example, consider an aborted * transaction that invalidated a dirty bli and cleared the dirty * state. */ if (aborted) xfs_trans_ail_delete(lip, 0); xfs_buf_item_relse(bip->bli_buf); return true; } /* * Release the buffer associated with the buf log item. If there is no dirty * logged data associated with the buffer recorded in the buf log item, then * free the buf log item and remove the reference to it in the buffer. * * This call ignores the recursion count. It is only called when the buffer * should REALLY be unlocked, regardless of the recursion count. * * We unconditionally drop the transaction's reference to the log item. If the * item was logged, then another reference was taken when it was pinned, so we * can safely drop the transaction reference now. This also allows us to avoid * potential races with the unpin code freeing the bli by not referencing the * bli after we've dropped the reference count. * * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item * if necessary but do not unlock the buffer. This is for support of * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't * free the item. */ STATIC void xfs_buf_item_release( struct xfs_log_item *lip) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; bool released; bool hold = bip->bli_flags & XFS_BLI_HOLD; bool stale = bip->bli_flags & XFS_BLI_STALE; #if defined(DEBUG) || defined(XFS_WARN) bool ordered = bip->bli_flags & XFS_BLI_ORDERED; bool dirty = bip->bli_flags & XFS_BLI_DIRTY; bool aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags); #endif trace_xfs_buf_item_release(bip); /* * The bli dirty state should match whether the blf has logged segments * except for ordered buffers, where only the bli should be dirty. */ ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || (ordered && dirty && !xfs_buf_item_dirty_format(bip))); ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); /* * Clear the buffer's association with this transaction and * per-transaction state from the bli, which has been copied above. */ bp->b_transp = NULL; bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); /* * Unref the item and unlock the buffer unless held or stale. Stale * buffers remain locked until final unpin unless the bli is freed by * the unref call. The latter implies shutdown because buffer * invalidation dirties the bli and transaction. */ released = xfs_buf_item_put(bip); if (hold || (stale && !released)) return; ASSERT(!stale || aborted); xfs_buf_relse(bp); } STATIC void xfs_buf_item_committing( struct xfs_log_item *lip, xfs_csn_t seq) { return xfs_buf_item_release(lip); } /* * This is called to find out where the oldest active copy of the * buf log item in the on disk log resides now that the last log * write of it completed at the given lsn. * We always re-log all the dirty data in a buffer, so usually the * latest copy in the on disk log is the only one that matters. For * those cases we simply return the given lsn. * * The one exception to this is for buffers full of newly allocated * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF * flag set, indicating that only the di_next_unlinked fields from the * inodes in the buffers will be replayed during recovery. If the * original newly allocated inode images have not yet been flushed * when the buffer is so relogged, then we need to make sure that we * keep the old images in the 'active' portion of the log. We do this * by returning the original lsn of that transaction here rather than * the current one. */ STATIC xfs_lsn_t xfs_buf_item_committed( struct xfs_log_item *lip, xfs_lsn_t lsn) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); trace_xfs_buf_item_committed(bip); if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0) return lip->li_lsn; return lsn; } #ifdef DEBUG_EXPENSIVE static int xfs_buf_item_precommit( struct xfs_trans *tp, struct xfs_log_item *lip) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; if (!bp->b_ops || !bp->b_ops->verify_struct) return 0; if (bip->bli_flags & XFS_BLI_STALE) return 0; fa = bp->b_ops->verify_struct(bp); if (fa) { xfs_buf_verifier_error(bp, -EFSCORRUPTED, bp->b_ops->name, bp->b_addr, BBTOB(bp->b_length), fa); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); ASSERT(fa == NULL); } return 0; } #else # define xfs_buf_item_precommit NULL #endif static const struct xfs_item_ops xfs_buf_item_ops = { .iop_size = xfs_buf_item_size, .iop_precommit = xfs_buf_item_precommit, .iop_format = xfs_buf_item_format, .iop_pin = xfs_buf_item_pin, .iop_unpin = xfs_buf_item_unpin, .iop_release = xfs_buf_item_release, .iop_committing = xfs_buf_item_committing, .iop_committed = xfs_buf_item_committed, .iop_push = xfs_buf_item_push, }; STATIC void xfs_buf_item_get_format( struct xfs_buf_log_item *bip, int count) { ASSERT(bip->bli_formats == NULL); bip->bli_format_count = count; if (count == 1) { bip->bli_formats = &bip->__bli_format; return; } bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format), GFP_KERNEL | __GFP_NOFAIL); } STATIC void xfs_buf_item_free_format( struct xfs_buf_log_item *bip) { if (bip->bli_formats != &bip->__bli_format) { kfree(bip->bli_formats); bip->bli_formats = NULL; } } /* * Allocate a new buf log item to go with the given buffer. * Set the buffer's b_log_item field to point to the new * buf log item. */ int xfs_buf_item_init( struct xfs_buf *bp, struct xfs_mount *mp) { struct xfs_buf_log_item *bip = bp->b_log_item; int chunks; int map_size; int i; /* * Check to see if there is already a buf log item for * this buffer. If we do already have one, there is * nothing to do here so return. */ ASSERT(bp->b_mount == mp); if (bip) { ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(!bp->b_transp); ASSERT(bip->bli_buf == bp); return 0; } bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; /* * chunks is the number of XFS_BLF_CHUNK size pieces the buffer * can be divided into. Make sure not to truncate any pieces. * map_size is the size of the bitmap needed to describe the * chunks of the buffer. * * Discontiguous buffer support follows the layout of the underlying * buffer. This makes the implementation as simple as possible. */ xfs_buf_item_get_format(bip, bp->b_map_count); for (i = 0; i < bip->bli_format_count; i++) { chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), XFS_BLF_CHUNK); map_size = DIV_ROUND_UP(chunks, NBWORD); if (map_size > XFS_BLF_DATAMAP_SIZE) { kmem_cache_free(xfs_buf_item_cache, bip); xfs_err(mp, "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!", map_size, BBTOB(bp->b_maps[i].bm_len)); return -EFSCORRUPTED; } bip->bli_formats[i].blf_type = XFS_LI_BUF; bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; bip->bli_formats[i].blf_map_size = map_size; } bp->b_log_item = bip; xfs_buf_hold(bp); return 0; } /* * Mark bytes first through last inclusive as dirty in the buf * item's bitmap. */ static void xfs_buf_item_log_segment( uint first, uint last, uint *map) { uint first_bit; uint last_bit; uint bits_to_set; uint bits_set; uint word_num; uint *wordp; uint bit; uint end_bit; uint mask; ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); /* * Convert byte offsets to bit numbers. */ first_bit = first >> XFS_BLF_SHIFT; last_bit = last >> XFS_BLF_SHIFT; /* * Calculate the total number of bits to be set. */ bits_to_set = last_bit - first_bit + 1; /* * Get a pointer to the first word in the bitmap * to set a bit in. */ word_num = first_bit >> BIT_TO_WORD_SHIFT; wordp = &map[word_num]; /* * Calculate the starting bit in the first word. */ bit = first_bit & (uint)(NBWORD - 1); /* * First set any bits in the first word of our range. * If it starts at bit 0 of the word, it will be * set below rather than here. That is what the variable * bit tells us. The variable bits_set tracks the number * of bits that have been set so far. End_bit is the number * of the last bit to be set in this word plus one. */ if (bit) { end_bit = min(bit + bits_to_set, (uint)NBWORD); mask = ((1U << (end_bit - bit)) - 1) << bit; *wordp |= mask; wordp++; bits_set = end_bit - bit; } else { bits_set = 0; } /* * Now set bits a whole word at a time that are between * first_bit and last_bit. */ while ((bits_to_set - bits_set) >= NBWORD) { *wordp = 0xffffffff; bits_set += NBWORD; wordp++; } /* * Finally, set any bits left to be set in one last partial word. */ end_bit = bits_to_set - bits_set; if (end_bit) { mask = (1U << end_bit) - 1; *wordp |= mask; } } /* * Mark bytes first through last inclusive as dirty in the buf * item's bitmap. */ void xfs_buf_item_log( struct xfs_buf_log_item *bip, uint first, uint last) { int i; uint start; uint end; struct xfs_buf *bp = bip->bli_buf; /* * walk each buffer segment and mark them dirty appropriately. */ start = 0; for (i = 0; i < bip->bli_format_count; i++) { if (start > last) break; end = start + BBTOB(bp->b_maps[i].bm_len) - 1; /* skip to the map that includes the first byte to log */ if (first > end) { start += BBTOB(bp->b_maps[i].bm_len); continue; } /* * Trim the range to this segment and mark it in the bitmap. * Note that we must convert buffer offsets to segment relative * offsets (e.g., the first byte of each segment is byte 0 of * that segment). */ if (first < start) first = start; if (end > last) end = last; xfs_buf_item_log_segment(first - start, end - start, &bip->bli_formats[i].blf_data_map[0]); start += BBTOB(bp->b_maps[i].bm_len); } } /* * Return true if the buffer has any ranges logged/dirtied by a transaction, * false otherwise. */ bool xfs_buf_item_dirty_format( struct xfs_buf_log_item *bip) { int i; for (i = 0; i < bip->bli_format_count; i++) { if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, bip->bli_formats[i].blf_map_size)) return true; } return false; } STATIC void xfs_buf_item_free( struct xfs_buf_log_item *bip) { xfs_buf_item_free_format(bip); kvfree(bip->bli_item.li_lv_shadow); kmem_cache_free(xfs_buf_item_cache, bip); } /* * xfs_buf_item_relse() is called when the buf log item is no longer needed. */ void xfs_buf_item_relse( struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; trace_xfs_buf_item_relse(bp, _RET_IP_); ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); if (atomic_read(&bip->bli_refcount)) return; bp->b_log_item = NULL; xfs_buf_rele(bp); xfs_buf_item_free(bip); } void xfs_buf_item_done( struct xfs_buf *bp) { /* * If we are forcibly shutting down, this may well be off the AIL * already. That's because we simulate the log-committed callbacks to * unpin these buffers. Or we may never have put this item on AIL * because of the transaction was aborted forcibly. * xfs_trans_ail_delete() takes care of these. * * Either way, AIL is useless if we're forcing a shutdown. * * Note that log recovery writes might have buffer items that are not on * the AIL even when the file system is not shut down. */ xfs_trans_ail_delete(&bp->b_log_item->bli_item, (bp->b_flags & _XBF_LOGRECOVERY) ? 0 : SHUTDOWN_CORRUPT_INCORE); xfs_buf_item_relse(bp); }
47 47 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C), 2008-2021, OPPO Mobile Comm Corp., Ltd. * https://www.oppo.com/ */ #include <linux/sysfs.h> #include <linux/kobject.h> #include "internal.h" enum { attr_feature, attr_pointer_ui, attr_pointer_bool, }; enum { struct_erofs_sb_info, struct_erofs_mount_opts, }; struct erofs_attr { struct attribute attr; short attr_id; int struct_type, offset; }; #define EROFS_ATTR(_name, _mode, _id) \ static struct erofs_attr erofs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .attr_id = attr_##_id, \ } #define EROFS_ATTR_FUNC(_name, _mode) EROFS_ATTR(_name, _mode, _name) #define EROFS_ATTR_FEATURE(_name) EROFS_ATTR(_name, 0444, feature) #define EROFS_ATTR_OFFSET(_name, _mode, _id, _struct) \ static struct erofs_attr erofs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .attr_id = attr_##_id, \ .struct_type = struct_##_struct, \ .offset = offsetof(struct _struct, _name),\ } #define EROFS_ATTR_RW(_name, _id, _struct) \ EROFS_ATTR_OFFSET(_name, 0644, _id, _struct) #define EROFS_RO_ATTR(_name, _id, _struct) \ EROFS_ATTR_OFFSET(_name, 0444, _id, _struct) #define EROFS_ATTR_RW_UI(_name, _struct) \ EROFS_ATTR_RW(_name, pointer_ui, _struct) #define EROFS_ATTR_RW_BOOL(_name, _struct) \ EROFS_ATTR_RW(_name, pointer_bool, _struct) #define ATTR_LIST(name) (&erofs_attr_##name.attr) #ifdef CONFIG_EROFS_FS_ZIP EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts); #endif static struct attribute *erofs_attrs[] = { #ifdef CONFIG_EROFS_FS_ZIP ATTR_LIST(sync_decompress), #endif NULL, }; ATTRIBUTE_GROUPS(erofs); /* Features this copy of erofs supports */ EROFS_ATTR_FEATURE(zero_padding); EROFS_ATTR_FEATURE(compr_cfgs); EROFS_ATTR_FEATURE(big_pcluster); EROFS_ATTR_FEATURE(chunked_file); EROFS_ATTR_FEATURE(device_table); EROFS_ATTR_FEATURE(compr_head2); EROFS_ATTR_FEATURE(sb_chksum); EROFS_ATTR_FEATURE(ztailpacking); EROFS_ATTR_FEATURE(fragments); EROFS_ATTR_FEATURE(dedupe); static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(zero_padding), ATTR_LIST(compr_cfgs), ATTR_LIST(big_pcluster), ATTR_LIST(chunked_file), ATTR_LIST(device_table), ATTR_LIST(compr_head2), ATTR_LIST(sb_chksum), ATTR_LIST(ztailpacking), ATTR_LIST(fragments), ATTR_LIST(dedupe), NULL, }; ATTRIBUTE_GROUPS(erofs_feat); static unsigned char *__struct_ptr(struct erofs_sb_info *sbi, int struct_type, int offset) { if (struct_type == struct_erofs_sb_info) return (unsigned char *)sbi + offset; if (struct_type == struct_erofs_mount_opts) return (unsigned char *)&sbi->opt + offset; return NULL; } static ssize_t erofs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, s_kobj); struct erofs_attr *a = container_of(attr, struct erofs_attr, attr); unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset); switch (a->attr_id) { case attr_feature: return sysfs_emit(buf, "supported\n"); case attr_pointer_ui: if (!ptr) return 0; return sysfs_emit(buf, "%u\n", *(unsigned int *)ptr); case attr_pointer_bool: if (!ptr) return 0; return sysfs_emit(buf, "%d\n", *(bool *)ptr); } return 0; } static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t len) { struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, s_kobj); struct erofs_attr *a = container_of(attr, struct erofs_attr, attr); unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset); unsigned long t; int ret; switch (a->attr_id) { case attr_pointer_ui: if (!ptr) return 0; ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret) return ret; if (t != (unsigned int)t) return -ERANGE; #ifdef CONFIG_EROFS_FS_ZIP if (!strcmp(a->attr.name, "sync_decompress") && (t > EROFS_SYNC_DECOMPRESS_FORCE_OFF)) return -EINVAL; #endif *(unsigned int *)ptr = t; return len; case attr_pointer_bool: if (!ptr) return 0; ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret) return ret; if (t != 0 && t != 1) return -EINVAL; *(bool *)ptr = !!t; return len; } return 0; } static void erofs_sb_release(struct kobject *kobj) { struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, s_kobj); complete(&sbi->s_kobj_unregister); } static const struct sysfs_ops erofs_attr_ops = { .show = erofs_attr_show, .store = erofs_attr_store, }; static const struct kobj_type erofs_sb_ktype = { .default_groups = erofs_groups, .sysfs_ops = &erofs_attr_ops, .release = erofs_sb_release, }; static const struct kobj_type erofs_ktype = { .sysfs_ops = &erofs_attr_ops, }; static struct kset erofs_root = { .kobj = {.ktype = &erofs_ktype}, }; static const struct kobj_type erofs_feat_ktype = { .default_groups = erofs_feat_groups, .sysfs_ops = &erofs_attr_ops, }; static struct kobject erofs_feat = { .kset = &erofs_root, }; int erofs_register_sysfs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); int err; sbi->s_kobj.kset = &erofs_root; init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", sb->s_sysfs_name); if (err) { kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); } return err; } void erofs_unregister_sysfs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); if (sbi->s_kobj.state_in_sysfs) { kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); } } int __init erofs_init_sysfs(void) { int ret; kobject_set_name(&erofs_root.kobj, "erofs"); erofs_root.kobj.parent = fs_kobj; ret = kset_register(&erofs_root); if (ret) goto root_err; ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype, NULL, "features"); if (ret) goto feat_err; return ret; feat_err: kobject_put(&erofs_feat); kset_unregister(&erofs_root); root_err: return ret; } void erofs_exit_sysfs(void) { kobject_put(&erofs_feat); kset_unregister(&erofs_root); }
941 5199 941 769 1258 1991 841 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_IVERSION_H #define _LINUX_IVERSION_H #include <linux/fs.h> /* * The inode->i_version field: * --------------------------- * The change attribute (i_version) is mandated by NFSv4 and is mostly for * knfsd, but is also used for other purposes (e.g. IMA). The i_version must * appear larger to observers if there was an explicit change to the inode's * data or metadata since it was last queried. * * An explicit change is one that would ordinarily result in a change to the * inode status change time (aka ctime). i_version must appear to change, even * if the ctime does not (since the whole point is to avoid missing updates due * to timestamp granularity). If POSIX or other relevant spec mandates that the * ctime must change due to an operation, then the i_version counter must be * incremented as well. * * Making the i_version update completely atomic with the operation itself would * be prohibitively expensive. Traditionally the kernel has updated the times on * directories after an operation that changes its contents. For regular files, * the ctime is usually updated before the data is copied into the cache for a * write. This means that there is a window of time when an observer can * associate a new timestamp with old file contents. Since the purpose of the * i_version is to allow for better cache coherency, the i_version must always * be updated after the results of the operation are visible. Updating it before * and after a change is also permitted. (Note that no filesystems currently do * this. Fixing that is a work-in-progress). * * Observers see the i_version as a 64-bit number that never decreases. If it * remains the same since it was last checked, then nothing has changed in the * inode. If it's different then something has changed. Observers cannot infer * anything about the nature or magnitude of the changes from the value, only * that the inode has changed in some fashion. * * Not all filesystems properly implement the i_version counter. Subsystems that * want to use i_version field on an inode should first check whether the * filesystem sets the SB_I_VERSION flag (usually via the IS_I_VERSION macro). * * Those that set SB_I_VERSION will automatically have their i_version counter * incremented on writes to normal files. If the SB_I_VERSION is not set, then * the VFS will not touch it on writes, and the filesystem can use it how it * wishes. Note that the filesystem is always responsible for updating the * i_version on namespace changes in directories (mkdir, rmdir, unlink, etc.). * We consider these sorts of filesystems to have a kernel-managed i_version. * * It may be impractical for filesystems to keep i_version updates atomic with * respect to the changes that cause them. They should, however, guarantee * that i_version updates are never visible before the changes that caused * them. Also, i_version updates should never be delayed longer than it takes * the original change to reach disk. * * This implementation uses the low bit in the i_version field as a flag to * track when the value has been queried. If it has not been queried since it * was last incremented, we can skip the increment in most cases. * * In the event that we're updating the ctime, we will usually go ahead and * bump the i_version anyway. Since that has to go to stable storage in some * fashion, we might as well increment it as well. * * With this implementation, the value should always appear to observers to * increase over time if the file has changed. It's recommended to use * inode_eq_iversion() helper to compare values. * * Note that some filesystems (e.g. NFS and AFS) just use the field to store * a server-provided value (for the most part). For that reason, those * filesystems do not set SB_I_VERSION. These filesystems are considered to * have a self-managed i_version. * * Persistently storing the i_version * ---------------------------------- * Queries of the i_version field are not gated on them hitting the backing * store. It's always possible that the host could crash after allowing * a query of the value but before it has made it to disk. * * To mitigate this problem, filesystems should always use * inode_set_iversion_queried when loading an existing inode from disk. This * ensures that the next attempted inode increment will result in the value * changing. * * Storing the value to disk therefore does not count as a query, so those * filesystems should use inode_peek_iversion to grab the value to be stored. * There is no need to flag the value as having been queried in that case. */ /* * We borrow the lowest bit in the i_version to use as a flag to tell whether * it has been queried since we last incremented it. If it has, then we must * increment it on the next change. After that, we can clear the flag and * avoid incrementing it again until it has again been queried. */ #define I_VERSION_QUERIED_SHIFT (1) #define I_VERSION_QUERIED (1ULL << (I_VERSION_QUERIED_SHIFT - 1)) #define I_VERSION_INCREMENT (1ULL << I_VERSION_QUERIED_SHIFT) /** * inode_set_iversion_raw - set i_version to the specified raw value * @inode: inode to set * @val: new i_version value to set * * Set @inode's i_version field to @val. This function is for use by * filesystems that self-manage the i_version. * * For example, the NFS client stores its NFSv4 change attribute in this way, * and the AFS client stores the data_version from the server here. */ static inline void inode_set_iversion_raw(struct inode *inode, u64 val) { atomic64_set(&inode->i_version, val); } /** * inode_peek_iversion_raw - grab a "raw" iversion value * @inode: inode from which i_version should be read * * Grab a "raw" inode->i_version value and return it. The i_version is not * flagged or converted in any way. This is mostly used to access a self-managed * i_version. * * With those filesystems, we want to treat the i_version as an entirely * opaque value. */ static inline u64 inode_peek_iversion_raw(const struct inode *inode) { return atomic64_read(&inode->i_version); } /** * inode_set_max_iversion_raw - update i_version new value is larger * @inode: inode to set * @val: new i_version to set * * Some self-managed filesystems (e.g Ceph) will only update the i_version * value if the new value is larger than the one we already have. */ static inline void inode_set_max_iversion_raw(struct inode *inode, u64 val) { u64 cur = inode_peek_iversion_raw(inode); do { if (cur > val) break; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, val)); } /** * inode_set_iversion - set i_version to a particular value * @inode: inode to set * @val: new i_version value to set * * Set @inode's i_version field to @val. This function is for filesystems with * a kernel-managed i_version, for initializing a newly-created inode from * scratch. * * In this case, we do not set the QUERIED flag since we know that this value * has never been queried. */ static inline void inode_set_iversion(struct inode *inode, u64 val) { inode_set_iversion_raw(inode, val << I_VERSION_QUERIED_SHIFT); } /** * inode_set_iversion_queried - set i_version to a particular value as quereied * @inode: inode to set * @val: new i_version value to set * * Set @inode's i_version field to @val, and flag it for increment on the next * change. * * Filesystems that persistently store the i_version on disk should use this * when loading an existing inode from disk. * * When loading in an i_version value from a backing store, we can't be certain * that it wasn't previously viewed before being stored. Thus, we must assume * that it was, to ensure that we don't end up handing out the same value for * different versions of the same inode. */ static inline void inode_set_iversion_queried(struct inode *inode, u64 val) { inode_set_iversion_raw(inode, (val << I_VERSION_QUERIED_SHIFT) | I_VERSION_QUERIED); } bool inode_maybe_inc_iversion(struct inode *inode, bool force); /** * inode_inc_iversion - forcibly increment i_version * @inode: inode that needs to be updated * * Forcbily increment the i_version field. This always results in a change to * the observable value. */ static inline void inode_inc_iversion(struct inode *inode) { inode_maybe_inc_iversion(inode, true); } /** * inode_iversion_need_inc - is the i_version in need of being incremented? * @inode: inode to check * * Returns whether the inode->i_version counter needs incrementing on the next * change. Just fetch the value and check the QUERIED flag. */ static inline bool inode_iversion_need_inc(struct inode *inode) { return inode_peek_iversion_raw(inode) & I_VERSION_QUERIED; } /** * inode_inc_iversion_raw - forcibly increment raw i_version * @inode: inode that needs to be updated * * Forcbily increment the raw i_version field. This always results in a change * to the raw value. * * NFS will use the i_version field to store the value from the server. It * mostly treats it as opaque, but in the case where it holds a write * delegation, it must increment the value itself. This function does that. */ static inline void inode_inc_iversion_raw(struct inode *inode) { atomic64_inc(&inode->i_version); } /** * inode_peek_iversion - read i_version without flagging it to be incremented * @inode: inode from which i_version should be read * * Read the inode i_version counter for an inode without registering it as a * query. * * This is typically used by local filesystems that need to store an i_version * on disk. In that situation, it's not necessary to flag it as having been * viewed, as the result won't be used to gauge changes from that point. */ static inline u64 inode_peek_iversion(const struct inode *inode) { return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT; } /* * For filesystems without any sort of change attribute, the best we can * do is fake one up from the ctime: */ static inline u64 time_to_chattr(const struct timespec64 *t) { u64 chattr = t->tv_sec; chattr <<= 32; chattr += t->tv_nsec; return chattr; } u64 inode_query_iversion(struct inode *inode); /** * inode_eq_iversion_raw - check whether the raw i_version counter has changed * @inode: inode to check * @old: old value to check against its i_version * * Compare the current raw i_version counter with a previous one. Returns true * if they are the same or false if they are different. */ static inline bool inode_eq_iversion_raw(const struct inode *inode, u64 old) { return inode_peek_iversion_raw(inode) == old; } /** * inode_eq_iversion - check whether the i_version counter has changed * @inode: inode to check * @old: old value to check against its i_version * * Compare an i_version counter with a previous one. Returns true if they are * the same, and false if they are different. * * Note that we don't need to set the QUERIED flag in this case, as the value * in the inode is not being recorded for later use. */ static inline bool inode_eq_iversion(const struct inode *inode, u64 old) { return inode_peek_iversion(inode) == old; } #endif
92 92 92 92 92 54 54 54 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. * * This file contains power management functions related to interrupts. */ #include <linux/irq.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> #include "internals.h" bool irq_pm_check_wakeup(struct irq_desc *desc) { if (irqd_is_wakeup_armed(&desc->irq_data)) { irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; desc->depth++; irq_disable(desc); pm_system_irq_wakeup(irq_desc_get_irq(desc)); return true; } return false; } /* * Called from __setup_irq() with desc->lock held after @action has * been installed in the action chain. */ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { desc->nr_actions++; if (action->flags & IRQF_FORCE_RESUME) desc->force_resume_depth++; WARN_ON_ONCE(desc->force_resume_depth && desc->force_resume_depth != desc->nr_actions); if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth++; else if (action->flags & IRQF_COND_SUSPEND) desc->cond_suspend_depth++; WARN_ON_ONCE(desc->no_suspend_depth && (desc->no_suspend_depth + desc->cond_suspend_depth) != desc->nr_actions); } /* * Called from __free_irq() with desc->lock held after @action has * been removed from the action chain. */ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { desc->nr_actions--; if (action->flags & IRQF_FORCE_RESUME) desc->force_resume_depth--; if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth--; else if (action->flags & IRQF_COND_SUSPEND) desc->cond_suspend_depth--; } static bool suspend_device_irq(struct irq_desc *desc) { unsigned long chipflags = irq_desc_get_chip(desc)->flags; struct irq_data *irqd = &desc->irq_data; if (!desc->action || irq_desc_is_chained(desc) || desc->no_suspend_depth) return false; if (irqd_is_wakeup_set(irqd)) { irqd_set(irqd, IRQD_WAKEUP_ARMED); if ((chipflags & IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND) && irqd_irq_disabled(irqd)) { /* * Interrupt marked for wakeup is in disabled state. * Enable interrupt here to unmask/enable in irqchip * to be able to resume with such interrupts. */ __enable_irq(desc); irqd_set(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); } /* * We return true here to force the caller to issue * synchronize_irq(). We need to make sure that the * IRQD_WAKEUP_ARMED is visible before we return from * suspend_device_irqs(). */ return true; } desc->istate |= IRQS_SUSPENDED; __disable_irq(desc); /* * Hardware which has no wakeup source configuration facility * requires that the non wakeup interrupts are masked at the * chip level. The chip implementation indicates that with * IRQCHIP_MASK_ON_SUSPEND. */ if (chipflags & IRQCHIP_MASK_ON_SUSPEND) mask_irq(desc); return true; } /** * suspend_device_irqs - disable all currently enabled interrupt lines * * During system-wide suspend or hibernation device drivers need to be * prevented from receiving interrupts and this function is provided * for this purpose. * * So we disable all interrupts and mark them IRQS_SUSPENDED except * for those which are unused, those which are marked as not * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND * set and those which are marked as active wakeup sources. * * The active wakeup sources are handled by the flow handler entry * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the * interrupt and notifies the pm core about the wakeup. */ void suspend_device_irqs(void) { struct irq_desc *desc; int irq; for_each_irq_desc(irq, desc) { unsigned long flags; bool sync; if (irq_settings_is_nested_thread(desc)) continue; raw_spin_lock_irqsave(&desc->lock, flags); sync = suspend_device_irq(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); if (sync) synchronize_irq(irq); } } static void resume_irq(struct irq_desc *desc) { struct irq_data *irqd = &desc->irq_data; irqd_clear(irqd, IRQD_WAKEUP_ARMED); if (irqd_is_enabled_on_suspend(irqd)) { /* * Interrupt marked for wakeup was enabled during suspend * entry. Disable such interrupts to restore them back to * original state. */ __disable_irq(desc); irqd_clear(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); } if (desc->istate & IRQS_SUSPENDED) goto resume; /* Force resume the interrupt? */ if (!desc->force_resume_depth) return; /* Pretend that it got disabled ! */ desc->depth++; irq_state_set_disabled(desc); irq_state_set_masked(desc); resume: desc->istate &= ~IRQS_SUSPENDED; __enable_irq(desc); } static void resume_irqs(bool want_early) { struct irq_desc *desc; int irq; for_each_irq_desc(irq, desc) { unsigned long flags; bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME; if (!is_early && want_early) continue; if (irq_settings_is_nested_thread(desc)) continue; raw_spin_lock_irqsave(&desc->lock, flags); resume_irq(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); } } /** * rearm_wake_irq - rearm a wakeup interrupt line after signaling wakeup * @irq: Interrupt to rearm */ void rearm_wake_irq(unsigned int irq) { unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return; if (!(desc->istate & IRQS_SUSPENDED) || !irqd_is_wakeup_set(&desc->irq_data)) goto unlock; desc->istate &= ~IRQS_SUSPENDED; irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); __enable_irq(desc); unlock: irq_put_desc_busunlock(desc, flags); } /** * irq_pm_syscore_resume - enable interrupt lines early * * Enable all interrupt lines with %IRQF_EARLY_RESUME set. */ static void irq_pm_syscore_resume(void) { resume_irqs(true); } static struct syscore_ops irq_pm_syscore_ops = { .resume = irq_pm_syscore_resume, }; static int __init irq_pm_init_ops(void) { register_syscore_ops(&irq_pm_syscore_ops); return 0; } device_initcall(irq_pm_init_ops); /** * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() * * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag * set as well as those with %IRQF_FORCE_RESUME. */ void resume_device_irqs(void) { resume_irqs(false); }
87 87 84 84 45 45 46 6 6 41 41 113 87 14 14 1 13 13 28 27 1 26 26 44 44 3 3 4 4 1 3 1 6 1 1 4 4 5 4 1 1 1 5 5 5 3 1 3 1 34 16 14 69 36 113 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 * Phillip Lougher <phillip@squashfs.org.uk> * * inode.c */ /* * This file implements code to create and read inodes from disk. * * Inodes in Squashfs are identified by a 48-bit inode which encodes the * location of the compressed metadata block containing the inode, and the byte * offset into that block where the inode is placed (<block, offset>). * * To maximise compression there are different inodes for each file type * (regular file, directory, device, etc.), the inode contents and length * varying with the type. * * To further maximise compression, two types of regular file inode and * directory inode are defined: inodes optimised for frequently occurring * regular files and directories, and extended types where extra * information has to be stored. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/xattr.h> #include <linux/pagemap.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" #include "xattr.h" /* * Initialise VFS inode with the base inode information common to all * Squashfs inode types. Sqsh_ino contains the unswapped base inode * off disk. */ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, struct squashfs_base_inode *sqsh_ino) { uid_t i_uid; gid_t i_gid; int err; inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); if (inode->i_ino == 0) return -EINVAL; err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid); if (err) return err; err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &i_gid); if (err) return err; i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0); inode_set_atime(inode, inode_get_mtime_sec(inode), 0); inode_set_ctime(inode, inode_get_mtime_sec(inode), 0); inode->i_mode = le16_to_cpu(sqsh_ino->mode); inode->i_size = 0; return err; } struct inode *squashfs_iget(struct super_block *sb, long long ino, unsigned int ino_number) { struct inode *inode = iget_locked(sb, ino_number); int err; TRACE("Entered squashfs_iget\n"); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; err = squashfs_read_inode(inode, ino); if (err) { iget_failed(inode); return ERR_PTR(err); } unlock_new_inode(inode); return inode; } /* * Initialise VFS inode by reading inode from inode table (compressed * metadata). The format and amount of data read depends on type. */ int squashfs_read_inode(struct inode *inode, long long ino) { struct super_block *sb = inode->i_sb; struct squashfs_sb_info *msblk = sb->s_fs_info; u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table; int err, type, offset = SQUASHFS_INODE_OFFSET(ino); union squashfs_inode squashfs_ino; struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base; int xattr_id = SQUASHFS_INVALID_XATTR; TRACE("Entered squashfs_read_inode\n"); /* * Read inode base common to all inode types. */ err = squashfs_read_metadata(sb, sqshb_ino, &block, &offset, sizeof(*sqshb_ino)); if (err < 0) goto failed_read; err = squashfs_new_inode(sb, inode, sqshb_ino); if (err) goto failed_read; block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table; offset = SQUASHFS_INODE_OFFSET(ino); type = le16_to_cpu(sqshb_ino->inode_type); switch (type) { case SQUASHFS_REG_TYPE: { unsigned int frag_offset, frag; int frag_size; u64 frag_blk; struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; frag = le32_to_cpu(sqsh_ino->fragment); if (frag != SQUASHFS_INVALID_FRAG) { frag_offset = le32_to_cpu(sqsh_ino->offset); frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); if (frag_size < 0) { err = frag_size; goto failed_read; } } else { frag_blk = SQUASHFS_INVALID_BLK; frag_size = 0; frag_offset = 0; } set_nlink(inode, 1); inode->i_size = le32_to_cpu(sqsh_ino->file_size); inode->i_fop = &generic_ro_fops; inode->i_mode |= S_IFREG; inode->i_blocks = ((inode->i_size - 1) >> 9) + 1; squashfs_i(inode)->fragment_block = frag_blk; squashfs_i(inode)->fragment_size = frag_size; squashfs_i(inode)->fragment_offset = frag_offset; squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->block_list_start = block; squashfs_i(inode)->offset = offset; inode->i_data.a_ops = &squashfs_aops; TRACE("File inode %x:%x, start_block %llx, block_list_start " "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino), offset, squashfs_i(inode)->start, block, offset); break; } case SQUASHFS_LREG_TYPE: { unsigned int frag_offset, frag; int frag_size; u64 frag_blk; struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; frag = le32_to_cpu(sqsh_ino->fragment); if (frag != SQUASHFS_INVALID_FRAG) { frag_offset = le32_to_cpu(sqsh_ino->offset); frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); if (frag_size < 0) { err = frag_size; goto failed_read; } } else { frag_blk = SQUASHFS_INVALID_BLK; frag_size = 0; frag_offset = 0; } xattr_id = le32_to_cpu(sqsh_ino->xattr); set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le64_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_inode_ops; inode->i_fop = &generic_ro_fops; inode->i_mode |= S_IFREG; inode->i_blocks = (inode->i_size - le64_to_cpu(sqsh_ino->sparse) + 511) >> 9; squashfs_i(inode)->fragment_block = frag_blk; squashfs_i(inode)->fragment_size = frag_size; squashfs_i(inode)->fragment_offset = frag_offset; squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->block_list_start = block; squashfs_i(inode)->offset = offset; inode->i_data.a_ops = &squashfs_aops; TRACE("File inode %x:%x, start_block %llx, block_list_start " "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino), offset, squashfs_i(inode)->start, block, offset); break; } case SQUASHFS_DIR_TYPE: { struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le16_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_dir_inode_ops; inode->i_fop = &squashfs_dir_ops; inode->i_mode |= S_IFDIR; squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset); squashfs_i(inode)->dir_idx_cnt = 0; squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode); TRACE("Directory inode %x:%x, start_block %llx, offset %x\n", SQUASHFS_INODE_BLK(ino), offset, squashfs_i(inode)->start, le16_to_cpu(sqsh_ino->offset)); break; } case SQUASHFS_LDIR_TYPE: { struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; xattr_id = le32_to_cpu(sqsh_ino->xattr); set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le32_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_dir_inode_ops; inode->i_fop = &squashfs_dir_ops; inode->i_mode |= S_IFDIR; squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset); squashfs_i(inode)->dir_idx_start = block; squashfs_i(inode)->dir_idx_offset = offset; squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count); squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode); TRACE("Long directory inode %x:%x, start_block %llx, offset " "%x\n", SQUASHFS_INODE_BLK(ino), offset, squashfs_i(inode)->start, le16_to_cpu(sqsh_ino->offset)); break; } case SQUASHFS_SYMLINK_TYPE: case SQUASHFS_LSYMLINK_TYPE: { struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); if (inode->i_size > PAGE_SIZE) { ERROR("Corrupted symlink\n"); return -EINVAL; } set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_op = &squashfs_symlink_inode_ops; inode_nohighmem(inode); inode->i_data.a_ops = &squashfs_symlink_aops; inode->i_mode |= S_IFLNK; squashfs_i(inode)->start = block; squashfs_i(inode)->offset = offset; if (type == SQUASHFS_LSYMLINK_TYPE) { __le32 xattr; err = squashfs_read_metadata(sb, NULL, &block, &offset, inode->i_size); if (err < 0) goto failed_read; err = squashfs_read_metadata(sb, &xattr, &block, &offset, sizeof(xattr)); if (err < 0) goto failed_read; xattr_id = le32_to_cpu(xattr); } TRACE("Symbolic link inode %x:%x, start_block %llx, offset " "%x\n", SQUASHFS_INODE_BLK(ino), offset, block, offset); break; } case SQUASHFS_BLKDEV_TYPE: case SQUASHFS_CHRDEV_TYPE: { struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev; unsigned int rdev; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; if (type == SQUASHFS_CHRDEV_TYPE) inode->i_mode |= S_IFCHR; else inode->i_mode |= S_IFBLK; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); TRACE("Device inode %x:%x, rdev %x\n", SQUASHFS_INODE_BLK(ino), offset, rdev); break; } case SQUASHFS_LBLKDEV_TYPE: case SQUASHFS_LCHRDEV_TYPE: { struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev; unsigned int rdev; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; if (type == SQUASHFS_LCHRDEV_TYPE) inode->i_mode |= S_IFCHR; else inode->i_mode |= S_IFBLK; xattr_id = le32_to_cpu(sqsh_ino->xattr); inode->i_op = &squashfs_inode_ops; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); TRACE("Device inode %x:%x, rdev %x\n", SQUASHFS_INODE_BLK(ino), offset, rdev); break; } case SQUASHFS_FIFO_TYPE: case SQUASHFS_SOCKET_TYPE: { struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; if (type == SQUASHFS_FIFO_TYPE) inode->i_mode |= S_IFIFO; else inode->i_mode |= S_IFSOCK; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); break; } case SQUASHFS_LFIFO_TYPE: case SQUASHFS_LSOCKET_TYPE: { struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; if (type == SQUASHFS_LFIFO_TYPE) inode->i_mode |= S_IFIFO; else inode->i_mode |= S_IFSOCK; xattr_id = le32_to_cpu(sqsh_ino->xattr); inode->i_op = &squashfs_inode_ops; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); break; } default: ERROR("Unknown inode type %d in squashfs_iget!\n", type); return -EINVAL; } if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) { err = squashfs_xattr_lookup(sb, xattr_id, &squashfs_i(inode)->xattr_count, &squashfs_i(inode)->xattr_size, &squashfs_i(inode)->xattr); if (err < 0) goto failed_read; inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9) + 1; } else squashfs_i(inode)->xattr_count = 0; return 0; failed_read: ERROR("Unable to read inode 0x%llx\n", ino); return err; } const struct inode_operations squashfs_inode_ops = { .listxattr = squashfs_listxattr };
188 188 188 188 188 188 188 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2019 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_trace.h" #include "xfs_sysctl.h" #include "xfs_pwork.h" #include <linux/nmi.h> /* * Parallel Work Queue * =================== * * Abstract away the details of running a large and "obviously" parallelizable * task across multiple CPUs. Callers initialize the pwork control object with * a desired level of parallelization and a work function. Next, they embed * struct xfs_pwork in whatever structure they use to pass work context to a * worker thread and queue that pwork. The work function will be passed the * pwork item when it is run (from process context) and any returned error will * be recorded in xfs_pwork_ctl.error. Work functions should check for errors * and abort if necessary; the non-zeroness of xfs_pwork_ctl.error does not * stop workqueue item processing. * * This is the rough equivalent of the xfsprogs workqueue code, though we can't * reuse that name here. */ /* Invoke our caller's function. */ static void xfs_pwork_work( struct work_struct *work) { struct xfs_pwork *pwork; struct xfs_pwork_ctl *pctl; int error; pwork = container_of(work, struct xfs_pwork, work); pctl = pwork->pctl; error = pctl->work_fn(pctl->mp, pwork); if (error && !pctl->error) pctl->error = error; if (atomic_dec_and_test(&pctl->nr_work)) wake_up(&pctl->poll_wait); } /* * Set up control data for parallel work. @work_fn is the function that will * be called. @tag will be written into the kernel threads. @nr_threads is * the level of parallelism desired, or 0 for no limit. */ int xfs_pwork_init( struct xfs_mount *mp, struct xfs_pwork_ctl *pctl, xfs_pwork_work_fn work_fn, const char *tag) { unsigned int nr_threads = 0; #ifdef DEBUG if (xfs_globals.pwork_threads >= 0) nr_threads = xfs_globals.pwork_threads; #endif trace_xfs_pwork_init(mp, nr_threads, current->pid); pctl->wq = alloc_workqueue("%s-%d", WQ_UNBOUND | WQ_SYSFS | WQ_FREEZABLE, nr_threads, tag, current->pid); if (!pctl->wq) return -ENOMEM; pctl->work_fn = work_fn; pctl->error = 0; pctl->mp = mp; atomic_set(&pctl->nr_work, 0); init_waitqueue_head(&pctl->poll_wait); return 0; } /* Queue some parallel work. */ void xfs_pwork_queue( struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork) { INIT_WORK(&pwork->work, xfs_pwork_work); pwork->pctl = pctl; atomic_inc(&pctl->nr_work); queue_work(pctl->wq, &pwork->work); } /* Wait for the work to finish and tear down the control structure. */ int xfs_pwork_destroy( struct xfs_pwork_ctl *pctl) { destroy_workqueue(pctl->wq); pctl->wq = NULL; return pctl->error; } /* * Wait for the work to finish by polling completion status and touch the soft * lockup watchdog. This is for callers such as mount which hold locks. */ void xfs_pwork_poll( struct xfs_pwork_ctl *pctl) { while (wait_event_timeout(pctl->poll_wait, atomic_read(&pctl->nr_work) == 0, HZ) == 0) touch_softlockup_watchdog(); }
2 2 2 2 2 4 4 4 4 4 17 17 2 2 2 15 17 17 71 71 71 1 71 71 71 71 71 1 71 70 5 5 2 5 4 4 5 2 4 5 5 5 4 4 1 1 1 4 4 4 4 1 1 1 1 1 3 3 3 1 70 70 15 15 68 70 15 1 2 2 2 57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 // SPDX-License-Identifier: GPL-2.0-only /* * acl.c * * Copyright (C) 2004, 2008 Oracle. All rights reserved. * * CREDITS: * Lots of code in this file is copy from linux/fs/ext3/acl.c. * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/string.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "alloc.h" #include "dlmglue.h" #include "file.h" #include "inode.h" #include "journal.h" #include "ocfs2_fs.h" #include "xattr.h" #include "acl.h" /* * Convert from xattr value to acl struct. */ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size) { int n, count; struct posix_acl *acl; if (!value) return NULL; if (size < sizeof(struct posix_acl_entry)) return ERR_PTR(-EINVAL); count = size / sizeof(struct posix_acl_entry); acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); for (n = 0; n < count; n++) { struct ocfs2_acl_entry *entry = (struct ocfs2_acl_entry *)value; acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); switch(acl->a_entries[n].e_tag) { case ACL_USER: acl->a_entries[n].e_uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); break; case ACL_GROUP: acl->a_entries[n].e_gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); break; default: break; } value += sizeof(struct posix_acl_entry); } return acl; } /* * Convert acl struct to xattr value. */ static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size) { struct ocfs2_acl_entry *entry = NULL; char *ocfs2_acl; size_t n; *size = acl->a_count * sizeof(struct posix_acl_entry); ocfs2_acl = kmalloc(*size, GFP_NOFS); if (!ocfs2_acl) return ERR_PTR(-ENOMEM); entry = (struct ocfs2_acl_entry *)ocfs2_acl; for (n = 0; n < acl->a_count; n++, entry++) { entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); switch(acl->a_entries[n].e_tag) { case ACL_USER: entry->e_id = cpu_to_le32( from_kuid(&init_user_ns, acl->a_entries[n].e_uid)); break; case ACL_GROUP: entry->e_id = cpu_to_le32( from_kgid(&init_user_ns, acl->a_entries[n].e_gid)); break; default: entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); break; } } return ocfs2_acl; } static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode, int type, struct buffer_head *di_bh) { int name_index; char *value = NULL; struct posix_acl *acl; int retval; switch (type) { case ACL_TYPE_ACCESS: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; break; default: return ERR_PTR(-EINVAL); } retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0); if (retval > 0) { value = kmalloc(retval, GFP_NOFS); if (!value) return ERR_PTR(-ENOMEM); retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", value, retval); } if (retval > 0) acl = ocfs2_acl_from_xattr(value, retval); else if (retval == -ENODATA || retval == 0) acl = NULL; else acl = ERR_PTR(retval); kfree(value); return acl; } /* * Helper function to set i_mode in memory and disk. Some call paths * will not have di_bh or a journal handle to pass, in which case it * will create it's own. */ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, handle_t *handle, umode_t new_mode) { int ret, commit_handle = 0; struct ocfs2_dinode *di; if (di_bh == NULL) { ret = ocfs2_read_inode_block(inode, &di_bh); if (ret) { mlog_errno(ret); goto out; } } else get_bh(di_bh); if (handle == NULL) { handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out_brelse; } commit_handle = 1; } di = (struct ocfs2_dinode *)di_bh->b_data; ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } inode->i_mode = new_mode; inode_set_ctime_current(inode); di->i_mode = cpu_to_le16(inode->i_mode); di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); out_commit: if (commit_handle) ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out_brelse: brelse(di_bh); out: return ret; } /* * Set the access or default ACL of an inode. */ static int ocfs2_set_acl(handle_t *handle, struct inode *inode, struct buffer_head *di_bh, int type, struct posix_acl *acl, struct ocfs2_alloc_context *meta_ac, struct ocfs2_alloc_context *data_ac) { int name_index; void *value = NULL; size_t size = 0; int ret; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; switch (type) { case ACL_TYPE_ACCESS: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; break; default: return -EINVAL; } if (acl) { value = ocfs2_acl_to_xattr(acl, &size); if (IS_ERR(value)) return (int)PTR_ERR(value); } if (handle) ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index, "", value, size, 0, meta_ac, data_ac); else ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0); kfree(value); if (!ret) set_cached_acl(inode, type, acl); return ret; } int ocfs2_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct buffer_head *bh = NULL; int status, had_lock; struct ocfs2_lock_holder oh; struct inode *inode = d_inode(dentry); had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh); if (had_lock < 0) return had_lock; if (type == ACL_TYPE_ACCESS && acl) { umode_t mode; status = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (status) goto unlock; status = ocfs2_acl_set_mode(inode, bh, NULL, mode); if (status) goto unlock; } status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); unlock: ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); brelse(bh); return status; } struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu) { struct ocfs2_super *osb; struct buffer_head *di_bh = NULL; struct posix_acl *acl; int had_lock; struct ocfs2_lock_holder oh; if (rcu) return ERR_PTR(-ECHILD); osb = OCFS2_SB(inode->i_sb); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return NULL; had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh); if (had_lock < 0) return ERR_PTR(had_lock); down_read(&OCFS2_I(inode)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(inode, type, di_bh); up_read(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); brelse(di_bh); return acl; } int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct posix_acl *acl; int ret; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return 0; down_read(&OCFS2_I(inode)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh); up_read(&OCFS2_I(inode)->ip_xattr_sem); if (IS_ERR_OR_NULL(acl)) return PTR_ERR_OR_ZERO(acl); ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (ret) return ret; ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, acl, NULL, NULL); posix_acl_release(acl); return ret; } /* * Initialize the ACLs of a new inode. If parent directory has default ACL, * then clone to new inode. Called from ocfs2_mknod. */ int ocfs2_init_acl(handle_t *handle, struct inode *inode, struct inode *dir, struct buffer_head *di_bh, struct buffer_head *dir_bh, struct ocfs2_alloc_context *meta_ac, struct ocfs2_alloc_context *data_ac) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct posix_acl *acl = NULL; int ret = 0, ret2; umode_t mode; if (!S_ISLNK(inode->i_mode)) { if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { down_read(&OCFS2_I(dir)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, dir_bh); up_read(&OCFS2_I(dir)->ip_xattr_sem); if (IS_ERR(acl)) return PTR_ERR(acl); } if (!acl) { mode = inode->i_mode & ~current_umask(); ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); if (ret) { mlog_errno(ret); goto cleanup; } } } if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { if (S_ISDIR(inode->i_mode)) { ret = ocfs2_set_acl(handle, inode, di_bh, ACL_TYPE_DEFAULT, acl, meta_ac, data_ac); if (ret) goto cleanup; } mode = inode->i_mode; ret = __posix_acl_create(&acl, GFP_NOFS, &mode); if (ret < 0) return ret; ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); if (ret2) { mlog_errno(ret2); ret = ret2; goto cleanup; } if (ret > 0) { ret = ocfs2_set_acl(handle, inode, di_bh, ACL_TYPE_ACCESS, acl, meta_ac, data_ac); } } cleanup: posix_acl_release(acl); return ret; }
9 9 35 35 35 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "bkey_buf.h" #include "btree_locking.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" #include "disk_accounting.h" #include "error.h" #include "extents.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" #include <linux/prefetch.h> #include <linux/sort.h> static int bch2_btree_write_buffer_journal_flush(struct journal *, struct journal_entry_pin *, u64); static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *); static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) { return (cmp_int(l->hi, r->hi) ?: cmp_int(l->mi, r->mi) ?: cmp_int(l->lo, r->lo)) >= 0; } static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) { #ifdef CONFIG_X86_64 int cmp; asm("mov (%[l]), %%rax;" "sub (%[r]), %%rax;" "mov 8(%[l]), %%rax;" "sbb 8(%[r]), %%rax;" "mov 16(%[l]), %%rax;" "sbb 16(%[r]), %%rax;" : "=@ccae" (cmp) : [l] "r" (l), [r] "r" (r) : "rax", "cc"); EBUG_ON(cmp != __wb_key_ref_cmp(l, r)); return cmp; #else return __wb_key_ref_cmp(l, r); #endif } static int wb_key_seq_cmp(const void *_l, const void *_r) { const struct btree_write_buffered_key *l = _l; const struct btree_write_buffered_key *r = _r; return cmp_int(l->journal_seq, r->journal_seq); } /* Compare excluding idx, the low 24 bits: */ static inline bool wb_key_eq(const void *_l, const void *_r) { const struct wb_key_ref *l = _l; const struct wb_key_ref *r = _r; return !((l->hi ^ r->hi)| (l->mi ^ r->mi)| ((l->lo >> 24) ^ (r->lo >> 24))); } static noinline void wb_sort(struct wb_key_ref *base, size_t num) { size_t n = num, a = num / 2; if (!a) /* num < 2 || size == 0 */ return; for (;;) { size_t b, c, d; if (a) /* Building heap: sift down --a */ --a; else if (--n) /* Sorting: Extract root to --n */ swap(base[0], base[n]); else /* Sort complete */ break; /* * Sift element at "a" down into heap. This is the * "bottom-up" variant, which significantly reduces * calls to cmp_func(): we find the sift-down path all * the way to the leaves (one compare per level), then * backtrack to find where to insert the target element. * * Because elements tend to sift down close to the leaves, * this uses fewer compares than doing two per level * on the way down. (A bit more than half as many on * average, 3/4 worst-case.) */ for (b = a; c = 2*b + 1, (d = c + 1) < n;) b = wb_key_ref_cmp(base + c, base + d) ? c : d; if (d == n) /* Special case last leaf with no sibling */ b = c; /* Now backtrack from "b" to the correct location for "a" */ while (b != a && wb_key_ref_cmp(base + a, base + b)) b = (b - 1) / 2; c = b; /* Where "a" belongs */ while (b != a) { /* Shift it into place */ b = (b - 1) / 2; swap(base[b], base[c]); } } } static noinline int wb_flush_one_slowpath(struct btree_trans *trans, struct btree_iter *iter, struct btree_write_buffered_key *wb) { struct btree_path *path = btree_iter_path(trans, iter); bch2_btree_node_unlock_write(trans, path, path->l[0].b); trans->journal_res.seq = wb->journal_seq; return bch2_trans_update(trans, iter, &wb->k, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_journal_res| BCH_TRANS_COMMIT_journal_reclaim); } static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter, struct btree_write_buffered_key *wb, bool *write_locked, bool *accounting_accumulated, size_t *fast) { struct btree_path *path; int ret; EBUG_ON(!wb->journal_seq); EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq); EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); ret = bch2_btree_iter_traverse(iter); if (ret) return ret; if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) { struct bkey u; struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u); if (k.k->type == KEY_TYPE_accounting) bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k), bkey_s_c_to_accounting(k)); } *accounting_accumulated = true; /* * We can't clone a path that has write locks: unshare it now, before * set_pos and traverse(): */ if (btree_iter_path(trans, iter)->ref > 1) iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_); path = btree_iter_path(trans, iter); if (!*write_locked) { ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c); if (ret) return ret; bch2_btree_node_prep_for_write(trans, path, path->l[0].b); *write_locked = true; } if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) { *write_locked = false; return wb_flush_one_slowpath(trans, iter, wb); } bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); (*fast)++; return 0; } /* * Update a btree with a write buffered key using the journal seq of the * original write buffer insert. * * It is not safe to rejournal the key once it has been inserted into the write * buffer because that may break recovery ordering. For example, the key may * have already been modified in the active write buffer in a seq that comes * before the current transaction. If we were to journal this key again and * crash, recovery would process updates in the wrong order. */ static int btree_write_buffered_insert(struct btree_trans *trans, struct btree_write_buffered_key *wb) { struct btree_iter iter; int ret; bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), BTREE_ITER_cached|BTREE_ITER_intent); trans->journal_res.seq = wb->journal_seq; ret = bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(trans, &iter, &wb->k, BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter); return ret; } static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) { struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer); struct journal *j = &c->journal; if (!wb->inc.keys.nr) return; bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin, bch2_btree_write_buffer_journal_flush); darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr)); darray_resize(&wb->sorted, wb->flushing.keys.size); if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) { swap(wb->flushing.keys, wb->inc.keys); goto out; } size_t nr = min(darray_room(wb->flushing.keys), wb->sorted.size - wb->flushing.keys.nr); nr = min(nr, wb->inc.keys.nr); memcpy(&darray_top(wb->flushing.keys), wb->inc.keys.data, sizeof(wb->inc.keys.data[0]) * nr); memmove(wb->inc.keys.data, wb->inc.keys.data + nr, sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr)); wb->flushing.keys.nr += nr; wb->inc.keys.nr -= nr; out: if (!wb->inc.keys.nr) bch2_journal_pin_drop(j, &wb->inc.pin); else bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin, bch2_btree_write_buffer_journal_flush); if (j->watermark) { spin_lock(&j->lock); bch2_journal_set_watermark(j); spin_unlock(&j->lock); } BUG_ON(wb->sorted.size < wb->flushing.keys.nr); } static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; struct btree_iter iter = { NULL }; size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0; bool write_locked = false; bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags); int ret = 0; bch2_trans_unlock(trans); bch2_trans_begin(trans); mutex_lock(&wb->inc.lock); move_keys_from_inc_to_flushing(wb); mutex_unlock(&wb->inc.lock); for (size_t i = 0; i < wb->flushing.keys.nr; i++) { wb->sorted.data[i].idx = i; wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree; memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos)); } wb->sorted.nr = wb->flushing.keys.nr; /* * We first sort so that we can detect and skip redundant updates, and * then we attempt to flush in sorted btree order, as this is most * efficient. * * However, since we're not flushing in the order they appear in the * journal we won't be able to drop our journal pin until everything is * flushed - which means this could deadlock the journal if we weren't * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail * if it would block taking a journal reservation. * * If that happens, simply skip the key so we can optimistically insert * as many keys as possible in the fast path. */ wb_sort(wb->sorted.data, wb->sorted.nr); darray_for_each(wb->sorted, i) { struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) prefetch(&wb->flushing.keys.data[n->idx]); BUG_ON(!k->journal_seq); if (!accounting_replay_done && k->k.k.type == KEY_TYPE_accounting) { slowpath++; continue; } if (i + 1 < &darray_top(wb->sorted) && wb_key_eq(i, i + 1)) { struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx]; if (k->k.k.type == KEY_TYPE_accounting && n->k.k.type == KEY_TYPE_accounting) bch2_accounting_accumulate(bkey_i_to_accounting(&n->k), bkey_i_to_s_c_accounting(&k->k)); overwritten++; n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq); k->journal_seq = 0; continue; } if (write_locked) { struct btree_path *path = btree_iter_path(trans, &iter); if (path->btree_id != i->btree || bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) { bch2_btree_node_unlock_write(trans, path, path->l[0].b); write_locked = false; ret = lockrestart_do(trans, bch2_btree_iter_traverse(&iter) ?: bch2_foreground_maybe_merge(trans, iter.path, 0, BCH_WATERMARK_reclaim| BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc)); if (ret) goto err; } } if (!iter.path || iter.btree_id != k->btree) { bch2_trans_iter_exit(trans, &iter); bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, BTREE_ITER_intent|BTREE_ITER_all_snapshots); } bch2_btree_iter_set_pos(&iter, k->k.k.p); btree_iter_path(trans, &iter)->preserve = false; bool accounting_accumulated = false; do { if (race_fault()) { ret = -BCH_ERR_journal_reclaim_would_deadlock; break; } ret = wb_flush_one(trans, &iter, k, &write_locked, &accounting_accumulated, &fast); if (!write_locked) bch2_trans_begin(trans); } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); if (!ret) { k->journal_seq = 0; } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { slowpath++; ret = 0; } else break; } if (write_locked) { struct btree_path *path = btree_iter_path(trans, &iter); bch2_btree_node_unlock_write(trans, path, path->l[0].b); } bch2_trans_iter_exit(trans, &iter); if (ret) goto err; if (slowpath) { /* * Flush in the order they were present in the journal, so that * we can release journal pins: * The fastpath zapped the seq of keys that were successfully flushed so * we can skip those here. */ trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); sort(wb->flushing.keys.data, wb->flushing.keys.nr, sizeof(wb->flushing.keys.data[0]), wb_key_seq_cmp, NULL); darray_for_each(wb->flushing.keys, i) { if (!i->journal_seq) continue; if (!accounting_replay_done && i->k.k.type == KEY_TYPE_accounting) { could_not_insert++; continue; } if (!could_not_insert) bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, bch2_btree_write_buffer_journal_flush); bch2_trans_begin(trans); ret = commit_do(trans, NULL, NULL, BCH_WATERMARK_reclaim| BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_journal_res , btree_write_buffered_insert(trans, i)); if (ret) goto err; i->journal_seq = 0; } /* * If journal replay hasn't finished with accounting keys we * can't flush accounting keys at all - condense them and leave * them for next time. * * Q: Can the write buffer overflow? * A Shouldn't be any actual risk. It's just new accounting * updates that the write buffer can't flush, and those are only * going to be generated by interior btree node updates as * journal replay has to split/rewrite nodes to make room for * its updates. * * And for those new acounting updates, updates to the same * counters get accumulated as they're flushed from the journal * to the write buffer - see the patch for eytzingcer tree * accumulated. So we could only overflow if the number of * distinct counters touched somehow was very large. */ if (could_not_insert) { struct btree_write_buffered_key *dst = wb->flushing.keys.data; darray_for_each(wb->flushing.keys, i) if (i->journal_seq) *dst++ = *i; wb->flushing.keys.nr = dst - wb->flushing.keys.data; } } err: if (ret || !could_not_insert) { bch2_journal_pin_drop(j, &wb->flushing.pin); wb->flushing.keys.nr = 0; } bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret)); trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0); return ret; } static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) { struct journal *j = &c->journal; struct journal_buf *buf; int ret = 0; while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) { ret = bch2_journal_keys_to_write_buffer(c, buf); mutex_unlock(&j->buf_lock); } return ret; } static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq) { struct bch_fs *c = trans->c; struct btree_write_buffer *wb = &c->btree_write_buffer; int ret = 0, fetch_from_journal_err; do { bch2_trans_unlock(trans); fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); /* * On memory allocation failure, bch2_btree_write_buffer_flush_locked() * is not guaranteed to empty wb->inc: */ mutex_lock(&wb->flushing.lock); ret = bch2_btree_write_buffer_flush_locked(trans); mutex_unlock(&wb->flushing.lock); } while (!ret && (fetch_from_journal_err || (wb->inc.pin.seq && wb->inc.pin.seq <= seq) || (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq))); return ret; } static int bch2_btree_write_buffer_journal_flush(struct journal *j, struct journal_entry_pin *_pin, u64 seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq)); } int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) { struct bch_fs *c = trans->c; trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_); return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal)); } int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct btree_write_buffer *wb = &c->btree_write_buffer; int ret = 0; if (mutex_trylock(&wb->flushing.lock)) { ret = bch2_btree_write_buffer_flush_locked(trans); mutex_unlock(&wb->flushing.lock); } return ret; } int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) { struct bch_fs *c = trans->c; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer)) return -BCH_ERR_erofs_no_writes; int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); return ret; } /* * In check and repair code, when checking references to write buffer btrees we * need to issue a flush before we have a definitive error: this issues a flush * if this is a key we haven't yet checked. */ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, struct bkey_s_c referring_k, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; struct bkey_buf tmp; int ret = 0; bch2_bkey_buf_init(&tmp); if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { bch2_bkey_buf_reassemble(&tmp, c, referring_k); if (bkey_is_btree_ptr(referring_k.k)) { bch2_trans_unlock(trans); bch2_btree_interior_updates_flush(c); } ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) goto err; bch2_bkey_buf_copy(last_flushed, c, tmp.k); ret = -BCH_ERR_transaction_restart_write_buffer_flush; } err: bch2_bkey_buf_exit(&tmp, c); return ret; } static void bch2_btree_write_buffer_flush_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); struct btree_write_buffer *wb = &c->btree_write_buffer; int ret; mutex_lock(&wb->flushing.lock); do { ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); } while (!ret && bch2_btree_write_buffer_should_flush(c)); mutex_unlock(&wb->flushing.lock); bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); } static void wb_accounting_sort(struct btree_write_buffer *wb) { eytzinger0_sort(wb->accounting.data, wb->accounting.nr, sizeof(wb->accounting.data[0]), wb_key_cmp, NULL); } int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree, struct bkey_i_accounting *k) { struct btree_write_buffer *wb = &c->btree_write_buffer; struct btree_write_buffered_key new = { .btree = btree }; bkey_copy(&new.k, &k->k_i); int ret = darray_push(&wb->accounting, new); if (ret) return ret; wb_accounting_sort(wb); return 0; } int bch2_journal_key_to_wb_slowpath(struct bch_fs *c, struct journal_keys_to_wb *dst, enum btree_id btree, struct bkey_i *k) { struct btree_write_buffer *wb = &c->btree_write_buffer; int ret; retry: ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL); if (!ret && dst->wb == &wb->flushing) ret = darray_resize(&wb->sorted, wb->flushing.keys.size); if (unlikely(ret)) { if (dst->wb == &c->btree_write_buffer.flushing) { mutex_unlock(&dst->wb->lock); dst->wb = &c->btree_write_buffer.inc; bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin, bch2_btree_write_buffer_journal_flush); goto retry; } return ret; } dst->room = darray_room(dst->wb->keys); if (dst->wb == &wb->flushing) dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); BUG_ON(!dst->room); BUG_ON(!dst->seq); struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); wb_k->journal_seq = dst->seq; wb_k->btree = btree; bkey_copy(&wb_k->k, k); dst->wb->keys.nr++; dst->room--; return 0; } void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq) { struct btree_write_buffer *wb = &c->btree_write_buffer; if (mutex_trylock(&wb->flushing.lock)) { mutex_lock(&wb->inc.lock); move_keys_from_inc_to_flushing(wb); /* * Attempt to skip wb->inc, and add keys directly to * wb->flushing, saving us a copy later: */ if (!wb->inc.keys.nr) { dst->wb = &wb->flushing; } else { mutex_unlock(&wb->flushing.lock); dst->wb = &wb->inc; } } else { mutex_lock(&wb->inc.lock); dst->wb = &wb->inc; } dst->room = darray_room(dst->wb->keys); if (dst->wb == &wb->flushing) dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); dst->seq = seq; bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin, bch2_btree_write_buffer_journal_flush); darray_for_each(wb->accounting, i) memset(&i->k.v, 0, bkey_val_bytes(&i->k.k)); } int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst) { struct btree_write_buffer *wb = &c->btree_write_buffer; unsigned live_accounting_keys = 0; int ret = 0; darray_for_each(wb->accounting, i) if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&i->k))) { i->journal_seq = dst->seq; live_accounting_keys++; ret = __bch2_journal_key_to_wb(c, dst, i->btree, &i->k); if (ret) break; } if (live_accounting_keys * 2 < wb->accounting.nr) { struct btree_write_buffered_key *dst = wb->accounting.data; darray_for_each(wb->accounting, src) if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&src->k))) *dst++ = *src; wb->accounting.nr = dst - wb->accounting.data; wb_accounting_sort(wb); } if (!dst->wb->keys.nr) bch2_journal_pin_drop(&c->journal, &dst->wb->pin); if (bch2_btree_write_buffer_should_flush(c) && __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) && !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); if (dst->wb == &wb->flushing) mutex_unlock(&wb->flushing.lock); mutex_unlock(&wb->inc.lock); return ret; } static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) { struct journal_keys_to_wb dst; int ret = 0; bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { jset_entry_for_each_key(entry, k) { ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); if (ret) goto out; } entry->type = BCH_JSET_ENTRY_btree_keys; } spin_lock(&c->journal.lock); buf->need_flush_to_write_buffer = false; spin_unlock(&c->journal.lock); out: ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; return ret; } static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) { if (wb->keys.size >= new_size) return 0; if (!mutex_trylock(&wb->lock)) return -EINTR; int ret = darray_resize(&wb->keys, new_size); mutex_unlock(&wb->lock); return ret; } int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size) { struct btree_write_buffer *wb = &c->btree_write_buffer; return wb_keys_resize(&wb->flushing, new_size) ?: wb_keys_resize(&wb->inc, new_size); } void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) && !bch2_journal_error(&c->journal)); darray_exit(&wb->accounting); darray_exit(&wb->sorted); darray_exit(&wb->flushing.keys); darray_exit(&wb->inc.keys); } int bch2_fs_btree_write_buffer_init(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; mutex_init(&wb->inc.lock); mutex_init(&wb->flushing.lock); INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); /* Will be resized by journal as needed: */ unsigned initial_size = 1 << 16; return darray_make_room(&wb->inc.keys, initial_size) ?: darray_make_room(&wb->flushing.keys, initial_size) ?: darray_make_room(&wb->sorted, initial_size); }
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 // SPDX-License-Identifier: GPL-2.0-only /* * Linux driver for digital TV devices equipped with B2C2 FlexcopII(b)/III * flexcop-usb.c - covers the USB part * see flexcop.c for copyright information */ #define FC_LOG_PREFIX "flexcop_usb" #include "flexcop-usb.h" #include "flexcop-common.h" /* Version information */ #define DRIVER_VERSION "0.1" #define DRIVER_NAME "Technisat/B2C2 FlexCop II/IIb/III Digital TV USB Driver" #define DRIVER_AUTHOR "Patrick Boettcher <patrick.boettcher@posteo.de>" /* debug */ #ifdef CONFIG_DVB_B2C2_FLEXCOP_DEBUG #define dprintk(level, args...) \ do { if ((debug & (level))) printk(args); } while (0) #define debug_dump(b, l, method) do {\ int i; \ for (i = 0; i < l; i++) \ method("%02x ", b[i]); \ method("\n"); \ } while (0) #define DEBSTATUS "" #else #define dprintk(level, args...) no_printk(args) #define debug_dump(b, l, method) do { } while (0) #define DEBSTATUS " (debugging is not enabled)" #endif static int debug; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "set debugging level (1=info,ts=2,ctrl=4,i2c=8,v8mem=16 (or-able))." DEBSTATUS); #undef DEBSTATUS #define deb_info(args...) dprintk(0x01, args) #define deb_ts(args...) dprintk(0x02, args) #define deb_ctrl(args...) dprintk(0x04, args) #define deb_i2c(args...) dprintk(0x08, args) #define deb_v8(args...) dprintk(0x10, args) /* JLP 111700: we will include the 1 bit gap between the upper and lower 3 bits * in the IBI address, to make the V8 code simpler. * PCI ADDRESS FORMAT: 0x71C -> 0000 0111 0001 1100 (the six bits used) * in general: 0000 0HHH 000L LL00 * IBI ADDRESS FORMAT: RHHH BLLL * * where R is the read(1)/write(0) bit, B is the busy bit * and HHH and LLL are the two sets of three bits from the PCI address. */ #define B2C2_FLEX_PCIOFFSET_TO_INTERNALADDR(usPCI) (u8) \ (((usPCI >> 2) & 0x07) + ((usPCI >> 4) & 0x70)) #define B2C2_FLEX_INTERNALADDR_TO_PCIOFFSET(ucAddr) (u16) \ (((ucAddr & 0x07) << 2) + ((ucAddr & 0x70) << 4)) /* * DKT 020228 * - forget about this VENDOR_BUFFER_SIZE, read and write register * deal with DWORD or 4 bytes, that should be should from now on * - from now on, we don't support anything older than firm 1.00 * I eliminated the write register as a 2 trip of writing hi word and lo word * and force this to write only 4 bytes at a time. * NOTE: this should work with all the firmware from 1.00 and newer */ static int flexcop_usb_readwrite_dw(struct flexcop_device *fc, u16 wRegOffsPCI, u32 *val, u8 read) { struct flexcop_usb *fc_usb = fc->bus_specific; u8 request = read ? B2C2_USB_READ_REG : B2C2_USB_WRITE_REG; u8 request_type = (read ? USB_DIR_IN : USB_DIR_OUT) | USB_TYPE_VENDOR; u8 wAddress = B2C2_FLEX_PCIOFFSET_TO_INTERNALADDR(wRegOffsPCI) | (read ? 0x80 : 0); int ret; mutex_lock(&fc_usb->data_mutex); if (!read) memcpy(fc_usb->data, val, sizeof(*val)); ret = usb_control_msg(fc_usb->udev, read ? B2C2_USB_CTRL_PIPE_IN : B2C2_USB_CTRL_PIPE_OUT, request, request_type, /* 0xc0 read or 0x40 write */ wAddress, 0, fc_usb->data, sizeof(u32), B2C2_WAIT_FOR_OPERATION_RDW); if (ret != sizeof(u32)) { err("error while %s dword from %d (%d).", read ? "reading" : "writing", wAddress, wRegOffsPCI); if (ret >= 0) ret = -EIO; } if (read && ret >= 0) memcpy(val, fc_usb->data, sizeof(*val)); mutex_unlock(&fc_usb->data_mutex); return ret; } /* * DKT 010817 - add support for V8 memory read/write and flash update */ static int flexcop_usb_v8_memory_req(struct flexcop_usb *fc_usb, flexcop_usb_request_t req, u8 page, u16 wAddress, u8 *pbBuffer, u32 buflen) { u8 request_type = USB_TYPE_VENDOR; u16 wIndex; int nWaitTime, pipe, ret; wIndex = page << 8; if (buflen > sizeof(fc_usb->data)) { err("Buffer size bigger than max URB control message\n"); return -EIO; } switch (req) { case B2C2_USB_READ_V8_MEM: nWaitTime = B2C2_WAIT_FOR_OPERATION_V8READ; request_type |= USB_DIR_IN; pipe = B2C2_USB_CTRL_PIPE_IN; break; case B2C2_USB_WRITE_V8_MEM: wIndex |= pbBuffer[0]; request_type |= USB_DIR_OUT; nWaitTime = B2C2_WAIT_FOR_OPERATION_V8WRITE; pipe = B2C2_USB_CTRL_PIPE_OUT; break; case B2C2_USB_FLASH_BLOCK: request_type |= USB_DIR_OUT; nWaitTime = B2C2_WAIT_FOR_OPERATION_V8FLASH; pipe = B2C2_USB_CTRL_PIPE_OUT; break; default: deb_info("unsupported request for v8_mem_req %x.\n", req); return -EINVAL; } deb_v8("v8mem: %02x %02x %04x %04x, len: %d\n", request_type, req, wAddress, wIndex, buflen); mutex_lock(&fc_usb->data_mutex); if ((request_type & USB_ENDPOINT_DIR_MASK) == USB_DIR_OUT) memcpy(fc_usb->data, pbBuffer, buflen); ret = usb_control_msg(fc_usb->udev, pipe, req, request_type, wAddress, wIndex, fc_usb->data, buflen, nWaitTime); if (ret != buflen) ret = -EIO; if (ret >= 0) { ret = 0; if ((request_type & USB_ENDPOINT_DIR_MASK) == USB_DIR_IN) memcpy(pbBuffer, fc_usb->data, buflen); } mutex_unlock(&fc_usb->data_mutex); debug_dump(pbBuffer, ret, deb_v8); return ret; } #define bytes_left_to_read_on_page(paddr, buflen) \ ((V8_MEMORY_PAGE_SIZE - (paddr & V8_MEMORY_PAGE_MASK)) > buflen \ ? buflen : (V8_MEMORY_PAGE_SIZE - (paddr & V8_MEMORY_PAGE_MASK))) static int flexcop_usb_memory_req(struct flexcop_usb *fc_usb, flexcop_usb_request_t req, flexcop_usb_mem_page_t page_start, u32 addr, int extended, u8 *buf, u32 len) { int ret = 0; u16 wMax; u32 pagechunk = 0; switch (req) { case B2C2_USB_READ_V8_MEM: wMax = USB_MEM_READ_MAX; break; case B2C2_USB_WRITE_V8_MEM: wMax = USB_MEM_WRITE_MAX; break; case B2C2_USB_FLASH_BLOCK: wMax = USB_FLASH_MAX; break; default: return -EINVAL; } while (len) { pagechunk = min(wMax, bytes_left_to_read_on_page(addr, len)); deb_info("%x\n", (addr & V8_MEMORY_PAGE_MASK) | (V8_MEMORY_EXTENDED*extended)); ret = flexcop_usb_v8_memory_req(fc_usb, req, page_start + (addr / V8_MEMORY_PAGE_SIZE), (addr & V8_MEMORY_PAGE_MASK) | (V8_MEMORY_EXTENDED*extended), buf, pagechunk); if (ret < 0) return ret; addr += pagechunk; buf += pagechunk; len -= pagechunk; } return 0; } static int flexcop_usb_get_mac_addr(struct flexcop_device *fc, int extended) { return flexcop_usb_memory_req(fc->bus_specific, B2C2_USB_READ_V8_MEM, V8_MEMORY_PAGE_FLASH, 0x1f010, 1, fc->dvb_adapter.proposed_mac, 6); } /* usb i2c stuff */ static int flexcop_usb_i2c_req(struct flexcop_i2c_adapter *i2c, flexcop_usb_request_t req, flexcop_usb_i2c_function_t func, u8 chipaddr, u8 addr, u8 *buf, u8 buflen) { struct flexcop_usb *fc_usb = i2c->fc->bus_specific; u16 wValue, wIndex; int nWaitTime, pipe, ret; u8 request_type = USB_TYPE_VENDOR; if (buflen > sizeof(fc_usb->data)) { err("Buffer size bigger than max URB control message\n"); return -EIO; } switch (func) { case USB_FUNC_I2C_WRITE: case USB_FUNC_I2C_MULTIWRITE: case USB_FUNC_I2C_REPEATWRITE: /* DKT 020208 - add this to support special case of DiSEqC */ case USB_FUNC_I2C_CHECKWRITE: pipe = B2C2_USB_CTRL_PIPE_OUT; nWaitTime = 2000; request_type |= USB_DIR_OUT; break; case USB_FUNC_I2C_READ: case USB_FUNC_I2C_REPEATREAD: pipe = B2C2_USB_CTRL_PIPE_IN; nWaitTime = 2000; request_type |= USB_DIR_IN; break; default: deb_info("unsupported function for i2c_req %x\n", func); return -EINVAL; } wValue = (func << 8) | (i2c->port << 4); wIndex = (chipaddr << 8 ) | addr; deb_i2c("i2c %2d: %02x %02x %02x %02x %02x %02x\n", func, request_type, req, wValue & 0xff, wValue >> 8, wIndex & 0xff, wIndex >> 8); mutex_lock(&fc_usb->data_mutex); if ((request_type & USB_ENDPOINT_DIR_MASK) == USB_DIR_OUT) memcpy(fc_usb->data, buf, buflen); ret = usb_control_msg(fc_usb->udev, pipe, req, request_type, wValue, wIndex, fc_usb->data, buflen, nWaitTime); if (ret != buflen) ret = -EIO; if (ret >= 0) { ret = 0; if ((request_type & USB_ENDPOINT_DIR_MASK) == USB_DIR_IN) memcpy(buf, fc_usb->data, buflen); } mutex_unlock(&fc_usb->data_mutex); return ret; } /* actual bus specific access functions, make sure prototype are/will be equal to pci */ static flexcop_ibi_value flexcop_usb_read_ibi_reg(struct flexcop_device *fc, flexcop_ibi_register reg) { flexcop_ibi_value val; val.raw = 0; flexcop_usb_readwrite_dw(fc, reg, &val.raw, 1); return val; } static int flexcop_usb_write_ibi_reg(struct flexcop_device *fc, flexcop_ibi_register reg, flexcop_ibi_value val) { return flexcop_usb_readwrite_dw(fc, reg, &val.raw, 0); } static int flexcop_usb_i2c_request(struct flexcop_i2c_adapter *i2c, flexcop_access_op_t op, u8 chipaddr, u8 addr, u8 *buf, u16 len) { if (op == FC_READ) return flexcop_usb_i2c_req(i2c, B2C2_USB_I2C_REQUEST, USB_FUNC_I2C_READ, chipaddr, addr, buf, len); else return flexcop_usb_i2c_req(i2c, B2C2_USB_I2C_REQUEST, USB_FUNC_I2C_WRITE, chipaddr, addr, buf, len); } static void flexcop_usb_process_frame(struct flexcop_usb *fc_usb, u8 *buffer, int buffer_length) { u8 *b; int l; deb_ts("tmp_buffer_length=%d, buffer_length=%d\n", fc_usb->tmp_buffer_length, buffer_length); if (fc_usb->tmp_buffer_length > 0) { memcpy(fc_usb->tmp_buffer+fc_usb->tmp_buffer_length, buffer, buffer_length); fc_usb->tmp_buffer_length += buffer_length; b = fc_usb->tmp_buffer; l = fc_usb->tmp_buffer_length; } else { b = buffer; l = buffer_length; } while (l >= 190) { if (*b == 0xff) { switch (*(b+1) & 0x03) { case 0x01: /* media packet */ if (*(b+2) == 0x47) flexcop_pass_dmx_packets( fc_usb->fc_dev, b+2, 1); else deb_ts("not ts packet %*ph\n", 4, b+2); b += 190; l -= 190; break; default: deb_ts("wrong packet type\n"); l = 0; break; } } else { deb_ts("wrong header\n"); l = 0; } } if (l > 0) memcpy(fc_usb->tmp_buffer, b, l); fc_usb->tmp_buffer_length = l; } static void flexcop_usb_urb_complete(struct urb *urb) { struct flexcop_usb *fc_usb = urb->context; int i; if (urb->actual_length > 0) deb_ts("urb completed, bufsize: %d actlen; %d\n", urb->transfer_buffer_length, urb->actual_length); for (i = 0; i < urb->number_of_packets; i++) { if (urb->iso_frame_desc[i].status < 0) { err("iso frame descriptor %d has an error: %d\n", i, urb->iso_frame_desc[i].status); } else if (urb->iso_frame_desc[i].actual_length > 0) { deb_ts("passed %d bytes to the demux\n", urb->iso_frame_desc[i].actual_length); flexcop_usb_process_frame(fc_usb, urb->transfer_buffer + urb->iso_frame_desc[i].offset, urb->iso_frame_desc[i].actual_length); } urb->iso_frame_desc[i].status = 0; urb->iso_frame_desc[i].actual_length = 0; } usb_submit_urb(urb, GFP_ATOMIC); } static int flexcop_usb_stream_control(struct flexcop_device *fc, int onoff) { /* submit/kill iso packets */ return 0; } static void flexcop_usb_transfer_exit(struct flexcop_usb *fc_usb) { int i; for (i = 0; i < B2C2_USB_NUM_ISO_URB; i++) if (fc_usb->iso_urb[i] != NULL) { deb_ts("unlinking/killing urb no. %d\n", i); usb_kill_urb(fc_usb->iso_urb[i]); usb_free_urb(fc_usb->iso_urb[i]); } usb_free_coherent(fc_usb->udev, fc_usb->buffer_size, fc_usb->iso_buffer, fc_usb->dma_addr); } static int flexcop_usb_transfer_init(struct flexcop_usb *fc_usb) { struct usb_host_interface *alt = fc_usb->uintf->cur_altsetting; u16 frame_size; int bufsize, i, j, ret; int buffer_offset = 0; frame_size = usb_endpoint_maxp(&alt->endpoint[0].desc); bufsize = B2C2_USB_NUM_ISO_URB * B2C2_USB_FRAMES_PER_ISO * frame_size; deb_ts("creating %d iso-urbs with %d frames each of %d bytes size = %d.\n", B2C2_USB_NUM_ISO_URB, B2C2_USB_FRAMES_PER_ISO, frame_size, bufsize); fc_usb->iso_buffer = usb_alloc_coherent(fc_usb->udev, bufsize, GFP_KERNEL, &fc_usb->dma_addr); if (fc_usb->iso_buffer == NULL) return -ENOMEM; memset(fc_usb->iso_buffer, 0, bufsize); fc_usb->buffer_size = bufsize; /* creating iso urbs */ for (i = 0; i < B2C2_USB_NUM_ISO_URB; i++) { fc_usb->iso_urb[i] = usb_alloc_urb(B2C2_USB_FRAMES_PER_ISO, GFP_KERNEL); if (fc_usb->iso_urb[i] == NULL) { ret = -ENOMEM; goto urb_error; } } /* initialising and submitting iso urbs */ for (i = 0; i < B2C2_USB_NUM_ISO_URB; i++) { int frame_offset = 0; struct urb *urb = fc_usb->iso_urb[i]; deb_ts("initializing and submitting urb no. %d (buf_offset: %d).\n", i, buffer_offset); urb->dev = fc_usb->udev; urb->context = fc_usb; urb->complete = flexcop_usb_urb_complete; urb->pipe = B2C2_USB_DATA_PIPE; urb->transfer_flags = URB_ISO_ASAP; urb->interval = 1; urb->number_of_packets = B2C2_USB_FRAMES_PER_ISO; urb->transfer_buffer_length = frame_size * B2C2_USB_FRAMES_PER_ISO; urb->transfer_buffer = fc_usb->iso_buffer + buffer_offset; buffer_offset += frame_size * B2C2_USB_FRAMES_PER_ISO; for (j = 0; j < B2C2_USB_FRAMES_PER_ISO; j++) { deb_ts("urb no: %d, frame: %d, frame_offset: %d\n", i, j, frame_offset); urb->iso_frame_desc[j].offset = frame_offset; urb->iso_frame_desc[j].length = frame_size; frame_offset += frame_size; } if ((ret = usb_submit_urb(fc_usb->iso_urb[i],GFP_KERNEL))) { err("submitting urb %d failed with %d.", i, ret); goto urb_error; } deb_ts("submitted urb no. %d.\n", i); } /* SRAM */ flexcop_sram_set_dest(fc_usb->fc_dev, FC_SRAM_DEST_MEDIA | FC_SRAM_DEST_NET | FC_SRAM_DEST_CAO | FC_SRAM_DEST_CAI, FC_SRAM_DEST_TARGET_WAN_USB); flexcop_wan_set_speed(fc_usb->fc_dev, FC_WAN_SPEED_8MBITS); flexcop_sram_ctrl(fc_usb->fc_dev, 1, 1, 1); return 0; urb_error: flexcop_usb_transfer_exit(fc_usb); return ret; } static int flexcop_usb_init(struct flexcop_usb *fc_usb) { struct usb_host_interface *alt; int ret; /* use the alternate setting with the largest buffer */ ret = usb_set_interface(fc_usb->udev, 0, 1); if (ret) { err("set interface failed."); return ret; } alt = fc_usb->uintf->cur_altsetting; if (alt->desc.bNumEndpoints < 2) return -ENODEV; if (!usb_endpoint_is_isoc_in(&alt->endpoint[0].desc)) return -ENODEV; switch (fc_usb->udev->speed) { case USB_SPEED_LOW: err("cannot handle USB speed because it is too slow."); return -ENODEV; break; case USB_SPEED_FULL: info("running at FULL speed."); break; case USB_SPEED_HIGH: info("running at HIGH speed."); break; case USB_SPEED_SUPER: info("running at SUPER speed."); break; case USB_SPEED_SUPER_PLUS: info("running at SUPER+ speed."); break; case USB_SPEED_UNKNOWN: default: err("cannot handle USB speed because it is unknown."); return -ENODEV; } usb_set_intfdata(fc_usb->uintf, fc_usb); return 0; } static void flexcop_usb_exit(struct flexcop_usb *fc_usb) { usb_set_intfdata(fc_usb->uintf, NULL); } static int flexcop_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct flexcop_usb *fc_usb = NULL; struct flexcop_device *fc = NULL; int ret; if ((fc = flexcop_device_kmalloc(sizeof(struct flexcop_usb))) == NULL) { err("out of memory\n"); return -ENOMEM; } /* general flexcop init */ fc_usb = fc->bus_specific; fc_usb->fc_dev = fc; mutex_init(&fc_usb->data_mutex); fc->read_ibi_reg = flexcop_usb_read_ibi_reg; fc->write_ibi_reg = flexcop_usb_write_ibi_reg; fc->i2c_request = flexcop_usb_i2c_request; fc->get_mac_addr = flexcop_usb_get_mac_addr; fc->stream_control = flexcop_usb_stream_control; fc->pid_filtering = 1; fc->bus_type = FC_USB; fc->dev = &udev->dev; fc->owner = THIS_MODULE; /* bus specific part */ fc_usb->udev = udev; fc_usb->uintf = intf; if ((ret = flexcop_usb_init(fc_usb)) != 0) goto err_kfree; /* init flexcop */ if ((ret = flexcop_device_initialize(fc)) != 0) goto err_usb_exit; /* xfer init */ if ((ret = flexcop_usb_transfer_init(fc_usb)) != 0) goto err_fc_exit; info("%s successfully initialized and connected.", DRIVER_NAME); return 0; err_fc_exit: flexcop_device_exit(fc); err_usb_exit: flexcop_usb_exit(fc_usb); err_kfree: flexcop_device_kfree(fc); return ret; } static void flexcop_usb_disconnect(struct usb_interface *intf) { struct flexcop_usb *fc_usb = usb_get_intfdata(intf); flexcop_usb_transfer_exit(fc_usb); flexcop_device_exit(fc_usb->fc_dev); flexcop_usb_exit(fc_usb); flexcop_device_kfree(fc_usb->fc_dev); info("%s successfully deinitialized and disconnected.", DRIVER_NAME); } static const struct usb_device_id flexcop_usb_table[] = { { USB_DEVICE(0x0af7, 0x0101) }, { } }; MODULE_DEVICE_TABLE (usb, flexcop_usb_table); /* usb specific object needed to register this driver with the usb subsystem */ static struct usb_driver flexcop_usb_driver = { .name = "b2c2_flexcop_usb", .probe = flexcop_usb_probe, .disconnect = flexcop_usb_disconnect, .id_table = flexcop_usb_table, }; module_usb_driver(flexcop_usb_driver); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_NAME); MODULE_LICENSE("GPL");
86 85 85 45 117 85 56 117 117 117 116 117 117 1 4 118 117 117 55 55 55 5 55 55 55 45 45 45 85 85 3 86 4 86 36 86 86 53 117 117 24 35 35 35 35 35 35 24 55 98 21 19 3 21 88 88 6 100 100 99 100 41 4 4 40 19 23 21 24 24 24 21 24 3 2 20 21 19 24 24 24 24 24 24 24 24 24 24 24 24 24 86 86 85 40 86 40 40 40 39 40 40 40 85 85 85 85 1 85 85 1 1 1 1 1 86 1 86 86 86 86 86 41 41 85 85 84 85 85 85 85 85 85 85 36 86 85 86 56 56 3 3 3 55 52 16 30 55 55 55 37 37 20 37 24 24 24 30 30 30 117 117 116 116 113 33 53 116 116 117 117 117 33 116 115 116 122 86 56 117 117 122 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 // SPDX-License-Identifier: GPL-2.0-only /* * z3fold.c * * Author: Vitaly Wool <vitaly.wool@konsulko.com> * Copyright (C) 2016, Sony Mobile Communications Inc. * * This implementation is based on zbud written by Seth Jennings. * * z3fold is an special purpose allocator for storing compressed pages. It * can store up to three compressed pages per page which improves the * compression ratio of zbud while retaining its main concepts (e. g. always * storing an integral number of objects per page) and simplicity. * It still has simple and deterministic reclaim properties that make it * preferable to a higher density approach (with no requirement on integral * number of object per page) when reclaim is used. * * As in zbud, pages are divided into "chunks". The size of the chunks is * fixed at compile time and is determined by NCHUNKS_ORDER below. * * z3fold doesn't export any API and is meant to be used via zpool API. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/atomic.h> #include <linux/sched.h> #include <linux/cpumask.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/page-flags.h> #include <linux/migrate.h> #include <linux/node.h> #include <linux/compaction.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/zpool.h> #include <linux/kmemleak.h> /* * NCHUNKS_ORDER determines the internal allocation granularity, effectively * adjusting internal fragmentation. It also determines the number of * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks * in the beginning of an allocated page are occupied by z3fold header, so * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y), * which shows the max number of free chunks in z3fold page, also there will * be 63, or 62, respectively, freelists per pool. */ #define NCHUNKS_ORDER 6 #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) #define CHUNK_SIZE (1 << CHUNK_SHIFT) #define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) #define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) #define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) #define NCHUNKS (TOTAL_CHUNKS - ZHDR_CHUNKS) #define BUDDY_MASK (0x3) #define BUDDY_SHIFT 2 #define SLOTS_ALIGN (0x40) /***************** * Structures *****************/ struct z3fold_pool; enum buddy { HEADLESS = 0, FIRST, MIDDLE, LAST, BUDDIES_MAX = LAST }; struct z3fold_buddy_slots { /* * we are using BUDDY_MASK in handle_to_buddy etc. so there should * be enough slots to hold all possible variants */ unsigned long slot[BUDDY_MASK + 1]; unsigned long pool; /* back link */ rwlock_t lock; }; #define HANDLE_FLAG_MASK (0x03) /* * struct z3fold_header - z3fold page metadata occupying first chunks of each * z3fold page, except for HEADLESS pages * @buddy: links the z3fold page into the relevant list in the * pool * @page_lock: per-page lock * @refcount: reference count for the z3fold page * @work: work_struct for page layout optimization * @slots: pointer to the structure holding buddy slots * @pool: pointer to the containing pool * @cpu: CPU which this page "belongs" to * @first_chunks: the size of the first buddy in chunks, 0 if free * @middle_chunks: the size of the middle buddy in chunks, 0 if free * @last_chunks: the size of the last buddy in chunks, 0 if free * @first_num: the starting number (for the first handle) * @mapped_count: the number of objects currently mapped */ struct z3fold_header { struct list_head buddy; spinlock_t page_lock; struct kref refcount; struct work_struct work; struct z3fold_buddy_slots *slots; struct z3fold_pool *pool; short cpu; unsigned short first_chunks; unsigned short middle_chunks; unsigned short last_chunks; unsigned short start_middle; unsigned short first_num:2; unsigned short mapped_count:2; unsigned short foreign_handles:2; }; /** * struct z3fold_pool - stores metadata for each z3fold pool * @name: pool name * @lock: protects pool unbuddied lists * @stale_lock: protects pool stale page list * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2- * buddies; the list each z3fold page is added to depends on * the size of its free region. * @stale: list of pages marked for freeing * @pages_nr: number of z3fold pages in the pool. * @c_handle: cache for z3fold_buddy_slots allocation * @compact_wq: workqueue for page layout background optimization * @release_wq: workqueue for safe page release * @work: work_struct for safe page release * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular z3fold pool. */ struct z3fold_pool { const char *name; spinlock_t lock; spinlock_t stale_lock; struct list_head *unbuddied; struct list_head stale; atomic64_t pages_nr; struct kmem_cache *c_handle; struct workqueue_struct *compact_wq; struct workqueue_struct *release_wq; struct work_struct work; }; /* * Internal z3fold page flags */ enum z3fold_page_flags { PAGE_HEADLESS = 0, MIDDLE_CHUNK_MAPPED, NEEDS_COMPACTING, PAGE_STALE, PAGE_CLAIMED, /* by either reclaim or free */ PAGE_MIGRATED, /* page is migrated and soon to be released */ }; /* * handle flags, go under HANDLE_FLAG_MASK */ enum z3fold_handle_flags { HANDLES_NOFREE = 0, }; /* * Forward declarations */ static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool); static void compact_page_work(struct work_struct *w); /***************** * Helpers *****************/ /* Converts an allocation size in bytes to size in z3fold chunks */ static int size_to_chunks(size_t size) { return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; } #define for_each_unbuddied_list(_iter, _begin) \ for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, gfp_t gfp) { struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle, gfp); if (slots) { /* It will be freed separately in free_handle(). */ kmemleak_not_leak(slots); slots->pool = (unsigned long)pool; rwlock_init(&slots->lock); } return slots; } static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s) { return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK); } static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle) { return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1)); } /* Lock a z3fold page */ static inline void z3fold_page_lock(struct z3fold_header *zhdr) { spin_lock(&zhdr->page_lock); } /* Try to lock a z3fold page */ static inline int z3fold_page_trylock(struct z3fold_header *zhdr) { return spin_trylock(&zhdr->page_lock); } /* Unlock a z3fold page */ static inline void z3fold_page_unlock(struct z3fold_header *zhdr) { spin_unlock(&zhdr->page_lock); } /* return locked z3fold page if it's not headless */ static inline struct z3fold_header *get_z3fold_header(unsigned long handle) { struct z3fold_buddy_slots *slots; struct z3fold_header *zhdr; int locked = 0; if (!(handle & (1 << PAGE_HEADLESS))) { slots = handle_to_slots(handle); do { unsigned long addr; read_lock(&slots->lock); addr = *(unsigned long *)handle; zhdr = (struct z3fold_header *)(addr & PAGE_MASK); locked = z3fold_page_trylock(zhdr); read_unlock(&slots->lock); if (locked) { struct page *page = virt_to_page(zhdr); if (!test_bit(PAGE_MIGRATED, &page->private)) break; z3fold_page_unlock(zhdr); } cpu_relax(); } while (true); } else { zhdr = (struct z3fold_header *)(handle & PAGE_MASK); } return zhdr; } static inline void put_z3fold_header(struct z3fold_header *zhdr) { struct page *page = virt_to_page(zhdr); if (!test_bit(PAGE_HEADLESS, &page->private)) z3fold_page_unlock(zhdr); } static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr) { struct z3fold_buddy_slots *slots; int i; bool is_free; if (WARN_ON(*(unsigned long *)handle == 0)) return; slots = handle_to_slots(handle); write_lock(&slots->lock); *(unsigned long *)handle = 0; if (test_bit(HANDLES_NOFREE, &slots->pool)) { write_unlock(&slots->lock); return; /* simple case, nothing else to do */ } if (zhdr->slots != slots) zhdr->foreign_handles--; is_free = true; for (i = 0; i <= BUDDY_MASK; i++) { if (slots->slot[i]) { is_free = false; break; } } write_unlock(&slots->lock); if (is_free) { struct z3fold_pool *pool = slots_to_pool(slots); if (zhdr->slots == slots) zhdr->slots = NULL; kmem_cache_free(pool->c_handle, slots); } } /* Initializes the z3fold header of a newly allocated z3fold page */ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, struct z3fold_pool *pool, gfp_t gfp) { struct z3fold_header *zhdr = page_address(page); struct z3fold_buddy_slots *slots; clear_bit(PAGE_HEADLESS, &page->private); clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); clear_bit(NEEDS_COMPACTING, &page->private); clear_bit(PAGE_STALE, &page->private); clear_bit(PAGE_CLAIMED, &page->private); clear_bit(PAGE_MIGRATED, &page->private); if (headless) return zhdr; slots = alloc_slots(pool, gfp); if (!slots) return NULL; memset(zhdr, 0, sizeof(*zhdr)); spin_lock_init(&zhdr->page_lock); kref_init(&zhdr->refcount); zhdr->cpu = -1; zhdr->slots = slots; zhdr->pool = pool; INIT_LIST_HEAD(&zhdr->buddy); INIT_WORK(&zhdr->work, compact_page_work); return zhdr; } /* Resets the struct page fields and frees the page */ static void free_z3fold_page(struct page *page, bool headless) { if (!headless) { lock_page(page); __ClearPageMovable(page); unlock_page(page); } __free_page(page); } /* Helper function to build the index */ static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) { return (bud + zhdr->first_num) & BUDDY_MASK; } /* * Encodes the handle of a particular buddy within a z3fold page. * Zhdr->page_lock should be held as this function accesses first_num * if bud != HEADLESS. */ static unsigned long __encode_handle(struct z3fold_header *zhdr, struct z3fold_buddy_slots *slots, enum buddy bud) { unsigned long h = (unsigned long)zhdr; int idx = 0; /* * For a headless page, its handle is its pointer with the extra * PAGE_HEADLESS bit set */ if (bud == HEADLESS) return h | (1 << PAGE_HEADLESS); /* otherwise, return pointer to encoded handle */ idx = __idx(zhdr, bud); h += idx; if (bud == LAST) h |= (zhdr->last_chunks << BUDDY_SHIFT); write_lock(&slots->lock); slots->slot[idx] = h; write_unlock(&slots->lock); return (unsigned long)&slots->slot[idx]; } static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) { return __encode_handle(zhdr, zhdr->slots, bud); } /* only for LAST bud, returns zero otherwise */ static unsigned short handle_to_chunks(unsigned long handle) { struct z3fold_buddy_slots *slots = handle_to_slots(handle); unsigned long addr; read_lock(&slots->lock); addr = *(unsigned long *)handle; read_unlock(&slots->lock); return (addr & ~PAGE_MASK) >> BUDDY_SHIFT; } /* * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle * but that doesn't matter. because the masking will result in the * correct buddy number. */ static enum buddy handle_to_buddy(unsigned long handle) { struct z3fold_header *zhdr; struct z3fold_buddy_slots *slots = handle_to_slots(handle); unsigned long addr; read_lock(&slots->lock); WARN_ON(handle & (1 << PAGE_HEADLESS)); addr = *(unsigned long *)handle; read_unlock(&slots->lock); zhdr = (struct z3fold_header *)(addr & PAGE_MASK); return (addr - zhdr->first_num) & BUDDY_MASK; } static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr) { return zhdr->pool; } static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) { struct page *page = virt_to_page(zhdr); struct z3fold_pool *pool = zhdr_to_pool(zhdr); WARN_ON(!list_empty(&zhdr->buddy)); set_bit(PAGE_STALE, &page->private); clear_bit(NEEDS_COMPACTING, &page->private); spin_lock(&pool->lock); spin_unlock(&pool->lock); if (locked) z3fold_page_unlock(zhdr); spin_lock(&pool->stale_lock); list_add(&zhdr->buddy, &pool->stale); queue_work(pool->release_wq, &pool->work); spin_unlock(&pool->stale_lock); atomic64_dec(&pool->pages_nr); } static void release_z3fold_page_locked(struct kref *ref) { struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, refcount); WARN_ON(z3fold_page_trylock(zhdr)); __release_z3fold_page(zhdr, true); } static void release_z3fold_page_locked_list(struct kref *ref) { struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, refcount); struct z3fold_pool *pool = zhdr_to_pool(zhdr); spin_lock(&pool->lock); list_del_init(&zhdr->buddy); spin_unlock(&pool->lock); WARN_ON(z3fold_page_trylock(zhdr)); __release_z3fold_page(zhdr, true); } static inline int put_z3fold_locked(struct z3fold_header *zhdr) { return kref_put(&zhdr->refcount, release_z3fold_page_locked); } static inline int put_z3fold_locked_list(struct z3fold_header *zhdr) { return kref_put(&zhdr->refcount, release_z3fold_page_locked_list); } static void free_pages_work(struct work_struct *w) { struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work); spin_lock(&pool->stale_lock); while (!list_empty(&pool->stale)) { struct z3fold_header *zhdr = list_first_entry(&pool->stale, struct z3fold_header, buddy); struct page *page = virt_to_page(zhdr); list_del(&zhdr->buddy); if (WARN_ON(!test_bit(PAGE_STALE, &page->private))) continue; spin_unlock(&pool->stale_lock); cancel_work_sync(&zhdr->work); free_z3fold_page(page, false); cond_resched(); spin_lock(&pool->stale_lock); } spin_unlock(&pool->stale_lock); } /* * Returns the number of free chunks in a z3fold page. * NB: can't be used with HEADLESS pages. */ static int num_free_chunks(struct z3fold_header *zhdr) { int nfree; /* * If there is a middle object, pick up the bigger free space * either before or after it. Otherwise just subtract the number * of chunks occupied by the first and the last objects. */ if (zhdr->middle_chunks != 0) { int nfree_before = zhdr->first_chunks ? 0 : zhdr->start_middle - ZHDR_CHUNKS; int nfree_after = zhdr->last_chunks ? 0 : TOTAL_CHUNKS - (zhdr->start_middle + zhdr->middle_chunks); nfree = max(nfree_before, nfree_after); } else nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; return nfree; } /* Add to the appropriate unbuddied list */ static inline void add_to_unbuddied(struct z3fold_pool *pool, struct z3fold_header *zhdr) { if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || zhdr->middle_chunks == 0) { struct list_head *unbuddied; int freechunks = num_free_chunks(zhdr); migrate_disable(); unbuddied = this_cpu_ptr(pool->unbuddied); spin_lock(&pool->lock); list_add(&zhdr->buddy, &unbuddied[freechunks]); spin_unlock(&pool->lock); zhdr->cpu = smp_processor_id(); migrate_enable(); } } static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks) { enum buddy bud = HEADLESS; if (zhdr->middle_chunks) { if (!zhdr->first_chunks && chunks <= zhdr->start_middle - ZHDR_CHUNKS) bud = FIRST; else if (!zhdr->last_chunks) bud = LAST; } else { if (!zhdr->first_chunks) bud = FIRST; else if (!zhdr->last_chunks) bud = LAST; else bud = MIDDLE; } return bud; } static inline void *mchunk_memmove(struct z3fold_header *zhdr, unsigned short dst_chunk) { void *beg = zhdr; return memmove(beg + (dst_chunk << CHUNK_SHIFT), beg + (zhdr->start_middle << CHUNK_SHIFT), zhdr->middle_chunks << CHUNK_SHIFT); } static inline bool buddy_single(struct z3fold_header *zhdr) { return !((zhdr->first_chunks && zhdr->middle_chunks) || (zhdr->first_chunks && zhdr->last_chunks) || (zhdr->middle_chunks && zhdr->last_chunks)); } static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr) { struct z3fold_pool *pool = zhdr_to_pool(zhdr); void *p = zhdr; unsigned long old_handle = 0; size_t sz = 0; struct z3fold_header *new_zhdr = NULL; int first_idx = __idx(zhdr, FIRST); int middle_idx = __idx(zhdr, MIDDLE); int last_idx = __idx(zhdr, LAST); unsigned short *moved_chunks = NULL; /* * No need to protect slots here -- all the slots are "local" and * the page lock is already taken */ if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) { p += ZHDR_SIZE_ALIGNED; sz = zhdr->first_chunks << CHUNK_SHIFT; old_handle = (unsigned long)&zhdr->slots->slot[first_idx]; moved_chunks = &zhdr->first_chunks; } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) { p += zhdr->start_middle << CHUNK_SHIFT; sz = zhdr->middle_chunks << CHUNK_SHIFT; old_handle = (unsigned long)&zhdr->slots->slot[middle_idx]; moved_chunks = &zhdr->middle_chunks; } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) { p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); sz = zhdr->last_chunks << CHUNK_SHIFT; old_handle = (unsigned long)&zhdr->slots->slot[last_idx]; moved_chunks = &zhdr->last_chunks; } if (sz > 0) { enum buddy new_bud = HEADLESS; short chunks = size_to_chunks(sz); void *q; new_zhdr = __z3fold_alloc(pool, sz, false); if (!new_zhdr) return NULL; if (WARN_ON(new_zhdr == zhdr)) goto out_fail; new_bud = get_free_buddy(new_zhdr, chunks); q = new_zhdr; switch (new_bud) { case FIRST: new_zhdr->first_chunks = chunks; q += ZHDR_SIZE_ALIGNED; break; case MIDDLE: new_zhdr->middle_chunks = chunks; new_zhdr->start_middle = new_zhdr->first_chunks + ZHDR_CHUNKS; q += new_zhdr->start_middle << CHUNK_SHIFT; break; case LAST: new_zhdr->last_chunks = chunks; q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT); break; default: goto out_fail; } new_zhdr->foreign_handles++; memcpy(q, p, sz); write_lock(&zhdr->slots->lock); *(unsigned long *)old_handle = (unsigned long)new_zhdr + __idx(new_zhdr, new_bud); if (new_bud == LAST) *(unsigned long *)old_handle |= (new_zhdr->last_chunks << BUDDY_SHIFT); write_unlock(&zhdr->slots->lock); add_to_unbuddied(pool, new_zhdr); z3fold_page_unlock(new_zhdr); *moved_chunks = 0; } return new_zhdr; out_fail: if (new_zhdr && !put_z3fold_locked(new_zhdr)) { add_to_unbuddied(pool, new_zhdr); z3fold_page_unlock(new_zhdr); } return NULL; } #define BIG_CHUNK_GAP 3 /* Has to be called with lock held */ static int z3fold_compact_page(struct z3fold_header *zhdr) { struct page *page = virt_to_page(zhdr); if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private)) return 0; /* can't move middle chunk, it's used */ if (unlikely(PageIsolated(page))) return 0; if (zhdr->middle_chunks == 0) return 0; /* nothing to compact */ if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { /* move to the beginning */ mchunk_memmove(zhdr, ZHDR_CHUNKS); zhdr->first_chunks = zhdr->middle_chunks; zhdr->middle_chunks = 0; zhdr->start_middle = 0; zhdr->first_num++; return 1; } /* * moving data is expensive, so let's only do that if * there's substantial gain (at least BIG_CHUNK_GAP chunks) */ if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 && zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >= BIG_CHUNK_GAP) { mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS); zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; return 1; } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 && TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle + zhdr->middle_chunks) >= BIG_CHUNK_GAP) { unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks - zhdr->middle_chunks; mchunk_memmove(zhdr, new_start); zhdr->start_middle = new_start; return 1; } return 0; } static void do_compact_page(struct z3fold_header *zhdr, bool locked) { struct z3fold_pool *pool = zhdr_to_pool(zhdr); struct page *page; page = virt_to_page(zhdr); if (locked) WARN_ON(z3fold_page_trylock(zhdr)); else z3fold_page_lock(zhdr); if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) { z3fold_page_unlock(zhdr); return; } spin_lock(&pool->lock); list_del_init(&zhdr->buddy); spin_unlock(&pool->lock); if (put_z3fold_locked(zhdr)) return; if (test_bit(PAGE_STALE, &page->private) || test_and_set_bit(PAGE_CLAIMED, &page->private)) { z3fold_page_unlock(zhdr); return; } if (!zhdr->foreign_handles && buddy_single(zhdr) && zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) { if (!put_z3fold_locked(zhdr)) { clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } return; } z3fold_compact_page(zhdr); add_to_unbuddied(pool, zhdr); clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } static void compact_page_work(struct work_struct *w) { struct z3fold_header *zhdr = container_of(w, struct z3fold_header, work); do_compact_page(zhdr, false); } /* returns _locked_ z3fold page header or NULL */ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, size_t size, bool can_sleep) { struct z3fold_header *zhdr = NULL; struct page *page; struct list_head *unbuddied; int chunks = size_to_chunks(size), i; lookup: migrate_disable(); /* First, try to find an unbuddied z3fold page. */ unbuddied = this_cpu_ptr(pool->unbuddied); for_each_unbuddied_list(i, chunks) { struct list_head *l = &unbuddied[i]; zhdr = list_first_entry_or_null(READ_ONCE(l), struct z3fold_header, buddy); if (!zhdr) continue; /* Re-check under lock. */ spin_lock(&pool->lock); if (unlikely(zhdr != list_first_entry(READ_ONCE(l), struct z3fold_header, buddy)) || !z3fold_page_trylock(zhdr)) { spin_unlock(&pool->lock); zhdr = NULL; migrate_enable(); if (can_sleep) cond_resched(); goto lookup; } list_del_init(&zhdr->buddy); zhdr->cpu = -1; spin_unlock(&pool->lock); page = virt_to_page(zhdr); if (test_bit(NEEDS_COMPACTING, &page->private) || test_bit(PAGE_CLAIMED, &page->private)) { z3fold_page_unlock(zhdr); zhdr = NULL; migrate_enable(); if (can_sleep) cond_resched(); goto lookup; } /* * this page could not be removed from its unbuddied * list while pool lock was held, and then we've taken * page lock so kref_put could not be called before * we got here, so it's safe to just call kref_get() */ kref_get(&zhdr->refcount); break; } migrate_enable(); if (!zhdr) { int cpu; /* look for _exact_ match on other cpus' lists */ for_each_online_cpu(cpu) { struct list_head *l; unbuddied = per_cpu_ptr(pool->unbuddied, cpu); spin_lock(&pool->lock); l = &unbuddied[chunks]; zhdr = list_first_entry_or_null(READ_ONCE(l), struct z3fold_header, buddy); if (!zhdr || !z3fold_page_trylock(zhdr)) { spin_unlock(&pool->lock); zhdr = NULL; continue; } list_del_init(&zhdr->buddy); zhdr->cpu = -1; spin_unlock(&pool->lock); page = virt_to_page(zhdr); if (test_bit(NEEDS_COMPACTING, &page->private) || test_bit(PAGE_CLAIMED, &page->private)) { z3fold_page_unlock(zhdr); zhdr = NULL; if (can_sleep) cond_resched(); continue; } kref_get(&zhdr->refcount); break; } } if (zhdr && !zhdr->slots) { zhdr->slots = alloc_slots(pool, GFP_ATOMIC); if (!zhdr->slots) goto out_fail; } return zhdr; out_fail: if (!put_z3fold_locked(zhdr)) { add_to_unbuddied(pool, zhdr); z3fold_page_unlock(zhdr); } return NULL; } /* * API Functions */ /** * z3fold_create_pool() - create a new z3fold pool * @name: pool name * @gfp: gfp flags when allocating the z3fold pool structure * * Return: pointer to the new z3fold pool or NULL if the metadata allocation * failed. */ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp) { struct z3fold_pool *pool = NULL; int i, cpu; pool = kzalloc(sizeof(struct z3fold_pool), gfp); if (!pool) goto out; pool->c_handle = kmem_cache_create("z3fold_handle", sizeof(struct z3fold_buddy_slots), SLOTS_ALIGN, 0, NULL); if (!pool->c_handle) goto out_c; spin_lock_init(&pool->lock); spin_lock_init(&pool->stale_lock); pool->unbuddied = __alloc_percpu(sizeof(struct list_head) * NCHUNKS, __alignof__(struct list_head)); if (!pool->unbuddied) goto out_pool; for_each_possible_cpu(cpu) { struct list_head *unbuddied = per_cpu_ptr(pool->unbuddied, cpu); for_each_unbuddied_list(i, 0) INIT_LIST_HEAD(&unbuddied[i]); } INIT_LIST_HEAD(&pool->stale); atomic64_set(&pool->pages_nr, 0); pool->name = name; pool->compact_wq = create_singlethread_workqueue(pool->name); if (!pool->compact_wq) goto out_unbuddied; pool->release_wq = create_singlethread_workqueue(pool->name); if (!pool->release_wq) goto out_wq; INIT_WORK(&pool->work, free_pages_work); return pool; out_wq: destroy_workqueue(pool->compact_wq); out_unbuddied: free_percpu(pool->unbuddied); out_pool: kmem_cache_destroy(pool->c_handle); out_c: kfree(pool); out: return NULL; } /** * z3fold_destroy_pool() - destroys an existing z3fold pool * @pool: the z3fold pool to be destroyed * * The pool should be emptied before this function is called. */ static void z3fold_destroy_pool(struct z3fold_pool *pool) { kmem_cache_destroy(pool->c_handle); /* * We need to destroy pool->compact_wq before pool->release_wq, * as any pending work on pool->compact_wq will call * queue_work(pool->release_wq, &pool->work). * * There are still outstanding pages until both workqueues are drained, * so we cannot unregister migration until then. */ destroy_workqueue(pool->compact_wq); destroy_workqueue(pool->release_wq); free_percpu(pool->unbuddied); kfree(pool); } static const struct movable_operations z3fold_mops; /** * z3fold_alloc() - allocates a region of a given size * @pool: z3fold pool from which to allocate * @size: size in bytes of the desired allocation * @gfp: gfp flags used if the pool needs to grow * @handle: handle of the new allocation * * This function will attempt to find a free region in the pool large enough to * satisfy the allocation request. A search of the unbuddied lists is * performed first. If no suitable free region is found, then a new page is * allocated and added to the pool to satisfy the request. * * Return: 0 if success and handle is set, otherwise -EINVAL if the size or * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate * a new page. */ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, unsigned long *handle) { int chunks = size_to_chunks(size); struct z3fold_header *zhdr = NULL; struct page *page = NULL; enum buddy bud; bool can_sleep = gfpflags_allow_blocking(gfp); if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; if (size > PAGE_SIZE) return -ENOSPC; if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) bud = HEADLESS; else { retry: zhdr = __z3fold_alloc(pool, size, can_sleep); if (zhdr) { bud = get_free_buddy(zhdr, chunks); if (bud == HEADLESS) { if (!put_z3fold_locked(zhdr)) z3fold_page_unlock(zhdr); pr_err("No free chunks in unbuddied\n"); WARN_ON(1); goto retry; } page = virt_to_page(zhdr); goto found; } bud = FIRST; } page = alloc_page(gfp); if (!page) return -ENOMEM; zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp); if (!zhdr) { __free_page(page); return -ENOMEM; } atomic64_inc(&pool->pages_nr); if (bud == HEADLESS) { set_bit(PAGE_HEADLESS, &page->private); goto headless; } if (can_sleep) { lock_page(page); __SetPageMovable(page, &z3fold_mops); unlock_page(page); } else { WARN_ON(!trylock_page(page)); __SetPageMovable(page, &z3fold_mops); unlock_page(page); } z3fold_page_lock(zhdr); found: if (bud == FIRST) zhdr->first_chunks = chunks; else if (bud == LAST) zhdr->last_chunks = chunks; else { zhdr->middle_chunks = chunks; zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; } add_to_unbuddied(pool, zhdr); headless: spin_lock(&pool->lock); *handle = encode_handle(zhdr, bud); spin_unlock(&pool->lock); if (bud != HEADLESS) z3fold_page_unlock(zhdr); return 0; } /** * z3fold_free() - frees the allocation associated with the given handle * @pool: pool in which the allocation resided * @handle: handle associated with the allocation returned by z3fold_alloc() * * In the case that the z3fold page in which the allocation resides is under * reclaim, as indicated by the PAGE_CLAIMED flag being set, this function * only sets the first|middle|last_chunks to 0. The page is actually freed * once all buddies are evicted (see z3fold_reclaim_page() below). */ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) { struct z3fold_header *zhdr; struct page *page; enum buddy bud; bool page_claimed; zhdr = get_z3fold_header(handle); page = virt_to_page(zhdr); page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private); if (test_bit(PAGE_HEADLESS, &page->private)) { /* if a headless page is under reclaim, just leave. * NB: we use test_and_set_bit for a reason: if the bit * has not been set before, we release this page * immediately so we don't care about its value any more. */ if (!page_claimed) { put_z3fold_header(zhdr); free_z3fold_page(page, true); atomic64_dec(&pool->pages_nr); } return; } /* Non-headless case */ bud = handle_to_buddy(handle); switch (bud) { case FIRST: zhdr->first_chunks = 0; break; case MIDDLE: zhdr->middle_chunks = 0; break; case LAST: zhdr->last_chunks = 0; break; default: pr_err("%s: unknown bud %d\n", __func__, bud); WARN_ON(1); put_z3fold_header(zhdr); return; } if (!page_claimed) free_handle(handle, zhdr); if (put_z3fold_locked_list(zhdr)) return; if (page_claimed) { /* the page has not been claimed by us */ put_z3fold_header(zhdr); return; } if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { clear_bit(PAGE_CLAIMED, &page->private); put_z3fold_header(zhdr); return; } if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { zhdr->cpu = -1; kref_get(&zhdr->refcount); clear_bit(PAGE_CLAIMED, &page->private); do_compact_page(zhdr, true); return; } kref_get(&zhdr->refcount); clear_bit(PAGE_CLAIMED, &page->private); queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); put_z3fold_header(zhdr); } /** * z3fold_map() - maps the allocation associated with the given handle * @pool: pool in which the allocation resides * @handle: handle associated with the allocation to be mapped * * Extracts the buddy number from handle and constructs the pointer to the * correct starting chunk within the page. * * Returns: a pointer to the mapped allocation */ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) { struct z3fold_header *zhdr; struct page *page; void *addr; enum buddy buddy; zhdr = get_z3fold_header(handle); addr = zhdr; page = virt_to_page(zhdr); if (test_bit(PAGE_HEADLESS, &page->private)) goto out; buddy = handle_to_buddy(handle); switch (buddy) { case FIRST: addr += ZHDR_SIZE_ALIGNED; break; case MIDDLE: addr += zhdr->start_middle << CHUNK_SHIFT; set_bit(MIDDLE_CHUNK_MAPPED, &page->private); break; case LAST: addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT); break; default: pr_err("unknown buddy id %d\n", buddy); WARN_ON(1); addr = NULL; break; } if (addr) zhdr->mapped_count++; out: put_z3fold_header(zhdr); return addr; } /** * z3fold_unmap() - unmaps the allocation associated with the given handle * @pool: pool in which the allocation resides * @handle: handle associated with the allocation to be unmapped */ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) { struct z3fold_header *zhdr; struct page *page; enum buddy buddy; zhdr = get_z3fold_header(handle); page = virt_to_page(zhdr); if (test_bit(PAGE_HEADLESS, &page->private)) return; buddy = handle_to_buddy(handle); if (buddy == MIDDLE) clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); zhdr->mapped_count--; put_z3fold_header(zhdr); } /** * z3fold_get_pool_pages() - gets the z3fold pool size in pages * @pool: pool whose size is being queried * * Returns: size in pages of the given pool. */ static u64 z3fold_get_pool_pages(struct z3fold_pool *pool) { return atomic64_read(&pool->pages_nr); } static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) { struct z3fold_header *zhdr; struct z3fold_pool *pool; VM_BUG_ON_PAGE(PageIsolated(page), page); if (test_bit(PAGE_HEADLESS, &page->private)) return false; zhdr = page_address(page); z3fold_page_lock(zhdr); if (test_bit(NEEDS_COMPACTING, &page->private) || test_bit(PAGE_STALE, &page->private)) goto out; if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) goto out; if (test_and_set_bit(PAGE_CLAIMED, &page->private)) goto out; pool = zhdr_to_pool(zhdr); spin_lock(&pool->lock); if (!list_empty(&zhdr->buddy)) list_del_init(&zhdr->buddy); spin_unlock(&pool->lock); kref_get(&zhdr->refcount); z3fold_page_unlock(zhdr); return true; out: z3fold_page_unlock(zhdr); return false; } static int z3fold_page_migrate(struct page *newpage, struct page *page, enum migrate_mode mode) { struct z3fold_header *zhdr, *new_zhdr; struct z3fold_pool *pool; VM_BUG_ON_PAGE(!PageIsolated(page), page); VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); zhdr = page_address(page); pool = zhdr_to_pool(zhdr); if (!z3fold_page_trylock(zhdr)) return -EAGAIN; if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) { clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); return -EBUSY; } if (work_pending(&zhdr->work)) { z3fold_page_unlock(zhdr); return -EAGAIN; } new_zhdr = page_address(newpage); memcpy(new_zhdr, zhdr, PAGE_SIZE); newpage->private = page->private; set_bit(PAGE_MIGRATED, &page->private); z3fold_page_unlock(zhdr); spin_lock_init(&new_zhdr->page_lock); INIT_WORK(&new_zhdr->work, compact_page_work); /* * z3fold_page_isolate() ensures that new_zhdr->buddy is empty, * so we only have to reinitialize it. */ INIT_LIST_HEAD(&new_zhdr->buddy); __ClearPageMovable(page); get_page(newpage); z3fold_page_lock(new_zhdr); if (new_zhdr->first_chunks) encode_handle(new_zhdr, FIRST); if (new_zhdr->last_chunks) encode_handle(new_zhdr, LAST); if (new_zhdr->middle_chunks) encode_handle(new_zhdr, MIDDLE); set_bit(NEEDS_COMPACTING, &newpage->private); new_zhdr->cpu = smp_processor_id(); __SetPageMovable(newpage, &z3fold_mops); z3fold_page_unlock(new_zhdr); queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); /* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */ page->private = 0; put_page(page); return 0; } static void z3fold_page_putback(struct page *page) { struct z3fold_header *zhdr; struct z3fold_pool *pool; zhdr = page_address(page); pool = zhdr_to_pool(zhdr); z3fold_page_lock(zhdr); if (!list_empty(&zhdr->buddy)) list_del_init(&zhdr->buddy); INIT_LIST_HEAD(&page->lru); if (put_z3fold_locked(zhdr)) return; if (list_empty(&zhdr->buddy)) add_to_unbuddied(pool, zhdr); clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } static const struct movable_operations z3fold_mops = { .isolate_page = z3fold_page_isolate, .migrate_page = z3fold_page_migrate, .putback_page = z3fold_page_putback, }; /***************** * zpool ****************/ static void *z3fold_zpool_create(const char *name, gfp_t gfp) { return z3fold_create_pool(name, gfp); } static void z3fold_zpool_destroy(void *pool) { z3fold_destroy_pool(pool); } static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp, unsigned long *handle) { return z3fold_alloc(pool, size, gfp, handle); } static void z3fold_zpool_free(void *pool, unsigned long handle) { z3fold_free(pool, handle); } static void *z3fold_zpool_map(void *pool, unsigned long handle, enum zpool_mapmode mm) { return z3fold_map(pool, handle); } static void z3fold_zpool_unmap(void *pool, unsigned long handle) { z3fold_unmap(pool, handle); } static u64 z3fold_zpool_total_pages(void *pool) { return z3fold_get_pool_pages(pool); } static struct zpool_driver z3fold_zpool_driver = { .type = "z3fold", .sleep_mapped = true, .owner = THIS_MODULE, .create = z3fold_zpool_create, .destroy = z3fold_zpool_destroy, .malloc = z3fold_zpool_malloc, .free = z3fold_zpool_free, .map = z3fold_zpool_map, .unmap = z3fold_zpool_unmap, .total_pages = z3fold_zpool_total_pages, }; MODULE_ALIAS("zpool-z3fold"); static int __init init_z3fold(void) { /* * Make sure the z3fold header is not larger than the page size and * there has remaining spaces for its buddy. */ BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE); zpool_register_driver(&z3fold_zpool_driver); return 0; } static void __exit exit_z3fold(void) { zpool_unregister_driver(&z3fold_zpool_driver); } module_init(init_z3fold); module_exit(exit_z3fold); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>"); MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages");
1 128 1 67 59 67 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 /* SPDX-License-Identifier: GPL-2.0 */ /* * INETPEER - A storage for permanent information about peers * * Authors: Andrey V. Savochkin <saw@msu.ru> */ #ifndef _NET_INETPEER_H #define _NET_INETPEER_H #include <linux/types.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/spinlock.h> #include <linux/rtnetlink.h> #include <net/ipv6.h> #include <linux/atomic.h> /* IPv4 address key for cache lookups */ struct ipv4_addr_key { __be32 addr; int vif; }; #define INETPEER_MAXKEYSZ (sizeof(struct in6_addr) / sizeof(u32)) struct inetpeer_addr { union { struct ipv4_addr_key a4; struct in6_addr a6; u32 key[INETPEER_MAXKEYSZ]; }; __u16 family; }; struct inet_peer { struct rb_node rb_node; struct inetpeer_addr daddr; u32 metrics[RTAX_MAX]; u32 rate_tokens; /* rate limiting for ICMP */ u32 n_redirects; unsigned long rate_last; /* * Once inet_peer is queued for deletion (refcnt == 0), following field * is not available: rid * We can share memory with rcu_head to help keep inet_peer small. */ union { struct { atomic_t rid; /* Frag reception counter */ }; struct rcu_head rcu; }; /* following fields might be frequently dirtied */ __u32 dtime; /* the time of last use of not referenced entries */ refcount_t refcnt; }; struct inet_peer_base { struct rb_root rb_root; seqlock_t lock; int total; }; void inet_peer_base_init(struct inet_peer_base *); void inet_initpeers(void) __init; #define INETPEER_METRICS_NEW (~(u32) 0) static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip) { iaddr->a4.addr = ip; iaddr->a4.vif = 0; iaddr->family = AF_INET; } static inline __be32 inetpeer_get_addr_v4(struct inetpeer_addr *iaddr) { return iaddr->a4.addr; } static inline void inetpeer_set_addr_v6(struct inetpeer_addr *iaddr, struct in6_addr *in6) { iaddr->a6 = *in6; iaddr->family = AF_INET6; } static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr) { return &iaddr->a6; } /* can be called with or without local BH being disabled */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr, int create); static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, __be32 v4daddr, int vif, int create) { struct inetpeer_addr daddr; daddr.a4.addr = v4daddr; daddr.a4.vif = vif; daddr.family = AF_INET; return inet_getpeer(base, &daddr, create); } static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, const struct in6_addr *v6daddr, int create) { struct inetpeer_addr daddr; daddr.a6 = *v6daddr; daddr.family = AF_INET6; return inet_getpeer(base, &daddr, create); } static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { int i, n; if (a->family == AF_INET) n = sizeof(a->a4) / sizeof(u32); else n = sizeof(a->a6) / sizeof(u32); for (i = 0; i < n; i++) { if (a->key[i] == b->key[i]) continue; if (a->key[i] < b->key[i]) return -1; return 1; } return 0; } /* can be called from BH context or outside */ void inet_putpeer(struct inet_peer *p); bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout); void inetpeer_invalidate_tree(struct inet_peer_base *); #endif /* _NET_INETPEER_H */
1 83 81 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 /* SPDX-License-Identifier: GPL-2.0-only */ #undef TRACE_SYSTEM #define TRACE_SYSTEM erofs #if !defined(_TRACE_EROFS_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_EROFS_H #include <linux/tracepoint.h> #include <linux/fs.h> struct erofs_map_blocks; #define show_dev(dev) MAJOR(dev), MINOR(dev) #define show_dev_nid(entry) show_dev(entry->dev), entry->nid #define show_file_type(type) \ __print_symbolic(type, \ { 0, "FILE" }, \ { 1, "DIR" }) #define show_map_flags(flags) __print_flags(flags, "|", \ { EROFS_GET_BLOCKS_FIEMAP, "FIEMAP" }, \ { EROFS_GET_BLOCKS_READMORE, "READMORE" }, \ { EROFS_GET_BLOCKS_FINDTAIL, "FINDTAIL" }) #define show_mflags(flags) __print_flags(flags, "", \ { EROFS_MAP_MAPPED, "M" }, \ { EROFS_MAP_META, "I" }, \ { EROFS_MAP_ENCODED, "E" }, \ { EROFS_MAP_FULL_MAPPED, "F" }, \ { EROFS_MAP_FRAGMENT, "R" }, \ { EROFS_MAP_PARTIAL_REF, "P" }) TRACE_EVENT(erofs_lookup, TP_PROTO(struct inode *dir, struct dentry *dentry, unsigned int flags), TP_ARGS(dir, dentry, flags), TP_STRUCT__entry( __field(dev_t, dev ) __field(erofs_nid_t, nid ) __string(name, dentry->d_name.name ) __field(unsigned int, flags ) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->nid = EROFS_I(dir)->nid; __assign_str(name); __entry->flags = flags; ), TP_printk("dev = (%d,%d), pnid = %llu, name:%s, flags:%x", show_dev_nid(__entry), __get_str(name), __entry->flags) ); TRACE_EVENT(erofs_fill_inode, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field(dev_t, dev ) __field(erofs_nid_t, nid ) __field(erofs_blk_t, blkaddr ) __field(unsigned int, ofs ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; __entry->blkaddr = erofs_blknr(inode->i_sb, erofs_iloc(inode)); __entry->ofs = erofs_blkoff(inode->i_sb, erofs_iloc(inode)); ), TP_printk("dev = (%d,%d), nid = %llu, blkaddr %u ofs %u", show_dev_nid(__entry), __entry->blkaddr, __entry->ofs) ); TRACE_EVENT(erofs_read_folio, TP_PROTO(struct folio *folio, bool raw), TP_ARGS(folio, raw), TP_STRUCT__entry( __field(dev_t, dev ) __field(erofs_nid_t, nid ) __field(int, dir ) __field(pgoff_t, index ) __field(int, uptodate) __field(bool, raw ) ), TP_fast_assign( __entry->dev = folio->mapping->host->i_sb->s_dev; __entry->nid = EROFS_I(folio->mapping->host)->nid; __entry->dir = S_ISDIR(folio->mapping->host->i_mode); __entry->index = folio->index; __entry->uptodate = folio_test_uptodate(folio); __entry->raw = raw; ), TP_printk("dev = (%d,%d), nid = %llu, %s, index = %lu, uptodate = %d " "raw = %d", show_dev_nid(__entry), show_file_type(__entry->dir), (unsigned long)__entry->index, __entry->uptodate, __entry->raw) ); TRACE_EVENT(erofs_readpages, TP_PROTO(struct inode *inode, pgoff_t start, unsigned int nrpage, bool raw), TP_ARGS(inode, start, nrpage, raw), TP_STRUCT__entry( __field(dev_t, dev ) __field(erofs_nid_t, nid ) __field(pgoff_t, start ) __field(unsigned int, nrpage ) __field(bool, raw ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; __entry->start = start; __entry->nrpage = nrpage; __entry->raw = raw; ), TP_printk("dev = (%d,%d), nid = %llu, start = %lu nrpage = %u raw = %d", show_dev_nid(__entry), (unsigned long)__entry->start, __entry->nrpage, __entry->raw) ); TRACE_EVENT(erofs_map_blocks_enter, TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, unsigned int flags), TP_ARGS(inode, map, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( erofs_nid_t, nid ) __field( erofs_off_t, la ) __field( u64, llen ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; __entry->la = map->m_la; __entry->llen = map->m_llen; __entry->flags = flags; ), TP_printk("dev = (%d,%d), nid = %llu, la %llu llen %llu flags %s", show_dev_nid(__entry), __entry->la, __entry->llen, __entry->flags ? show_map_flags(__entry->flags) : "NULL") ); TRACE_EVENT(erofs_map_blocks_exit, TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, unsigned int flags, int ret), TP_ARGS(inode, map, flags, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( erofs_nid_t, nid ) __field( unsigned int, flags ) __field( erofs_off_t, la ) __field( erofs_off_t, pa ) __field( u64, llen ) __field( u64, plen ) __field( unsigned int, mflags ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; __entry->flags = flags; __entry->la = map->m_la; __entry->pa = map->m_pa; __entry->llen = map->m_llen; __entry->plen = map->m_plen; __entry->mflags = map->m_flags; __entry->ret = ret; ), TP_printk("dev = (%d,%d), nid = %llu, flags %s " "la %llu pa %llu llen %llu plen %llu mflags %s ret %d", show_dev_nid(__entry), __entry->flags ? show_map_flags(__entry->flags) : "NULL", __entry->la, __entry->pa, __entry->llen, __entry->plen, show_mflags(__entry->mflags), __entry->ret) ); TRACE_EVENT(erofs_destroy_inode, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( erofs_nid_t, nid ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; ), TP_printk("dev = (%d,%d), nid = %llu", show_dev_nid(__entry)) ); #endif /* _TRACE_EROFS_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
15 15 15 9 9 9 5 8 4 8 2 8 15 13 2 12 1 11 13 2 10 8 8 4 4 4 4 4 9 17 17 17 17 15 1 1 1 17 13 3 13 13 14 1 14 14 13 12 11 11 7 11 11 4 3 2 1 2 2 8 1 7 7 7 7 3 4 14 11 10 9 8 2 7 6 5 5 5 7 11 2 2 5 5 5 3 2 1 2 1 1 2 4 1 5 3 3 17 15 15 33 33 33 33 31 30 29 28 3 2 1 28 4 1 28 26 9 6 5 4 29 26 29 12 5 7 2 7 21 2 1 5 24 19 8 24 4 23 22 3 19 24 23 24 5 24 23 24 23 24 21 1 22 17 22 2 22 24 33 7 4 4 8 8 7 1 6 5 4 5 4 4 3 1 3 4 1 4 2 5 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 // SPDX-License-Identifier: GPL-2.0-or-later /* L2TPv3 IP encapsulation support for IPv6 * * Copyright (c) 2012 Katalix Systems Ltd */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/icmp.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/random.h> #include <linux/socket.h> #include <linux/l2tp.h> #include <linux/in.h> #include <linux/in6.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/udp.h> #include <net/inet_common.h> #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/transp_v6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include "l2tp_core.h" /* per-net private data for this module */ static unsigned int l2tp_ip6_net_id; struct l2tp_ip6_net { rwlock_t l2tp_ip6_lock; struct hlist_head l2tp_ip6_table; struct hlist_head l2tp_ip6_bind_table; }; struct l2tp_ip6_sock { /* inet_sock has to be the first member of l2tp_ip6_sock */ struct inet_sock inet; u32 conn_id; u32 peer_conn_id; struct ipv6_pinfo inet6; }; static struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk) { return (struct l2tp_ip6_sock *)sk; } static struct l2tp_ip6_net *l2tp_ip6_pernet(const struct net *net) { return net_generic(net, l2tp_ip6_net_id); } static struct sock *__l2tp_ip6_bind_lookup(const struct net *net, const struct in6_addr *laddr, const struct in6_addr *raddr, int dif, u32 tunnel_id) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(net); struct sock *sk; sk_for_each_bound(sk, &pn->l2tp_ip6_bind_table) { const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk); const struct in6_addr *sk_raddr = &sk->sk_v6_daddr; const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk); int bound_dev_if; if (!net_eq(sock_net(sk), net)) continue; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && dif && bound_dev_if != dif) continue; if (sk_laddr && !ipv6_addr_any(sk_laddr) && !ipv6_addr_any(laddr) && !ipv6_addr_equal(sk_laddr, laddr)) continue; if (!ipv6_addr_any(sk_raddr) && raddr && !ipv6_addr_any(raddr) && !ipv6_addr_equal(sk_raddr, raddr)) continue; if (l2tp->conn_id != tunnel_id) continue; goto found; } sk = NULL; found: return sk; } /* When processing receive frames, there are two cases to * consider. Data frames consist of a non-zero session-id and an * optional cookie. Control frames consist of a regular L2TP header * preceded by 32-bits of zeros. * * L2TPv3 Session Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Cookie (optional, maximum 64 bits)... * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * L2TPv3 Control Message Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | (32 bits of zeros) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |T|L|x|x|S|x|x|x|x|x|x|x| Ver | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Control Connection ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ns | Nr | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * All control frames are passed to userspace. */ static int l2tp_ip6_recv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct l2tp_ip6_net *pn; struct sock *sk; u32 session_id; u32 tunnel_id; unsigned char *ptr, *optr; struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; struct ipv6hdr *iph; pn = l2tp_ip6_pernet(net); if (!pskb_may_pull(skb, 4)) goto discard; /* Point to L2TP header */ optr = skb->data; ptr = skb->data; session_id = ntohl(*((__be32 *)ptr)); ptr += 4; /* RFC3931: L2TP/IP packets have the first 4 bytes containing * the session_id. If it is 0, the packet is a L2TP control * frame and the session_id value can be discarded. */ if (session_id == 0) { __skb_pull(skb, 4); goto pass_up; } /* Ok, this is a data packet. Lookup the session. */ session = l2tp_v3_session_get(net, NULL, session_id); if (!session) goto discard; tunnel = session->tunnel; if (!tunnel) goto discard_sess; if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) goto discard_sess; l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); l2tp_session_put(session); return 0; pass_up: /* Get the tunnel_id from the L2TP header */ if (!pskb_may_pull(skb, 12)) goto discard; if ((skb->data[0] & 0xc0) != 0xc0) goto discard; tunnel_id = ntohl(*(__be32 *)&skb->data[4]); iph = ipv6_hdr(skb); read_lock_bh(&pn->l2tp_ip6_lock); sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, &iph->saddr, inet6_iif(skb), tunnel_id); if (!sk) { read_unlock_bh(&pn->l2tp_ip6_lock); goto discard; } sock_hold(sk); read_unlock_bh(&pn->l2tp_ip6_lock); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); discard_sess: l2tp_session_put(session); goto discard; discard_put: sock_put(sk); discard: kfree_skb(skb); return 0; } static int l2tp_ip6_hash(struct sock *sk) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); if (sk_unhashed(sk)) { write_lock_bh(&pn->l2tp_ip6_lock); sk_add_node(sk, &pn->l2tp_ip6_table); write_unlock_bh(&pn->l2tp_ip6_lock); } return 0; } static void l2tp_ip6_unhash(struct sock *sk) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); if (sk_unhashed(sk)) return; write_lock_bh(&pn->l2tp_ip6_lock); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip6_lock); } static int l2tp_ip6_open(struct sock *sk) { /* Prevent autobind. We don't have ports. */ inet_sk(sk)->inet_num = IPPROTO_L2TP; l2tp_ip6_hash(sk); return 0; } static void l2tp_ip6_close(struct sock *sk, long timeout) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); write_lock_bh(&pn->l2tp_ip6_lock); hlist_del_init(&sk->sk_bind_node); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip6_lock); sk_common_release(sk); } static void l2tp_ip6_destroy_sock(struct sock *sk) { struct l2tp_tunnel *tunnel; lock_sock(sk); ip6_flush_pending_frames(sk); release_sock(sk); tunnel = l2tp_sk_to_tunnel(sk); if (tunnel) { l2tp_tunnel_delete(tunnel); l2tp_tunnel_put(tunnel); } } static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *)uaddr; struct net *net = sock_net(sk); struct l2tp_ip6_net *pn; __be32 v4addr = 0; int bound_dev_if; int addr_type; int err; pn = l2tp_ip6_pernet(net); if (addr->l2tp_family != AF_INET6) return -EINVAL; if (addr_len < sizeof(*addr)) return -EINVAL; addr_type = ipv6_addr_type(&addr->l2tp_addr); /* l2tp_ip6 sockets are IPv6 only */ if (addr_type == IPV6_ADDR_MAPPED) return -EADDRNOTAVAIL; /* L2TP is point-point, not multicast */ if (addr_type & IPV6_ADDR_MULTICAST) return -EADDRNOTAVAIL; lock_sock(sk); err = -EINVAL; if (!sock_flag(sk, SOCK_ZAPPED)) goto out_unlock; if (sk->sk_state != TCP_CLOSE) goto out_unlock; bound_dev_if = sk->sk_bound_dev_if; /* Check if the address belongs to the host. */ rcu_read_lock(); if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; if (addr_type & IPV6_ADDR_LINKLOCAL) { if (addr->l2tp_scope_id) bound_dev_if = addr->l2tp_scope_id; /* Binding to link-local address requires an * interface. */ if (!bound_dev_if) goto out_unlock_rcu; err = -ENODEV; dev = dev_get_by_index_rcu(sock_net(sk), bound_dev_if); if (!dev) goto out_unlock_rcu; } /* ipv4 addr of the socket is invalid. Only the * unspecified and mapped address have a v4 equivalent. */ v4addr = LOOPBACK4_IPV6; err = -EADDRNOTAVAIL; if (!ipv6_chk_addr(sock_net(sk), &addr->l2tp_addr, dev, 0)) goto out_unlock_rcu; } rcu_read_unlock(); write_lock_bh(&pn->l2tp_ip6_lock); if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, NULL, bound_dev_if, addr->l2tp_conn_id)) { write_unlock_bh(&pn->l2tp_ip6_lock); err = -EADDRINUSE; goto out_unlock; } inet->inet_saddr = v4addr; inet->inet_rcv_saddr = v4addr; sk->sk_bound_dev_if = bound_dev_if; sk->sk_v6_rcv_saddr = addr->l2tp_addr; np->saddr = addr->l2tp_addr; l2tp_ip6_sk(sk)->conn_id = addr->l2tp_conn_id; sk_add_bind_node(sk, &pn->l2tp_ip6_bind_table); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip6_lock); sock_reset_flag(sk, SOCK_ZAPPED); release_sock(sk); return 0; out_unlock_rcu: rcu_read_unlock(); out_unlock: release_sock(sk); return err; } static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr; struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr; struct in6_addr *daddr; int addr_type; int rc; struct l2tp_ip6_net *pn; if (addr_len < sizeof(*lsa)) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EINVAL; addr_type = ipv6_addr_type(&usin->sin6_addr); if (addr_type & IPV6_ADDR_MULTICAST) return -EINVAL; if (addr_type & IPV6_ADDR_MAPPED) { daddr = &usin->sin6_addr; if (ipv4_is_multicast(daddr->s6_addr32[3])) return -EINVAL; } lock_sock(sk); /* Must bind first - autobinding does not work */ if (sock_flag(sk, SOCK_ZAPPED)) { rc = -EINVAL; goto out_sk; } rc = __ip6_datagram_connect(sk, uaddr, addr_len); if (rc < 0) goto out_sk; l2tp_ip6_sk(sk)->peer_conn_id = lsa->l2tp_conn_id; pn = l2tp_ip6_pernet(sock_net(sk)); write_lock_bh(&pn->l2tp_ip6_lock); hlist_del_init(&sk->sk_bind_node); sk_add_bind_node(sk, &pn->l2tp_ip6_bind_table); write_unlock_bh(&pn->l2tp_ip6_lock); out_sk: release_sock(sk); return rc; } static int l2tp_ip6_disconnect(struct sock *sk, int flags) { if (sock_flag(sk, SOCK_ZAPPED)) return 0; return __udp_disconnect(sk, flags); } static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr; struct sock *sk = sock->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct l2tp_ip6_sock *lsk = l2tp_ip6_sk(sk); lsa->l2tp_family = AF_INET6; lsa->l2tp_flowinfo = 0; lsa->l2tp_scope_id = 0; lsa->l2tp_unused = 0; if (peer) { if (!lsk->peer_conn_id) return -ENOTCONN; lsa->l2tp_conn_id = lsk->peer_conn_id; lsa->l2tp_addr = sk->sk_v6_daddr; if (inet6_test_bit(SNDFLOW, sk)) lsa->l2tp_flowinfo = np->flow_label; } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) lsa->l2tp_addr = np->saddr; else lsa->l2tp_addr = sk->sk_v6_rcv_saddr; lsa->l2tp_conn_id = lsk->conn_id; } if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL) lsa->l2tp_scope_id = READ_ONCE(sk->sk_bound_dev_if); return sizeof(*lsa); } static int l2tp_ip6_backlog_recv(struct sock *sk, struct sk_buff *skb) { int rc; /* Charge it to the socket, dropping if the queue is full. */ rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) goto drop; return 0; drop: IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); kfree_skb(skb); return -1; } static int l2tp_ip6_push_pending_frames(struct sock *sk) { struct sk_buff *skb; __be32 *transhdr = NULL; int err = 0; skb = skb_peek(&sk->sk_write_queue); if (!skb) goto out; transhdr = (__be32 *)skb_transport_header(skb); *transhdr = 0; err = ip6_push_pending_frames(sk); out: return err; } /* Userspace will call sendmsg() on the tunnel socket to send L2TP * control frames. */ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct ipv6_txoptions opt_space; DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; struct flowi6 fl6; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; int transhdrlen = 4; /* zero session-id */ int ulen; int err; /* Rough check on arithmetic overflow, * better check is made in ip6_append_data(). */ if (len > INT_MAX - transhdrlen) return -EMSGSIZE; /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; /* Get and verify the address */ memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk->sk_uid; ipcm6_init(&ipc6); if (lsa) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (lsa->l2tp_family && lsa->l2tp_family != AF_INET6) return -EAFNOSUPPORT; daddr = &lsa->l2tp_addr; if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK; if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } } /* Otherwise it will be difficult to maintain * sk->sk_dst_cache. */ if (sk->sk_state == TCP_ESTABLISHED && ipv6_addr_equal(daddr, &sk->sk_v6_daddr)) daddr = &sk->sk_v6_daddr; if (addr_len >= sizeof(struct sockaddr_in6) && lsa->l2tp_scope_id && ipv6_addr_type(daddr) & IPV6_ADDR_LINKLOCAL) fl6.flowi6_oif = lsa->l2tp_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = &sk->sk_v6_daddr; fl6.flowlabel = np->flow_label; } if (fl6.flowi6_oif == 0) fl6.flowi6_oif = READ_ONCE(sk->sk_bound_dev_if); if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); ipc6.opt = opt; err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; } if ((fl6.flowlabel & IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen | opt->opt_flen)) opt = NULL; } if (!opt) { opt = txopt_get(np); opt_to_free = opt; } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); ipc6.opt = opt; fl6.flowi6_proto = sk->sk_protocol; if (!ipv6_addr_any(daddr)) fl6.daddr = *daddr; else fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) fl6.saddr = np->saddr; final_p = fl6_update_dst(&fl6, opt, &final); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) fl6.flowi6_oif = READ_ONCE(np->ucast_oif); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); if (ipc6.tclass < 0) ipc6.tclass = np->tclass; fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; } if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); if (ipc6.dontfrag < 0) ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; back_from_confirm: lock_sock(sk); ulen = len + (skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0); err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, &ipc6, &fl6, dst_rt6_info(dst), msg->msg_flags); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) err = l2tp_ip6_push_pending_frames(sk); release_sock(sk); done: dst_release(dst); out: fl6_sock_release(flowlabel); txopt_put(opt_to_free); return err < 0 ? err : len; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(dst, &fl6.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto done; } static int l2tp_ip6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; if (flags & MSG_OOB) goto out; if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ if (lsa) { lsa->l2tp_family = AF_INET6; lsa->l2tp_unused = 0; lsa->l2tp_addr = ipv6_hdr(skb)->saddr; lsa->l2tp_flowinfo = 0; lsa->l2tp_scope_id = 0; lsa->l2tp_conn_id = 0; if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL) lsa->l2tp_scope_id = inet6_iif(skb); *addr_len = sizeof(*lsa); } if (np->rxopt.all) ip6_datagram_recv_ctl(sk, msg, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: return err ? err : copied; } static struct proto l2tp_ip6_prot = { .name = "L2TP/IPv6", .owner = THIS_MODULE, .init = l2tp_ip6_open, .close = l2tp_ip6_close, .bind = l2tp_ip6_bind, .connect = l2tp_ip6_connect, .disconnect = l2tp_ip6_disconnect, .ioctl = l2tp_ioctl, .destroy = l2tp_ip6_destroy_sock, .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .sendmsg = l2tp_ip6_sendmsg, .recvmsg = l2tp_ip6_recvmsg, .backlog_rcv = l2tp_ip6_backlog_recv, .hash = l2tp_ip6_hash, .unhash = l2tp_ip6_unhash, .obj_size = sizeof(struct l2tp_ip6_sock), .ipv6_pinfo_offset = offsetof(struct l2tp_ip6_sock, inet6), }; static const struct proto_ops l2tp_ip6_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip6_getname, .poll = datagram_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif }; static struct inet_protosw l2tp_ip6_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_L2TP, .prot = &l2tp_ip6_prot, .ops = &l2tp_ip6_ops, }; static struct inet6_protocol l2tp_ip6_protocol __read_mostly = { .handler = l2tp_ip6_recv, }; static __net_init int l2tp_ip6_init_net(struct net *net) { struct l2tp_ip6_net *pn = net_generic(net, l2tp_ip6_net_id); rwlock_init(&pn->l2tp_ip6_lock); INIT_HLIST_HEAD(&pn->l2tp_ip6_table); INIT_HLIST_HEAD(&pn->l2tp_ip6_bind_table); return 0; } static __net_exit void l2tp_ip6_exit_net(struct net *net) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(net); write_lock_bh(&pn->l2tp_ip6_lock); WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip6_table) != 0); WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip6_bind_table) != 0); write_unlock_bh(&pn->l2tp_ip6_lock); } static struct pernet_operations l2tp_ip6_net_ops = { .init = l2tp_ip6_init_net, .exit = l2tp_ip6_exit_net, .id = &l2tp_ip6_net_id, .size = sizeof(struct l2tp_ip6_net), }; static int __init l2tp_ip6_init(void) { int err; pr_info("L2TP IP encapsulation support for IPv6 (L2TPv3)\n"); err = register_pernet_device(&l2tp_ip6_net_ops); if (err) goto out; err = proto_register(&l2tp_ip6_prot, 1); if (err != 0) goto out1; err = inet6_add_protocol(&l2tp_ip6_protocol, IPPROTO_L2TP); if (err) goto out2; inet6_register_protosw(&l2tp_ip6_protosw); return 0; out2: proto_unregister(&l2tp_ip6_prot); out1: unregister_pernet_device(&l2tp_ip6_net_ops); out: return err; } static void __exit l2tp_ip6_exit(void) { inet6_unregister_protosw(&l2tp_ip6_protosw); inet6_del_protocol(&l2tp_ip6_protocol, IPPROTO_L2TP); proto_unregister(&l2tp_ip6_prot); unregister_pernet_device(&l2tp_ip6_net_ops); } module_init(l2tp_ip6_init); module_exit(l2tp_ip6_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Chris Elston <celston@katalix.com>"); MODULE_DESCRIPTION("L2TP IP encapsulation for IPv6"); MODULE_VERSION("1.0"); /* Use the values of SOCK_DGRAM (2) as type and IPPROTO_L2TP (115) as protocol, * because __stringify doesn't like enums */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 115, 2); MODULE_ALIAS_NET_PF_PROTO(PF_INET6, 115);
3 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 /* RFCOMM implementation for Linux Bluetooth stack (BlueZ) Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com> Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #include <linux/refcount.h> #ifndef __RFCOMM_H #define __RFCOMM_H #define RFCOMM_CONN_TIMEOUT (HZ * 30) #define RFCOMM_DISC_TIMEOUT (HZ * 20) #define RFCOMM_AUTH_TIMEOUT (HZ * 25) #define RFCOMM_IDLE_TIMEOUT (HZ * 2) #define RFCOMM_DEFAULT_MTU 127 #define RFCOMM_DEFAULT_CREDITS 7 #define RFCOMM_MAX_CREDITS 40 #define RFCOMM_SKB_HEAD_RESERVE 8 #define RFCOMM_SKB_TAIL_RESERVE 2 #define RFCOMM_SKB_RESERVE (RFCOMM_SKB_HEAD_RESERVE + RFCOMM_SKB_TAIL_RESERVE) #define RFCOMM_SABM 0x2f #define RFCOMM_DISC 0x43 #define RFCOMM_UA 0x63 #define RFCOMM_DM 0x0f #define RFCOMM_UIH 0xef #define RFCOMM_TEST 0x08 #define RFCOMM_FCON 0x28 #define RFCOMM_FCOFF 0x18 #define RFCOMM_MSC 0x38 #define RFCOMM_RPN 0x24 #define RFCOMM_RLS 0x14 #define RFCOMM_PN 0x20 #define RFCOMM_NSC 0x04 #define RFCOMM_V24_FC 0x02 #define RFCOMM_V24_RTC 0x04 #define RFCOMM_V24_RTR 0x08 #define RFCOMM_V24_IC 0x40 #define RFCOMM_V24_DV 0x80 #define RFCOMM_RPN_BR_2400 0x0 #define RFCOMM_RPN_BR_4800 0x1 #define RFCOMM_RPN_BR_7200 0x2 #define RFCOMM_RPN_BR_9600 0x3 #define RFCOMM_RPN_BR_19200 0x4 #define RFCOMM_RPN_BR_38400 0x5 #define RFCOMM_RPN_BR_57600 0x6 #define RFCOMM_RPN_BR_115200 0x7 #define RFCOMM_RPN_BR_230400 0x8 #define RFCOMM_RPN_DATA_5 0x0 #define RFCOMM_RPN_DATA_6 0x1 #define RFCOMM_RPN_DATA_7 0x2 #define RFCOMM_RPN_DATA_8 0x3 #define RFCOMM_RPN_STOP_1 0 #define RFCOMM_RPN_STOP_15 1 #define RFCOMM_RPN_PARITY_NONE 0x0 #define RFCOMM_RPN_PARITY_ODD 0x1 #define RFCOMM_RPN_PARITY_EVEN 0x3 #define RFCOMM_RPN_PARITY_MARK 0x5 #define RFCOMM_RPN_PARITY_SPACE 0x7 #define RFCOMM_RPN_FLOW_NONE 0x00 #define RFCOMM_RPN_XON_CHAR 0x11 #define RFCOMM_RPN_XOFF_CHAR 0x13 #define RFCOMM_RPN_PM_BITRATE 0x0001 #define RFCOMM_RPN_PM_DATA 0x0002 #define RFCOMM_RPN_PM_STOP 0x0004 #define RFCOMM_RPN_PM_PARITY 0x0008 #define RFCOMM_RPN_PM_PARITY_TYPE 0x0010 #define RFCOMM_RPN_PM_XON 0x0020 #define RFCOMM_RPN_PM_XOFF 0x0040 #define RFCOMM_RPN_PM_FLOW 0x3F00 #define RFCOMM_RPN_PM_ALL 0x3F7F struct rfcomm_hdr { u8 addr; u8 ctrl; u8 len; /* Actual size can be 2 bytes */ } __packed; struct rfcomm_cmd { u8 addr; u8 ctrl; u8 len; u8 fcs; } __packed; struct rfcomm_mcc { u8 type; u8 len; } __packed; struct rfcomm_pn { u8 dlci; u8 flow_ctrl; u8 priority; u8 ack_timer; __le16 mtu; u8 max_retrans; u8 credits; } __packed; struct rfcomm_rpn { u8 dlci; u8 bit_rate; u8 line_settings; u8 flow_ctrl; u8 xon_char; u8 xoff_char; __le16 param_mask; } __packed; struct rfcomm_rls { u8 dlci; u8 status; } __packed; struct rfcomm_msc { u8 dlci; u8 v24_sig; } __packed; /* ---- Core structures, flags etc ---- */ struct rfcomm_session { struct list_head list; struct socket *sock; struct timer_list timer; unsigned long state; unsigned long flags; int initiator; /* Default DLC parameters */ int cfc; uint mtu; struct list_head dlcs; }; struct rfcomm_dlc { struct list_head list; struct rfcomm_session *session; struct sk_buff_head tx_queue; struct timer_list timer; struct mutex lock; unsigned long state; unsigned long flags; refcount_t refcnt; u8 dlci; u8 addr; u8 priority; u8 v24_sig; u8 remote_v24_sig; u8 mscex; u8 out; u8 sec_level; u8 role_switch; u32 defer_setup; uint mtu; uint cfc; uint rx_credits; uint tx_credits; void *owner; void (*data_ready)(struct rfcomm_dlc *d, struct sk_buff *skb); void (*state_change)(struct rfcomm_dlc *d, int err); void (*modem_status)(struct rfcomm_dlc *d, u8 v24_sig); }; /* DLC and session flags */ #define RFCOMM_RX_THROTTLED 0 #define RFCOMM_TX_THROTTLED 1 #define RFCOMM_TIMED_OUT 2 #define RFCOMM_MSC_PENDING 3 #define RFCOMM_SEC_PENDING 4 #define RFCOMM_AUTH_PENDING 5 #define RFCOMM_AUTH_ACCEPT 6 #define RFCOMM_AUTH_REJECT 7 #define RFCOMM_DEFER_SETUP 8 #define RFCOMM_ENC_DROP 9 /* Scheduling flags and events */ #define RFCOMM_SCHED_WAKEUP 31 /* MSC exchange flags */ #define RFCOMM_MSCEX_TX 1 #define RFCOMM_MSCEX_RX 2 #define RFCOMM_MSCEX_OK (RFCOMM_MSCEX_TX + RFCOMM_MSCEX_RX) /* CFC states */ #define RFCOMM_CFC_UNKNOWN -1 #define RFCOMM_CFC_DISABLED 0 #define RFCOMM_CFC_ENABLED RFCOMM_MAX_CREDITS /* ---- RFCOMM SEND RPN ---- */ int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci, u8 bit_rate, u8 data_bits, u8 stop_bits, u8 parity, u8 flow_ctrl_settings, u8 xon_char, u8 xoff_char, u16 param_mask); /* ---- RFCOMM DLCs (channels) ---- */ struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio); void rfcomm_dlc_free(struct rfcomm_dlc *d); int rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel); int rfcomm_dlc_close(struct rfcomm_dlc *d, int reason); int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb); void rfcomm_dlc_send_noerror(struct rfcomm_dlc *d, struct sk_buff *skb); int rfcomm_dlc_set_modem_status(struct rfcomm_dlc *d, u8 v24_sig); int rfcomm_dlc_get_modem_status(struct rfcomm_dlc *d, u8 *v24_sig); void rfcomm_dlc_accept(struct rfcomm_dlc *d); struct rfcomm_dlc *rfcomm_dlc_exists(bdaddr_t *src, bdaddr_t *dst, u8 channel); #define rfcomm_dlc_lock(d) mutex_lock(&d->lock) #define rfcomm_dlc_unlock(d) mutex_unlock(&d->lock) static inline void rfcomm_dlc_hold(struct rfcomm_dlc *d) { refcount_inc(&d->refcnt); } static inline void rfcomm_dlc_put(struct rfcomm_dlc *d) { if (refcount_dec_and_test(&d->refcnt)) rfcomm_dlc_free(d); } void __rfcomm_dlc_throttle(struct rfcomm_dlc *d); void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d); static inline void rfcomm_dlc_throttle(struct rfcomm_dlc *d) { if (!test_and_set_bit(RFCOMM_RX_THROTTLED, &d->flags)) __rfcomm_dlc_throttle(d); } static inline void rfcomm_dlc_unthrottle(struct rfcomm_dlc *d) { if (test_and_clear_bit(RFCOMM_RX_THROTTLED, &d->flags)) __rfcomm_dlc_unthrottle(d); } /* ---- RFCOMM sessions ---- */ void rfcomm_session_getaddr(struct rfcomm_session *s, bdaddr_t *src, bdaddr_t *dst); /* ---- RFCOMM sockets ---- */ struct sockaddr_rc { sa_family_t rc_family; bdaddr_t rc_bdaddr; u8 rc_channel; }; #define RFCOMM_CONNINFO 0x02 struct rfcomm_conninfo { __u16 hci_handle; __u8 dev_class[3]; }; #define RFCOMM_LM 0x03 #define RFCOMM_LM_MASTER 0x0001 #define RFCOMM_LM_AUTH 0x0002 #define RFCOMM_LM_ENCRYPT 0x0004 #define RFCOMM_LM_TRUSTED 0x0008 #define RFCOMM_LM_RELIABLE 0x0010 #define RFCOMM_LM_SECURE 0x0020 #define RFCOMM_LM_FIPS 0x0040 #define rfcomm_pi(sk) ((struct rfcomm_pinfo *) sk) struct rfcomm_pinfo { struct bt_sock bt; bdaddr_t src; bdaddr_t dst; struct rfcomm_dlc *dlc; u8 channel; u8 sec_level; u8 role_switch; }; int rfcomm_init_sockets(void); void rfcomm_cleanup_sockets(void); int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc **d); /* ---- RFCOMM TTY ---- */ #define RFCOMM_MAX_DEV 256 #define RFCOMMCREATEDEV _IOW('R', 200, int) #define RFCOMMRELEASEDEV _IOW('R', 201, int) #define RFCOMMGETDEVLIST _IOR('R', 210, int) #define RFCOMMGETDEVINFO _IOR('R', 211, int) #define RFCOMMSTEALDLC _IOW('R', 220, int) /* rfcomm_dev.flags bit definitions */ #define RFCOMM_REUSE_DLC 0 #define RFCOMM_RELEASE_ONHUP 1 #define RFCOMM_HANGUP_NOW 2 #define RFCOMM_TTY_ATTACHED 3 #define RFCOMM_DEFUNCT_BIT4 4 /* don't reuse this bit - userspace visible */ /* rfcomm_dev.status bit definitions */ #define RFCOMM_DEV_RELEASED 0 #define RFCOMM_TTY_OWNED 1 struct rfcomm_dev_req { s16 dev_id; u32 flags; bdaddr_t src; bdaddr_t dst; u8 channel; }; struct rfcomm_dev_info { s16 id; u32 flags; u16 state; bdaddr_t src; bdaddr_t dst; u8 channel; }; struct rfcomm_dev_list_req { u16 dev_num; struct rfcomm_dev_info dev_info[] __counted_by(dev_num); }; int rfcomm_dev_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); #ifdef CONFIG_BT_RFCOMM_TTY int rfcomm_init_ttys(void); void rfcomm_cleanup_ttys(void); #else static inline int rfcomm_init_ttys(void) { return 0; } static inline void rfcomm_cleanup_ttys(void) { } #endif #endif /* __RFCOMM_H */
2320 4475 22 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 /* SPDX-License-Identifier: GPL-2.0 */ /* * Common values and helper functions for the ChaCha and XChaCha stream ciphers. * * XChaCha extends ChaCha's nonce to 192 bits, while provably retaining ChaCha's * security. Here they share the same key size, tfm context, and setkey * function; only their IV size and encrypt/decrypt function differ. * * The ChaCha paper specifies 20, 12, and 8-round variants. In general, it is * recommended to use the 20-round variant ChaCha20. However, the other * variants can be needed in some performance-sensitive scenarios. The generic * ChaCha code currently allows only the 20 and 12-round variants. */ #ifndef _CRYPTO_CHACHA_H #define _CRYPTO_CHACHA_H #include <asm/unaligned.h> #include <linux/types.h> /* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */ #define CHACHA_IV_SIZE 16 #define CHACHA_KEY_SIZE 32 #define CHACHA_BLOCK_SIZE 64 #define CHACHAPOLY_IV_SIZE 12 #define CHACHA_STATE_WORDS (CHACHA_BLOCK_SIZE / sizeof(u32)) /* 192-bit nonce, then 64-bit stream position */ #define XCHACHA_IV_SIZE 32 void chacha_block_generic(u32 *state, u8 *stream, int nrounds); static inline void chacha20_block(u32 *state, u8 *stream) { chacha_block_generic(state, stream, 20); } void hchacha_block_arch(const u32 *state, u32 *out, int nrounds); void hchacha_block_generic(const u32 *state, u32 *out, int nrounds); static inline void hchacha_block(const u32 *state, u32 *out, int nrounds) { if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)) hchacha_block_arch(state, out, nrounds); else hchacha_block_generic(state, out, nrounds); } enum chacha_constants { /* expand 32-byte k */ CHACHA_CONSTANT_EXPA = 0x61707865U, CHACHA_CONSTANT_ND_3 = 0x3320646eU, CHACHA_CONSTANT_2_BY = 0x79622d32U, CHACHA_CONSTANT_TE_K = 0x6b206574U }; static inline void chacha_init_consts(u32 *state) { state[0] = CHACHA_CONSTANT_EXPA; state[1] = CHACHA_CONSTANT_ND_3; state[2] = CHACHA_CONSTANT_2_BY; state[3] = CHACHA_CONSTANT_TE_K; } void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv); static inline void chacha_init_generic(u32 *state, const u32 *key, const u8 *iv) { chacha_init_consts(state); state[4] = key[0]; state[5] = key[1]; state[6] = key[2]; state[7] = key[3]; state[8] = key[4]; state[9] = key[5]; state[10] = key[6]; state[11] = key[7]; state[12] = get_unaligned_le32(iv + 0); state[13] = get_unaligned_le32(iv + 4); state[14] = get_unaligned_le32(iv + 8); state[15] = get_unaligned_le32(iv + 12); } static inline void chacha_init(u32 *state, const u32 *key, const u8 *iv) { if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)) chacha_init_arch(state, key, iv); else chacha_init_generic(state, key, iv); } void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds); void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds); static inline void chacha_crypt(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)) chacha_crypt_arch(state, dst, src, bytes, nrounds); else chacha_crypt_generic(state, dst, src, bytes, nrounds); } static inline void chacha20_crypt(u32 *state, u8 *dst, const u8 *src, unsigned int bytes) { chacha_crypt(state, dst, src, bytes, 20); } #endif /* _CRYPTO_CHACHA_H */
20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 // SPDX-License-Identifier: GPL-2.0-only /* * This file is part of UBIFS. * * Copyright (C) 2006-2008 Nokia Corporation. * * Authors: Artem Bityutskiy (Битюцкий Артём) * Adrian Hunter */ /* * This file implements UBIFS shrinker which evicts clean znodes from the TNC * tree when Linux VM needs more RAM. * * We do not implement any LRU lists to find oldest znodes to free because it * would add additional overhead to the file system fast paths. So the shrinker * just walks the TNC tree when searching for znodes to free. * * If the root of a TNC sub-tree is clean and old enough, then the children are * also clean and old enough. So the shrinker walks the TNC in level order and * dumps entire sub-trees. * * The age of znodes is just the time-stamp when they were last looked at. * The current shrinker first tries to evict old znodes, then young ones. * * Since the shrinker is global, it has to protect against races with FS * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'. */ #include "ubifs.h" /* List of all UBIFS file-system instances */ LIST_HEAD(ubifs_infos); /* * We number each shrinker run and record the number on the ubifs_info structure * so that we can easily work out which ubifs_info structures have already been * done by the current run. */ static unsigned int shrinker_run_no; /* Protects 'ubifs_infos' list */ DEFINE_SPINLOCK(ubifs_infos_lock); /* Global clean znode counter (for all mounted UBIFS instances) */ atomic_long_t ubifs_clean_zn_cnt; /** * shrink_tnc - shrink TNC tree. * @c: UBIFS file-system description object * @nr: number of znodes to free * @age: the age of znodes to free * @contention: if any contention, this is set to %1 * * This function traverses TNC tree and frees clean znodes. It does not free * clean znodes which younger then @age. Returns number of freed znodes. */ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) { int total_freed = 0; struct ubifs_znode *znode, *zprev; time64_t time = ktime_get_seconds(); ubifs_assert(c, mutex_is_locked(&c->umount_mutex)); ubifs_assert(c, mutex_is_locked(&c->tnc_mutex)); if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0) return 0; /* * Traverse the TNC tree in levelorder manner, so that it is possible * to destroy large sub-trees. Indeed, if a znode is old, then all its * children are older or of the same age. * * Note, we are holding 'c->tnc_mutex', so we do not have to lock the * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is * changed only when the 'c->tnc_mutex' is held. */ zprev = NULL; znode = ubifs_tnc_levelorder_next(c, c->zroot.znode, NULL); while (znode && total_freed < nr && atomic_long_read(&c->clean_zn_cnt) > 0) { int freed; /* * If the znode is clean, but it is in the 'c->cnext' list, this * means that this znode has just been written to flash as a * part of commit and was marked clean. They will be removed * from the list at end commit. We cannot change the list, * because it is not protected by any mutex (design decision to * make commit really independent and parallel to main I/O). So * we just skip these znodes. * * Note, the 'clean_zn_cnt' counters are not updated until * after the commit, so the UBIFS shrinker does not report * the znodes which are in the 'c->cnext' list as freeable. * * Also note, if the root of a sub-tree is not in 'c->cnext', * then the whole sub-tree is not in 'c->cnext' as well, so it * is safe to dump whole sub-tree. */ if (znode->cnext) { /* * Very soon these znodes will be removed from the list * and become freeable. */ *contention = 1; } else if (!ubifs_zn_dirty(znode) && abs(time - znode->time) >= age) { if (znode->parent) znode->parent->zbranch[znode->iip].znode = NULL; else c->zroot.znode = NULL; freed = ubifs_destroy_tnc_subtree(c, znode); atomic_long_sub(freed, &ubifs_clean_zn_cnt); atomic_long_sub(freed, &c->clean_zn_cnt); total_freed += freed; znode = zprev; } if (unlikely(!c->zroot.znode)) break; zprev = znode; znode = ubifs_tnc_levelorder_next(c, c->zroot.znode, znode); cond_resched(); } return total_freed; } /** * shrink_tnc_trees - shrink UBIFS TNC trees. * @nr: number of znodes to free * @age: the age of znodes to free * @contention: if any contention, this is set to %1 * * This function walks the list of mounted UBIFS file-systems and frees clean * znodes which are older than @age, until at least @nr znodes are freed. * Returns the number of freed znodes. */ static int shrink_tnc_trees(int nr, int age, int *contention) { struct ubifs_info *c; struct list_head *p; unsigned int run_no; int freed = 0; spin_lock(&ubifs_infos_lock); do { run_no = ++shrinker_run_no; } while (run_no == 0); /* Iterate over all mounted UBIFS file-systems and try to shrink them */ p = ubifs_infos.next; while (p != &ubifs_infos) { c = list_entry(p, struct ubifs_info, infos_list); /* * We move the ones we do to the end of the list, so we stop * when we see one we have already done. */ if (c->shrinker_run_no == run_no) break; if (!mutex_trylock(&c->umount_mutex)) { /* Some un-mount is in progress, try next FS */ *contention = 1; p = p->next; continue; } /* * We're holding 'c->umount_mutex', so the file-system won't go * away. */ if (!mutex_trylock(&c->tnc_mutex)) { mutex_unlock(&c->umount_mutex); *contention = 1; p = p->next; continue; } spin_unlock(&ubifs_infos_lock); /* * OK, now we have TNC locked, the file-system cannot go away - * it is safe to reap the cache. */ c->shrinker_run_no = run_no; freed += shrink_tnc(c, nr, age, contention); mutex_unlock(&c->tnc_mutex); spin_lock(&ubifs_infos_lock); /* Get the next list element before we move this one */ p = p->next; /* * Move this one to the end of the list to provide some * fairness. */ list_move_tail(&c->infos_list, &ubifs_infos); mutex_unlock(&c->umount_mutex); if (freed >= nr) break; } spin_unlock(&ubifs_infos_lock); return freed; } /** * kick_a_thread - kick a background thread to start commit. * * This function kicks a background thread to start background commit. Returns * %-1 if a thread was kicked or there is another reason to assume the memory * will soon be freed or become freeable. If there are no dirty znodes, returns * %0. */ static int kick_a_thread(void) { int i; struct ubifs_info *c; /* * Iterate over all mounted UBIFS file-systems and find out if there is * already an ongoing commit operation there. If no, then iterate for * the second time and initiate background commit. */ spin_lock(&ubifs_infos_lock); for (i = 0; i < 2; i++) { list_for_each_entry(c, &ubifs_infos, infos_list) { long dirty_zn_cnt; if (!mutex_trylock(&c->umount_mutex)) { /* * Some un-mount is in progress, it will * certainly free memory, so just return. */ spin_unlock(&ubifs_infos_lock); return -1; } dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt); if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN || c->ro_mount || c->ro_error) { mutex_unlock(&c->umount_mutex); continue; } if (c->cmt_state != COMMIT_RESTING) { spin_unlock(&ubifs_infos_lock); mutex_unlock(&c->umount_mutex); return -1; } if (i == 1) { list_move_tail(&c->infos_list, &ubifs_infos); spin_unlock(&ubifs_infos_lock); ubifs_request_bg_commit(c); mutex_unlock(&c->umount_mutex); return -1; } mutex_unlock(&c->umount_mutex); } } spin_unlock(&ubifs_infos_lock); return 0; } unsigned long ubifs_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); /* * Due to the way UBIFS updates the clean znode counter it may * temporarily be negative. */ return clean_zn_cnt >= 0 ? clean_zn_cnt : 1; } unsigned long ubifs_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { unsigned long nr = sc->nr_to_scan; int contention = 0; unsigned long freed; long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); if (!clean_zn_cnt) { /* * No clean znodes, nothing to reap. All we can do in this case * is to kick background threads to start commit, which will * probably make clean znodes which, in turn, will be freeable. * And we return -1 which means will make VM call us again * later. */ dbg_tnc("no clean znodes, kick a thread"); return kick_a_thread(); } freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention); if (freed >= nr) goto out; dbg_tnc("not enough old znodes, try to free young ones"); freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention); if (freed >= nr) goto out; dbg_tnc("not enough young znodes, free all"); freed += shrink_tnc_trees(nr - freed, 0, &contention); if (!freed && contention) { dbg_tnc("freed nothing, but contention"); return SHRINK_STOP; } out: dbg_tnc("%lu znodes were freed, requested %lu", freed, nr); return freed; }
126 126 126 126 126 100 48 71 76 70 83 91 91 91 17 30 30 88 88 51 51 51 2 70 69 70 70 70 21 21 21 99 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 /* SPDX-License-Identifier: LGPL-2.1+ */ /* Copyright (C) 2022 Kent Overstreet */ #ifndef _BCACHEFS_PRINTBUF_H #define _BCACHEFS_PRINTBUF_H /* * Printbufs: Simple strings for printing to, with optional heap allocation * * This code has provisions for use in userspace, to aid in making other code * portable between kernelspace and userspace. * * Basic example: * struct printbuf buf = PRINTBUF; * * prt_printf(&buf, "foo="); * foo_to_text(&buf, foo); * printk("%s", buf.buf); * printbuf_exit(&buf); * * Or * struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size) * * We can now write pretty printers instead of writing code that dumps * everything to the kernel log buffer, and then those pretty-printers can be * used by other code that outputs to kernel log, sysfs, debugfs, etc. * * Memory allocation: Outputing to a printbuf may allocate memory. This * allocation is done with GFP_KERNEL, by default: use the newer * memalloc_*_(save|restore) functions as needed. * * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations * will be done with GFP_NOWAIT if printbuf->atomic is nonzero. * * It's allowed to grab the output buffer and free it later with kfree() instead * of using printbuf_exit(), if the user just needs a heap allocated string at * the end. * * Memory allocation failures: We don't return errors directly, because on * memory allocation failure we usually don't want to bail out and unwind - we * want to print what we've got, on a best-effort basis. But code that does want * to return -ENOMEM may check printbuf.allocation_failure. * * Indenting, tabstops: * * To aid is writing multi-line pretty printers spread across multiple * functions, printbufs track the current indent level. * * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent * level, respectively. * * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from * start of line. Once set, prt_tab() will output spaces up to the next tabstop. * prt_tab_rjust() will also advance the current line of text up to the next * tabstop, but it does so by shifting text since the previous tabstop up to the * next tabstop - right justifying it. * * Make sure you use prt_newline() instead of \n in the format string for indent * level and tabstops to work corretly. * * Output units: printbuf->units exists to tell pretty-printers how to output * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as * human readable bytes. prt_units() obeys it. */ #include <linux/kernel.h> #include <linux/string.h> enum printbuf_si { PRINTBUF_UNITS_2, /* use binary powers of 2^10 */ PRINTBUF_UNITS_10, /* use powers of 10^3 (standard SI) */ }; #define PRINTBUF_INLINE_TABSTOPS 6 struct printbuf { char *buf; unsigned size; unsigned pos; unsigned last_newline; unsigned last_field; unsigned indent; /* * If nonzero, allocations will be done with GFP_ATOMIC: */ u8 atomic; bool allocation_failure:1; bool heap_allocated:1; bool overflow:1; enum printbuf_si si_units:1; bool human_readable_units:1; bool has_indent_or_tabstops:1; bool suppress_indent_tabstop_handling:1; u8 nr_tabstops; /* * Do not modify directly: use printbuf_tabstop_add(), * printbuf_tabstop_get() */ u8 cur_tabstop; u8 _tabstops[PRINTBUF_INLINE_TABSTOPS]; }; int bch2_printbuf_make_room(struct printbuf *, unsigned); __printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...); __printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list); const char *bch2_printbuf_str(const struct printbuf *); void bch2_printbuf_exit(struct printbuf *); void bch2_printbuf_tabstops_reset(struct printbuf *); void bch2_printbuf_tabstop_pop(struct printbuf *); int bch2_printbuf_tabstop_push(struct printbuf *, unsigned); void bch2_printbuf_indent_add(struct printbuf *, unsigned); void bch2_printbuf_indent_sub(struct printbuf *, unsigned); void bch2_prt_newline(struct printbuf *); void bch2_printbuf_strip_trailing_newline(struct printbuf *); void bch2_prt_tab(struct printbuf *); void bch2_prt_tab_rjust(struct printbuf *); void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned); void bch2_prt_human_readable_u64(struct printbuf *, u64); void bch2_prt_human_readable_s64(struct printbuf *, s64); void bch2_prt_units_u64(struct printbuf *, u64); void bch2_prt_units_s64(struct printbuf *, s64); void bch2_prt_string_option(struct printbuf *, const char * const[], size_t); void bch2_prt_bitflags(struct printbuf *, const char * const[], u64); void bch2_prt_bitflags_vector(struct printbuf *, const char * const[], unsigned long *, unsigned); /* Initializer for a heap allocated printbuf: */ #define PRINTBUF ((struct printbuf) { .heap_allocated = true }) /* Initializer a printbuf that points to an external buffer: */ #define PRINTBUF_EXTERN(_buf, _size) \ ((struct printbuf) { \ .buf = _buf, \ .size = _size, \ }) /* * Returns size remaining of output buffer: */ static inline unsigned printbuf_remaining_size(struct printbuf *out) { if (WARN_ON(out->size && out->pos >= out->size)) out->pos = out->size - 1; return out->size - out->pos; } /* * Returns number of characters we can print to the output buffer - i.e. * excluding the terminating nul: */ static inline unsigned printbuf_remaining(struct printbuf *out) { return out->size ? printbuf_remaining_size(out) - 1 : 0; } static inline unsigned printbuf_written(struct printbuf *out) { return out->size ? min(out->pos, out->size - 1) : 0; } static inline void printbuf_nul_terminate_reserved(struct printbuf *out) { if (WARN_ON(out->size && out->pos >= out->size)) out->pos = out->size - 1; if (out->size) out->buf[out->pos] = 0; } static inline void printbuf_nul_terminate(struct printbuf *out) { bch2_printbuf_make_room(out, 1); printbuf_nul_terminate_reserved(out); } /* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */ static inline void __prt_char_reserved(struct printbuf *out, char c) { if (printbuf_remaining(out)) out->buf[out->pos++] = c; } /* Doesn't nul terminate: */ static inline void __prt_char(struct printbuf *out, char c) { bch2_printbuf_make_room(out, 1); __prt_char_reserved(out, c); } static inline void prt_char(struct printbuf *out, char c) { bch2_printbuf_make_room(out, 2); __prt_char_reserved(out, c); printbuf_nul_terminate_reserved(out); } static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n) { unsigned can_print = min(n, printbuf_remaining(out)); for (unsigned i = 0; i < can_print; i++) out->buf[out->pos++] = c; } static inline void prt_chars(struct printbuf *out, char c, unsigned n) { bch2_printbuf_make_room(out, n); __prt_chars_reserved(out, c, n); printbuf_nul_terminate_reserved(out); } static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n) { bch2_printbuf_make_room(out, n); unsigned can_print = min(n, printbuf_remaining(out)); for (unsigned i = 0; i < can_print; i++) out->buf[out->pos++] = ((char *) b)[i]; printbuf_nul_terminate(out); } static inline void prt_str(struct printbuf *out, const char *str) { prt_bytes(out, str, strlen(str)); } static inline void prt_str_indented(struct printbuf *out, const char *str) { bch2_prt_bytes_indented(out, str, strlen(str)); } static inline void prt_hex_byte(struct printbuf *out, u8 byte) { bch2_printbuf_make_room(out, 3); __prt_char_reserved(out, hex_asc_hi(byte)); __prt_char_reserved(out, hex_asc_lo(byte)); printbuf_nul_terminate_reserved(out); } static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) { bch2_printbuf_make_room(out, 3); __prt_char_reserved(out, hex_asc_upper_hi(byte)); __prt_char_reserved(out, hex_asc_upper_lo(byte)); printbuf_nul_terminate_reserved(out); } /** * printbuf_reset - re-use a printbuf without freeing and re-initializing it: */ static inline void printbuf_reset(struct printbuf *buf) { buf->pos = 0; buf->allocation_failure = 0; buf->indent = 0; buf->nr_tabstops = 0; buf->cur_tabstop = 0; } /** * printbuf_atomic_inc - mark as entering an atomic section */ static inline void printbuf_atomic_inc(struct printbuf *buf) { buf->atomic++; } /** * printbuf_atomic_inc - mark as leaving an atomic section */ static inline void printbuf_atomic_dec(struct printbuf *buf) { buf->atomic--; } #endif /* _BCACHEFS_PRINTBUF_H */
20499 8 8 9 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_TASK_STACK_H #define _LINUX_SCHED_TASK_STACK_H /* * task->stack (kernel stack) handling interfaces: */ #include <linux/sched.h> #include <linux/magic.h> #include <linux/refcount.h> #ifdef CONFIG_THREAD_INFO_IN_TASK /* * When accessing the stack of a non-current task that might exit, use * try_get_task_stack() instead. task_stack_page will return a pointer * that could get freed out from under you. */ static __always_inline void *task_stack_page(const struct task_struct *task) { return task->stack; } #define setup_thread_stack(new,old) do { } while(0) static __always_inline unsigned long *end_of_stack(const struct task_struct *task) { #ifdef CONFIG_STACK_GROWSUP return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1; #else return task->stack; #endif } #elif !defined(__HAVE_THREAD_FUNCTIONS) #define task_stack_page(task) ((void *)(task)->stack) static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) { *task_thread_info(p) = *task_thread_info(org); task_thread_info(p)->task = p; } /* * Return the address of the last usable long on the stack. * * When the stack grows down, this is just above the thread * info struct. Going any lower will corrupt the threadinfo. * * When the stack grows up, this is the highest address. * Beyond that position, we corrupt data on the next page. */ static inline unsigned long *end_of_stack(struct task_struct *p) { #ifdef CONFIG_STACK_GROWSUP return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1; #else return (unsigned long *)(task_thread_info(p) + 1); #endif } #endif #ifdef CONFIG_THREAD_INFO_IN_TASK static inline void *try_get_task_stack(struct task_struct *tsk) { return refcount_inc_not_zero(&tsk->stack_refcount) ? task_stack_page(tsk) : NULL; } extern void put_task_stack(struct task_struct *tsk); #else static inline void *try_get_task_stack(struct task_struct *tsk) { return task_stack_page(tsk); } static inline void put_task_stack(struct task_struct *tsk) {} #endif void exit_task_stack_account(struct task_struct *tsk); #define task_stack_end_corrupted(task) \ (*(end_of_stack(task)) != STACK_END_MAGIC) static inline int object_is_on_stack(const void *obj) { void *stack = task_stack_page(current); return (obj >= stack) && (obj < (stack + THREAD_SIZE)); } extern void thread_stack_cache_init(void); #ifdef CONFIG_DEBUG_STACK_USAGE static inline unsigned long stack_not_used(struct task_struct *p) { unsigned long *n = end_of_stack(p); do { /* Skip over canary */ # ifdef CONFIG_STACK_GROWSUP n--; # else n++; # endif } while (!*n); # ifdef CONFIG_STACK_GROWSUP return (unsigned long)end_of_stack(p) - (unsigned long)n; # else return (unsigned long)n - (unsigned long)end_of_stack(p); # endif } #endif extern void set_task_stack_end_magic(struct task_struct *tsk); #ifndef __HAVE_ARCH_KSTACK_END static inline int kstack_end(void *addr) { /* Reliable end of stack detection: * Some APM bios versions misalign the stack */ return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); } #endif #endif /* _LINUX_SCHED_TASK_STACK_H */
1202 152 435 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PARAVIRT_H #define _ASM_X86_PARAVIRT_H /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ #include <asm/paravirt_types.h> #ifndef __ASSEMBLY__ struct mm_struct; #endif #ifdef CONFIG_PARAVIRT #include <asm/pgtable_types.h> #include <asm/asm.h> #include <asm/nospec-branch.h> #ifndef __ASSEMBLY__ #include <linux/bug.h> #include <linux/types.h> #include <linux/cpumask.h> #include <linux/static_call_types.h> #include <asm/frame.h> u64 dummy_steal_clock(int cpu); u64 dummy_sched_clock(void); DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock); DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock); void paravirt_set_sched_clock(u64 (*func)(void)); static __always_inline u64 paravirt_sched_clock(void) { return static_call(pv_sched_clock)(); } struct static_key; extern struct static_key paravirt_steal_enabled; extern struct static_key paravirt_steal_rq_enabled; __visible void __native_queued_spin_unlock(struct qspinlock *lock); bool pv_is_native_spin_unlock(void); __visible bool __native_vcpu_is_preempted(long cpu); bool pv_is_native_vcpu_is_preempted(void); static inline u64 paravirt_steal_clock(int cpu) { return static_call(pv_steal_clock)(cpu); } #ifdef CONFIG_PARAVIRT_SPINLOCKS void __init paravirt_set_cap(void); #endif /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { PVOP_VCALL0(cpu.io_delay); #ifdef REALLY_SLOW_IO PVOP_VCALL0(cpu.io_delay); PVOP_VCALL0(cpu.io_delay); PVOP_VCALL0(cpu.io_delay); #endif } void native_flush_tlb_local(void); void native_flush_tlb_global(void); void native_flush_tlb_one_user(unsigned long addr); void native_flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info); static inline void __flush_tlb_local(void) { PVOP_VCALL0(mmu.flush_tlb_user); } static inline void __flush_tlb_global(void) { PVOP_VCALL0(mmu.flush_tlb_kernel); } static inline void __flush_tlb_one_user(unsigned long addr) { PVOP_VCALL1(mmu.flush_tlb_one_user, addr); } static inline void __flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info) { PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info); } static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) { PVOP_VCALL2(mmu.tlb_remove_table, tlb, table); } static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { PVOP_VCALL1(mmu.exit_mmap, mm); } static inline void notify_page_enc_status_changed(unsigned long pfn, int npages, bool enc) { PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc); } #ifdef CONFIG_PARAVIRT_XXL static inline void load_sp0(unsigned long sp0) { PVOP_VCALL1(cpu.load_sp0, sp0); } /* The paravirtualized CPUID instruction. */ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { PVOP_VCALL4(cpu.cpuid, eax, ebx, ecx, edx); } /* * These special macros can be used to get or set a debugging register */ static __always_inline unsigned long paravirt_get_debugreg(int reg) { return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg); } #define get_debugreg(var, reg) var = paravirt_get_debugreg(reg) static __always_inline void set_debugreg(unsigned long val, int reg) { PVOP_VCALL2(cpu.set_debugreg, reg, val); } static inline unsigned long read_cr0(void) { return PVOP_CALL0(unsigned long, cpu.read_cr0); } static inline void write_cr0(unsigned long x) { PVOP_VCALL1(cpu.write_cr0, x); } static __always_inline unsigned long read_cr2(void) { return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, "mov %%cr2, %%rax;", ALT_NOT_XEN); } static __always_inline void write_cr2(unsigned long x) { PVOP_VCALL1(mmu.write_cr2, x); } static inline unsigned long __read_cr3(void) { return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3, "mov %%cr3, %%rax;", ALT_NOT_XEN); } static inline void write_cr3(unsigned long x) { PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN); } static inline void __write_cr4(unsigned long x) { PVOP_VCALL1(cpu.write_cr4, x); } static __always_inline void arch_safe_halt(void) { PVOP_VCALL0(irq.safe_halt); } static inline void halt(void) { PVOP_VCALL0(irq.halt); } extern noinstr void pv_native_wbinvd(void); static __always_inline void wbinvd(void) { PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN); } static inline u64 paravirt_read_msr(unsigned msr) { return PVOP_CALL1(u64, cpu.read_msr, msr); } static inline void paravirt_write_msr(unsigned msr, unsigned low, unsigned high) { PVOP_VCALL3(cpu.write_msr, msr, low, high); } static inline u64 paravirt_read_msr_safe(unsigned msr, int *err) { return PVOP_CALL2(u64, cpu.read_msr_safe, msr, err); } static inline int paravirt_write_msr_safe(unsigned msr, unsigned low, unsigned high) { return PVOP_CALL3(int, cpu.write_msr_safe, msr, low, high); } #define rdmsr(msr, val1, val2) \ do { \ u64 _l = paravirt_read_msr(msr); \ val1 = (u32)_l; \ val2 = _l >> 32; \ } while (0) #define wrmsr(msr, val1, val2) \ do { \ paravirt_write_msr(msr, val1, val2); \ } while (0) #define rdmsrl(msr, val) \ do { \ val = paravirt_read_msr(msr); \ } while (0) static inline void wrmsrl(unsigned msr, u64 val) { wrmsr(msr, (u32)val, (u32)(val>>32)); } #define wrmsr_safe(msr, a, b) paravirt_write_msr_safe(msr, a, b) /* rdmsr with exception handling */ #define rdmsr_safe(msr, a, b) \ ({ \ int _err; \ u64 _l = paravirt_read_msr_safe(msr, &_err); \ (*a) = (u32)_l; \ (*b) = _l >> 32; \ _err; \ }) static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) { int err; *p = paravirt_read_msr_safe(msr, &err); return err; } static inline unsigned long long paravirt_read_pmc(int counter) { return PVOP_CALL1(u64, cpu.read_pmc, counter); } #define rdpmc(counter, low, high) \ do { \ u64 _l = paravirt_read_pmc(counter); \ low = (u32)_l; \ high = _l >> 32; \ } while (0) #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(cpu.alloc_ldt, ldt, entries); } static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(cpu.free_ldt, ldt, entries); } static inline void load_TR_desc(void) { PVOP_VCALL0(cpu.load_tr_desc); } static inline void load_gdt(const struct desc_ptr *dtr) { PVOP_VCALL1(cpu.load_gdt, dtr); } static inline void load_idt(const struct desc_ptr *dtr) { PVOP_VCALL1(cpu.load_idt, dtr); } static inline void set_ldt(const void *addr, unsigned entries) { PVOP_VCALL2(cpu.set_ldt, addr, entries); } static inline unsigned long paravirt_store_tr(void) { return PVOP_CALL0(unsigned long, cpu.store_tr); } #define store_tr(tr) ((tr) = paravirt_store_tr()) static inline void load_TLS(struct thread_struct *t, unsigned cpu) { PVOP_VCALL2(cpu.load_tls, t, cpu); } static inline void load_gs_index(unsigned int gs) { PVOP_VCALL1(cpu.load_gs_index, gs); } static inline void write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { PVOP_VCALL3(cpu.write_ldt_entry, dt, entry, desc); } static inline void write_gdt_entry(struct desc_struct *dt, int entry, void *desc, int type) { PVOP_VCALL4(cpu.write_gdt_entry, dt, entry, desc, type); } static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) { PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g); } #ifdef CONFIG_X86_IOPL_IOPERM static inline void tss_invalidate_io_bitmap(void) { PVOP_VCALL0(cpu.invalidate_io_bitmap); } static inline void tss_update_io_bitmap(void) { PVOP_VCALL0(cpu.update_io_bitmap); } #endif static inline void paravirt_enter_mmap(struct mm_struct *next) { PVOP_VCALL1(mmu.enter_mmap, next); } static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return PVOP_CALL1(int, mmu.pgd_alloc, mm); } static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) { PVOP_VCALL2(mmu.pgd_free, mm, pgd); } static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pte, mm, pfn); } static inline void paravirt_release_pte(unsigned long pfn) { PVOP_VCALL1(mmu.release_pte, pfn); } static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pmd, mm, pfn); } static inline void paravirt_release_pmd(unsigned long pfn) { PVOP_VCALL1(mmu.release_pmd, pfn); } static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pud, mm, pfn); } static inline void paravirt_release_pud(unsigned long pfn) { PVOP_VCALL1(mmu.release_pud, pfn); } static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_p4d, mm, pfn); } static inline void paravirt_release_p4d(unsigned long pfn) { PVOP_VCALL1(mmu.release_p4d, pfn); } static inline pte_t __pte(pteval_t val) { return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pteval_t pte_val(pte_t pte) { return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline pgd_t __pgd(pgdval_t val) { return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pgdval_t pgd_val(pgd_t pgd) { return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd, "mov %%rdi, %%rax", ALT_NOT_XEN); } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pteval_t ret; ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep); return (pte_t) { .pte = ret }; } static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { PVOP_VCALL4(mmu.ptep_modify_prot_commit, vma, addr, ptep, pte.pte); } static inline void set_pte(pte_t *ptep, pte_t pte) { PVOP_VCALL2(mmu.set_pte, ptep, pte.pte); } static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { PVOP_VCALL2(mmu.set_pmd, pmdp, native_pmd_val(pmd)); } static inline pmd_t __pmd(pmdval_t val) { return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pmdval_t pmd_val(pmd_t pmd) { return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void set_pud(pud_t *pudp, pud_t pud) { PVOP_VCALL2(mmu.set_pud, pudp, native_pud_val(pud)); } static inline pud_t __pud(pudval_t val) { pudval_t ret; ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val, "mov %%rdi, %%rax", ALT_NOT_XEN); return (pud_t) { ret }; } static inline pudval_t pud_val(pud_t pud) { return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void pud_clear(pud_t *pudp) { set_pud(pudp, native_make_pud(0)); } static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) { p4dval_t val = native_p4d_val(p4d); PVOP_VCALL2(mmu.set_p4d, p4dp, val); } #if CONFIG_PGTABLE_LEVELS >= 5 static inline p4d_t __p4d(p4dval_t val) { p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val, "mov %%rdi, %%rax", ALT_NOT_XEN); return (p4d_t) { ret }; } static inline p4dval_t p4d_val(p4d_t p4d) { return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) { PVOP_VCALL2(mmu.set_pgd, pgdp, native_pgd_val(pgd)); } #define set_pgd(pgdp, pgdval) do { \ if (pgtable_l5_enabled()) \ __set_pgd(pgdp, pgdval); \ else \ set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \ } while (0) #define pgd_clear(pgdp) do { \ if (pgtable_l5_enabled()) \ set_pgd(pgdp, native_make_pgd(0)); \ } while (0) #endif /* CONFIG_PGTABLE_LEVELS == 5 */ static inline void p4d_clear(p4d_t *p4dp) { set_p4d(p4dp, native_make_p4d(0)); } static inline void set_pte_atomic(pte_t *ptep, pte_t pte) { set_pte(ptep, pte); } static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { set_pte(ptep, native_make_pte(0)); } static inline void pmd_clear(pmd_t *pmdp) { set_pmd(pmdp, native_make_pmd(0)); } #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(struct task_struct *prev) { PVOP_VCALL1(cpu.start_context_switch, prev); } static inline void arch_end_context_switch(struct task_struct *next) { PVOP_VCALL1(cpu.end_context_switch, next); } #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.enter); } static inline void arch_leave_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.leave); } static inline void arch_flush_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.flush); } static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags) { pv_ops.mmu.set_fixmap(idx, phys, flags); } #endif #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { PVOP_VCALL2(lock.queued_spin_lock_slowpath, lock, val); } static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) { PVOP_ALT_VCALLEE1(lock.queued_spin_unlock, lock, "movb $0, (%%" _ASM_ARG1 ");", ALT_NOT(X86_FEATURE_PVUNLOCK)); } static __always_inline void pv_wait(u8 *ptr, u8 val) { PVOP_VCALL2(lock.wait, ptr, val); } static __always_inline void pv_kick(int cpu) { PVOP_VCALL1(lock.kick, cpu); } static __always_inline bool pv_vcpu_is_preempted(long cpu) { return PVOP_ALT_CALLEE1(bool, lock.vcpu_is_preempted, cpu, "xor %%" _ASM_AX ", %%" _ASM_AX ";", ALT_NOT(X86_FEATURE_VCPUPREEMPT)); } void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock); bool __raw_callee_save___native_vcpu_is_preempted(long cpu); #endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 /* save and restore all caller-save registers, except return value */ #define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;" #define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;" #else /* save and restore all caller-save registers, except return value */ #define PV_SAVE_ALL_CALLER_REGS \ "push %rcx;" \ "push %rdx;" \ "push %rsi;" \ "push %rdi;" \ "push %r8;" \ "push %r9;" \ "push %r10;" \ "push %r11;" #define PV_RESTORE_ALL_CALLER_REGS \ "pop %r11;" \ "pop %r10;" \ "pop %r9;" \ "pop %r8;" \ "pop %rdi;" \ "pop %rsi;" \ "pop %rdx;" \ "pop %rcx;" #endif /* * Generate a thunk around a function which saves all caller-save * registers except for the return value. This allows C functions to * be called from assembler code where fewer than normal registers are * available. It may also help code generation around calls from C * code if the common case doesn't use many registers. * * When a callee is wrapped in a thunk, the caller can assume that all * arg regs and all scratch registers are preserved across the * call. The return value in rax/eax will not be saved, even for void * functions. */ #define PV_THUNK_NAME(func) "__raw_callee_save_" #func #define __PV_CALLEE_SAVE_REGS_THUNK(func, section) \ extern typeof(func) __raw_callee_save_##func; \ \ asm(".pushsection " section ", \"ax\";" \ ".globl " PV_THUNK_NAME(func) ";" \ ".type " PV_THUNK_NAME(func) ", @function;" \ ASM_FUNC_ALIGN \ PV_THUNK_NAME(func) ":" \ ASM_ENDBR \ FRAME_BEGIN \ PV_SAVE_ALL_CALLER_REGS \ "call " #func ";" \ PV_RESTORE_ALL_CALLER_REGS \ FRAME_END \ ASM_RET \ ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \ ".popsection") #define PV_CALLEE_SAVE_REGS_THUNK(func) \ __PV_CALLEE_SAVE_REGS_THUNK(func, ".text") /* Get a reference to a callee-save function */ #define PV_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { __raw_callee_save_##func }) /* Promise that "func" already uses the right calling convention */ #define __PV_IS_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { func }) #ifdef CONFIG_PARAVIRT_XXL static __always_inline unsigned long arch_local_save_flags(void) { return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", ALT_NOT_XEN); } static __always_inline void arch_local_irq_disable(void) { PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN); } static __always_inline void arch_local_irq_enable(void) { PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN); } static __always_inline unsigned long arch_local_irq_save(void) { unsigned long f; f = arch_local_save_flags(); arch_local_irq_disable(); return f; } #endif /* Make sure as little as possible of this mess escapes. */ #undef PARAVIRT_CALL #undef __PVOP_CALL #undef __PVOP_VCALL #undef PVOP_VCALL0 #undef PVOP_CALL0 #undef PVOP_VCALL1 #undef PVOP_CALL1 #undef PVOP_VCALL2 #undef PVOP_CALL2 #undef PVOP_VCALL3 #undef PVOP_CALL3 #undef PVOP_VCALL4 #undef PVOP_CALL4 extern void default_banner(void); void native_pv_lock_init(void) __init; #else /* __ASSEMBLY__ */ #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL #ifdef CONFIG_DEBUG_ENTRY #define PARA_INDIRECT(addr) *addr(%rip) .macro PARA_IRQ_save_fl ANNOTATE_RETPOLINE_SAFE; call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); .endm #define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;", \ "ALT_CALL_INSTR;", ALT_CALL_ALWAYS, \ "pushf; pop %rax;", ALT_NOT_XEN #endif #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop #ifndef __ASSEMBLY__ static inline void native_pv_lock_init(void) { } #endif #endif /* !CONFIG_PARAVIRT */ #ifndef __ASSEMBLY__ #ifndef CONFIG_PARAVIRT_XXL static inline void paravirt_enter_mmap(struct mm_struct *mm) { } #endif #ifndef CONFIG_PARAVIRT static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { } #endif #ifndef CONFIG_PARAVIRT_SPINLOCKS static inline void paravirt_set_cap(void) { } #endif #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PARAVIRT_H */
31 31 117 116 31 31 31 31 31 31 8 8 117 117 117 116 117 117 66 50 50 5759 5745 5753 5760 5762 1428 1427 1430 5801 1450 40 31 35 9 83 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/mempool.c * * memory buffer pool support. Such pools are mostly used * for guaranteed, deadlock-free memory allocations during * extreme VM load. * * started by Ingo Molnar, Copyright (C) 2001 * debugging by David Rientjes, Copyright (C) 2015 */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/highmem.h> #include <linux/kasan.h> #include <linux/kmemleak.h> #include <linux/export.h> #include <linux/mempool.h> #include <linux/writeback.h> #include "slab.h" #ifdef CONFIG_SLUB_DEBUG_ON static void poison_error(mempool_t *pool, void *element, size_t size, size_t byte) { const int nr = pool->curr_nr; const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0); const int end = min_t(int, byte + (BITS_PER_LONG / 8), size); int i; pr_err("BUG: mempool element poison mismatch\n"); pr_err("Mempool %p size %zu\n", pool, size); pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : ""); for (i = start; i < end; i++) pr_cont("%x ", *(u8 *)(element + i)); pr_cont("%s\n", end < size ? "..." : ""); dump_stack(); } static void __check_element(mempool_t *pool, void *element, size_t size) { u8 *obj = element; size_t i; for (i = 0; i < size; i++) { u8 exp = (i < size - 1) ? POISON_FREE : POISON_END; if (obj[i] != exp) { poison_error(pool, element, size, i); return; } } memset(obj, POISON_INUSE, size); } static void check_element(mempool_t *pool, void *element) { /* Skip checking: KASAN might save its metadata in the element. */ if (kasan_enabled()) return; /* Mempools backed by slab allocator */ if (pool->free == mempool_kfree) { __check_element(pool, element, (size_t)pool->pool_data); } else if (pool->free == mempool_free_slab) { __check_element(pool, element, kmem_cache_size(pool->pool_data)); } else if (pool->free == mempool_free_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; void *addr = kmap_local_page((struct page *)element); __check_element(pool, addr, 1UL << (PAGE_SHIFT + order)); kunmap_local(addr); } } static void __poison_element(void *element, size_t size) { u8 *obj = element; memset(obj, POISON_FREE, size - 1); obj[size - 1] = POISON_END; } static void poison_element(mempool_t *pool, void *element) { /* Skip poisoning: KASAN might save its metadata in the element. */ if (kasan_enabled()) return; /* Mempools backed by slab allocator */ if (pool->alloc == mempool_kmalloc) { __poison_element(element, (size_t)pool->pool_data); } else if (pool->alloc == mempool_alloc_slab) { __poison_element(element, kmem_cache_size(pool->pool_data)); } else if (pool->alloc == mempool_alloc_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; void *addr = kmap_local_page((struct page *)element); __poison_element(addr, 1UL << (PAGE_SHIFT + order)); kunmap_local(addr); } } #else /* CONFIG_SLUB_DEBUG_ON */ static inline void check_element(mempool_t *pool, void *element) { } static inline void poison_element(mempool_t *pool, void *element) { } #endif /* CONFIG_SLUB_DEBUG_ON */ static __always_inline bool kasan_poison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) return kasan_mempool_poison_object(element); else if (pool->alloc == mempool_alloc_pages) return kasan_mempool_poison_pages(element, (unsigned long)pool->pool_data); return true; } static void kasan_unpoison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_kmalloc) kasan_mempool_unpoison_object(element, (size_t)pool->pool_data); else if (pool->alloc == mempool_alloc_slab) kasan_mempool_unpoison_object(element, kmem_cache_size(pool->pool_data)); else if (pool->alloc == mempool_alloc_pages) kasan_mempool_unpoison_pages(element, (unsigned long)pool->pool_data); } static __always_inline void add_element(mempool_t *pool, void *element) { BUG_ON(pool->curr_nr >= pool->min_nr); poison_element(pool, element); if (kasan_poison_element(pool, element)) pool->elements[pool->curr_nr++] = element; } static void *remove_element(mempool_t *pool) { void *element = pool->elements[--pool->curr_nr]; BUG_ON(pool->curr_nr < 0); kasan_unpoison_element(pool, element); check_element(pool, element); return element; } /** * mempool_exit - exit a mempool initialized with mempool_init() * @pool: pointer to the memory pool which was initialized with * mempool_init(). * * Free all reserved elements in @pool and @pool itself. This function * only sleeps if the free_fn() function sleeps. * * May be called on a zeroed but uninitialized mempool (i.e. allocated with * kzalloc()). */ void mempool_exit(mempool_t *pool) { while (pool->curr_nr) { void *element = remove_element(pool); pool->free(element, pool->pool_data); } kfree(pool->elements); pool->elements = NULL; } EXPORT_SYMBOL(mempool_exit); /** * mempool_destroy - deallocate a memory pool * @pool: pointer to the memory pool which was allocated via * mempool_create(). * * Free all reserved elements in @pool and @pool itself. This function * only sleeps if the free_fn() function sleeps. */ void mempool_destroy(mempool_t *pool) { if (unlikely(!pool)) return; mempool_exit(pool); kfree(pool); } EXPORT_SYMBOL(mempool_destroy); int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, gfp_t gfp_mask, int node_id) { spin_lock_init(&pool->lock); pool->min_nr = min_nr; pool->pool_data = pool_data; pool->alloc = alloc_fn; pool->free = free_fn; init_waitqueue_head(&pool->wait); pool->elements = kmalloc_array_node(min_nr, sizeof(void *), gfp_mask, node_id); if (!pool->elements) return -ENOMEM; /* * First pre-allocate the guaranteed number of buffers. */ while (pool->curr_nr < pool->min_nr) { void *element; element = pool->alloc(gfp_mask, pool->pool_data); if (unlikely(!element)) { mempool_exit(pool); return -ENOMEM; } add_element(pool, element); } return 0; } EXPORT_SYMBOL(mempool_init_node); /** * mempool_init - initialize a memory pool * @pool: pointer to the memory pool that should be initialized * @min_nr: the minimum number of elements guaranteed to be * allocated for this pool. * @alloc_fn: user-defined element-allocation function. * @free_fn: user-defined element-freeing function. * @pool_data: optional private data available to the user-defined functions. * * Like mempool_create(), but initializes the pool in (i.e. embedded in another * structure). * * Return: %0 on success, negative error code otherwise. */ int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data) { return mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, GFP_KERNEL, NUMA_NO_NODE); } EXPORT_SYMBOL(mempool_init_noprof); /** * mempool_create_node - create a memory pool * @min_nr: the minimum number of elements guaranteed to be * allocated for this pool. * @alloc_fn: user-defined element-allocation function. * @free_fn: user-defined element-freeing function. * @pool_data: optional private data available to the user-defined functions. * @gfp_mask: memory allocation flags * @node_id: numa node to allocate on * * this function creates and allocates a guaranteed size, preallocated * memory pool. The pool can be used from the mempool_alloc() and mempool_free() * functions. This function might sleep. Both the alloc_fn() and the free_fn() * functions might sleep - as long as the mempool_alloc() function is not called * from IRQ contexts. * * Return: pointer to the created memory pool object or %NULL on error. */ mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, gfp_t gfp_mask, int node_id) { mempool_t *pool; pool = kmalloc_node_noprof(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); if (!pool) return NULL; if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, gfp_mask, node_id)) { kfree(pool); return NULL; } return pool; } EXPORT_SYMBOL(mempool_create_node_noprof); /** * mempool_resize - resize an existing memory pool * @pool: pointer to the memory pool which was allocated via * mempool_create(). * @new_min_nr: the new minimum number of elements guaranteed to be * allocated for this pool. * * This function shrinks/grows the pool. In the case of growing, * it cannot be guaranteed that the pool will be grown to the new * size immediately, but new mempool_free() calls will refill it. * This function may sleep. * * Note, the caller must guarantee that no mempool_destroy is called * while this function is running. mempool_alloc() & mempool_free() * might be called (eg. from IRQ contexts) while this function executes. * * Return: %0 on success, negative error code otherwise. */ int mempool_resize(mempool_t *pool, int new_min_nr) { void *element; void **new_elements; unsigned long flags; BUG_ON(new_min_nr <= 0); might_sleep(); spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { while (new_min_nr < pool->curr_nr) { element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); pool->free(element, pool->pool_data); spin_lock_irqsave(&pool->lock, flags); } pool->min_nr = new_min_nr; goto out_unlock; } spin_unlock_irqrestore(&pool->lock, flags); /* Grow the pool */ new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements), GFP_KERNEL); if (!new_elements) return -ENOMEM; spin_lock_irqsave(&pool->lock, flags); if (unlikely(new_min_nr <= pool->min_nr)) { /* Raced, other resize will do our work */ spin_unlock_irqrestore(&pool->lock, flags); kfree(new_elements); goto out; } memcpy(new_elements, pool->elements, pool->curr_nr * sizeof(*new_elements)); kfree(pool->elements); pool->elements = new_elements; pool->min_nr = new_min_nr; while (pool->curr_nr < pool->min_nr) { spin_unlock_irqrestore(&pool->lock, flags); element = pool->alloc(GFP_KERNEL, pool->pool_data); if (!element) goto out; spin_lock_irqsave(&pool->lock, flags); if (pool->curr_nr < pool->min_nr) { add_element(pool, element); } else { spin_unlock_irqrestore(&pool->lock, flags); pool->free(element, pool->pool_data); /* Raced */ goto out; } } out_unlock: spin_unlock_irqrestore(&pool->lock, flags); out: return 0; } EXPORT_SYMBOL(mempool_resize); /** * mempool_alloc - allocate an element from a specific memory pool * @pool: pointer to the memory pool which was allocated via * mempool_create(). * @gfp_mask: the usual allocation bitmask. * * this function only sleeps if the alloc_fn() function sleeps or * returns NULL. Note that due to preallocation, this function * *never* fails when called from process contexts. (it might * fail if called from an IRQ context.) * Note: using __GFP_ZERO is not supported. * * Return: pointer to the allocated element or %NULL on error. */ void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) { void *element; unsigned long flags; wait_queue_entry_t wait; gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); might_alloc(gfp_mask); gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: element = pool->alloc(gfp_temp, pool->pool_data); if (likely(element != NULL)) return element; spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr)) { element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); /* * Update the allocation stack trace as this is more useful * for debugging. */ kmemleak_update_trace(element); return element; } /* * We use gfp mask w/o direct reclaim or IO for the first round. If * alloc failed with that and @pool was empty, retry immediately. */ if (gfp_temp != gfp_mask) { spin_unlock_irqrestore(&pool->lock, flags); gfp_temp = gfp_mask; goto repeat_alloc; } /* We must not sleep if !__GFP_DIRECT_RECLAIM */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { spin_unlock_irqrestore(&pool->lock, flags); return NULL; } /* Let's wait for someone else to return an element to @pool */ init_wait(&wait); prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_irqrestore(&pool->lock, flags); /* * FIXME: this should be io_schedule(). The timeout is there as a * workaround for some DM problems in 2.6.18. */ io_schedule_timeout(5*HZ); finish_wait(&pool->wait, &wait); goto repeat_alloc; } EXPORT_SYMBOL(mempool_alloc_noprof); /** * mempool_alloc_preallocated - allocate an element from preallocated elements * belonging to a specific memory pool * @pool: pointer to the memory pool which was allocated via * mempool_create(). * * This function is similar to mempool_alloc, but it only attempts allocating * an element from the preallocated elements. It does not sleep and immediately * returns if no preallocated elements are available. * * Return: pointer to the allocated element or %NULL if no elements are * available. */ void *mempool_alloc_preallocated(mempool_t *pool) { void *element; unsigned long flags; spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr)) { element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); /* * Update the allocation stack trace as this is more useful * for debugging. */ kmemleak_update_trace(element); return element; } spin_unlock_irqrestore(&pool->lock, flags); return NULL; } EXPORT_SYMBOL(mempool_alloc_preallocated); /** * mempool_free - return an element to the pool. * @element: pool element pointer. * @pool: pointer to the memory pool which was allocated via * mempool_create(). * * this function only sleeps if the free_fn() function sleeps. */ void mempool_free(void *element, mempool_t *pool) { unsigned long flags; if (unlikely(element == NULL)) return; /* * Paired with the wmb in mempool_alloc(). The preceding read is * for @element and the following @pool->curr_nr. This ensures * that the visible value of @pool->curr_nr is from after the * allocation of @element. This is necessary for fringe cases * where @element was passed to this task without going through * barriers. * * For example, assume @p is %NULL at the beginning and one task * performs "p = mempool_alloc(...);" while another task is doing * "while (!p) cpu_relax(); mempool_free(p, ...);". This function * may end up using curr_nr value which is from before allocation * of @p without the following rmb. */ smp_rmb(); /* * For correctness, we need a test which is guaranteed to trigger * if curr_nr + #allocated == min_nr. Testing curr_nr < min_nr * without locking achieves that and refilling as soon as possible * is desirable. * * Because curr_nr visible here is always a value after the * allocation of @element, any task which decremented curr_nr below * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets * incremented to min_nr afterwards. If curr_nr gets incremented * to min_nr after the allocation of @element, the elements * allocated after that are subject to the same guarantee. * * Waiters happen iff curr_nr is 0 and the above guarantee also * ensures that there will be frees which return elements to the * pool waking up the waiters. */ if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr < pool->min_nr)) { add_element(pool, element); spin_unlock_irqrestore(&pool->lock, flags); wake_up(&pool->wait); return; } spin_unlock_irqrestore(&pool->lock, flags); } pool->free(element, pool->pool_data); } EXPORT_SYMBOL(mempool_free); /* * A commonly used alloc and free fn. */ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) { struct kmem_cache *mem = pool_data; VM_BUG_ON(mem->ctor); return kmem_cache_alloc_noprof(mem, gfp_mask); } EXPORT_SYMBOL(mempool_alloc_slab); void mempool_free_slab(void *element, void *pool_data) { struct kmem_cache *mem = pool_data; kmem_cache_free(mem, element); } EXPORT_SYMBOL(mempool_free_slab); /* * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory * specified by pool_data */ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) { size_t size = (size_t)pool_data; return kmalloc_noprof(size, gfp_mask); } EXPORT_SYMBOL(mempool_kmalloc); void mempool_kfree(void *element, void *pool_data) { kfree(element); } EXPORT_SYMBOL(mempool_kfree); void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data) { size_t size = (size_t)pool_data; return kvmalloc(size, gfp_mask); } EXPORT_SYMBOL(mempool_kvmalloc); void mempool_kvfree(void *element, void *pool_data) { kvfree(element); } EXPORT_SYMBOL(mempool_kvfree); /* * A simple mempool-backed page allocator that allocates pages * of the order specified by pool_data. */ void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data) { int order = (int)(long)pool_data; return alloc_pages_noprof(gfp_mask, order); } EXPORT_SYMBOL(mempool_alloc_pages); void mempool_free_pages(void *element, void *pool_data) { int order = (int)(long)pool_data; __free_pages(element, order); } EXPORT_SYMBOL(mempool_free_pages);
121 117 118 110 118 118 118 118 118 117 116 22 22 8 22 118 117 108 108 99 118 117 118 118 113 106 118 30 30 118 86 86 109 109 99 109 117 118 118 117 118 117 118 118 117 116 101 126 8 8 6 6 6 6 6 6 8 8 8 8 8 8 123 123 123 123 122 120 120 121 121 118 121 118 119 121 121 119 121 121 121 120 22 125 29 29 275 275 274 273 275 275 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 // SPDX-License-Identifier: GPL-2.0 /* * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, * for the blk-mq scheduling framework * * Copyright (C) 2016 Jens Axboe <axboe@kernel.dk> */ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/compiler.h> #include <linux/rbtree.h> #include <linux/sbitmap.h> #include <trace/events/block.h> #include "elevator.h" #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" /* * See Documentation/block/deadline-iosched.rst */ static const int read_expire = HZ / 2; /* max time before a read is submitted. */ static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ /* * Time after which to dispatch lower priority requests even if higher * priority requests are pending. */ static const int prio_aging_expire = 10 * HZ; static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ enum dd_data_dir { DD_READ = READ, DD_WRITE = WRITE, }; enum { DD_DIR_COUNT = 2 }; enum dd_prio { DD_RT_PRIO = 0, DD_BE_PRIO = 1, DD_IDLE_PRIO = 2, DD_PRIO_MAX = 2, }; enum { DD_PRIO_COUNT = 3 }; /* * I/O statistics per I/O priority. It is fine if these counters overflow. * What matters is that these counters are at least as wide as * log2(max_outstanding_requests). */ struct io_stats_per_prio { uint32_t inserted; uint32_t merged; uint32_t dispatched; atomic_t completed; }; /* * Deadline scheduler data per I/O priority (enum dd_prio). Requests are * present on both sort_list[] and fifo_list[]. */ struct dd_per_prio { struct list_head dispatch; struct rb_root sort_list[DD_DIR_COUNT]; struct list_head fifo_list[DD_DIR_COUNT]; /* Position of the most recently dispatched request. */ sector_t latest_pos[DD_DIR_COUNT]; struct io_stats_per_prio stats; }; struct deadline_data { /* * run time data */ struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ enum dd_data_dir last_dir; unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ /* * settings that change how the i/o scheduler behaves */ int fifo_expire[DD_DIR_COUNT]; int fifo_batch; int writes_starved; int front_merges; u32 async_depth; int prio_aging_expire; spinlock_t lock; }; /* Maps an I/O priority class to a deadline scheduler priority. */ static const enum dd_prio ioprio_class_to_prio[] = { [IOPRIO_CLASS_NONE] = DD_BE_PRIO, [IOPRIO_CLASS_RT] = DD_RT_PRIO, [IOPRIO_CLASS_BE] = DD_BE_PRIO, [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, }; static inline struct rb_root * deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) { return &per_prio->sort_list[rq_data_dir(rq)]; } /* * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a * request. */ static u8 dd_rq_ioclass(struct request *rq) { return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); } /* * Return the first request for which blk_rq_pos() >= @pos. */ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, enum dd_data_dir data_dir, sector_t pos) { struct rb_node *node = per_prio->sort_list[data_dir].rb_node; struct request *rq, *res = NULL; if (!node) return NULL; rq = rb_entry_rq(node); while (node) { rq = rb_entry_rq(node); if (blk_rq_pos(rq) >= pos) { res = rq; node = node->rb_left; } else { node = node->rb_right; } } return res; } static void deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { struct rb_root *root = deadline_rb_root(per_prio, rq); elv_rb_add(root, rq); } static inline void deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { elv_rb_del(deadline_rb_root(per_prio, rq), rq); } /* * remove rq from rbtree and fifo. */ static void deadline_remove_request(struct request_queue *q, struct dd_per_prio *per_prio, struct request *rq) { list_del_init(&rq->queuelist); /* * We might not be on the rbtree, if we are doing an insert merge */ if (!RB_EMPTY_NODE(&rq->rb_node)) deadline_del_rq_rb(per_prio, rq); elv_rqhash_del(q, rq); if (q->last_merge == rq) q->last_merge = NULL; } static void dd_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(req); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; /* * if the merge was a front merge, we need to reposition request */ if (type == ELEVATOR_FRONT_MERGE) { elv_rb_del(deadline_rb_root(per_prio, req), req); deadline_add_rq_rb(per_prio, req); } } /* * Callback function that is invoked after @next has been merged into @req. */ static void dd_merged_requests(struct request_queue *q, struct request *req, struct request *next) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(next); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; lockdep_assert_held(&dd->lock); dd->per_prio[prio].stats.merged++; /* * if next expires before rq, assign its expire time to rq * and move into next position (next will be deleted) in fifo */ if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { if (time_before((unsigned long)next->fifo_time, (unsigned long)req->fifo_time)) { list_move(&req->queuelist, &next->queuelist); req->fifo_time = next->fifo_time; } } /* * kill knowledge of next, this one is a goner */ deadline_remove_request(q, &dd->per_prio[prio], next); } /* * move an entry to dispatch queue */ static void deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, struct request *rq) { /* * take it off the sort and fifo list */ deadline_remove_request(rq->q, per_prio, rq); } /* Number of requests queued for a given priority level. */ static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) { const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; lockdep_assert_held(&dd->lock); return stats->inserted - atomic_read(&stats->completed); } /* * deadline_check_fifo returns true if and only if there are expired requests * in the FIFO list. Requires !list_empty(&dd->fifo_list[data_dir]). */ static inline bool deadline_check_fifo(struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); return time_is_before_eq_jiffies((unsigned long)rq->fifo_time); } /* * For the specified data direction, return the next request to * dispatch using arrival ordered lists. */ static struct request * deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { if (list_empty(&per_prio->fifo_list[data_dir])) return NULL; return rq_entry_fifo(per_prio->fifo_list[data_dir].next); } /* * For the specified data direction, return the next request to * dispatch using sector position sorted lists. */ static struct request * deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { return deadline_from_pos(per_prio, data_dir, per_prio->latest_pos[data_dir]); } /* * Returns true if and only if @rq started after @latest_start where * @latest_start is in jiffies. */ static bool started_after(struct deadline_data *dd, struct request *rq, unsigned long latest_start) { unsigned long start_time = (unsigned long)rq->fifo_time; start_time -= dd->fifo_expire[rq_data_dir(rq)]; return time_after(start_time, latest_start); } /* * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc and with a start time <= @latest_start. */ static struct request *__dd_dispatch_request(struct deadline_data *dd, struct dd_per_prio *per_prio, unsigned long latest_start) { struct request *rq, *next_rq; enum dd_data_dir data_dir; enum dd_prio prio; u8 ioprio_class; lockdep_assert_held(&dd->lock); if (!list_empty(&per_prio->dispatch)) { rq = list_first_entry(&per_prio->dispatch, struct request, queuelist); if (started_after(dd, rq, latest_start)) return NULL; list_del_init(&rq->queuelist); data_dir = rq_data_dir(rq); goto done; } /* * batches are currently reads XOR writes */ rq = deadline_next_request(dd, per_prio, dd->last_dir); if (rq && dd->batching < dd->fifo_batch) { /* we have a next request and are still entitled to batch */ data_dir = rq_data_dir(rq); goto dispatch_request; } /* * at this point we are not running a batch. select the appropriate * data direction (read / write) */ if (!list_empty(&per_prio->fifo_list[DD_READ])) { BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ])); if (deadline_fifo_request(dd, per_prio, DD_WRITE) && (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = DD_READ; goto dispatch_find_request; } /* * there are either no reads or writes have been starved */ if (!list_empty(&per_prio->fifo_list[DD_WRITE])) { dispatch_writes: BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE])); dd->starved = 0; data_dir = DD_WRITE; goto dispatch_find_request; } return NULL; dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ next_rq = deadline_next_request(dd, per_prio, data_dir); if (deadline_check_fifo(per_prio, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ rq = deadline_fifo_request(dd, per_prio, data_dir); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ rq = next_rq; } if (!rq) return NULL; dd->last_dir = data_dir; dd->batching = 0; dispatch_request: if (started_after(dd, rq, latest_start)) return NULL; /* * rq is the selected appropriate request. */ dd->batching++; deadline_move_request(dd, per_prio, rq); done: ioprio_class = dd_rq_ioclass(rq); prio = ioprio_class_to_prio[ioprio_class]; dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); dd->per_prio[prio].stats.dispatched++; rq->rq_flags |= RQF_STARTED; return rq; } /* * Check whether there are any requests with priority other than DD_RT_PRIO * that were inserted more than prio_aging_expire jiffies ago. */ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, unsigned long now) { struct request *rq; enum dd_prio prio; int prio_cnt; lockdep_assert_held(&dd->lock); prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) + !!dd_queued(dd, DD_IDLE_PRIO); if (prio_cnt < 2) return NULL; for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) { rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now - dd->prio_aging_expire); if (rq) return rq; } return NULL; } /* * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). * * One confusing aspect here is that we get called for a specific * hardware queue, but we may return a request that is for a * different hardware queue. This is because mq-deadline has shared * state for all hardware queues, in terms of sorting, FIFOs, etc. */ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; const unsigned long now = jiffies; struct request *rq; enum dd_prio prio; spin_lock(&dd->lock); rq = dd_dispatch_prio_aged_requests(dd, now); if (rq) goto unlock; /* * Next, dispatch requests in priority order. Ignore lower priority * requests if any higher priority requests are pending. */ for (prio = 0; prio <= DD_PRIO_MAX; prio++) { rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now); if (rq || dd_queued(dd, prio)) break; } unlock: spin_unlock(&dd->lock); return rq; } /* * 'depth' is a number in the range 1..INT_MAX representing a number of * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow(). * Values larger than q->nr_requests have the same effect as q->nr_requests. */ static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) { struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags; const unsigned int nrr = hctx->queue->nr_requests; return ((qdepth << bt->sb.shift) + nrr - 1) / nrr; } /* * Called by __blk_mq_alloc_request(). The shallow_depth value set by this * function is used by __blk_mq_get_tag(). */ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { struct deadline_data *dd = data->q->elevator->elevator_data; /* Do not throttle synchronous reads. */ if (op_is_sync(opf) && !op_is_write(opf)) return; /* * Throttle asynchronous requests and writes such that these requests * do not block the allocation of synchronous requests. */ data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth); } /* Called by blk_mq_update_nr_requests(). */ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; dd->async_depth = q->nr_requests; sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); } /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { dd_depth_updated(hctx); return 0; } static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; enum dd_prio prio; for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; const struct io_stats_per_prio *stats = &per_prio->stats; uint32_t queued; WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); spin_lock(&dd->lock); queued = dd_queued(dd, prio); spin_unlock(&dd->lock); WARN_ONCE(queued != 0, "statistics for priority %d: i %u m %u d %u c %u\n", prio, stats->inserted, stats->merged, stats->dispatched, atomic_read(&stats->completed)); } kfree(dd); } /* * initialize elevator private data (deadline_data). */ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) { struct deadline_data *dd; struct elevator_queue *eq; enum dd_prio prio; int ret = -ENOMEM; eq = elevator_alloc(q, e); if (!eq) return ret; dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); if (!dd) goto put_eq; eq->elevator_data = dd; for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; INIT_LIST_HEAD(&per_prio->dispatch); INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); per_prio->sort_list[DD_READ] = RB_ROOT; per_prio->sort_list[DD_WRITE] = RB_ROOT; } dd->fifo_expire[DD_READ] = read_expire; dd->fifo_expire[DD_WRITE] = write_expire; dd->writes_starved = writes_starved; dd->front_merges = 1; dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; dd->prio_aging_expire = prio_aging_expire; spin_lock_init(&dd->lock); /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); q->elevator = eq; return 0; put_eq: kobject_put(&eq->kobj); return ret; } /* * Try to merge @bio into an existing request. If @bio has been merged into * an existing request, store the pointer to that request into *@rq. */ static int dd_request_merge(struct request_queue *q, struct request **rq, struct bio *bio) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; sector_t sector = bio_end_sector(bio); struct request *__rq; if (!dd->front_merges) return ELEVATOR_NO_MERGE; __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector); if (__rq) { BUG_ON(sector != blk_rq_pos(__rq)); if (elv_bio_merge_ok(__rq, bio)) { *rq = __rq; if (blk_discard_mergable(__rq)) return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } } return ELEVATOR_NO_MERGE; } /* * Attempt to merge a bio into an existing request. This function is called * before @bio is associated with a request. */ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct deadline_data *dd = q->elevator->elevator_data; struct request *free = NULL; bool ret; spin_lock(&dd->lock); ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); spin_unlock(&dd->lock); if (free) blk_mq_free_request(free); return ret; } /* * add rq to rbtree and fifo */ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_insert_t flags, struct list_head *free) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; const enum dd_data_dir data_dir = rq_data_dir(rq); u16 ioprio = req_get_ioprio(rq); u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); struct dd_per_prio *per_prio; enum dd_prio prio; lockdep_assert_held(&dd->lock); prio = ioprio_class_to_prio[ioprio_class]; per_prio = &dd->per_prio[prio]; if (!rq->elv.priv[0]) { per_prio->stats.inserted++; rq->elv.priv[0] = (void *)(uintptr_t)1; } if (blk_mq_sched_try_insert_merge(q, rq, free)) return; trace_block_rq_insert(rq); if (flags & BLK_MQ_INSERT_AT_HEAD) { list_add(&rq->queuelist, &per_prio->dispatch); rq->fifo_time = jiffies; } else { struct list_head *insert_before; deadline_add_rq_rb(per_prio, rq); if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); if (!q->last_merge) q->last_merge = rq; } /* * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; insert_before = &per_prio->fifo_list[data_dir]; list_add_tail(&rq->queuelist, insert_before); } } /* * Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list(). */ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, blk_insert_t flags) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; LIST_HEAD(free); spin_lock(&dd->lock); while (!list_empty(list)) { struct request *rq; rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); dd_insert_request(hctx, rq, flags, &free); } spin_unlock(&dd->lock); blk_mq_free_requests(&free); } /* Callback from inside blk_mq_rq_ctx_init(). */ static void dd_prepare_request(struct request *rq) { rq->elv.priv[0] = NULL; } /* * Callback from inside blk_mq_free_request(). */ static void dd_finish_request(struct request *rq) { struct request_queue *q = rq->q; struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(rq); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; /* * The block layer core may call dd_finish_request() without having * called dd_insert_requests(). Skip requests that bypassed I/O * scheduling. See also blk_mq_request_bypass_insert(). */ if (rq->elv.priv[0]) atomic_inc(&per_prio->stats.completed); } static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) { return !list_empty_careful(&per_prio->dispatch) || !list_empty_careful(&per_prio->fifo_list[DD_READ]) || !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); } static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; enum dd_prio prio; for (prio = 0; prio <= DD_PRIO_MAX; prio++) if (dd_has_work_for_prio(&dd->per_prio[prio])) return true; return false; } /* * sysfs parts below */ #define SHOW_INT(__FUNC, __VAR) \ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct deadline_data *dd = e->elevator_data; \ \ return sysfs_emit(page, "%d\n", __VAR); \ } #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); SHOW_INT(deadline_async_depth_show, dd->async_depth); SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); #undef SHOW_INT #undef SHOW_JIFFIES #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct deadline_data *dd = e->elevator_data; \ int __data, __ret; \ \ __ret = kstrtoint(page, 0, &__data); \ if (__ret < 0) \ return __ret; \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ *(__PTR) = __CONV(__data); \ return count; \ } #define STORE_INT(__FUNC, __PTR, MIN, MAX) \ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, ) #define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX); STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION #undef STORE_INT #undef STORE_JIFFIES #define DD_ATTR(name) \ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(read_expire), DD_ATTR(write_expire), DD_ATTR(writes_starved), DD_ATTR(front_merges), DD_ATTR(async_depth), DD_ATTR(fifo_batch), DD_ATTR(prio_aging_expire), __ATTR_NULL }; #ifdef CONFIG_BLK_DEBUG_FS #define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \ static void *deadline_##name##_fifo_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ } \ \ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ loff_t *pos) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \ } \ \ static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ __releases(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ spin_unlock(&dd->lock); \ } \ \ static const struct seq_operations deadline_##name##_fifo_seq_ops = { \ .start = deadline_##name##_fifo_start, \ .next = deadline_##name##_fifo_next, \ .stop = deadline_##name##_fifo_stop, \ .show = blk_mq_debugfs_rq_show, \ }; \ \ static int deadline_##name##_next_rq_show(void *data, \ struct seq_file *m) \ { \ struct request_queue *q = data; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ struct request *rq; \ \ rq = deadline_from_pos(per_prio, data_dir, \ per_prio->latest_pos[data_dir]); \ if (rq) \ __blk_mq_debugfs_rq_show(m, rq); \ return 0; \ } DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2); #undef DEADLINE_DEBUGFS_DDIR_ATTRS static int deadline_batching_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->batching); return 0; } static int deadline_starved_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->starved); return 0; } static int dd_async_depth_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->async_depth); return 0; } static int dd_queued_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; u32 rt, be, idle; spin_lock(&dd->lock); rt = dd_queued(dd, DD_RT_PRIO); be = dd_queued(dd, DD_BE_PRIO); idle = dd_queued(dd, DD_IDLE_PRIO); spin_unlock(&dd->lock); seq_printf(m, "%u %u %u\n", rt, be, idle); return 0; } /* Number of requests owned by the block driver for a given priority. */ static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) { const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; lockdep_assert_held(&dd->lock); return stats->dispatched + stats->merged - atomic_read(&stats->completed); } static int dd_owned_by_driver_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; u32 rt, be, idle; spin_lock(&dd->lock); rt = dd_owned_by_driver(dd, DD_RT_PRIO); be = dd_owned_by_driver(dd, DD_BE_PRIO); idle = dd_owned_by_driver(dd, DD_IDLE_PRIO); spin_unlock(&dd->lock); seq_printf(m, "%u %u %u\n", rt, be, idle); return 0; } #define DEADLINE_DISPATCH_ATTR(prio) \ static void *deadline_dispatch##prio##_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ return seq_list_start(&per_prio->dispatch, *pos); \ } \ \ static void *deadline_dispatch##prio##_next(struct seq_file *m, \ void *v, loff_t *pos) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ return seq_list_next(v, &per_prio->dispatch, pos); \ } \ \ static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ __releases(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ spin_unlock(&dd->lock); \ } \ \ static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ .start = deadline_dispatch##prio##_start, \ .next = deadline_dispatch##prio##_next, \ .stop = deadline_dispatch##prio##_stop, \ .show = blk_mq_debugfs_rq_show, \ } DEADLINE_DISPATCH_ATTR(0); DEADLINE_DISPATCH_ATTR(1); DEADLINE_DISPATCH_ATTR(2); #undef DEADLINE_DISPATCH_ATTR #define DEADLINE_QUEUE_DDIR_ATTRS(name) \ {#name "_fifo_list", 0400, \ .seq_ops = &deadline_##name##_fifo_seq_ops} #define DEADLINE_NEXT_RQ_ATTR(name) \ {#name "_next_rq", 0400, deadline_##name##_next_rq_show} static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { DEADLINE_QUEUE_DDIR_ATTRS(read0), DEADLINE_QUEUE_DDIR_ATTRS(write0), DEADLINE_QUEUE_DDIR_ATTRS(read1), DEADLINE_QUEUE_DDIR_ATTRS(write1), DEADLINE_QUEUE_DDIR_ATTRS(read2), DEADLINE_QUEUE_DDIR_ATTRS(write2), DEADLINE_NEXT_RQ_ATTR(read0), DEADLINE_NEXT_RQ_ATTR(write0), DEADLINE_NEXT_RQ_ATTR(read1), DEADLINE_NEXT_RQ_ATTR(write1), DEADLINE_NEXT_RQ_ATTR(read2), DEADLINE_NEXT_RQ_ATTR(write2), {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, {"async_depth", 0400, dd_async_depth_show}, {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, {"owned_by_driver", 0400, dd_owned_by_driver_show}, {"queued", 0400, dd_queued_show}, {}, }; #undef DEADLINE_QUEUE_DDIR_ATTRS #endif static struct elevator_type mq_deadline = { .ops = { .depth_updated = dd_depth_updated, .limit_depth = dd_limit_depth, .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, .prepare_request = dd_prepare_request, .finish_request = dd_finish_request, .next_request = elv_rb_latter_request, .former_request = elv_rb_former_request, .bio_merge = dd_bio_merge, .request_merge = dd_request_merge, .requests_merged = dd_merged_requests, .request_merged = dd_request_merged, .has_work = dd_has_work, .init_sched = dd_init_sched, .exit_sched = dd_exit_sched, .init_hctx = dd_init_hctx, }, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = deadline_queue_debugfs_attrs, #endif .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", .elevator_alias = "deadline", .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); static int __init deadline_init(void) { return elv_register(&mq_deadline); } static void __exit deadline_exit(void) { elv_unregister(&mq_deadline); } module_init(deadline_init); module_exit(deadline_exit); MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MQ deadline IO scheduler");
29 29 29 4 4 4 4 4 4 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/sysv/ialloc.c * * minix/bitmap.c * Copyright (C) 1991, 1992 Linus Torvalds * * ext/freelists.c * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) * * xenix/alloc.c * Copyright (C) 1992 Doug Evans * * coh/alloc.c * Copyright (C) 1993 Pascal Haible, Bruno Haible * * sysv/ialloc.c * Copyright (C) 1993 Bruno Haible * * This file contains code for allocating/freeing inodes. */ #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/sched.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include "sysv.h" /* We don't trust the value of sb->sv_sbd2->s_tinode = *sb->sv_sb_total_free_inodes but we nevertheless keep it up to date. */ /* An inode on disk is considered free if both i_mode == 0 and i_nlink == 0. */ /* return &sb->sv_sb_fic_inodes[i] = &sbd->s_inode[i]; */ static inline sysv_ino_t * sv_sb_fic_inode(struct super_block * sb, unsigned int i) { struct sysv_sb_info *sbi = SYSV_SB(sb); if (sbi->s_bh1 == sbi->s_bh2) return &sbi->s_sb_fic_inodes[i]; else { /* 512 byte Xenix FS */ unsigned int offset = offsetof(struct xenix_super_block, s_inode[i]); if (offset < 512) return (sysv_ino_t*)(sbi->s_sbd1 + offset); else return (sysv_ino_t*)(sbi->s_sbd2 + offset); } } struct sysv_inode * sysv_raw_inode(struct super_block *sb, unsigned ino, struct buffer_head **bh) { struct sysv_sb_info *sbi = SYSV_SB(sb); struct sysv_inode *res; int block = sbi->s_firstinodezone + sbi->s_block_base; block += (ino-1) >> sbi->s_inodes_per_block_bits; *bh = sb_bread(sb, block); if (!*bh) return NULL; res = (struct sysv_inode *)(*bh)->b_data; return res + ((ino-1) & sbi->s_inodes_per_block_1); } static int refill_free_cache(struct super_block *sb) { struct sysv_sb_info *sbi = SYSV_SB(sb); struct buffer_head * bh; struct sysv_inode * raw_inode; int i = 0, ino; ino = SYSV_ROOT_INO+1; raw_inode = sysv_raw_inode(sb, ino, &bh); if (!raw_inode) goto out; while (ino <= sbi->s_ninodes) { if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) { *sv_sb_fic_inode(sb,i++) = cpu_to_fs16(SYSV_SB(sb), ino); if (i == sbi->s_fic_size) break; } if ((ino++ & sbi->s_inodes_per_block_1) == 0) { brelse(bh); raw_inode = sysv_raw_inode(sb, ino, &bh); if (!raw_inode) goto out; } else raw_inode++; } brelse(bh); out: return i; } void sysv_free_inode(struct inode * inode) { struct super_block *sb = inode->i_sb; struct sysv_sb_info *sbi = SYSV_SB(sb); unsigned int ino; struct buffer_head * bh; struct sysv_inode * raw_inode; unsigned count; sb = inode->i_sb; ino = inode->i_ino; if (ino <= SYSV_ROOT_INO || ino > sbi->s_ninodes) { printk("sysv_free_inode: inode 0,1,2 or nonexistent inode\n"); return; } raw_inode = sysv_raw_inode(sb, ino, &bh); if (!raw_inode) { printk("sysv_free_inode: unable to read inode block on device " "%s\n", inode->i_sb->s_id); return; } mutex_lock(&sbi->s_lock); count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count); if (count < sbi->s_fic_size) { *sv_sb_fic_inode(sb,count++) = cpu_to_fs16(sbi, ino); *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); } fs16_add(sbi, sbi->s_sb_total_free_inodes, 1); dirty_sb(sb); memset(raw_inode, 0, sizeof(struct sysv_inode)); mark_buffer_dirty(bh); mutex_unlock(&sbi->s_lock); brelse(bh); } struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct sysv_sb_info *sbi = SYSV_SB(sb); struct inode *inode; sysv_ino_t ino; unsigned count; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE }; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); mutex_lock(&sbi->s_lock); count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count); if (count == 0 || (*sv_sb_fic_inode(sb,count-1) == 0)) { count = refill_free_cache(sb); if (count == 0) { iput(inode); mutex_unlock(&sbi->s_lock); return ERR_PTR(-ENOSPC); } } /* Now count > 0. */ ino = *sv_sb_fic_inode(sb,--count); *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); dirty_sb(sb); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = fs16_to_cpu(sbi, ino); simple_inode_init_ts(inode); inode->i_blocks = 0; memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data)); SYSV_I(inode)->i_dir_start_lookup = 0; insert_inode_hash(inode); mark_inode_dirty(inode); sysv_write_inode(inode, &wbc); /* ensure inode not allocated again */ mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ /* That's it. */ mutex_unlock(&sbi->s_lock); return inode; } unsigned long sysv_count_free_inodes(struct super_block * sb) { struct sysv_sb_info *sbi = SYSV_SB(sb); struct buffer_head * bh; struct sysv_inode * raw_inode; int ino, count, sb_count; mutex_lock(&sbi->s_lock); sb_count = fs16_to_cpu(sbi, *sbi->s_sb_total_free_inodes); if (0) goto trust_sb; /* this causes a lot of disk traffic ... */ count = 0; ino = SYSV_ROOT_INO+1; raw_inode = sysv_raw_inode(sb, ino, &bh); if (!raw_inode) goto Eio; while (ino <= sbi->s_ninodes) { if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) count++; if ((ino++ & sbi->s_inodes_per_block_1) == 0) { brelse(bh); raw_inode = sysv_raw_inode(sb, ino, &bh); if (!raw_inode) goto Eio; } else raw_inode++; } brelse(bh); if (count != sb_count) goto Einval; out: mutex_unlock(&sbi->s_lock); return count; Einval: printk("sysv_count_free_inodes: " "free inode count was %d, correcting to %d\n", sb_count, count); if (!sb_rdonly(sb)) { *sbi->s_sb_total_free_inodes = cpu_to_fs16(SYSV_SB(sb), count); dirty_sb(sb); } goto out; Eio: printk("sysv_count_free_inodes: unable to read inode table\n"); trust_sb: count = sb_count; goto out; }
11 4 4 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 // SPDX-License-Identifier: GPL-2.0-only /* * The Virtio 9p transport driver * * This is a block based transport driver based on the lguest block driver * code. * * Copyright (C) 2007, 2008 Eric Van Hensbergen, IBM Corporation * * Based on virtio console driver * Copyright (C) 2006, 2007 Rusty Russell, IBM Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/in.h> #include <linux/module.h> #include <linux/net.h> #include <linux/ipv6.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/un.h> #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/file.h> #include <linux/highmem.h> #include <linux/slab.h> #include <net/9p/9p.h> #include <linux/parser.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include <linux/scatterlist.h> #include <linux/swap.h> #include <linux/virtio.h> #include <linux/virtio_9p.h> #include "trans_common.h" #define VIRTQUEUE_NUM 128 /* a single mutex to manage channel initialization and attachment */ static DEFINE_MUTEX(virtio_9p_lock); static DECLARE_WAIT_QUEUE_HEAD(vp_wq); static atomic_t vp_pinned = ATOMIC_INIT(0); /** * struct virtio_chan - per-instance transport information * @inuse: whether the channel is in use * @lock: protects multiple elements within this structure * @client: client instance * @vdev: virtio dev associated with this channel * @vq: virtio queue associated with this channel * @ring_bufs_avail: flag to indicate there is some available in the ring buf * @vc_wq: wait queue for waiting for thing to be added to ring buf * @p9_max_pages: maximum number of pinned pages * @sg: scatter gather list which is used to pack a request (protected?) * @chan_list: linked list of channels * * We keep all per-channel information in a structure. * This structure is allocated within the devices dev->mem space. * A pointer to the structure will get put in the transport private. * */ struct virtio_chan { bool inuse; spinlock_t lock; struct p9_client *client; struct virtio_device *vdev; struct virtqueue *vq; int ring_bufs_avail; wait_queue_head_t *vc_wq; /* This is global limit. Since we don't have a global structure, * will be placing it in each channel. */ unsigned long p9_max_pages; /* Scatterlist: can be too big for stack. */ struct scatterlist sg[VIRTQUEUE_NUM]; /** * @tag: name to identify a mount null terminated */ char *tag; struct list_head chan_list; }; static struct list_head virtio_chan_list; /* How many bytes left in this page. */ static unsigned int rest_of_page(void *data) { return PAGE_SIZE - offset_in_page(data); } /** * p9_virtio_close - reclaim resources of a channel * @client: client instance * * This reclaims a channel by freeing its resources and * resetting its inuse flag. * */ static void p9_virtio_close(struct p9_client *client) { struct virtio_chan *chan = client->trans; mutex_lock(&virtio_9p_lock); if (chan) chan->inuse = false; mutex_unlock(&virtio_9p_lock); } /** * req_done - callback which signals activity from the server * @vq: virtio queue activity was received on * * This notifies us that the server has triggered some activity * on the virtio channel - most likely a response to request we * sent. Figure out which requests now have responses and wake up * those threads. * * Bugs: could do with some additional sanity checking, but appears to work. * */ static void req_done(struct virtqueue *vq) { struct virtio_chan *chan = vq->vdev->priv; unsigned int len; struct p9_req_t *req; bool need_wakeup = false; unsigned long flags; p9_debug(P9_DEBUG_TRANS, ": request done\n"); spin_lock_irqsave(&chan->lock, flags); while ((req = virtqueue_get_buf(chan->vq, &len)) != NULL) { if (!chan->ring_bufs_avail) { chan->ring_bufs_avail = 1; need_wakeup = true; } if (len) { req->rc.size = len; p9_client_cb(chan->client, req, REQ_STATUS_RCVD); } } spin_unlock_irqrestore(&chan->lock, flags); /* Wakeup if anyone waiting for VirtIO ring space. */ if (need_wakeup) wake_up(chan->vc_wq); } /** * pack_sg_list - pack a scatter gather list from a linear buffer * @sg: scatter/gather list to pack into * @start: which segment of the sg_list to start at * @limit: maximum segment to pack data to * @data: data to pack into scatter/gather list * @count: amount of data to pack into the scatter/gather list * * sg_lists have multiple segments of various sizes. This will pack * arbitrary data into an existing scatter gather list, segmenting the * data as necessary within constraints. * */ static int pack_sg_list(struct scatterlist *sg, int start, int limit, char *data, int count) { int s; int index = start; while (count) { s = rest_of_page(data); if (s > count) s = count; BUG_ON(index >= limit); /* Make sure we don't terminate early. */ sg_unmark_end(&sg[index]); sg_set_buf(&sg[index++], data, s); count -= s; data += s; } if (index-start) sg_mark_end(&sg[index - 1]); return index-start; } /* We don't currently allow canceling of virtio requests */ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req) { return 1; } /* Reply won't come, so drop req ref */ static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req) { p9_req_put(client, req); return 0; } /** * pack_sg_list_p - Just like pack_sg_list. Instead of taking a buffer, * this takes a list of pages. * @sg: scatter/gather list to pack into * @start: which segment of the sg_list to start at * @limit: maximum number of pages in sg list. * @pdata: a list of pages to add into sg. * @nr_pages: number of pages to pack into the scatter/gather list * @offs: amount of data in the beginning of first page _not_ to pack * @count: amount of data to pack into the scatter/gather list */ static int pack_sg_list_p(struct scatterlist *sg, int start, int limit, struct page **pdata, int nr_pages, size_t offs, int count) { int i = 0, s; int data_off = offs; int index = start; BUG_ON(nr_pages > (limit - start)); /* * if the first page doesn't start at * page boundary find the offset */ while (nr_pages) { s = PAGE_SIZE - data_off; if (s > count) s = count; BUG_ON(index >= limit); /* Make sure we don't terminate early. */ sg_unmark_end(&sg[index]); sg_set_page(&sg[index++], pdata[i++], s, data_off); data_off = 0; count -= s; nr_pages--; } if (index-start) sg_mark_end(&sg[index - 1]); return index - start; } /** * p9_virtio_request - issue a request * @client: client instance issuing the request * @req: request to be issued * */ static int p9_virtio_request(struct p9_client *client, struct p9_req_t *req) { int err; int in, out, out_sgs, in_sgs; unsigned long flags; struct virtio_chan *chan = client->trans; struct scatterlist *sgs[2]; p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n"); WRITE_ONCE(req->status, REQ_STATUS_SENT); req_retry: spin_lock_irqsave(&chan->lock, flags); out_sgs = in_sgs = 0; /* Handle out VirtIO ring buffers */ out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc.sdata, req->tc.size); if (out) sgs[out_sgs++] = chan->sg; in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, req->rc.sdata, req->rc.capacity); if (in) sgs[out_sgs + in_sgs++] = chan->sg + out; err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC); if (err < 0) { if (err == -ENOSPC) { chan->ring_bufs_avail = 0; spin_unlock_irqrestore(&chan->lock, flags); err = wait_event_killable(*chan->vc_wq, chan->ring_bufs_avail); if (err == -ERESTARTSYS) return err; p9_debug(P9_DEBUG_TRANS, "Retry virtio request\n"); goto req_retry; } else { spin_unlock_irqrestore(&chan->lock, flags); p9_debug(P9_DEBUG_TRANS, "virtio rpc add_sgs returned failure\n"); return -EIO; } } virtqueue_kick(chan->vq); spin_unlock_irqrestore(&chan->lock, flags); p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n"); return 0; } static int p9_get_mapped_pages(struct virtio_chan *chan, struct page ***pages, struct iov_iter *data, int count, size_t *offs, int *need_drop) { int nr_pages; int err; if (!iov_iter_count(data)) return 0; if (!iov_iter_is_kvec(data)) { int n; /* * We allow only p9_max_pages pinned. We wait for the * Other zc request to finish here */ if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { err = wait_event_killable(vp_wq, (atomic_read(&vp_pinned) < chan->p9_max_pages)); if (err == -ERESTARTSYS) return err; } n = iov_iter_get_pages_alloc2(data, pages, count, offs); if (n < 0) return n; *need_drop = 1; nr_pages = DIV_ROUND_UP(n + *offs, PAGE_SIZE); atomic_add(nr_pages, &vp_pinned); return n; } else { /* kernel buffer, no need to pin pages */ int index; size_t len; void *p; /* we'd already checked that it's non-empty */ while (1) { len = iov_iter_single_seg_count(data); if (likely(len)) { p = data->kvec->iov_base + data->iov_offset; break; } iov_iter_advance(data, 0); } if (len > count) len = count; nr_pages = DIV_ROUND_UP((unsigned long)p + len, PAGE_SIZE) - (unsigned long)p / PAGE_SIZE; *pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); if (!*pages) return -ENOMEM; *need_drop = 0; p -= (*offs = offset_in_page(p)); for (index = 0; index < nr_pages; index++) { if (is_vmalloc_addr(p)) (*pages)[index] = vmalloc_to_page(p); else (*pages)[index] = kmap_to_page(p); p += PAGE_SIZE; } iov_iter_advance(data, len); return len; } } static void handle_rerror(struct p9_req_t *req, int in_hdr_len, size_t offs, struct page **pages) { unsigned size, n; void *to = req->rc.sdata + in_hdr_len; // Fits entirely into the static data? Nothing to do. if (req->rc.size < in_hdr_len || !pages) return; // Really long error message? Tough, truncate the reply. Might get // rejected (we can't be arsed to adjust the size encoded in header, // or string size for that matter), but it wouldn't be anything valid // anyway. if (unlikely(req->rc.size > P9_ZC_HDR_SZ)) req->rc.size = P9_ZC_HDR_SZ; // data won't span more than two pages size = req->rc.size - in_hdr_len; n = PAGE_SIZE - offs; if (size > n) { memcpy_from_page(to, *pages++, offs, n); offs = 0; to += n; size -= n; } memcpy_from_page(to, *pages, offs, size); } /** * p9_virtio_zc_request - issue a zero copy request * @client: client instance issuing the request * @req: request to be issued * @uidata: user buffer that should be used for zero copy read * @uodata: user buffer that should be used for zero copy write * @inlen: read buffer size * @outlen: write buffer size * @in_hdr_len: reader header size, This is the size of response protocol data * */ static int p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, struct iov_iter *uidata, struct iov_iter *uodata, int inlen, int outlen, int in_hdr_len) { int in, out, err, out_sgs, in_sgs; unsigned long flags; int in_nr_pages = 0, out_nr_pages = 0; struct page **in_pages = NULL, **out_pages = NULL; struct virtio_chan *chan = client->trans; struct scatterlist *sgs[4]; size_t offs = 0; int need_drop = 0; int kicked = 0; p9_debug(P9_DEBUG_TRANS, "virtio request\n"); if (uodata) { __le32 sz; int n = p9_get_mapped_pages(chan, &out_pages, uodata, outlen, &offs, &need_drop); if (n < 0) { err = n; goto err_out; } out_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE); if (n != outlen) { __le32 v = cpu_to_le32(n); memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4); outlen = n; } /* The size field of the message must include the length of the * header and the length of the data. We didn't actually know * the length of the data until this point so add it in now. */ sz = cpu_to_le32(req->tc.size + outlen); memcpy(&req->tc.sdata[0], &sz, sizeof(sz)); } else if (uidata) { int n = p9_get_mapped_pages(chan, &in_pages, uidata, inlen, &offs, &need_drop); if (n < 0) { err = n; goto err_out; } in_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE); if (n != inlen) { __le32 v = cpu_to_le32(n); memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4); inlen = n; } } WRITE_ONCE(req->status, REQ_STATUS_SENT); req_retry_pinned: spin_lock_irqsave(&chan->lock, flags); out_sgs = in_sgs = 0; /* out data */ out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc.sdata, req->tc.size); if (out) sgs[out_sgs++] = chan->sg; if (out_pages) { sgs[out_sgs++] = chan->sg + out; out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM, out_pages, out_nr_pages, offs, outlen); } /* * Take care of in data * For example TREAD have 11. * 11 is the read/write header = PDU Header(7) + IO Size (4). * Arrange in such a way that server places header in the * allocated memory and payload onto the user buffer. */ in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, req->rc.sdata, in_hdr_len); if (in) sgs[out_sgs + in_sgs++] = chan->sg + out; if (in_pages) { sgs[out_sgs + in_sgs++] = chan->sg + out + in; pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM, in_pages, in_nr_pages, offs, inlen); } BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs)); err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC); if (err < 0) { if (err == -ENOSPC) { chan->ring_bufs_avail = 0; spin_unlock_irqrestore(&chan->lock, flags); err = wait_event_killable(*chan->vc_wq, chan->ring_bufs_avail); if (err == -ERESTARTSYS) goto err_out; p9_debug(P9_DEBUG_TRANS, "Retry virtio request\n"); goto req_retry_pinned; } else { spin_unlock_irqrestore(&chan->lock, flags); p9_debug(P9_DEBUG_TRANS, "virtio rpc add_sgs returned failure\n"); err = -EIO; goto err_out; } } virtqueue_kick(chan->vq); spin_unlock_irqrestore(&chan->lock, flags); kicked = 1; p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n"); err = wait_event_killable(req->wq, READ_ONCE(req->status) >= REQ_STATUS_RCVD); // RERROR needs reply (== error string) in static data if (READ_ONCE(req->status) == REQ_STATUS_RCVD && unlikely(req->rc.sdata[4] == P9_RERROR)) handle_rerror(req, in_hdr_len, offs, in_pages); /* * Non kernel buffers are pinned, unpin them */ err_out: if (need_drop) { if (in_pages) { p9_release_pages(in_pages, in_nr_pages); atomic_sub(in_nr_pages, &vp_pinned); } if (out_pages) { p9_release_pages(out_pages, out_nr_pages); atomic_sub(out_nr_pages, &vp_pinned); } /* wakeup anybody waiting for slots to pin pages */ wake_up(&vp_wq); } kvfree(in_pages); kvfree(out_pages); if (!kicked) { /* reply won't come */ p9_req_put(client, req); } return err; } static ssize_t p9_mount_tag_show(struct device *dev, struct device_attribute *attr, char *buf) { struct virtio_chan *chan; struct virtio_device *vdev; int tag_len; vdev = dev_to_virtio(dev); chan = vdev->priv; tag_len = strlen(chan->tag); memcpy(buf, chan->tag, tag_len + 1); return tag_len + 1; } static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL); /** * p9_virtio_probe - probe for existence of 9P virtio channels * @vdev: virtio device to probe * * This probes for existing virtio channels. * */ static int p9_virtio_probe(struct virtio_device *vdev) { __u16 tag_len; char *tag; int err; struct virtio_chan *chan; if (!vdev->config->get) { dev_err(&vdev->dev, "%s failure: config access disabled\n", __func__); return -EINVAL; } chan = kmalloc(sizeof(struct virtio_chan), GFP_KERNEL); if (!chan) { pr_err("Failed to allocate virtio 9P channel\n"); err = -ENOMEM; goto fail; } chan->vdev = vdev; /* We expect one virtqueue, for requests. */ chan->vq = virtio_find_single_vq(vdev, req_done, "requests"); if (IS_ERR(chan->vq)) { err = PTR_ERR(chan->vq); goto out_free_chan; } chan->vq->vdev->priv = chan; spin_lock_init(&chan->lock); sg_init_table(chan->sg, VIRTQUEUE_NUM); chan->inuse = false; if (virtio_has_feature(vdev, VIRTIO_9P_MOUNT_TAG)) { virtio_cread(vdev, struct virtio_9p_config, tag_len, &tag_len); } else { err = -EINVAL; goto out_free_vq; } tag = kzalloc(tag_len + 1, GFP_KERNEL); if (!tag) { err = -ENOMEM; goto out_free_vq; } virtio_cread_bytes(vdev, offsetof(struct virtio_9p_config, tag), tag, tag_len); chan->tag = tag; err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); if (err) { goto out_free_tag; } chan->vc_wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL); if (!chan->vc_wq) { err = -ENOMEM; goto out_remove_file; } init_waitqueue_head(chan->vc_wq); chan->ring_bufs_avail = 1; /* Ceiling limit to avoid denial of service attacks */ chan->p9_max_pages = nr_free_buffer_pages()/4; virtio_device_ready(vdev); mutex_lock(&virtio_9p_lock); list_add_tail(&chan->chan_list, &virtio_chan_list); mutex_unlock(&virtio_9p_lock); /* Let udev rules use the new mount_tag attribute. */ kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE); return 0; out_remove_file: sysfs_remove_file(&vdev->dev.kobj, &dev_attr_mount_tag.attr); out_free_tag: kfree(tag); out_free_vq: vdev->config->del_vqs(vdev); out_free_chan: kfree(chan); fail: return err; } /** * p9_virtio_create - allocate a new virtio channel * @client: client instance invoking this transport * @devname: string identifying the channel to connect to (unused) * @args: args passed from sys_mount() for per-transport options (unused) * * This sets up a transport channel for 9p communication. Right now * we only match the first available channel, but eventually we could look up * alternate channels by matching devname versus a virtio_config entry. * We use a simple reference count mechanism to ensure that only a single * mount has a channel open at a time. * */ static int p9_virtio_create(struct p9_client *client, const char *devname, char *args) { struct virtio_chan *chan; int ret = -ENOENT; int found = 0; if (devname == NULL) return -EINVAL; mutex_lock(&virtio_9p_lock); list_for_each_entry(chan, &virtio_chan_list, chan_list) { if (!strcmp(devname, chan->tag)) { if (!chan->inuse) { chan->inuse = true; found = 1; break; } ret = -EBUSY; } } mutex_unlock(&virtio_9p_lock); if (!found) { pr_err("no channels available for device %s\n", devname); return ret; } client->trans = (void *)chan; client->status = Connected; chan->client = client; return 0; } /** * p9_virtio_remove - clean up resources associated with a virtio device * @vdev: virtio device to remove * */ static void p9_virtio_remove(struct virtio_device *vdev) { struct virtio_chan *chan = vdev->priv; unsigned long warning_time; mutex_lock(&virtio_9p_lock); /* Remove self from list so we don't get new users. */ list_del(&chan->chan_list); warning_time = jiffies; /* Wait for existing users to close. */ while (chan->inuse) { mutex_unlock(&virtio_9p_lock); msleep(250); if (time_after(jiffies, warning_time + 10 * HZ)) { dev_emerg(&vdev->dev, "p9_virtio_remove: waiting for device in use.\n"); warning_time = jiffies; } mutex_lock(&virtio_9p_lock); } mutex_unlock(&virtio_9p_lock); virtio_reset_device(vdev); vdev->config->del_vqs(vdev); sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE); kfree(chan->tag); kfree(chan->vc_wq); kfree(chan); } static struct virtio_device_id id_table[] = { { VIRTIO_ID_9P, VIRTIO_DEV_ANY_ID }, { 0 }, }; static unsigned int features[] = { VIRTIO_9P_MOUNT_TAG, }; /* The standard "struct lguest_driver": */ static struct virtio_driver p9_virtio_drv = { .feature_table = features, .feature_table_size = ARRAY_SIZE(features), .driver.name = KBUILD_MODNAME, .id_table = id_table, .probe = p9_virtio_probe, .remove = p9_virtio_remove, }; static struct p9_trans_module p9_virtio_trans = { .name = "virtio", .create = p9_virtio_create, .close = p9_virtio_close, .request = p9_virtio_request, .zc_request = p9_virtio_zc_request, .cancel = p9_virtio_cancel, .cancelled = p9_virtio_cancelled, /* * We leave one entry for input and one entry for response * headers. We also skip one more entry to accommodate, address * that are not at page boundary, that can result in an extra * page in zero copy. */ .maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3), .pooled_rbuffers = false, .def = 1, .owner = THIS_MODULE, }; /* The standard init function */ static int __init p9_virtio_init(void) { int rc; INIT_LIST_HEAD(&virtio_chan_list); v9fs_register_trans(&p9_virtio_trans); rc = register_virtio_driver(&p9_virtio_drv); if (rc) v9fs_unregister_trans(&p9_virtio_trans); return rc; } static void __exit p9_virtio_cleanup(void) { unregister_virtio_driver(&p9_virtio_drv); v9fs_unregister_trans(&p9_virtio_trans); } module_init(p9_virtio_init); module_exit(p9_virtio_cleanup); MODULE_ALIAS_9P("virtio"); MODULE_DEVICE_TABLE(virtio, id_table); MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>"); MODULE_DESCRIPTION("Virtio 9p Transport"); MODULE_LICENSE("GPL");
2 2 45 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #ifndef _NET_BATMAN_ADV_MAIN_H_ #define _NET_BATMAN_ADV_MAIN_H_ #define BATADV_DRIVER_AUTHOR "Marek Lindner <mareklindner@neomailbox.ch>, " \ "Simon Wunderlich <sw@simonwunderlich.de>" #define BATADV_DRIVER_DESC "B.A.T.M.A.N. advanced" #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION #define BATADV_SOURCE_VERSION "2024.2" #endif /* B.A.T.M.A.N. parameters */ #define BATADV_TQ_MAX_VALUE 255 #define BATADV_THROUGHPUT_MAX_VALUE 0xFFFFFFFF #define BATADV_JITTER 20 /* Time To Live of broadcast messages */ #define BATADV_TTL 50 /* maximum sequence number age of broadcast messages */ #define BATADV_BCAST_MAX_AGE 64 /* purge originators after time in seconds if no valid packet comes in * -> TODO: check influence on BATADV_TQ_LOCAL_WINDOW_SIZE */ #define BATADV_PURGE_TIMEOUT 200000 /* 200 seconds */ #define BATADV_TT_LOCAL_TIMEOUT 600000 /* in milliseconds */ #define BATADV_TT_CLIENT_ROAM_TIMEOUT 600000 /* in milliseconds */ #define BATADV_TT_CLIENT_TEMP_TIMEOUT 600000 /* in milliseconds */ #define BATADV_TT_WORK_PERIOD 5000 /* 5 seconds */ #define BATADV_ORIG_WORK_PERIOD 1000 /* 1 second */ #define BATADV_MCAST_WORK_PERIOD 500 /* 0.5 seconds */ #define BATADV_DAT_ENTRY_TIMEOUT (5 * 60000) /* 5 mins in milliseconds */ /* sliding packet range of received originator messages in sequence numbers * (should be a multiple of our word size) */ #define BATADV_TQ_LOCAL_WINDOW_SIZE 64 /* milliseconds we have to keep pending tt_req */ #define BATADV_TT_REQUEST_TIMEOUT 3000 #define BATADV_TQ_GLOBAL_WINDOW_SIZE 5 #define BATADV_TQ_LOCAL_BIDRECT_SEND_MINIMUM 1 #define BATADV_TQ_LOCAL_BIDRECT_RECV_MINIMUM 1 #define BATADV_TQ_TOTAL_BIDRECT_LIMIT 1 /* B.A.T.M.A.N. V */ #define BATADV_THROUGHPUT_DEFAULT_VALUE 10 /* 1 Mbps */ #define BATADV_ELP_PROBES_PER_NODE 2 #define BATADV_ELP_MIN_PROBE_SIZE 200 /* bytes */ #define BATADV_ELP_PROBE_MAX_TX_DIFF 100 /* milliseconds */ #define BATADV_ELP_MAX_AGE 64 #define BATADV_OGM_MAX_ORIGDIFF 5 #define BATADV_OGM_MAX_AGE 64 /* number of OGMs sent with the last tt diff */ #define BATADV_TT_OGM_APPEND_MAX 3 /* Time in which a client can roam at most ROAMING_MAX_COUNT times in * milliseconds */ #define BATADV_ROAMING_MAX_TIME 20000 #define BATADV_ROAMING_MAX_COUNT 5 #define BATADV_NO_FLAGS 0 #define BATADV_NULL_IFINDEX 0 /* dummy ifindex used to avoid iface checks */ #define BATADV_NO_MARK 0 /* default interface for multi interface operation. The default interface is * used for communication which originated locally (i.e. is not forwarded) * or where special forwarding is not desired/necessary. */ #define BATADV_IF_DEFAULT ((struct batadv_hard_iface *)NULL) #define BATADV_NUM_WORDS BITS_TO_LONGS(BATADV_TQ_LOCAL_WINDOW_SIZE) #define BATADV_LOG_BUF_LEN 8192 /* has to be a power of 2 */ /* number of packets to send for broadcasts on different interface types */ #define BATADV_NUM_BCASTS_DEFAULT 1 #define BATADV_NUM_BCASTS_WIRELESS 3 /* length of the single packet used by the TP meter */ #define BATADV_TP_PACKET_LEN ETH_DATA_LEN /* msecs after which an ARP_REQUEST is sent in broadcast as fallback */ #define ARP_REQ_DELAY 250 /* numbers of originator to contact for any PUT/GET DHT operation */ #define BATADV_DAT_CANDIDATES_NUM 3 /* BATADV_TQ_SIMILARITY_THRESHOLD - TQ points that a secondary metric can differ * at most from the primary one in order to be still considered acceptable */ #define BATADV_TQ_SIMILARITY_THRESHOLD 50 /* should not be bigger than 512 bytes or change the size of * forw_packet->direct_link_flags */ #define BATADV_MAX_AGGREGATION_BYTES 512 #define BATADV_MAX_AGGREGATION_MS 100 #define BATADV_BLA_PERIOD_LENGTH 10000 /* 10 seconds */ #define BATADV_BLA_BACKBONE_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 6) #define BATADV_BLA_CLAIM_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 10) #define BATADV_BLA_WAIT_PERIODS 3 #define BATADV_BLA_LOOPDETECT_PERIODS 6 #define BATADV_BLA_LOOPDETECT_TIMEOUT 3000 /* 3 seconds */ #define BATADV_DUPLIST_SIZE 16 #define BATADV_DUPLIST_TIMEOUT 500 /* 500 ms */ /* don't reset again within 30 seconds */ #define BATADV_RESET_PROTECTION_MS 30000 #define BATADV_EXPECTED_SEQNO_RANGE 65536 #define BATADV_NC_NODE_TIMEOUT 10000 /* Milliseconds */ /** * BATADV_TP_MAX_NUM - maximum number of simultaneously active tp sessions */ #define BATADV_TP_MAX_NUM 5 /** * enum batadv_mesh_state - State of a soft interface */ enum batadv_mesh_state { /** @BATADV_MESH_INACTIVE: soft interface is not yet running */ BATADV_MESH_INACTIVE, /** @BATADV_MESH_ACTIVE: interface is up and running */ BATADV_MESH_ACTIVE, /** @BATADV_MESH_DEACTIVATING: interface is getting shut down */ BATADV_MESH_DEACTIVATING, }; #define BATADV_BCAST_QUEUE_LEN 256 #define BATADV_BATMAN_QUEUE_LEN 256 /** * enum batadv_uev_action - action type of uevent */ enum batadv_uev_action { /** @BATADV_UEV_ADD: gateway was selected (after none was selected) */ BATADV_UEV_ADD = 0, /** * @BATADV_UEV_DEL: selected gateway was removed and none is selected * anymore */ BATADV_UEV_DEL, /** * @BATADV_UEV_CHANGE: a different gateway was selected as based gateway */ BATADV_UEV_CHANGE, /** * @BATADV_UEV_LOOPDETECT: loop was detected which cannot be handled by * bridge loop avoidance */ BATADV_UEV_LOOPDETECT, }; /** * enum batadv_uev_type - Type of uevent */ enum batadv_uev_type { /** @BATADV_UEV_GW: selected gateway was modified */ BATADV_UEV_GW = 0, /** @BATADV_UEV_BLA: bridge loop avoidance event */ BATADV_UEV_BLA, }; #define BATADV_GW_THRESHOLD 50 /* Number of fragment chains for each orig_node */ #define BATADV_FRAG_BUFFER_COUNT 8 /* Maximum number of fragments for one packet */ #define BATADV_FRAG_MAX_FRAGMENTS 16 /* Maxumim size of each fragment */ #define BATADV_FRAG_MAX_FRAG_SIZE 1280 /* Time to keep fragments while waiting for rest of the fragments */ #define BATADV_FRAG_TIMEOUT 10000 #define BATADV_DAT_CANDIDATE_NOT_FOUND 0 #define BATADV_DAT_CANDIDATE_ORIG 1 /* Debug Messages */ #ifdef pr_fmt #undef pr_fmt #endif /* Append 'batman-adv: ' before kernel messages */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt /* Kernel headers */ #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/etherdevice.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/netdevice.h> #include <linux/percpu.h> #include <linux/skbuff.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> #include "types.h" #include "main.h" /** * batadv_print_vid() - return printable version of vid information * @vid: the VLAN identifier * * Return: -1 when no VLAN is used, VLAN id otherwise */ static inline int batadv_print_vid(unsigned short vid) { if (vid & BATADV_VLAN_HAS_TAG) return (int)(vid & VLAN_VID_MASK); else return -1; } extern struct list_head batadv_hardif_list; extern unsigned int batadv_hardif_generation; extern unsigned char batadv_broadcast_addr[]; extern struct workqueue_struct *batadv_event_workqueue; int batadv_mesh_init(struct net_device *soft_iface); void batadv_mesh_free(struct net_device *soft_iface); bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr); int batadv_max_header_len(void); void batadv_skb_set_priority(struct sk_buff *skb, int offset); int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev); int batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)); void batadv_recv_handler_unregister(u8 packet_type); __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr); /** * batadv_compare_eth() - Compare two not u16 aligned Ethernet addresses * @data1: Pointer to a six-byte array containing the Ethernet address * @data2: Pointer other six-byte array containing the Ethernet address * * note: can't use ether_addr_equal() as it requires aligned memory * * Return: true if they are the same ethernet addr */ static inline bool batadv_compare_eth(const void *data1, const void *data2) { return ether_addr_equal_unaligned(data1, data2); } /** * batadv_has_timed_out() - compares current time (jiffies) and timestamp + * timeout * @timestamp: base value to compare with (in jiffies) * @timeout: added to base value before comparing (in milliseconds) * * Return: true if current time is after timestamp + timeout */ static inline bool batadv_has_timed_out(unsigned long timestamp, unsigned int timeout) { return time_is_before_jiffies(timestamp + msecs_to_jiffies(timeout)); } /** * batadv_atomic_dec_not_zero() - Decrease unless the number is 0 * @v: pointer of type atomic_t * * Return: non-zero if v was not 0, and zero otherwise. */ #define batadv_atomic_dec_not_zero(v) atomic_add_unless((v), -1, 0) /** * batadv_smallest_signed_int() - Returns the smallest signed integer in two's * complement with the sizeof x * @x: type of integer * * Return: smallest signed integer of type */ #define batadv_smallest_signed_int(x) (1u << (7u + 8u * (sizeof(x) - 1u))) /** * batadv_seq_before() - Checks if a sequence number x is a predecessor of y * @x: potential predecessor of @y * @y: value to compare @x against * * It handles overflows/underflows and can correctly check for a predecessor * unless the variable sequence number has grown by more than * 2**(bitwidth(x)-1)-1. * * This means that for a u8 with the maximum value 255, it would think: * * * when adding nothing - it is neither a predecessor nor a successor * * before adding more than 127 to the starting value - it is a predecessor, * * when adding 128 - it is neither a predecessor nor a successor, * * after adding more than 127 to the starting value - it is a successor * * Return: true when x is a predecessor of y, false otherwise */ #define batadv_seq_before(x, y) ({ \ typeof(x)_d1 = (x); \ typeof(y)_d2 = (y); \ typeof(x)_dummy = (_d1 - _d2); \ (void)(&_d1 == &_d2); \ _dummy > batadv_smallest_signed_int(_dummy); \ }) /** * batadv_seq_after() - Checks if a sequence number x is a successor of y * @x: potential successor of @y * @y: value to compare @x against * * It handles overflows/underflows and can correctly check for a successor * unless the variable sequence number has grown by more than * 2**(bitwidth(x)-1)-1. * * This means that for a u8 with the maximum value 255, it would think: * * * when adding nothing - it is neither a predecessor nor a successor * * before adding more than 127 to the starting value - it is a predecessor, * * when adding 128 - it is neither a predecessor nor a successor, * * after adding more than 127 to the starting value - it is a successor * * Return: true when x is a successor of y, false otherwise */ #define batadv_seq_after(x, y) batadv_seq_before(y, x) /** * batadv_add_counter() - Add to per cpu statistics counter of soft interface * @bat_priv: the bat priv with all the soft interface information * @idx: counter index which should be modified * @count: value to increase counter by * * Stop preemption on local cpu while incrementing the counter */ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx, size_t count) { this_cpu_add(bat_priv->bat_counters[idx], count); } /** * batadv_inc_counter() - Increase per cpu statistics counter of soft interface * @b: the bat priv with all the soft interface information * @i: counter index which should be modified */ #define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1) /** * BATADV_SKB_CB() - Get batadv_skb_cb from skb control buffer * @__skb: skb holding the control buffer * * The members of the control buffer are defined in struct batadv_skb_cb in * types.h. The macro is inspired by the similar macro TCP_SKB_CB() in tcp.h. * * Return: pointer to the batadv_skb_cb of the skb */ #define BATADV_SKB_CB(__skb) ((struct batadv_skb_cb *)&((__skb)->cb[0])) unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len); bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid); int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, enum batadv_uev_action action, const char *data); #endif /* _NET_BATMAN_ADV_MAIN_H_ */
161 167 166 159 158 160 159 4 159 158 159 66 66 65 160 160 160 159 159 164 84 164 160 160 164 166 167 46 46 46 166 66 162 4 164 167 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 // SPDX-License-Identifier: GPL-2.0-only /* Copyright(c) 2019 Intel Corporation. */ #include <linux/hash.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/static_call.h> /* The BPF dispatcher is a multiway branch code generator. The * dispatcher is a mechanism to avoid the performance penalty of an * indirect call, which is expensive when retpolines are enabled. A * dispatch client registers a BPF program into the dispatcher, and if * there is available room in the dispatcher a direct call to the BPF * program will be generated. All calls to the BPF programs called via * the dispatcher will then be a direct call, instead of an * indirect. The dispatcher hijacks a trampoline function it via the * __fentry__ of the trampoline. The trampoline function has the * following signature: * * unsigned int trampoline(const void *ctx, const struct bpf_insn *insnsi, * unsigned int (*bpf_func)(const void *, * const struct bpf_insn *)); */ static struct bpf_dispatcher_prog *bpf_dispatcher_find_prog( struct bpf_dispatcher *d, struct bpf_prog *prog) { int i; for (i = 0; i < BPF_DISPATCHER_MAX; i++) { if (prog == d->progs[i].prog) return &d->progs[i]; } return NULL; } static struct bpf_dispatcher_prog *bpf_dispatcher_find_free( struct bpf_dispatcher *d) { return bpf_dispatcher_find_prog(d, NULL); } static bool bpf_dispatcher_add_prog(struct bpf_dispatcher *d, struct bpf_prog *prog) { struct bpf_dispatcher_prog *entry; if (!prog) return false; entry = bpf_dispatcher_find_prog(d, prog); if (entry) { refcount_inc(&entry->users); return false; } entry = bpf_dispatcher_find_free(d); if (!entry) return false; bpf_prog_inc(prog); entry->prog = prog; refcount_set(&entry->users, 1); d->num_progs++; return true; } static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d, struct bpf_prog *prog) { struct bpf_dispatcher_prog *entry; if (!prog) return false; entry = bpf_dispatcher_find_prog(d, prog); if (!entry) return false; if (refcount_dec_and_test(&entry->users)) { entry->prog = NULL; bpf_prog_put(prog); d->num_progs--; return true; } return false; } int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs) { return -ENOTSUPP; } static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf) { s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0]; int i; for (i = 0; i < BPF_DISPATCHER_MAX; i++) { if (d->progs[i].prog) *ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func; } return arch_prepare_bpf_dispatcher(image, buf, &ips[0], d->num_progs); } static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) { void *new, *tmp; u32 noff = 0; if (prev_num_progs) noff = d->image_off ^ (PAGE_SIZE / 2); new = d->num_progs ? d->image + noff : NULL; tmp = d->num_progs ? d->rw_image + noff : NULL; if (new) { /* Prepare the dispatcher in d->rw_image. Then use * bpf_arch_text_copy to update d->image, which is RO+X. */ if (bpf_dispatcher_prepare(d, new, tmp)) return; if (IS_ERR(bpf_arch_text_copy(new, tmp, PAGE_SIZE / 2))) return; } __BPF_DISPATCHER_UPDATE(d, new ?: (void *)&bpf_dispatcher_nop_func); /* Make sure all the callers executing the previous/old half of the * image leave it, so following update call can modify it safely. */ synchronize_rcu(); if (new) d->image_off = noff; } void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to) { bool changed = false; int prev_num_progs; if (from == to) return; mutex_lock(&d->mutex); if (!d->image) { d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero); if (!d->image) goto out; d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE); if (!d->rw_image) { bpf_prog_pack_free(d->image, PAGE_SIZE); d->image = NULL; goto out; } bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym); } prev_num_progs = d->num_progs; changed |= bpf_dispatcher_remove_prog(d, from); changed |= bpf_dispatcher_add_prog(d, to); if (!changed) goto out; bpf_dispatcher_update(d, prev_num_progs); out: mutex_unlock(&d->mutex); }
7 7 7 3 3 2 1 7 14 14 14 15 15 14 16 16 1 14 15 16 15 12 16 14 14 14 16 9 8 8 21 20 19 19 19 19 21 25 25 23 24 24 23 23 16 9 20 20 18 24 7 7 13 8 7 2 7 4 7 17 17 17 24 23 17 17 17 17 16 17 17 22 2 20 2 1 19 20 20 22 5 5 5 5 9 9 8 7 7 6 7 7 7 6 2 10 15 15 7 7 6 6 6 6 6 6 14 15 15 7 6 7 11 11 7 6 6 7 7 17 9 1 9 9 9 9 6 2 4 3 2 4 7 7 7 7 6 5 7 2 2 2 1 1 1 1 1 1 1 1 1 1 3 3 3 2 3 1 1 6 6 6 2 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. * * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The * PFNs can be placed into an iommu_domain, or returned to the caller as a page * list for access by an in-kernel user. * * The datastructure uses the iopt_pages to optimize the storage of the PFNs * between the domains and xarray. */ #include <linux/iommufd.h> #include <linux/lockdep.h> #include <linux/iommu.h> #include <linux/sched/mm.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/errno.h> #include <uapi/linux/iommufd.h> #include "io_pagetable.h" #include "double_span.h" struct iopt_pages_list { struct iopt_pages *pages; struct iopt_area *area; struct list_head next; unsigned long start_byte; unsigned long length; }; struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, struct io_pagetable *iopt, unsigned long iova, unsigned long last_iova) { lockdep_assert_held(&iopt->iova_rwsem); iter->cur_iova = iova; iter->last_iova = last_iova; iter->area = iopt_area_iter_first(iopt, iova, iova); if (!iter->area) return NULL; if (!iter->area->pages) { iter->area = NULL; return NULL; } return iter->area; } struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter) { unsigned long last_iova; if (!iter->area) return NULL; last_iova = iopt_area_last_iova(iter->area); if (iter->last_iova <= last_iova) return NULL; iter->cur_iova = last_iova + 1; iter->area = iopt_area_iter_next(iter->area, iter->cur_iova, iter->last_iova); if (!iter->area) return NULL; if (iter->cur_iova != iopt_area_iova(iter->area) || !iter->area->pages) { iter->area = NULL; return NULL; } return iter->area; } static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span, unsigned long length, unsigned long iova_alignment, unsigned long page_offset) { if (span->is_used || span->last_hole - span->start_hole < length - 1) return false; span->start_hole = ALIGN(span->start_hole, iova_alignment) | page_offset; if (span->start_hole > span->last_hole || span->last_hole - span->start_hole < length - 1) return false; return true; } static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, unsigned long length, unsigned long iova_alignment, unsigned long page_offset) { if (span->is_hole || span->last_used - span->start_used < length - 1) return false; span->start_used = ALIGN(span->start_used, iova_alignment) | page_offset; if (span->start_used > span->last_used || span->last_used - span->start_used < length - 1) return false; return true; } /* * Automatically find a block of IOVA that is not being used and not reserved. * Does not return a 0 IOVA even if it is valid. */ static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, unsigned long uptr, unsigned long length) { unsigned long page_offset = uptr % PAGE_SIZE; struct interval_tree_double_span_iter used_span; struct interval_tree_span_iter allowed_span; unsigned long iova_alignment; lockdep_assert_held(&iopt->iova_rwsem); /* Protect roundup_pow-of_two() from overflow */ if (length == 0 || length >= ULONG_MAX / 2) return -EOVERFLOW; /* * Keep alignment present in the uptr when building the IOVA, this * increases the chance we can map a THP. */ if (!uptr) iova_alignment = roundup_pow_of_two(length); else iova_alignment = min_t(unsigned long, roundup_pow_of_two(length), 1UL << __ffs64(uptr)); if (iova_alignment < iopt->iova_alignment) return -EINVAL; interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree, PAGE_SIZE, ULONG_MAX - PAGE_SIZE) { if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) { allowed_span.start_used = PAGE_SIZE; allowed_span.last_used = ULONG_MAX - PAGE_SIZE; allowed_span.is_hole = false; } if (!__alloc_iova_check_used(&allowed_span, length, iova_alignment, page_offset)) continue; interval_tree_for_each_double_span( &used_span, &iopt->reserved_itree, &iopt->area_itree, allowed_span.start_used, allowed_span.last_used) { if (!__alloc_iova_check_hole(&used_span, length, iova_alignment, page_offset)) continue; *iova = used_span.start_hole; return 0; } } return -ENOSPC; } static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova, unsigned long length) { unsigned long last; lockdep_assert_held(&iopt->iova_rwsem); if ((iova & (iopt->iova_alignment - 1))) return -EINVAL; if (check_add_overflow(iova, length - 1, &last)) return -EOVERFLOW; /* No reserved IOVA intersects the range */ if (iopt_reserved_iter_first(iopt, iova, last)) return -EINVAL; /* Check that there is not already a mapping in the range */ if (iopt_area_iter_first(iopt, iova, last)) return -EEXIST; return 0; } /* * The area takes a slice of the pages from start_bytes to start_byte + length */ static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area, struct iopt_pages *pages, unsigned long iova, unsigned long start_byte, unsigned long length, int iommu_prot) { lockdep_assert_held_write(&iopt->iova_rwsem); if ((iommu_prot & IOMMU_WRITE) && !pages->writable) return -EPERM; area->iommu_prot = iommu_prot; area->page_offset = start_byte % PAGE_SIZE; if (area->page_offset & (iopt->iova_alignment - 1)) return -EINVAL; area->node.start = iova; if (check_add_overflow(iova, length - 1, &area->node.last)) return -EOVERFLOW; area->pages_node.start = start_byte / PAGE_SIZE; if (check_add_overflow(start_byte, length - 1, &area->pages_node.last)) return -EOVERFLOW; area->pages_node.last = area->pages_node.last / PAGE_SIZE; if (WARN_ON(area->pages_node.last >= pages->npages)) return -EOVERFLOW; /* * The area is inserted with a NULL pages indicating it is not fully * initialized yet. */ area->iopt = iopt; interval_tree_insert(&area->node, &iopt->area_itree); return 0; } static struct iopt_area *iopt_area_alloc(void) { struct iopt_area *area; area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); if (!area) return NULL; RB_CLEAR_NODE(&area->node.rb); RB_CLEAR_NODE(&area->pages_node.rb); return area; } static int iopt_alloc_area_pages(struct io_pagetable *iopt, struct list_head *pages_list, unsigned long length, unsigned long *dst_iova, int iommu_prot, unsigned int flags) { struct iopt_pages_list *elm; unsigned long iova; int rc = 0; list_for_each_entry(elm, pages_list, next) { elm->area = iopt_area_alloc(); if (!elm->area) return -ENOMEM; } down_write(&iopt->iova_rwsem); if ((length & (iopt->iova_alignment - 1)) || !length) { rc = -EINVAL; goto out_unlock; } if (flags & IOPT_ALLOC_IOVA) { /* Use the first entry to guess the ideal IOVA alignment */ elm = list_first_entry(pages_list, struct iopt_pages_list, next); rc = iopt_alloc_iova( iopt, dst_iova, (uintptr_t)elm->pages->uptr + elm->start_byte, length); if (rc) goto out_unlock; if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) { rc = -EINVAL; goto out_unlock; } } else { rc = iopt_check_iova(iopt, *dst_iova, length); if (rc) goto out_unlock; } /* * Areas are created with a NULL pages so that the IOVA space is * reserved and we can unlock the iova_rwsem. */ iova = *dst_iova; list_for_each_entry(elm, pages_list, next) { rc = iopt_insert_area(iopt, elm->area, elm->pages, iova, elm->start_byte, elm->length, iommu_prot); if (rc) goto out_unlock; iova += elm->length; } out_unlock: up_write(&iopt->iova_rwsem); return rc; } static void iopt_abort_area(struct iopt_area *area) { if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(area->pages); if (area->iopt) { down_write(&area->iopt->iova_rwsem); interval_tree_remove(&area->node, &area->iopt->area_itree); up_write(&area->iopt->iova_rwsem); } kfree(area); } void iopt_free_pages_list(struct list_head *pages_list) { struct iopt_pages_list *elm; while ((elm = list_first_entry_or_null(pages_list, struct iopt_pages_list, next))) { if (elm->area) iopt_abort_area(elm->area); if (elm->pages) iopt_put_pages(elm->pages); list_del(&elm->next); kfree(elm); } } static int iopt_fill_domains_pages(struct list_head *pages_list) { struct iopt_pages_list *undo_elm; struct iopt_pages_list *elm; int rc; list_for_each_entry(elm, pages_list, next) { rc = iopt_area_fill_domains(elm->area, elm->pages); if (rc) goto err_undo; } return 0; err_undo: list_for_each_entry(undo_elm, pages_list, next) { if (undo_elm == elm) break; iopt_area_unfill_domains(undo_elm->area, undo_elm->pages); } return rc; } int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, unsigned long length, unsigned long *dst_iova, int iommu_prot, unsigned int flags) { struct iopt_pages_list *elm; int rc; rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova, iommu_prot, flags); if (rc) return rc; down_read(&iopt->domains_rwsem); rc = iopt_fill_domains_pages(pages_list); if (rc) goto out_unlock_domains; down_write(&iopt->iova_rwsem); list_for_each_entry(elm, pages_list, next) { /* * area->pages must be set inside the domains_rwsem to ensure * any newly added domains will get filled. Moves the reference * in from the list. */ elm->area->pages = elm->pages; elm->pages = NULL; elm->area = NULL; } up_write(&iopt->iova_rwsem); out_unlock_domains: up_read(&iopt->domains_rwsem); return rc; } /** * iopt_map_user_pages() - Map a user VA to an iova in the io page table * @ictx: iommufd_ctx the iopt is part of * @iopt: io_pagetable to act on * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains * the chosen iova on output. Otherwise is the iova to map to on input * @uptr: User VA to map * @length: Number of bytes to map * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping * @flags: IOPT_ALLOC_IOVA or zero * * iova, uptr, and length must be aligned to iova_alignment. For domain backed * page tables this will pin the pages and load them into the domain at iova. * For non-domain page tables this will only setup a lazy reference and the * caller must use iopt_access_pages() to touch them. * * iopt_unmap_iova() must be called to undo this before the io_pagetable can be * destroyed. */ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long *iova, void __user *uptr, unsigned long length, int iommu_prot, unsigned int flags) { struct iopt_pages_list elm = {}; LIST_HEAD(pages_list); int rc; elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE); if (IS_ERR(elm.pages)) return PTR_ERR(elm.pages); if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; elm.start_byte = uptr - elm.pages->uptr; elm.length = length; list_add(&elm.next, &pages_list); rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); if (rc) { if (elm.area) iopt_abort_area(elm.area); if (elm.pages) iopt_put_pages(elm.pages); return rc; } return 0; } struct iova_bitmap_fn_arg { unsigned long flags; struct io_pagetable *iopt; struct iommu_domain *domain; struct iommu_dirty_bitmap *dirty; }; static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap, unsigned long iova, size_t length, void *opaque) { struct iopt_area *area; struct iopt_area_contig_iter iter; struct iova_bitmap_fn_arg *arg = opaque; struct iommu_domain *domain = arg->domain; struct iommu_dirty_bitmap *dirty = arg->dirty; const struct iommu_dirty_ops *ops = domain->dirty_ops; unsigned long last_iova = iova + length - 1; unsigned long flags = arg->flags; int ret; iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) { unsigned long last = min(last_iova, iopt_area_last_iova(area)); ret = ops->read_and_clear_dirty(domain, iter.cur_iova, last - iter.cur_iova + 1, flags, dirty); if (ret) return ret; } if (!iopt_area_contig_done(&iter)) return -EINVAL; return 0; } static int iommu_read_and_clear_dirty(struct iommu_domain *domain, struct io_pagetable *iopt, unsigned long flags, struct iommu_hwpt_get_dirty_bitmap *bitmap) { const struct iommu_dirty_ops *ops = domain->dirty_ops; struct iommu_iotlb_gather gather; struct iommu_dirty_bitmap dirty; struct iova_bitmap_fn_arg arg; struct iova_bitmap *iter; int ret = 0; if (!ops || !ops->read_and_clear_dirty) return -EOPNOTSUPP; iter = iova_bitmap_alloc(bitmap->iova, bitmap->length, bitmap->page_size, u64_to_user_ptr(bitmap->data)); if (IS_ERR(iter)) return -ENOMEM; iommu_dirty_bitmap_init(&dirty, iter, &gather); arg.flags = flags; arg.iopt = iopt; arg.domain = domain; arg.dirty = &dirty; iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty); if (!(flags & IOMMU_DIRTY_NO_CLEAR)) iommu_iotlb_sync(domain, &gather); iova_bitmap_free(iter); return ret; } int iommufd_check_iova_range(struct io_pagetable *iopt, struct iommu_hwpt_get_dirty_bitmap *bitmap) { size_t iommu_pgsize = iopt->iova_alignment; u64 last_iova; if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova)) return -EOVERFLOW; if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX) return -EOVERFLOW; if ((bitmap->iova & (iommu_pgsize - 1)) || ((last_iova + 1) & (iommu_pgsize - 1))) return -EINVAL; if (!bitmap->page_size) return -EINVAL; if ((bitmap->iova & (bitmap->page_size - 1)) || ((last_iova + 1) & (bitmap->page_size - 1))) return -EINVAL; return 0; } int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, struct iommu_domain *domain, unsigned long flags, struct iommu_hwpt_get_dirty_bitmap *bitmap) { int ret; ret = iommufd_check_iova_range(iopt, bitmap); if (ret) return ret; down_read(&iopt->iova_rwsem); ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap); up_read(&iopt->iova_rwsem); return ret; } static int iopt_clear_dirty_data(struct io_pagetable *iopt, struct iommu_domain *domain) { const struct iommu_dirty_ops *ops = domain->dirty_ops; struct iommu_iotlb_gather gather; struct iommu_dirty_bitmap dirty; struct iopt_area *area; int ret = 0; lockdep_assert_held_read(&iopt->iova_rwsem); iommu_dirty_bitmap_init(&dirty, NULL, &gather); for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; area = iopt_area_iter_next(area, 0, ULONG_MAX)) { if (!area->pages) continue; ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area), iopt_area_length(area), 0, &dirty); if (ret) break; } iommu_iotlb_sync(domain, &gather); return ret; } int iopt_set_dirty_tracking(struct io_pagetable *iopt, struct iommu_domain *domain, bool enable) { const struct iommu_dirty_ops *ops = domain->dirty_ops; int ret = 0; if (!ops) return -EOPNOTSUPP; down_read(&iopt->iova_rwsem); /* Clear dirty bits from PTEs to ensure a clean snapshot */ if (enable) { ret = iopt_clear_dirty_data(iopt, domain); if (ret) goto out_unlock; } ret = ops->set_dirty_tracking(domain, enable); out_unlock: up_read(&iopt->iova_rwsem); return ret; } int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, unsigned long length, struct list_head *pages_list) { struct iopt_area_contig_iter iter; unsigned long last_iova; struct iopt_area *area; int rc; if (!length) return -EINVAL; if (check_add_overflow(iova, length - 1, &last_iova)) return -EOVERFLOW; down_read(&iopt->iova_rwsem); iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { struct iopt_pages_list *elm; unsigned long last = min(last_iova, iopt_area_last_iova(area)); elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT); if (!elm) { rc = -ENOMEM; goto err_free; } elm->start_byte = iopt_area_start_byte(area, iter.cur_iova); elm->pages = area->pages; elm->length = (last - iter.cur_iova) + 1; kref_get(&elm->pages->kref); list_add_tail(&elm->next, pages_list); } if (!iopt_area_contig_done(&iter)) { rc = -ENOENT; goto err_free; } up_read(&iopt->iova_rwsem); return 0; err_free: up_read(&iopt->iova_rwsem); iopt_free_pages_list(pages_list); return rc; } static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, unsigned long last, unsigned long *unmapped) { struct iopt_area *area; unsigned long unmapped_bytes = 0; unsigned int tries = 0; int rc = -ENOENT; /* * The domains_rwsem must be held in read mode any time any area->pages * is NULL. This prevents domain attach/detatch from running * concurrently with cleaning up the area. */ again: down_read(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); while ((area = iopt_area_iter_first(iopt, start, last))) { unsigned long area_last = iopt_area_last_iova(area); unsigned long area_first = iopt_area_iova(area); struct iopt_pages *pages; /* Userspace should not race map/unmap's of the same area */ if (!area->pages) { rc = -EBUSY; goto out_unlock_iova; } if (area_first < start || area_last > last) { rc = -ENOENT; goto out_unlock_iova; } if (area_first != start) tries = 0; /* * num_accesses writers must hold the iova_rwsem too, so we can * safely read it under the write side of the iovam_rwsem * without the pages->mutex. */ if (area->num_accesses) { size_t length = iopt_area_length(area); start = area_first; area->prevent_access = true; up_write(&iopt->iova_rwsem); up_read(&iopt->domains_rwsem); iommufd_access_notify_unmap(iopt, area_first, length); /* Something is not responding to unmap requests. */ tries++; if (WARN_ON(tries > 100)) return -EDEADLOCK; goto again; } pages = area->pages; area->pages = NULL; up_write(&iopt->iova_rwsem); iopt_area_unfill_domains(area, pages); iopt_abort_area(area); iopt_put_pages(pages); unmapped_bytes += area_last - area_first + 1; down_write(&iopt->iova_rwsem); } if (unmapped_bytes) rc = 0; out_unlock_iova: up_write(&iopt->iova_rwsem); up_read(&iopt->domains_rwsem); if (unmapped) *unmapped = unmapped_bytes; return rc; } /** * iopt_unmap_iova() - Remove a range of iova * @iopt: io_pagetable to act on * @iova: Starting iova to unmap * @length: Number of bytes to unmap * @unmapped: Return number of bytes unmapped * * The requested range must be a superset of existing ranges. * Splitting/truncating IOVA mappings is not allowed. */ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, unsigned long length, unsigned long *unmapped) { unsigned long iova_last; if (!length) return -EINVAL; if (check_add_overflow(iova, length - 1, &iova_last)) return -EOVERFLOW; return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped); } int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped) { int rc; rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped); /* If the IOVAs are empty then unmap all succeeds */ if (rc == -ENOENT) return 0; return rc; } /* The caller must always free all the nodes in the allowed_iova rb_root. */ int iopt_set_allow_iova(struct io_pagetable *iopt, struct rb_root_cached *allowed_iova) { struct iopt_allowed *allowed; down_write(&iopt->iova_rwsem); swap(*allowed_iova, iopt->allowed_itree); for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed; allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) { if (iopt_reserved_iter_first(iopt, allowed->node.start, allowed->node.last)) { swap(*allowed_iova, iopt->allowed_itree); up_write(&iopt->iova_rwsem); return -EADDRINUSE; } } up_write(&iopt->iova_rwsem); return 0; } int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, unsigned long last, void *owner) { struct iopt_reserved *reserved; lockdep_assert_held_write(&iopt->iova_rwsem); if (iopt_area_iter_first(iopt, start, last) || iopt_allowed_iter_first(iopt, start, last)) return -EADDRINUSE; reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT); if (!reserved) return -ENOMEM; reserved->node.start = start; reserved->node.last = last; reserved->owner = owner; interval_tree_insert(&reserved->node, &iopt->reserved_itree); return 0; } static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) { struct iopt_reserved *reserved, *next; lockdep_assert_held_write(&iopt->iova_rwsem); for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved; reserved = next) { next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX); if (reserved->owner == owner) { interval_tree_remove(&reserved->node, &iopt->reserved_itree); kfree(reserved); } } } void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) { down_write(&iopt->iova_rwsem); __iopt_remove_reserved_iova(iopt, owner); up_write(&iopt->iova_rwsem); } void iopt_init_table(struct io_pagetable *iopt) { init_rwsem(&iopt->iova_rwsem); init_rwsem(&iopt->domains_rwsem); iopt->area_itree = RB_ROOT_CACHED; iopt->allowed_itree = RB_ROOT_CACHED; iopt->reserved_itree = RB_ROOT_CACHED; xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT); xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC); /* * iopt's start as SW tables that can use the entire size_t IOVA space * due to the use of size_t in the APIs. They have no alignment * restriction. */ iopt->iova_alignment = 1; } void iopt_destroy_table(struct io_pagetable *iopt) { struct interval_tree_node *node; if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) iopt_remove_reserved_iova(iopt, NULL); while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0, ULONG_MAX))) { interval_tree_remove(node, &iopt->allowed_itree); kfree(container_of(node, struct iopt_allowed, node)); } WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root)); WARN_ON(!xa_empty(&iopt->domains)); WARN_ON(!xa_empty(&iopt->access_list)); WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root)); } /** * iopt_unfill_domain() - Unfill a domain with PFNs * @iopt: io_pagetable to act on * @domain: domain to unfill * * This is used when removing a domain from the iopt. Every area in the iopt * will be unmapped from the domain. The domain must already be removed from the * domains xarray. */ static void iopt_unfill_domain(struct io_pagetable *iopt, struct iommu_domain *domain) { struct iopt_area *area; lockdep_assert_held(&iopt->iova_rwsem); lockdep_assert_held_write(&iopt->domains_rwsem); /* * Some other domain is holding all the pfns still, rapidly unmap this * domain. */ if (iopt->next_domain_id != 0) { /* Pick an arbitrary remaining domain to act as storage */ struct iommu_domain *storage_domain = xa_load(&iopt->domains, 0); for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; area = iopt_area_iter_next(area, 0, ULONG_MAX)) { struct iopt_pages *pages = area->pages; if (!pages) continue; mutex_lock(&pages->mutex); if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(!area->storage_domain); if (area->storage_domain == domain) area->storage_domain = storage_domain; mutex_unlock(&pages->mutex); iopt_area_unmap_domain(area, domain); } return; } for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; area = iopt_area_iter_next(area, 0, ULONG_MAX)) { struct iopt_pages *pages = area->pages; if (!pages) continue; mutex_lock(&pages->mutex); interval_tree_remove(&area->pages_node, &pages->domains_itree); WARN_ON(area->storage_domain != domain); area->storage_domain = NULL; iopt_area_unfill_domain(area, pages, domain); mutex_unlock(&pages->mutex); } } /** * iopt_fill_domain() - Fill a domain with PFNs * @iopt: io_pagetable to act on * @domain: domain to fill * * Fill the domain with PFNs from every area in the iopt. On failure the domain * is left unchanged. */ static int iopt_fill_domain(struct io_pagetable *iopt, struct iommu_domain *domain) { struct iopt_area *end_area; struct iopt_area *area; int rc; lockdep_assert_held(&iopt->iova_rwsem); lockdep_assert_held_write(&iopt->domains_rwsem); for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; area = iopt_area_iter_next(area, 0, ULONG_MAX)) { struct iopt_pages *pages = area->pages; if (!pages) continue; mutex_lock(&pages->mutex); rc = iopt_area_fill_domain(area, domain); if (rc) { mutex_unlock(&pages->mutex); goto out_unfill; } if (!area->storage_domain) { WARN_ON(iopt->next_domain_id != 0); area->storage_domain = domain; interval_tree_insert(&area->pages_node, &pages->domains_itree); } mutex_unlock(&pages->mutex); } return 0; out_unfill: end_area = area; for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; area = iopt_area_iter_next(area, 0, ULONG_MAX)) { struct iopt_pages *pages = area->pages; if (area == end_area) break; if (!pages) continue; mutex_lock(&pages->mutex); if (iopt->next_domain_id == 0) { interval_tree_remove(&area->pages_node, &pages->domains_itree); area->storage_domain = NULL; } iopt_area_unfill_domain(area, pages, domain); mutex_unlock(&pages->mutex); } return rc; } /* All existing area's conform to an increased page size */ static int iopt_check_iova_alignment(struct io_pagetable *iopt, unsigned long new_iova_alignment) { unsigned long align_mask = new_iova_alignment - 1; struct iopt_area *area; lockdep_assert_held(&iopt->iova_rwsem); lockdep_assert_held(&iopt->domains_rwsem); for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; area = iopt_area_iter_next(area, 0, ULONG_MAX)) if ((iopt_area_iova(area) & align_mask) || (iopt_area_length(area) & align_mask) || (area->page_offset & align_mask)) return -EADDRINUSE; if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) { struct iommufd_access *access; unsigned long index; xa_for_each(&iopt->access_list, index, access) if (WARN_ON(access->iova_alignment > new_iova_alignment)) return -EADDRINUSE; } return 0; } int iopt_table_add_domain(struct io_pagetable *iopt, struct iommu_domain *domain) { const struct iommu_domain_geometry *geometry = &domain->geometry; struct iommu_domain *iter_domain; unsigned int new_iova_alignment; unsigned long index; int rc; down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); xa_for_each(&iopt->domains, index, iter_domain) { if (WARN_ON(iter_domain == domain)) { rc = -EEXIST; goto out_unlock; } } /* * The io page size drives the iova_alignment. Internally the iopt_pages * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE * objects into the iommu_domain. * * A iommu_domain must always be able to accept PAGE_SIZE to be * compatible as we can't guarantee higher contiguity. */ new_iova_alignment = max_t(unsigned long, 1UL << __ffs(domain->pgsize_bitmap), iopt->iova_alignment); if (new_iova_alignment > PAGE_SIZE) { rc = -EINVAL; goto out_unlock; } if (new_iova_alignment != iopt->iova_alignment) { rc = iopt_check_iova_alignment(iopt, new_iova_alignment); if (rc) goto out_unlock; } /* No area exists that is outside the allowed domain aperture */ if (geometry->aperture_start != 0) { rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1, domain); if (rc) goto out_reserved; } if (geometry->aperture_end != ULONG_MAX) { rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1, ULONG_MAX, domain); if (rc) goto out_reserved; } rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL); if (rc) goto out_reserved; rc = iopt_fill_domain(iopt, domain); if (rc) goto out_release; iopt->iova_alignment = new_iova_alignment; xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL); iopt->next_domain_id++; up_write(&iopt->iova_rwsem); up_write(&iopt->domains_rwsem); return 0; out_release: xa_release(&iopt->domains, iopt->next_domain_id); out_reserved: __iopt_remove_reserved_iova(iopt, domain); out_unlock: up_write(&iopt->iova_rwsem); up_write(&iopt->domains_rwsem); return rc; } static int iopt_calculate_iova_alignment(struct io_pagetable *iopt) { unsigned long new_iova_alignment; struct iommufd_access *access; struct iommu_domain *domain; unsigned long index; lockdep_assert_held_write(&iopt->iova_rwsem); lockdep_assert_held(&iopt->domains_rwsem); /* See batch_iommu_map_small() */ if (iopt->disable_large_pages) new_iova_alignment = PAGE_SIZE; else new_iova_alignment = 1; xa_for_each(&iopt->domains, index, domain) new_iova_alignment = max_t(unsigned long, 1UL << __ffs(domain->pgsize_bitmap), new_iova_alignment); xa_for_each(&iopt->access_list, index, access) new_iova_alignment = max_t(unsigned long, access->iova_alignment, new_iova_alignment); if (new_iova_alignment > iopt->iova_alignment) { int rc; rc = iopt_check_iova_alignment(iopt, new_iova_alignment); if (rc) return rc; } iopt->iova_alignment = new_iova_alignment; return 0; } void iopt_table_remove_domain(struct io_pagetable *iopt, struct iommu_domain *domain) { struct iommu_domain *iter_domain = NULL; unsigned long index; down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); xa_for_each(&iopt->domains, index, iter_domain) if (iter_domain == domain) break; if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id) goto out_unlock; /* * Compress the xarray to keep it linear by swapping the entry to erase * with the tail entry and shrinking the tail. */ iopt->next_domain_id--; iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id); if (index != iopt->next_domain_id) xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL); iopt_unfill_domain(iopt, domain); __iopt_remove_reserved_iova(iopt, domain); WARN_ON(iopt_calculate_iova_alignment(iopt)); out_unlock: up_write(&iopt->iova_rwsem); up_write(&iopt->domains_rwsem); } /** * iopt_area_split - Split an area into two parts at iova * @area: The area to split * @iova: Becomes the last of a new area * * This splits an area into two. It is part of the VFIO compatibility to allow * poking a hole in the mapping. The two areas continue to point at the same * iopt_pages, just with different starting bytes. */ static int iopt_area_split(struct iopt_area *area, unsigned long iova) { unsigned long alignment = area->iopt->iova_alignment; unsigned long last_iova = iopt_area_last_iova(area); unsigned long start_iova = iopt_area_iova(area); unsigned long new_start = iova + 1; struct io_pagetable *iopt = area->iopt; struct iopt_pages *pages = area->pages; struct iopt_area *lhs; struct iopt_area *rhs; int rc; lockdep_assert_held_write(&iopt->iova_rwsem); if (iova == start_iova || iova == last_iova) return 0; if (!pages || area->prevent_access) return -EBUSY; if (new_start & (alignment - 1) || iopt_area_start_byte(area, new_start) & (alignment - 1)) return -EINVAL; lhs = iopt_area_alloc(); if (!lhs) return -ENOMEM; rhs = iopt_area_alloc(); if (!rhs) { rc = -ENOMEM; goto err_free_lhs; } mutex_lock(&pages->mutex); /* * Splitting is not permitted if an access exists, we don't track enough * information to split existing accesses. */ if (area->num_accesses) { rc = -EINVAL; goto err_unlock; } /* * Splitting is not permitted if a domain could have been mapped with * huge pages. */ if (area->storage_domain && !iopt->disable_large_pages) { rc = -EINVAL; goto err_unlock; } interval_tree_remove(&area->node, &iopt->area_itree); rc = iopt_insert_area(iopt, lhs, area->pages, start_iova, iopt_area_start_byte(area, start_iova), (new_start - 1) - start_iova + 1, area->iommu_prot); if (WARN_ON(rc)) goto err_insert; rc = iopt_insert_area(iopt, rhs, area->pages, new_start, iopt_area_start_byte(area, new_start), last_iova - new_start + 1, area->iommu_prot); if (WARN_ON(rc)) goto err_remove_lhs; /* * If the original area has filled a domain, domains_itree has to be * updated. */ if (area->storage_domain) { interval_tree_remove(&area->pages_node, &pages->domains_itree); interval_tree_insert(&lhs->pages_node, &pages->domains_itree); interval_tree_insert(&rhs->pages_node, &pages->domains_itree); } lhs->storage_domain = area->storage_domain; lhs->pages = area->pages; rhs->storage_domain = area->storage_domain; rhs->pages = area->pages; kref_get(&rhs->pages->kref); kfree(area); mutex_unlock(&pages->mutex); /* * No change to domains or accesses because the pages hasn't been * changed */ return 0; err_remove_lhs: interval_tree_remove(&lhs->node, &iopt->area_itree); err_insert: interval_tree_insert(&area->node, &iopt->area_itree); err_unlock: mutex_unlock(&pages->mutex); kfree(rhs); err_free_lhs: kfree(lhs); return rc; } int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, size_t num_iovas) { int rc = 0; int i; down_write(&iopt->iova_rwsem); for (i = 0; i < num_iovas; i++) { struct iopt_area *area; area = iopt_area_iter_first(iopt, iovas[i], iovas[i]); if (!area) continue; rc = iopt_area_split(area, iovas[i]); if (rc) break; } up_write(&iopt->iova_rwsem); return rc; } void iopt_enable_large_pages(struct io_pagetable *iopt) { int rc; down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); WRITE_ONCE(iopt->disable_large_pages, false); rc = iopt_calculate_iova_alignment(iopt); WARN_ON(rc); up_write(&iopt->iova_rwsem); up_write(&iopt->domains_rwsem); } int iopt_disable_large_pages(struct io_pagetable *iopt) { int rc = 0; down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); if (iopt->disable_large_pages) goto out_unlock; /* Won't do it if domains already have pages mapped in them */ if (!xa_empty(&iopt->domains) && !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) { rc = -EINVAL; goto out_unlock; } WRITE_ONCE(iopt->disable_large_pages, true); rc = iopt_calculate_iova_alignment(iopt); if (rc) WRITE_ONCE(iopt->disable_large_pages, false); out_unlock: up_write(&iopt->iova_rwsem); up_write(&iopt->domains_rwsem); return rc; } int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access) { u32 new_id; int rc; down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b, GFP_KERNEL_ACCOUNT); if (rc) goto out_unlock; rc = iopt_calculate_iova_alignment(iopt); if (rc) { xa_erase(&iopt->access_list, new_id); goto out_unlock; } access->iopt_access_list_id = new_id; out_unlock: up_write(&iopt->iova_rwsem); up_write(&iopt->domains_rwsem); return rc; } void iopt_remove_access(struct io_pagetable *iopt, struct iommufd_access *access, u32 iopt_access_list_id) { down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access); WARN_ON(iopt_calculate_iova_alignment(iopt)); up_write(&iopt->iova_rwsem); up_write(&iopt->domains_rwsem); } /* Narrow the valid_iova_itree to include reserved ranges from a device. */ int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, struct device *dev, phys_addr_t *sw_msi_start) { struct iommu_resv_region *resv; LIST_HEAD(resv_regions); unsigned int num_hw_msi = 0; unsigned int num_sw_msi = 0; int rc; if (iommufd_should_fail()) return -EINVAL; down_write(&iopt->iova_rwsem); /* FIXME: drivers allocate memory but there is no failure propogated */ iommu_get_resv_regions(dev, &resv_regions); list_for_each_entry(resv, &resv_regions, list) { if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) continue; if (sw_msi_start && resv->type == IOMMU_RESV_MSI) num_hw_msi++; if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) { *sw_msi_start = resv->start; num_sw_msi++; } rc = iopt_reserve_iova(iopt, resv->start, resv->length - 1 + resv->start, dev); if (rc) goto out_reserved; } /* Drivers must offer sane combinations of regions */ if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) { rc = -EINVAL; goto out_reserved; } rc = 0; goto out_free_resv; out_reserved: __iopt_remove_reserved_iova(iopt, dev); out_free_resv: iommu_put_resv_regions(dev, &resv_regions); up_write(&iopt->iova_rwsem); return rc; }
4 4 1484 1484 1484 24 19 19 13 1487 14 2 1 2 2 1 1 2 2 2 10 2 2 2 4 4 4 4 58 58 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * Event handling for HSR and PRP devices. */ #include <linux/netdevice.h> #include <net/rtnetlink.h> #include <linux/rculist.h> #include <linux/timer.h> #include <linux/etherdevice.h> #include "hsr_main.h" #include "hsr_device.h" #include "hsr_netlink.h" #include "hsr_framereg.h" #include "hsr_slave.h" static bool hsr_slave_empty(struct hsr_priv *hsr) { struct hsr_port *port; hsr_for_each_port(hsr, port) if (port->type != HSR_PT_MASTER) return false; return true; } static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, void *ptr) { struct hsr_port *port, *master; struct net_device *dev; struct hsr_priv *hsr; LIST_HEAD(list_kill); int mtu_max; int res; dev = netdev_notifier_info_to_dev(ptr); port = hsr_port_get_rtnl(dev); if (!port) { if (!is_hsr_master(dev)) return NOTIFY_DONE; /* Not an HSR device */ hsr = netdev_priv(dev); port = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (!port) { /* Resend of notification concerning removed device? */ return NOTIFY_DONE; } } else { hsr = port->hsr; } switch (event) { case NETDEV_UP: /* Administrative state DOWN */ case NETDEV_DOWN: /* Administrative state UP */ case NETDEV_CHANGE: /* Link (carrier) state changes */ hsr_check_carrier_and_operstate(hsr); break; case NETDEV_CHANGENAME: if (is_hsr_master(dev)) hsr_debugfs_rename(dev); break; case NETDEV_CHANGEADDR: if (port->type == HSR_PT_MASTER) { /* This should not happen since there's no * ndo_set_mac_address() for HSR devices - i.e. not * supported. */ break; } master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (port->type == HSR_PT_SLAVE_A) { eth_hw_addr_set(master->dev, dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, master->dev); } /* Make sure we recognize frames from ourselves in hsr_rcv() */ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); res = hsr_create_self_node(hsr, master->dev->dev_addr, port ? port->dev->dev_addr : master->dev->dev_addr); if (res) netdev_warn(master->dev, "Could not update HSR node address.\n"); break; case NETDEV_CHANGEMTU: if (port->type == HSR_PT_MASTER) break; /* Handled in ndo_change_mtu() */ mtu_max = hsr_get_max_mtu(port->hsr); master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER); WRITE_ONCE(master->dev->mtu, mtu_max); break; case NETDEV_UNREGISTER: if (!is_hsr_master(dev)) { master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER); hsr_del_port(port); if (hsr_slave_empty(master->hsr)) { const struct rtnl_link_ops *ops; ops = master->dev->rtnl_link_ops; ops->dellink(master->dev, &list_kill); unregister_netdevice_many(&list_kill); } } break; case NETDEV_PRE_TYPE_CHANGE: /* HSR works only on Ethernet devices. Refuse slave to change * its type. */ return NOTIFY_BAD; } return NOTIFY_DONE; } struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt) { struct hsr_port *port; hsr_for_each_port(hsr, port) if (port->type == pt) return port; return NULL; } int hsr_get_version(struct net_device *dev, enum hsr_version *ver) { struct hsr_priv *hsr; hsr = netdev_priv(dev); *ver = hsr->prot_version; return 0; } EXPORT_SYMBOL(hsr_get_version); static struct notifier_block hsr_nb = { .notifier_call = hsr_netdev_notify, /* Slave event notifications */ }; static int __init hsr_init(void) { int err; BUILD_BUG_ON(sizeof(struct hsr_tag) != HSR_HLEN); err = register_netdevice_notifier(&hsr_nb); if (err) return err; err = hsr_netlink_init(); if (err) { unregister_netdevice_notifier(&hsr_nb); return err; } return 0; } static void __exit hsr_exit(void) { hsr_netlink_exit(); hsr_debugfs_remove_root(); unregister_netdevice_notifier(&hsr_nb); } module_init(hsr_init); module_exit(hsr_exit); MODULE_DESCRIPTION("High-availability Seamless Redundancy (HSR) driver"); MODULE_LICENSE("GPL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 /* SPDX-License-Identifier: GPL-2.0 */ /* * USB PHY defines * * These APIs may be used between USB controllers. USB device drivers * (for either host or peripheral roles) don't use these calls; they * continue to use just usb_device and usb_gadget. */ #ifndef __LINUX_USB_PHY_H #define __LINUX_USB_PHY_H #include <linux/extcon.h> #include <linux/notifier.h> #include <linux/usb.h> #include <uapi/linux/usb/charger.h> enum usb_phy_interface { USBPHY_INTERFACE_MODE_UNKNOWN, USBPHY_INTERFACE_MODE_UTMI, USBPHY_INTERFACE_MODE_UTMIW, USBPHY_INTERFACE_MODE_ULPI, USBPHY_INTERFACE_MODE_SERIAL, USBPHY_INTERFACE_MODE_HSIC, }; enum usb_phy_events { USB_EVENT_NONE, /* no events or cable disconnected */ USB_EVENT_VBUS, /* vbus valid event */ USB_EVENT_ID, /* id was grounded */ USB_EVENT_CHARGER, /* usb dedicated charger */ USB_EVENT_ENUMERATED, /* gadget driver enumerated */ }; /* associate a type with PHY */ enum usb_phy_type { USB_PHY_TYPE_UNDEFINED, USB_PHY_TYPE_USB2, USB_PHY_TYPE_USB3, }; /* OTG defines lots of enumeration states before device reset */ enum usb_otg_state { OTG_STATE_UNDEFINED = 0, /* single-role peripheral, and dual-role default-b */ OTG_STATE_B_IDLE, OTG_STATE_B_SRP_INIT, OTG_STATE_B_PERIPHERAL, /* extra dual-role default-b states */ OTG_STATE_B_WAIT_ACON, OTG_STATE_B_HOST, /* dual-role default-a */ OTG_STATE_A_IDLE, OTG_STATE_A_WAIT_VRISE, OTG_STATE_A_WAIT_BCON, OTG_STATE_A_HOST, OTG_STATE_A_SUSPEND, OTG_STATE_A_PERIPHERAL, OTG_STATE_A_WAIT_VFALL, OTG_STATE_A_VBUS_ERR, }; struct usb_phy; struct usb_otg; /* for phys connected thru an ULPI interface, the user must * provide access ops */ struct usb_phy_io_ops { int (*read)(struct usb_phy *x, u32 reg); int (*write)(struct usb_phy *x, u32 val, u32 reg); }; struct usb_charger_current { unsigned int sdp_min; unsigned int sdp_max; unsigned int dcp_min; unsigned int dcp_max; unsigned int cdp_min; unsigned int cdp_max; unsigned int aca_min; unsigned int aca_max; }; struct usb_phy { struct device *dev; const char *label; unsigned int flags; enum usb_phy_type type; enum usb_phy_events last_event; struct usb_otg *otg; struct device *io_dev; struct usb_phy_io_ops *io_ops; void __iomem *io_priv; /* to support extcon device */ struct extcon_dev *edev; struct extcon_dev *id_edev; struct notifier_block vbus_nb; struct notifier_block id_nb; struct notifier_block type_nb; /* Support USB charger */ enum usb_charger_type chg_type; enum usb_charger_state chg_state; struct usb_charger_current chg_cur; struct work_struct chg_work; /* for notification of usb_phy_events */ struct atomic_notifier_head notifier; /* to pass extra port status to the root hub */ u16 port_status; u16 port_change; /* to support controllers that have multiple phys */ struct list_head head; /* initialize/shutdown the phy */ int (*init)(struct usb_phy *x); void (*shutdown)(struct usb_phy *x); /* enable/disable VBUS */ int (*set_vbus)(struct usb_phy *x, int on); /* effective for B devices, ignored for A-peripheral */ int (*set_power)(struct usb_phy *x, unsigned mA); /* Set phy into suspend mode */ int (*set_suspend)(struct usb_phy *x, int suspend); /* * Set wakeup enable for PHY, in that case, the PHY can be * woken up from suspend status due to external events, * like vbus change, dp/dm change and id. */ int (*set_wakeup)(struct usb_phy *x, bool enabled); /* notify phy connect status change */ int (*notify_connect)(struct usb_phy *x, enum usb_device_speed speed); int (*notify_disconnect)(struct usb_phy *x, enum usb_device_speed speed); /* * Charger detection method can be implemented if you need to * manually detect the charger type. */ enum usb_charger_type (*charger_detect)(struct usb_phy *x); }; /* for board-specific init logic */ extern int usb_add_phy(struct usb_phy *, enum usb_phy_type type); extern int usb_add_phy_dev(struct usb_phy *); extern void usb_remove_phy(struct usb_phy *); /* helpers for direct access thru low-level io interface */ static inline int usb_phy_io_read(struct usb_phy *x, u32 reg) { if (x && x->io_ops && x->io_ops->read) return x->io_ops->read(x, reg); return -EINVAL; } static inline int usb_phy_io_write(struct usb_phy *x, u32 val, u32 reg) { if (x && x->io_ops && x->io_ops->write) return x->io_ops->write(x, val, reg); return -EINVAL; } static inline int usb_phy_init(struct usb_phy *x) { if (x && x->init) return x->init(x); return 0; } static inline void usb_phy_shutdown(struct usb_phy *x) { if (x && x->shutdown) x->shutdown(x); } static inline int usb_phy_vbus_on(struct usb_phy *x) { if (!x || !x->set_vbus) return 0; return x->set_vbus(x, true); } static inline int usb_phy_vbus_off(struct usb_phy *x) { if (!x || !x->set_vbus) return 0; return x->set_vbus(x, false); } /* for usb host and peripheral controller drivers */ #if IS_ENABLED(CONFIG_USB_PHY) extern struct usb_phy *usb_get_phy(enum usb_phy_type type); extern struct usb_phy *devm_usb_get_phy(struct device *dev, enum usb_phy_type type); extern struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, const char *phandle, u8 index); extern struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, struct device_node *node, struct notifier_block *nb); extern void usb_put_phy(struct usb_phy *); extern void devm_usb_put_phy(struct device *dev, struct usb_phy *x); extern void usb_phy_set_event(struct usb_phy *x, unsigned long event); extern void usb_phy_set_charger_current(struct usb_phy *usb_phy, unsigned int mA); extern void usb_phy_get_charger_current(struct usb_phy *usb_phy, unsigned int *min, unsigned int *max); extern void usb_phy_set_charger_state(struct usb_phy *usb_phy, enum usb_charger_state state); #else static inline struct usb_phy *usb_get_phy(enum usb_phy_type type) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy(struct device *dev, enum usb_phy_type type) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, const char *phandle, u8 index) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, struct device_node *node, struct notifier_block *nb) { return ERR_PTR(-ENXIO); } static inline void usb_put_phy(struct usb_phy *x) { } static inline void devm_usb_put_phy(struct device *dev, struct usb_phy *x) { } static inline void usb_phy_set_event(struct usb_phy *x, unsigned long event) { } static inline void usb_phy_set_charger_current(struct usb_phy *usb_phy, unsigned int mA) { } static inline void usb_phy_get_charger_current(struct usb_phy *usb_phy, unsigned int *min, unsigned int *max) { } static inline void usb_phy_set_charger_state(struct usb_phy *usb_phy, enum usb_charger_state state) { } #endif static inline int usb_phy_set_power(struct usb_phy *x, unsigned mA) { if (!x) return 0; usb_phy_set_charger_current(x, mA); if (x->set_power) return x->set_power(x, mA); return 0; } /* Context: can sleep */ static inline int usb_phy_set_suspend(struct usb_phy *x, int suspend) { if (x && x->set_suspend != NULL) return x->set_suspend(x, suspend); else return 0; } static inline int usb_phy_set_wakeup(struct usb_phy *x, bool enabled) { if (x && x->set_wakeup) return x->set_wakeup(x, enabled); else return 0; } static inline int usb_phy_notify_connect(struct usb_phy *x, enum usb_device_speed speed) { if (x && x->notify_connect) return x->notify_connect(x, speed); else return 0; } static inline int usb_phy_notify_disconnect(struct usb_phy *x, enum usb_device_speed speed) { if (x && x->notify_disconnect) return x->notify_disconnect(x, speed); else return 0; } /* notifiers */ static inline int usb_register_notifier(struct usb_phy *x, struct notifier_block *nb) { return atomic_notifier_chain_register(&x->notifier, nb); } static inline void usb_unregister_notifier(struct usb_phy *x, struct notifier_block *nb) { atomic_notifier_chain_unregister(&x->notifier, nb); } static inline const char *usb_phy_type_string(enum usb_phy_type type) { switch (type) { case USB_PHY_TYPE_USB2: return "USB2 PHY"; case USB_PHY_TYPE_USB3: return "USB3 PHY"; default: return "UNKNOWN PHY TYPE"; } } #endif /* __LINUX_USB_PHY_H */
12 12 12 12 12 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 /* * linux/fs/nls/mac-gaelic.c * * Charset macgaelic translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ /* * COPYRIGHT AND PERMISSION NOTICE * * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under * the Terms of Use in http://www.unicode.org/copyright.html. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of the Unicode data files and any associated documentation (the "Data * Files") or Unicode software and any associated documentation (the * "Software") to deal in the Data Files or Software without restriction, * including without limitation the rights to use, copy, modify, merge, * publish, distribute, and/or sell copies of the Data Files or Software, and * to permit persons to whom the Data Files or Software are furnished to do * so, provided that (a) the above copyright notice(s) and this permission * notice appear with all copies of the Data Files or Software, (b) both the * above copyright notice(s) and this permission notice appear in associated * documentation, and (c) there is clear notice in each modified Data File or * in the Software as well as in the documentation associated with the Data * File(s) or Software that the data or software has been modified. * * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR * PERFORMANCE OF THE DATA FILES OR SOFTWARE. * * Except as contained in this notice, the name of a copyright holder shall * not be used in advertising or otherwise to promote the sale, use or other * dealings in these Data Files or Software without prior written * authorization of the copyright holder. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00 */ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40 */ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50 */ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80 */ 0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1, 0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8, /* 0x90 */ 0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3, 0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc, /* 0xa0 */ 0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df, 0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8, /* 0xb0 */ 0x1e02, 0x00b1, 0x2264, 0x2265, 0x1e03, 0x010a, 0x010b, 0x1e0a, 0x1e0b, 0x1e1e, 0x1e1f, 0x0120, 0x0121, 0x1e40, 0x00e6, 0x00f8, /* 0xc0 */ 0x1e41, 0x1e56, 0x1e57, 0x027c, 0x0192, 0x017f, 0x1e60, 0x00ab, 0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153, /* 0xd0 */ 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x1e61, 0x1e9b, 0x00ff, 0x0178, 0x1e6a, 0x20ac, 0x2039, 0x203a, 0x0176, 0x0177, /* 0xe0 */ 0x1e6b, 0x00b7, 0x1ef2, 0x1ef3, 0x204a, 0x00c2, 0x00ca, 0x00c1, 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4, /* 0xf0 */ 0x2663, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x00dd, 0x00fd, 0x0174, 0x0175, 0x1e84, 0x1e85, 0x1e80, 0x1e81, 0x1e82, 0x1e83, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xca, 0x00, 0xa2, 0xa3, 0x00, 0x00, 0x00, 0xa4, /* 0xa0-0xa7 */ 0xac, 0xa9, 0x00, 0xc7, 0x00, 0x00, 0xa8, 0x00, /* 0xa8-0xaf */ 0xa1, 0xb1, 0x00, 0x00, 0xab, 0x00, 0xa6, 0xe1, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, /* 0xc0-0xc7 */ 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */ 0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */ 0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0xf6, 0x00, 0xa7, /* 0xd8-0xdf */ 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, /* 0xe0-0xe7 */ 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */ 0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0x00, /* 0xf0-0xf7 */ 0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0xf7, 0x00, 0xd8, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0xb5, 0xb6, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0xbb, 0xbc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0xf8, 0xf9, 0xde, 0xdf, /* 0x70-0x77 */ 0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc5, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page02[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page1e[256] = { 0x00, 0x00, 0xb0, 0xb4, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0xb7, 0xb8, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb9, 0xba, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0xbd, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc1, 0xc2, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xc6, 0xd6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0xda, 0xe0, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0xfc, 0xfd, 0xfe, 0xff, 0xfa, 0xfb, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0xe2, 0xe3, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd4, 0xd5, 0x00, 0x00, 0xd2, 0xd3, 0x00, 0x00, /* 0x18-0x1f */ 0xa0, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page21[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page26[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, page02, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page1e, NULL, page20, page21, page22, NULL, NULL, NULL, page26, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x00-0x07 */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x08-0x0f */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x10-0x17 */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x18-0x1f */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x20-0x27 */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x28-0x2f */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x30-0x37 */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x38-0x3f */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x40-0x47 */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x48-0x4f */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x50-0x57 */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x58-0x5f */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x60-0x67 */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x68-0x6f */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x70-0x77 */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x78-0x7f */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x80-0x87 */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x88-0x8f */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x90-0x97 */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0x98-0x9f */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xa0-0xa7 */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xa8-0xaf */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xb0-0xb7 */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xb8-0xbf */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xc0-0xc7 */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xc8-0xcf */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xd0-0xd7 */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xd8-0xdf */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xe0-0xe7 */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xe8-0xef */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf0-0xf7 */ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "macgaelic", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_macgaelic(void) { return register_nls(&table); } static void __exit exit_nls_macgaelic(void) { unregister_nls(&table); } module_init(init_nls_macgaelic) module_exit(exit_nls_macgaelic) MODULE_DESCRIPTION("NLS Codepage macgaelic"); MODULE_LICENSE("Dual BSD/GPL");
70 71 71 64 64 64 64 71 71 71 71 3 3 3 3 3 3 3 3 1 2 3 4 4 1 4 4 4 4 3 12 12 12 12 10 1 10 4 3 2 3 1 9 5 3 7 7 7 7 12 12 34 9 6 33 24 4 21 26 16 15 14 13 15 10 34 3 1 22 62 6 11 5 4 1 1 2 1 3 13 13 12 12 35 62 34 1 1 2 1 11 11 11 11 1 71 63 64 64 64 64 63 71 71 70 78 64 71 70 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 // SPDX-License-Identifier: GPL-2.0-or-later /* i2c-dev.c - i2c-bus driver, char device interface Copyright (C) 1995-97 Simon G. Vogl Copyright (C) 1998-99 Frodo Looijaard <frodol@dds.nl> Copyright (C) 2003 Greg Kroah-Hartman <greg@kroah.com> */ /* Note that this is a complete rewrite of Simon Vogl's i2c-dev module. But I have used so much of his original code and ideas that it seems only fair to recognize him as co-author -- Frodo */ /* The I2C_RDWR ioctl code is written by Kolja Waschk <waschk@telos.de> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cdev.h> #include <linux/compat.h> #include <linux/device.h> #include <linux/fs.h> #include <linux/i2c-dev.h> #include <linux/i2c.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/module.h> #include <linux/notifier.h> #include <linux/slab.h> #include <linux/uaccess.h> /* * An i2c_dev represents an i2c_adapter ... an I2C or SMBus master, not a * slave (i2c_client) with which messages will be exchanged. It's coupled * with a character special file which is accessed by user mode drivers. * * The list of i2c_dev structures is parallel to the i2c_adapter lists * maintained by the driver model, and is updated using bus notifications. */ struct i2c_dev { struct list_head list; struct i2c_adapter *adap; struct device dev; struct cdev cdev; }; #define I2C_MINORS (MINORMASK + 1) static LIST_HEAD(i2c_dev_list); static DEFINE_SPINLOCK(i2c_dev_list_lock); static struct i2c_dev *i2c_dev_get_by_minor(unsigned index) { struct i2c_dev *i2c_dev; spin_lock(&i2c_dev_list_lock); list_for_each_entry(i2c_dev, &i2c_dev_list, list) { if (i2c_dev->adap->nr == index) goto found; } i2c_dev = NULL; found: spin_unlock(&i2c_dev_list_lock); return i2c_dev; } static struct i2c_dev *get_free_i2c_dev(struct i2c_adapter *adap) { struct i2c_dev *i2c_dev; if (adap->nr >= I2C_MINORS) { pr_err("Out of device minors (%d)\n", adap->nr); return ERR_PTR(-ENODEV); } i2c_dev = kzalloc(sizeof(*i2c_dev), GFP_KERNEL); if (!i2c_dev) return ERR_PTR(-ENOMEM); i2c_dev->adap = adap; spin_lock(&i2c_dev_list_lock); list_add_tail(&i2c_dev->list, &i2c_dev_list); spin_unlock(&i2c_dev_list_lock); return i2c_dev; } static void put_i2c_dev(struct i2c_dev *i2c_dev, bool del_cdev) { spin_lock(&i2c_dev_list_lock); list_del(&i2c_dev->list); spin_unlock(&i2c_dev_list_lock); if (del_cdev) cdev_device_del(&i2c_dev->cdev, &i2c_dev->dev); put_device(&i2c_dev->dev); } static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct i2c_dev *i2c_dev = i2c_dev_get_by_minor(MINOR(dev->devt)); if (!i2c_dev) return -ENODEV; return sysfs_emit(buf, "%s\n", i2c_dev->adap->name); } static DEVICE_ATTR_RO(name); static struct attribute *i2c_attrs[] = { &dev_attr_name.attr, NULL, }; ATTRIBUTE_GROUPS(i2c); /* ------------------------------------------------------------------------- */ /* * After opening an instance of this character special file, a file * descriptor starts out associated only with an i2c_adapter (and bus). * * Using the I2C_RDWR ioctl(), you can then *immediately* issue i2c_msg * traffic to any devices on the bus used by that adapter. That's because * the i2c_msg vectors embed all the addressing information they need, and * are submitted directly to an i2c_adapter. However, SMBus-only adapters * don't support that interface. * * To use read()/write() system calls on that file descriptor, or to use * SMBus interfaces (and work with SMBus-only hosts!), you must first issue * an I2C_SLAVE (or I2C_SLAVE_FORCE) ioctl. That configures an anonymous * (never registered) i2c_client so it holds the addressing information * needed by those system calls and by this SMBus interface. */ static ssize_t i2cdev_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { char *tmp; int ret; struct i2c_client *client = file->private_data; /* Adapter must support I2C transfers */ if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) return -EOPNOTSUPP; if (count > 8192) count = 8192; tmp = kzalloc(count, GFP_KERNEL); if (tmp == NULL) return -ENOMEM; pr_debug("i2c-%d reading %zu bytes.\n", iminor(file_inode(file)), count); ret = i2c_master_recv(client, tmp, count); if (ret >= 0) if (copy_to_user(buf, tmp, ret)) ret = -EFAULT; kfree(tmp); return ret; } static ssize_t i2cdev_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { int ret; char *tmp; struct i2c_client *client = file->private_data; /* Adapter must support I2C transfers */ if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) return -EOPNOTSUPP; if (count > 8192) count = 8192; tmp = memdup_user(buf, count); if (IS_ERR(tmp)) return PTR_ERR(tmp); pr_debug("i2c-%d writing %zu bytes.\n", iminor(file_inode(file)), count); ret = i2c_master_send(client, tmp, count); kfree(tmp); return ret; } static int i2cdev_check(struct device *dev, void *addrp) { struct i2c_client *client = i2c_verify_client(dev); if (!client || client->addr != *(unsigned int *)addrp) return 0; return dev->driver ? -EBUSY : 0; } /* walk up mux tree */ static int i2cdev_check_mux_parents(struct i2c_adapter *adapter, int addr) { struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter); int result; result = device_for_each_child(&adapter->dev, &addr, i2cdev_check); if (!result && parent) result = i2cdev_check_mux_parents(parent, addr); return result; } /* recurse down mux tree */ static int i2cdev_check_mux_children(struct device *dev, void *addrp) { int result; if (dev->type == &i2c_adapter_type) result = device_for_each_child(dev, addrp, i2cdev_check_mux_children); else result = i2cdev_check(dev, addrp); return result; } /* This address checking function differs from the one in i2c-core in that it considers an address with a registered device, but no driver bound to it, as NOT busy. */ static int i2cdev_check_addr(struct i2c_adapter *adapter, unsigned int addr) { struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter); int result = 0; if (parent) result = i2cdev_check_mux_parents(parent, addr); if (!result) result = device_for_each_child(&adapter->dev, &addr, i2cdev_check_mux_children); return result; } static noinline int i2cdev_ioctl_rdwr(struct i2c_client *client, unsigned nmsgs, struct i2c_msg *msgs) { u8 __user **data_ptrs; int i, res; /* Adapter must support I2C transfers */ if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) return -EOPNOTSUPP; data_ptrs = kmalloc_array(nmsgs, sizeof(u8 __user *), GFP_KERNEL); if (data_ptrs == NULL) { kfree(msgs); return -ENOMEM; } res = 0; for (i = 0; i < nmsgs; i++) { /* Limit the size of the message to a sane amount */ if (msgs[i].len > 8192) { res = -EINVAL; break; } data_ptrs[i] = (u8 __user *)msgs[i].buf; msgs[i].buf = memdup_user(data_ptrs[i], msgs[i].len); if (IS_ERR(msgs[i].buf)) { res = PTR_ERR(msgs[i].buf); break; } /* memdup_user allocates with GFP_KERNEL, so DMA is ok */ msgs[i].flags |= I2C_M_DMA_SAFE; /* * If the message length is received from the slave (similar * to SMBus block read), we must ensure that the buffer will * be large enough to cope with a message length of * I2C_SMBUS_BLOCK_MAX as this is the maximum underlying bus * drivers allow. The first byte in the buffer must be * pre-filled with the number of extra bytes, which must be * at least one to hold the message length, but can be * greater (for example to account for a checksum byte at * the end of the message.) */ if (msgs[i].flags & I2C_M_RECV_LEN) { if (!(msgs[i].flags & I2C_M_RD) || msgs[i].len < 1 || msgs[i].buf[0] < 1 || msgs[i].len < msgs[i].buf[0] + I2C_SMBUS_BLOCK_MAX) { i++; res = -EINVAL; break; } msgs[i].len = msgs[i].buf[0]; } } if (res < 0) { int j; for (j = 0; j < i; ++j) kfree(msgs[j].buf); kfree(data_ptrs); kfree(msgs); return res; } res = i2c_transfer(client->adapter, msgs, nmsgs); while (i-- > 0) { if (res >= 0 && (msgs[i].flags & I2C_M_RD)) { if (copy_to_user(data_ptrs[i], msgs[i].buf, msgs[i].len)) res = -EFAULT; } kfree(msgs[i].buf); } kfree(data_ptrs); kfree(msgs); return res; } static noinline int i2cdev_ioctl_smbus(struct i2c_client *client, u8 read_write, u8 command, u32 size, union i2c_smbus_data __user *data) { union i2c_smbus_data temp = {}; int datasize, res; if ((size != I2C_SMBUS_BYTE) && (size != I2C_SMBUS_QUICK) && (size != I2C_SMBUS_BYTE_DATA) && (size != I2C_SMBUS_WORD_DATA) && (size != I2C_SMBUS_PROC_CALL) && (size != I2C_SMBUS_BLOCK_DATA) && (size != I2C_SMBUS_I2C_BLOCK_BROKEN) && (size != I2C_SMBUS_I2C_BLOCK_DATA) && (size != I2C_SMBUS_BLOCK_PROC_CALL)) { dev_dbg(&client->adapter->dev, "size out of range (%x) in ioctl I2C_SMBUS.\n", size); return -EINVAL; } /* Note that I2C_SMBUS_READ and I2C_SMBUS_WRITE are 0 and 1, so the check is valid if size==I2C_SMBUS_QUICK too. */ if ((read_write != I2C_SMBUS_READ) && (read_write != I2C_SMBUS_WRITE)) { dev_dbg(&client->adapter->dev, "read_write out of range (%x) in ioctl I2C_SMBUS.\n", read_write); return -EINVAL; } /* Note that command values are always valid! */ if ((size == I2C_SMBUS_QUICK) || ((size == I2C_SMBUS_BYTE) && (read_write == I2C_SMBUS_WRITE))) /* These are special: we do not use data */ return i2c_smbus_xfer(client->adapter, client->addr, client->flags, read_write, command, size, NULL); if (data == NULL) { dev_dbg(&client->adapter->dev, "data is NULL pointer in ioctl I2C_SMBUS.\n"); return -EINVAL; } if ((size == I2C_SMBUS_BYTE_DATA) || (size == I2C_SMBUS_BYTE)) datasize = sizeof(data->byte); else if ((size == I2C_SMBUS_WORD_DATA) || (size == I2C_SMBUS_PROC_CALL)) datasize = sizeof(data->word); else /* size == smbus block, i2c block, or block proc. call */ datasize = sizeof(data->block); if ((size == I2C_SMBUS_PROC_CALL) || (size == I2C_SMBUS_BLOCK_PROC_CALL) || (size == I2C_SMBUS_I2C_BLOCK_DATA) || (read_write == I2C_SMBUS_WRITE)) { if (copy_from_user(&temp, data, datasize)) return -EFAULT; } if (size == I2C_SMBUS_I2C_BLOCK_BROKEN) { /* Convert old I2C block commands to the new convention. This preserves binary compatibility. */ size = I2C_SMBUS_I2C_BLOCK_DATA; if (read_write == I2C_SMBUS_READ) temp.block[0] = I2C_SMBUS_BLOCK_MAX; } res = i2c_smbus_xfer(client->adapter, client->addr, client->flags, read_write, command, size, &temp); if (!res && ((size == I2C_SMBUS_PROC_CALL) || (size == I2C_SMBUS_BLOCK_PROC_CALL) || (read_write == I2C_SMBUS_READ))) { if (copy_to_user(data, &temp, datasize)) return -EFAULT; } return res; } static long i2cdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct i2c_client *client = file->private_data; unsigned long funcs; dev_dbg(&client->adapter->dev, "ioctl, cmd=0x%02x, arg=0x%02lx\n", cmd, arg); switch (cmd) { case I2C_SLAVE: case I2C_SLAVE_FORCE: if ((arg > 0x3ff) || (((client->flags & I2C_M_TEN) == 0) && arg > 0x7f)) return -EINVAL; if (cmd == I2C_SLAVE && i2cdev_check_addr(client->adapter, arg)) return -EBUSY; /* REVISIT: address could become busy later */ client->addr = arg; return 0; case I2C_TENBIT: if (arg) client->flags |= I2C_M_TEN; else client->flags &= ~I2C_M_TEN; return 0; case I2C_PEC: /* * Setting the PEC flag here won't affect kernel drivers, * which will be using the i2c_client node registered with * the driver model core. Likewise, when that client has * the PEC flag already set, the i2c-dev driver won't see * (or use) this setting. */ if (arg) client->flags |= I2C_CLIENT_PEC; else client->flags &= ~I2C_CLIENT_PEC; return 0; case I2C_FUNCS: funcs = i2c_get_functionality(client->adapter); return put_user(funcs, (unsigned long __user *)arg); case I2C_RDWR: { struct i2c_rdwr_ioctl_data rdwr_arg; struct i2c_msg *rdwr_pa; if (copy_from_user(&rdwr_arg, (struct i2c_rdwr_ioctl_data __user *)arg, sizeof(rdwr_arg))) return -EFAULT; if (!rdwr_arg.msgs || rdwr_arg.nmsgs == 0) return -EINVAL; /* * Put an arbitrary limit on the number of messages that can * be sent at once */ if (rdwr_arg.nmsgs > I2C_RDWR_IOCTL_MAX_MSGS) return -EINVAL; rdwr_pa = memdup_array_user(rdwr_arg.msgs, rdwr_arg.nmsgs, sizeof(struct i2c_msg)); if (IS_ERR(rdwr_pa)) return PTR_ERR(rdwr_pa); return i2cdev_ioctl_rdwr(client, rdwr_arg.nmsgs, rdwr_pa); } case I2C_SMBUS: { struct i2c_smbus_ioctl_data data_arg; if (copy_from_user(&data_arg, (struct i2c_smbus_ioctl_data __user *) arg, sizeof(struct i2c_smbus_ioctl_data))) return -EFAULT; return i2cdev_ioctl_smbus(client, data_arg.read_write, data_arg.command, data_arg.size, data_arg.data); } case I2C_RETRIES: if (arg > INT_MAX) return -EINVAL; client->adapter->retries = arg; break; case I2C_TIMEOUT: if (arg > INT_MAX) return -EINVAL; /* For historical reasons, user-space sets the timeout * value in units of 10 ms. */ client->adapter->timeout = msecs_to_jiffies(arg * 10); break; default: /* NOTE: returning a fault code here could cause trouble * in buggy userspace code. Some old kernel bugs returned * zero in this case, and userspace code might accidentally * have depended on that bug. */ return -ENOTTY; } return 0; } #ifdef CONFIG_COMPAT struct i2c_smbus_ioctl_data32 { u8 read_write; u8 command; u32 size; compat_caddr_t data; /* union i2c_smbus_data *data */ }; struct i2c_msg32 { u16 addr; u16 flags; u16 len; compat_caddr_t buf; }; struct i2c_rdwr_ioctl_data32 { compat_caddr_t msgs; /* struct i2c_msg __user *msgs */ u32 nmsgs; }; static long compat_i2cdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct i2c_client *client = file->private_data; unsigned long funcs; switch (cmd) { case I2C_FUNCS: funcs = i2c_get_functionality(client->adapter); return put_user(funcs, (compat_ulong_t __user *)arg); case I2C_RDWR: { struct i2c_rdwr_ioctl_data32 rdwr_arg; struct i2c_msg32 __user *p; struct i2c_msg *rdwr_pa; int i; if (copy_from_user(&rdwr_arg, (struct i2c_rdwr_ioctl_data32 __user *)arg, sizeof(rdwr_arg))) return -EFAULT; if (!rdwr_arg.msgs || rdwr_arg.nmsgs == 0) return -EINVAL; if (rdwr_arg.nmsgs > I2C_RDWR_IOCTL_MAX_MSGS) return -EINVAL; rdwr_pa = kmalloc_array(rdwr_arg.nmsgs, sizeof(struct i2c_msg), GFP_KERNEL); if (!rdwr_pa) return -ENOMEM; p = compat_ptr(rdwr_arg.msgs); for (i = 0; i < rdwr_arg.nmsgs; i++) { struct i2c_msg32 umsg; if (copy_from_user(&umsg, p + i, sizeof(umsg))) { kfree(rdwr_pa); return -EFAULT; } rdwr_pa[i] = (struct i2c_msg) { .addr = umsg.addr, .flags = umsg.flags, .len = umsg.len, .buf = (__force __u8 *)compat_ptr(umsg.buf), }; } return i2cdev_ioctl_rdwr(client, rdwr_arg.nmsgs, rdwr_pa); } case I2C_SMBUS: { struct i2c_smbus_ioctl_data32 data32; if (copy_from_user(&data32, (void __user *) arg, sizeof(data32))) return -EFAULT; return i2cdev_ioctl_smbus(client, data32.read_write, data32.command, data32.size, compat_ptr(data32.data)); } default: return i2cdev_ioctl(file, cmd, arg); } } #else #define compat_i2cdev_ioctl NULL #endif static int i2cdev_open(struct inode *inode, struct file *file) { unsigned int minor = iminor(inode); struct i2c_client *client; struct i2c_adapter *adap; adap = i2c_get_adapter(minor); if (!adap) return -ENODEV; /* This creates an anonymous i2c_client, which may later be * pointed to some address using I2C_SLAVE or I2C_SLAVE_FORCE. * * This client is ** NEVER REGISTERED ** with the driver model * or I2C core code!! It just holds private copies of addressing * information and maybe a PEC flag. */ client = kzalloc(sizeof(*client), GFP_KERNEL); if (!client) { i2c_put_adapter(adap); return -ENOMEM; } snprintf(client->name, I2C_NAME_SIZE, "i2c-dev %d", adap->nr); client->adapter = adap; file->private_data = client; return 0; } static int i2cdev_release(struct inode *inode, struct file *file) { struct i2c_client *client = file->private_data; i2c_put_adapter(client->adapter); kfree(client); file->private_data = NULL; return 0; } static const struct file_operations i2cdev_fops = { .owner = THIS_MODULE, .llseek = no_llseek, .read = i2cdev_read, .write = i2cdev_write, .unlocked_ioctl = i2cdev_ioctl, .compat_ioctl = compat_i2cdev_ioctl, .open = i2cdev_open, .release = i2cdev_release, }; /* ------------------------------------------------------------------------- */ static const struct class i2c_dev_class = { .name = "i2c-dev", .dev_groups = i2c_groups, }; static void i2cdev_dev_release(struct device *dev) { struct i2c_dev *i2c_dev; i2c_dev = container_of(dev, struct i2c_dev, dev); kfree(i2c_dev); } static int i2cdev_attach_adapter(struct device *dev) { struct i2c_adapter *adap; struct i2c_dev *i2c_dev; int res; if (dev->type != &i2c_adapter_type) return NOTIFY_DONE; adap = to_i2c_adapter(dev); i2c_dev = get_free_i2c_dev(adap); if (IS_ERR(i2c_dev)) return NOTIFY_DONE; cdev_init(&i2c_dev->cdev, &i2cdev_fops); i2c_dev->cdev.owner = THIS_MODULE; device_initialize(&i2c_dev->dev); i2c_dev->dev.devt = MKDEV(I2C_MAJOR, adap->nr); i2c_dev->dev.class = &i2c_dev_class; i2c_dev->dev.parent = &adap->dev; i2c_dev->dev.release = i2cdev_dev_release; res = dev_set_name(&i2c_dev->dev, "i2c-%d", adap->nr); if (res) goto err_put_i2c_dev; res = cdev_device_add(&i2c_dev->cdev, &i2c_dev->dev); if (res) goto err_put_i2c_dev; pr_debug("adapter [%s] registered as minor %d\n", adap->name, adap->nr); return NOTIFY_OK; err_put_i2c_dev: put_i2c_dev(i2c_dev, false); return NOTIFY_DONE; } static int i2cdev_detach_adapter(struct device *dev) { struct i2c_adapter *adap; struct i2c_dev *i2c_dev; if (dev->type != &i2c_adapter_type) return NOTIFY_DONE; adap = to_i2c_adapter(dev); i2c_dev = i2c_dev_get_by_minor(adap->nr); if (!i2c_dev) /* attach_adapter must have failed */ return NOTIFY_DONE; put_i2c_dev(i2c_dev, true); pr_debug("adapter [%s] unregistered\n", adap->name); return NOTIFY_OK; } static int i2cdev_notifier_call(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; switch (action) { case BUS_NOTIFY_ADD_DEVICE: return i2cdev_attach_adapter(dev); case BUS_NOTIFY_DEL_DEVICE: return i2cdev_detach_adapter(dev); } return NOTIFY_DONE; } static struct notifier_block i2cdev_notifier = { .notifier_call = i2cdev_notifier_call, }; /* ------------------------------------------------------------------------- */ static int __init i2c_dev_attach_adapter(struct device *dev, void *dummy) { i2cdev_attach_adapter(dev); return 0; } static int __exit i2c_dev_detach_adapter(struct device *dev, void *dummy) { i2cdev_detach_adapter(dev); return 0; } /* * module load/unload record keeping */ static int __init i2c_dev_init(void) { int res; pr_info("i2c /dev entries driver\n"); res = register_chrdev_region(MKDEV(I2C_MAJOR, 0), I2C_MINORS, "i2c"); if (res) goto out; res = class_register(&i2c_dev_class); if (res) goto out_unreg_chrdev; /* Keep track of adapters which will be added or removed later */ res = bus_register_notifier(&i2c_bus_type, &i2cdev_notifier); if (res) goto out_unreg_class; /* Bind to already existing adapters right away */ i2c_for_each_dev(NULL, i2c_dev_attach_adapter); return 0; out_unreg_class: class_unregister(&i2c_dev_class); out_unreg_chrdev: unregister_chrdev_region(MKDEV(I2C_MAJOR, 0), I2C_MINORS); out: pr_err("Driver Initialisation failed\n"); return res; } static void __exit i2c_dev_exit(void) { bus_unregister_notifier(&i2c_bus_type, &i2cdev_notifier); i2c_for_each_dev(NULL, i2c_dev_detach_adapter); class_unregister(&i2c_dev_class); unregister_chrdev_region(MKDEV(I2C_MAJOR, 0), I2C_MINORS); } MODULE_AUTHOR("Frodo Looijaard <frodol@dds.nl>"); MODULE_AUTHOR("Simon G. Vogl <simon@tk.uni-linz.ac.at>"); MODULE_DESCRIPTION("I2C /dev entries driver"); MODULE_LICENSE("GPL"); module_init(i2c_dev_init); module_exit(i2c_dev_exit);
3096 32 11637 2897 32 2 2 219 218 219 218 11637 2122 15176 29 29 1 29 2 4427 9180 6387 6403 116 4694 146 235 236 236 34 99 3523 117 8 1137 465 508 146 6754 1 6757 2 2 2 2 2 49 49 2894 3898 4039 4028 3340 4039 4031 2896 4038 8397 8447 32 115 116 8355 8338 8355 8329 59 531 6764 6765 266 206 26 266 7000 4434 6753 5419 7000 3201 3213 3180 365 365 2641 126 504 38 3478 163 88 163 163 163 2 2 1 2829 2829 2823 2821 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* memcontrol.h - Memory Controller * * Copyright IBM Corporation, 2007 * Author Balbir Singh <balbir@linux.vnet.ibm.com> * * Copyright 2007 OpenVZ SWsoft Inc * Author: Pavel Emelianov <xemul@openvz.org> */ #ifndef _LINUX_MEMCONTROL_H #define _LINUX_MEMCONTROL_H #include <linux/cgroup.h> #include <linux/vm_event_item.h> #include <linux/hardirq.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/page_counter.h> #include <linux/vmpressure.h> #include <linux/eventfd.h> #include <linux/mm.h> #include <linux/vmstat.h> #include <linux/writeback.h> #include <linux/page-flags.h> #include <linux/shrinker.h> struct mem_cgroup; struct obj_cgroup; struct page; struct mm_struct; struct kmem_cache; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, MEMCG_PERCPU_B, MEMCG_VMALLOC, MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, MEMCG_NR_STAT, }; enum memcg_memory_event { MEMCG_LOW, MEMCG_HIGH, MEMCG_MAX, MEMCG_OOM, MEMCG_OOM_KILL, MEMCG_OOM_GROUP_KILL, MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, MEMCG_NR_MEMORY_EVENTS, }; struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; unsigned int generation; }; #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 struct mem_cgroup_id { int id; refcount_t ref; }; struct memcg_vmstats_percpu; struct memcg_vmstats; struct lruvec_stats_percpu; struct lruvec_stats; struct mem_cgroup_reclaim_iter { struct mem_cgroup *position; /* scan generation, increased every round-trip */ unsigned int generation; }; /* * per-node information in memory controller. */ struct mem_cgroup_per_node { /* Keep the read-only fields at the start */ struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ struct lruvec_stats_percpu __percpu *lruvec_stats_percpu; struct lruvec_stats *lruvec_stats; struct shrinker_info __rcu *shrinker_info; #ifdef CONFIG_MEMCG_V1 /* * Memcg-v1 only stuff in middle as buffer between read mostly fields * and update often fields to avoid false sharing. If v1 stuff is * not present, an explicit padding is needed. */ struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; #else CACHELINE_PADDING(_pad1_); #endif /* Fields which get updated often at the end. */ struct lruvec lruvec; CACHELINE_PADDING(_pad2_); unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter; }; struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; unsigned long threshold; }; /* For threshold */ struct mem_cgroup_threshold_ary { /* An array index points to threshold just below or equal to usage. */ int current_threshold; /* Size of entries[] */ unsigned int size; /* Array of thresholds */ struct mem_cgroup_threshold entries[] __counted_by(size); }; struct mem_cgroup_thresholds { /* Primary thresholds array */ struct mem_cgroup_threshold_ary *primary; /* * Spare threshold array. * This is needed to make mem_cgroup_unregister_event() "never fail". * It must be able to store at least primary->size - 1 entries. */ struct mem_cgroup_threshold_ary *spare; }; /* * Remember four most recent foreign writebacks with dirty pages in this * cgroup. Inode sharing is expected to be uncommon and, even if we miss * one in a given round, we're likely to catch it later if it keeps * foreign-dirtying, so a fairly low count should be enough. * * See mem_cgroup_track_foreign_dirty_slowpath() for details. */ #define MEMCG_CGWB_FRN_CNT 4 struct memcg_cgwb_frn { u64 bdi_id; /* bdi->id of the foreign inode */ int memcg_id; /* memcg->css.id of foreign inode */ u64 at; /* jiffies_64 at the time of dirtying */ struct wb_completion done; /* tracks in-flight foreign writebacks */ }; /* * Bucket for arbitrarily byte-sized objects charged to a memory * cgroup. The bucket can be reparented in one piece when the cgroup * is destroyed, without having to round up the individual references * of all live memory objects in the wild. */ struct obj_cgroup { struct percpu_ref refcnt; struct mem_cgroup *memcg; atomic_t nr_charged_bytes; union { struct list_head list; /* protected by objcg_lock */ struct rcu_head rcu; }; }; /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide * statistics based on the statistics developed by Rik Van Riel for clock-pro, * to help the administrator determine what knobs to tune. */ struct mem_cgroup { struct cgroup_subsys_state css; /* Private memcg ID. Used to ID objects that outlive the cgroup */ struct mem_cgroup_id id; /* Accounted resources */ struct page_counter memory; /* Both v1 & v2 */ union { struct page_counter swap; /* v2 only */ struct page_counter memsw; /* v1 only */ }; /* Range enforcement for interrupt charges */ struct work_struct high_work; #ifdef CONFIG_ZSWAP unsigned long zswap_max; /* * Prevent pages from this memcg from being written back from zswap to * swap, and from being swapped out on zswap store failures. */ bool zswap_writeback; #endif /* vmpressure notifications */ struct vmpressure vmpressure; /* * Should the OOM killer kill all belonging tasks, had it kill one? */ bool oom_group; int swappiness; /* memory.events and memory.events.local */ struct cgroup_file events_file; struct cgroup_file events_local_file; /* handle for "memory.swap.events" */ struct cgroup_file swap_events_file; /* memory.stat */ struct memcg_vmstats *vmstats; /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS]; /* * Hint of reclaim pressure for socket memroy management. Note * that this indicator should NOT be used in legacy cgroup mode * where socket memory is accounted/charged separately. */ unsigned long socket_pressure; int kmemcg_id; /* * memcg->objcg is wiped out as a part of the objcg repaprenting * process. memcg->orig_objcg preserves a pointer (and a reference) * to the original objcg until the end of live of memcg. */ struct obj_cgroup __rcu *objcg; struct obj_cgroup *orig_objcg; /* list of inherited objcgs, protected by objcg_lock */ struct list_head objcg_list; struct memcg_vmstats_percpu __percpu *vmstats_percpu; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; struct wb_domain cgwb_domain; struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT]; #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif #ifdef CONFIG_LRU_GEN_WALKS_MMU /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; #endif #ifdef CONFIG_MEMCG_V1 /* Legacy consumer-oriented counters */ struct page_counter kmem; /* v1 only */ struct page_counter tcpmem; /* v1 only */ unsigned long soft_limit; /* protected by memcg_oom_lock */ bool oom_lock; int under_oom; /* OOM-Killer disable */ int oom_kill_disable; /* protect arrays of thresholds */ struct mutex thresholds_lock; /* thresholds for memory usage. RCU-protected */ struct mem_cgroup_thresholds thresholds; /* thresholds for mem+swap usage. RCU-protected */ struct mem_cgroup_thresholds memsw_thresholds; /* For oom notifier event fd */ struct list_head oom_notify; /* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? */ unsigned long move_charge_at_immigrate; /* taken only while moving_account > 0 */ spinlock_t move_lock; unsigned long move_lock_flags; /* Legacy tcp memory accounting */ bool tcpmem_active; int tcpmem_pressure; /* * set > 0 if pages under this cgroup are moving to other cgroup. */ atomic_t moving_account; struct task_struct *move_lock_task; /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; #endif /* CONFIG_MEMCG_V1 */ struct mem_cgroup_per_node *nodeinfo[]; }; /* * size of first charge trial. * TODO: maybe necessary to use big numbers in big irons or dynamic based of the * workload. */ #define MEMCG_CHARGE_BATCH 64U extern struct mem_cgroup *root_mem_cgroup; enum page_memcg_data_flags { /* page->memcg_data is a pointer to an slabobj_ext vector */ MEMCG_DATA_OBJEXTS = (1UL << 0), /* page has been accounted as a non-slab kernel page */ MEMCG_DATA_KMEM = (1UL << 1), /* the next bit after the last actual flag */ __NR_MEMCG_DATA_FLAGS = (1UL << 2), }; #define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS #else /* CONFIG_MEMCG */ #define __FIRST_OBJEXT_FLAG (1UL << 0) #endif /* CONFIG_MEMCG */ enum objext_flags { /* slabobj_ext vector failed to allocate */ OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG, /* the next bit after the last actual flag */ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), }; #define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1) #ifdef CONFIG_MEMCG static inline bool folio_memcg_kmem(struct folio *folio); /* * After the initialization objcg->memcg is always pointing at * a valid memcg, but can be atomically swapped to the parent memcg. * * The caller must ensure that the returned memcg won't be released: * e.g. acquire the rcu_read_lock or css_set_lock. */ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) { return READ_ONCE(objcg->memcg); } /* * __folio_memcg - Get the memory cgroup associated with a non-kmem folio * @folio: Pointer to the folio. * * Returns a pointer to the memory cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios or * kmem folios. */ static inline struct mem_cgroup *__folio_memcg(struct folio *folio) { unsigned long memcg_data = folio->memcg_data; VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio); return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } /* * __folio_objcg - get the object cgroup associated with a kmem folio. * @folio: Pointer to the folio. * * Returns a pointer to the object cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a * proper object cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios or * LRU folios. */ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) { unsigned long memcg_data = folio->memcg_data; VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio); return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } /* * folio_memcg - Get the memory cgroup associated with a folio. * @folio: Pointer to the folio. * * Returns a pointer to the memory cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios. * * For a non-kmem folio any of the following ensures folio and memcg binding * stability: * * - the folio lock * - LRU isolation * - folio_memcg_lock() * - exclusive reference * - mem_cgroup_trylock_pages() * * For a kmem folio a caller should hold an rcu read lock to protect memcg * associated with a kmem folio from being released. */ static inline struct mem_cgroup *folio_memcg(struct folio *folio) { if (folio_memcg_kmem(folio)) return obj_cgroup_memcg(__folio_objcg(folio)); return __folio_memcg(folio); } /** * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio. * @folio: Pointer to the folio. * * This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios. * * Return: A pointer to the memory cgroup associated with the folio, * or NULL. */ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) { unsigned long memcg_data = READ_ONCE(folio->memcg_data); VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); WARN_ON_ONCE(!rcu_read_lock_held()); if (memcg_data & MEMCG_DATA_KMEM) { struct obj_cgroup *objcg; objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); return obj_cgroup_memcg(objcg); } return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } /* * folio_memcg_check - Get the memory cgroup associated with a folio. * @folio: Pointer to the folio. * * Returns a pointer to the memory cgroup associated with the folio, * or NULL. This function unlike folio_memcg() can take any folio * as an argument. It has to be used in cases when it's not known if a folio * has an associated memory cgroup pointer or an object cgroups vector or * an object cgroup. * * For a non-kmem folio any of the following ensures folio and memcg binding * stability: * * - the folio lock * - LRU isolation * - lock_folio_memcg() * - exclusive reference * - mem_cgroup_trylock_pages() * * For a kmem folio a caller should hold an rcu read lock to protect memcg * associated with a kmem folio from being released. */ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { /* * Because folio->memcg_data might be changed asynchronously * for slabs, READ_ONCE() should be used here. */ unsigned long memcg_data = READ_ONCE(folio->memcg_data); if (memcg_data & MEMCG_DATA_OBJEXTS) return NULL; if (memcg_data & MEMCG_DATA_KMEM) { struct obj_cgroup *objcg; objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); return obj_cgroup_memcg(objcg); } return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } static inline struct mem_cgroup *page_memcg_check(struct page *page) { if (PageTail(page)) return NULL; return folio_memcg_check((struct folio *)page); } static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) { struct mem_cgroup *memcg; rcu_read_lock(); retry: memcg = obj_cgroup_memcg(objcg); if (unlikely(!css_tryget(&memcg->css))) goto retry; rcu_read_unlock(); return memcg; } /* * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set. * @folio: Pointer to the folio. * * Checks if the folio has MemcgKmem flag set. The caller must ensure * that the folio has an associated memory cgroup. It's not safe to call * this function against some types of folios, e.g. slab folios. */ static inline bool folio_memcg_kmem(struct folio *folio) { VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page); VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio); return folio->memcg_data & MEMCG_DATA_KMEM; } static inline bool PageMemcgKmem(struct page *page) { return folio_memcg_kmem(page_folio(page)); } static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); } static inline bool mem_cgroup_disabled(void) { return !cgroup_subsys_enabled(memory_cgrp_subsys); } static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, unsigned long *low) { *min = *low = 0; if (mem_cgroup_disabled()) return; /* * There is no reclaim protection applied to a targeted reclaim. * We are special casing this specific case here because * mem_cgroup_calculate_protection is not robust enough to keep * the protection invariant for calculated effective values for * parallel reclaimers with different reclaim target. This is * especially a problem for tail memcgs (as they have pages on LRU) * which would want to have effective values 0 for targeted reclaim * but a different value for external reclaim. * * Example * Let's have global and A's reclaim in parallel: * | * A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G) * |\ * | C (low = 1G, usage = 2.5G) * B (low = 1G, usage = 0.5G) * * For the global reclaim * A.elow = A.low * B.elow = min(B.usage, B.low) because children_low_usage <= A.elow * C.elow = min(C.usage, C.low) * * With the effective values resetting we have A reclaim * A.elow = 0 * B.elow = B.low * C.elow = C.low * * If the global reclaim races with A's reclaim then * B.elow = C.elow = 0 because children_low_usage > A.elow) * is possible and reclaiming B would be violating the protection. * */ if (root == memcg) return; *min = READ_ONCE(memcg->memory.emin); *low = READ_ONCE(memcg->memory.elow); } void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg); static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, struct mem_cgroup *memcg) { /* * The root memcg doesn't account charges, and doesn't support * protection. The target memcg's protection is ignored, see * mem_cgroup_calculate_protection() and mem_cgroup_protection() */ return mem_cgroup_disabled() || mem_cgroup_is_root(memcg) || memcg == target; } static inline bool mem_cgroup_below_low(struct mem_cgroup *target, struct mem_cgroup *memcg) { if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.elow) >= page_counter_read(&memcg->memory); } static inline bool mem_cgroup_below_min(struct mem_cgroup *target, struct mem_cgroup *memcg) { if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.emin) >= page_counter_read(&memcg->memory); } void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg); int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); /** * mem_cgroup_charge - Charge a newly allocated folio to a cgroup. * @folio: Folio to charge. * @mm: mm context of the allocating task. * @gfp: Reclaim mode. * * Try to charge @folio to the memcg that @mm belongs to, reclaiming * pages according to @gfp if necessary. If @mm is NULL, try to * charge to the active memcg. * * Do not use this for folios allocated for swapin. * * Return: 0 on success. Otherwise, an error code is returned. */ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { if (mem_cgroup_disabled()) return 0; return __mem_cgroup_charge(folio, mm, gfp); } int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, long nr_pages); int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); void __mem_cgroup_uncharge(struct folio *folio); /** * mem_cgroup_uncharge - Uncharge a folio. * @folio: Folio to uncharge. * * Uncharge a folio previously charged with mem_cgroup_charge(). */ static inline void mem_cgroup_uncharge(struct folio *folio) { if (mem_cgroup_disabled()) return; __mem_cgroup_uncharge(folio); } void __mem_cgroup_uncharge_folios(struct folio_batch *folios); static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { if (mem_cgroup_disabled()) return; __mem_cgroup_uncharge_folios(folios); } void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_replace_folio(struct folio *old, struct folio *new); void mem_cgroup_migrate(struct folio *old, struct folio *new); /** * mem_cgroup_lruvec - get the lru list vector for a memcg & node * @memcg: memcg of the wanted lruvec * @pgdat: pglist_data * * Returns the lru list vector holding pages for a given @memcg & * @pgdat combination. This can be the node lruvec, if the memory * controller is disabled. */ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct pglist_data *pgdat) { struct mem_cgroup_per_node *mz; struct lruvec *lruvec; if (mem_cgroup_disabled()) { lruvec = &pgdat->__lruvec; goto out; } if (!memcg) memcg = root_mem_cgroup; mz = memcg->nodeinfo[pgdat->node_id]; lruvec = &mz->lruvec; out: /* * Since a node can be onlined after the mem_cgroup was created, * we have to be prepared to initialize lruvec->pgdat here; * and if offlined then reonlined, we need to reinitialize it. */ if (unlikely(lruvec->pgdat != pgdat)) lruvec->pgdat = pgdat; return lruvec; } /** * folio_lruvec - return lruvec for isolating/putting an LRU folio * @folio: Pointer to the folio. * * This function relies on folio->mem_cgroup being stable. */ static inline struct lruvec *folio_lruvec(struct folio *folio) { struct mem_cgroup *memcg = folio_memcg(folio); VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio); return mem_cgroup_lruvec(memcg, folio_pgdat(folio)); } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); struct mem_cgroup *get_mem_cgroup_from_current(void); struct lruvec *folio_lruvec_lock(struct folio *folio); struct lruvec *folio_lruvec_lock_irq(struct folio *folio); struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags); #ifdef CONFIG_DEBUG_VM void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio); #else static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { } #endif static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; } static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg) { return percpu_ref_tryget(&objcg->refcnt); } static inline void obj_cgroup_get(struct obj_cgroup *objcg) { percpu_ref_get(&objcg->refcnt); } static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, unsigned long nr) { percpu_ref_get_many(&objcg->refcnt, nr); } static inline void obj_cgroup_put(struct obj_cgroup *objcg) { if (objcg) percpu_ref_put(&objcg->refcnt); } static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) { return !memcg || css_tryget(&memcg->css); } static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) { return !memcg || css_tryget_online(&memcg->css); } static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) css_put(&memcg->css); } #define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, struct mem_cgroup_reclaim_cookie *); void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*)(struct task_struct *, void *), void *arg); static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return 0; return memcg->id.id; } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); #ifdef CONFIG_SHRINKER_DEBUG static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return memcg ? cgroup_ino(memcg->css.cgroup) : 0; } struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); #endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return mem_cgroup_from_css(seq_css(m)); } static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { struct mem_cgroup_per_node *mz; if (mem_cgroup_disabled()) return NULL; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return mz->memcg; } /** * parent_mem_cgroup - find the accounting parent of a memcg * @memcg: memcg whose parent to find * * Returns the parent memcg, or NULL if this is the root. */ static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return mem_cgroup_from_css(memcg->css.parent); } static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) { if (root == memcg) return true; return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); } static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { struct mem_cgroup *task_memcg; bool match = false; rcu_read_lock(); task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (task_memcg) match = mem_cgroup_is_descendant(task_memcg, memcg); rcu_read_unlock(); return match; } struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio); ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return true; return !!(memcg->css.flags & CSS_ONLINE); } void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages); static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { struct mem_cgroup_per_node *mz; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } void mem_cgroup_handle_over_high(gfp_t gfp_mask); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); unsigned long mem_cgroup_size(struct mem_cgroup *memcg); void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p); void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg); struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val); /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_memcg_state(memcg, idx, val); local_irq_restore(flags); } static inline void mod_memcg_page_state(struct page *page, enum memcg_stat_item idx, int val) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = folio_memcg(page_folio(page)); if (memcg) mod_memcg_state(memcg, idx, val); rcu_read_unlock(); } unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx); unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx); void mem_cgroup_flush_stats(struct mem_cgroup *memcg); void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg); void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_lruvec_kmem_state(p, idx, val); local_irq_restore(flags); } void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count); static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { unsigned long flags; local_irq_save(flags); __count_memcg_events(memcg, idx, count); local_irq_restore(flags); } static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) { struct mem_cgroup *memcg = folio_memcg(folio); if (memcg) count_memcg_events(memcg, idx, nr); } static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) count_memcg_events(memcg, idx, 1); rcu_read_unlock(); } static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || event == MEMCG_SWAP_FAIL; atomic_long_inc(&memcg->memory_events_local[event]); if (!swap_event) cgroup_file_notify(&memcg->events_local_file); do { atomic_long_inc(&memcg->memory_events[event]); if (swap_event) cgroup_file_notify(&memcg->swap_events_file); else cgroup_file_notify(&memcg->events_file); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) break; if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) break; } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); } static inline void memcg_memory_event_mm(struct mm_struct *mm, enum memcg_memory_event event) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) memcg_memory_event(memcg, event); rcu_read_unlock(); } void split_page_memcg(struct page *head, int old_order, int new_order); #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 static inline struct mem_cgroup *folio_memcg(struct folio *folio) { return NULL; } static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) { WARN_ON_ONCE(!rcu_read_lock_held()); return NULL; } static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { return NULL; } static inline struct mem_cgroup *page_memcg_check(struct page *page) { return NULL; } static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) { return NULL; } static inline bool folio_memcg_kmem(struct folio *folio) { return false; } static inline bool PageMemcgKmem(struct page *page) { return false; } static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return true; } static inline bool mem_cgroup_disabled(void) { return true; } static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { } static inline void memcg_memory_event_mm(struct mm_struct *mm, enum memcg_memory_event event) { } static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, unsigned long *low) { *min = *low = 0; } static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg) { } static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, struct mem_cgroup *memcg) { return true; } static inline bool mem_cgroup_below_low(struct mem_cgroup *target, struct mem_cgroup *memcg) { return false; } static inline bool mem_cgroup_below_min(struct mem_cgroup *target, struct mem_cgroup *memcg) { return false; } static inline void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) { } static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { return 0; } static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, long nr_pages) { return 0; } static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { return 0; } static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) { } static inline void mem_cgroup_uncharge(struct folio *folio) { } static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { } static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { } static inline void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { } static inline void mem_cgroup_migrate(struct folio *old, struct folio *new) { } static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct pglist_data *pgdat) { return &pgdat->__lruvec; } static inline struct lruvec *folio_lruvec(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); return &pgdat->__lruvec; } static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { } static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return NULL; } static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { return true; } static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { return NULL; } static inline struct mem_cgroup *get_mem_cgroup_from_current(void) { return NULL; } static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) { return NULL; } static inline void obj_cgroup_put(struct obj_cgroup *objcg) { } static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) { return true; } static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) { return true; } static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } static inline struct lruvec *folio_lruvec_lock(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); spin_lock(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); spin_lock_irq(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flagsp) { struct pglist_data *pgdat = folio_pgdat(folio); spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); return &pgdat->__lruvec; } static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { return NULL; } static inline void mem_cgroup_iter_break(struct mem_cgroup *root, struct mem_cgroup *prev) { } static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*fn)(struct task_struct *, void *), void *arg) { } static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { return 0; } static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { WARN_ON_ONCE(id); /* XXX: This should always return root_mem_cgroup */ return NULL; } #ifdef CONFIG_SHRINKER_DEBUG static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return 0; } static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { return NULL; } #endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return NULL; } static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { return NULL; } static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { return true; } static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { return 0; } static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { return 0; } static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg) { return 0; } static inline void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { } static inline void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { } static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { } static inline struct mem_cgroup *mem_cgroup_get_oom_group( struct task_struct *victim, struct mem_cgroup *oom_domain) { return NULL; } static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { } static inline void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int nr) { } static inline void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int nr) { } static inline void mod_memcg_page_state(struct page *page, enum memcg_stat_item idx, int val) { } static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { return 0; } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { return node_page_state(lruvec_pgdat(lruvec), idx); } static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx) { return node_page_state(lruvec_pgdat(lruvec), idx); } static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg) { } static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { } static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); __mod_node_page_state(page_pgdat(page), idx, val); } static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); mod_node_page_state(page_pgdat(page), idx, val); } static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { } static inline void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { } static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) { } static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } static inline void split_page_memcg(struct page *head, int old_order, int new_order) { } #endif /* CONFIG_MEMCG */ /* * Extended information for slab objects stored as an array in page->memcg_data * if MEMCG_DATA_OBJEXTS is set. */ struct slabobj_ext { #ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif #ifdef CONFIG_MEM_ALLOC_PROFILING union codetag_ref ref; #endif } __aligned(8); static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) { __mod_lruvec_kmem_state(p, idx, 1); } static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx) { __mod_lruvec_kmem_state(p, idx, -1); } static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) { struct mem_cgroup *memcg; memcg = lruvec_memcg(lruvec); if (!memcg) return NULL; memcg = parent_mem_cgroup(memcg); if (!memcg) return NULL; return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); } static inline void unlock_page_lruvec(struct lruvec *lruvec) { spin_unlock(&lruvec->lru_lock); } static inline void unlock_page_lruvec_irq(struct lruvec *lruvec) { spin_unlock_irq(&lruvec->lru_lock); } static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, unsigned long flags) { spin_unlock_irqrestore(&lruvec->lru_lock, flags); } /* Test requires a stable folio->memcg binding, see folio_memcg() */ static inline bool folio_matches_lruvec(struct folio *folio, struct lruvec *lruvec) { return lruvec_pgdat(lruvec) == folio_pgdat(folio) && lruvec_memcg(lruvec) == folio_memcg(folio); } /* Don't lock again iff page's lruvec locked */ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, struct lruvec *locked_lruvec) { if (locked_lruvec) { if (folio_matches_lruvec(folio, locked_lruvec)) return locked_lruvec; unlock_page_lruvec_irq(locked_lruvec); } return folio_lruvec_lock_irq(folio); } /* Don't lock again iff folio's lruvec locked */ static inline void folio_lruvec_relock_irqsave(struct folio *folio, struct lruvec **lruvecp, unsigned long *flags) { if (*lruvecp) { if (folio_matches_lruvec(folio, *lruvecp)) return; unlock_page_lruvec_irqrestore(*lruvecp, *flags); } *lruvecp = folio_lruvec_lock_irqsave(folio, flags); } #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback); void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, struct bdi_writeback *wb); static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; memcg = folio_memcg(folio); if (unlikely(memcg && &memcg->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(folio, wb); } void mem_cgroup_flush_foreign(struct bdi_writeback *wb); #else /* CONFIG_CGROUP_WRITEBACK */ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) { return NULL; } static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback) { } static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { } static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb) { } #endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, gfp_t gfp_mask); void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); #ifdef CONFIG_MEMCG extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { #ifdef CONFIG_MEMCG_V1 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return !!memcg->tcpmem_pressure; #endif /* CONFIG_MEMCG_V1 */ do { if (time_before(jiffies, READ_ONCE(memcg->socket_pressure))) return true; } while ((memcg = parent_mem_cgroup(memcg))); return false; } int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); void reparent_shrinker_deferred(struct mem_cgroup *memcg); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; static inline void mem_cgroup_sk_free(struct sock *sk) { }; static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; } static inline void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { } #endif #ifdef CONFIG_MEMCG bool mem_cgroup_kmem_disabled(void); int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); /* * The returned objcg pointer is safe to use without additional * protection within a scope. The scope is defined either by * the current task (similar to the "current" global variable) * or by set_active_memcg() pair. * Please, use obj_cgroup_get() to get a reference if the pointer * needs to be used outside of the local scope. */ struct obj_cgroup *current_obj_cgroup(void); struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio); static inline struct obj_cgroup *get_obj_cgroup_from_current(void) { struct obj_cgroup *objcg = current_obj_cgroup(); if (objcg) obj_cgroup_get(objcg); return objcg; } int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); extern struct static_key_false memcg_bpf_enabled_key; static inline bool memcg_bpf_enabled(void) { return static_branch_likely(&memcg_bpf_enabled_key); } extern struct static_key_false memcg_kmem_online_key; static inline bool memcg_kmem_online(void) { return static_branch_likely(&memcg_kmem_online_key); } static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { if (memcg_kmem_online()) return __memcg_kmem_charge_page(page, gfp, order); return 0; } static inline void memcg_kmem_uncharge_page(struct page *page, int order) { if (memcg_kmem_online()) __memcg_kmem_uncharge_page(page, order); } /* * A helper for accessing memcg's kmem_id, used for getting * corresponding LRU lists. */ static inline int memcg_kmem_id(struct mem_cgroup *memcg) { return memcg ? memcg->kmemcg_id : -1; } struct mem_cgroup *mem_cgroup_from_obj(void *p); struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); static inline void count_objcg_event(struct obj_cgroup *objcg, enum vm_event_item idx) { struct mem_cgroup *memcg; if (!memcg_kmem_online()) return; rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); count_memcg_events(memcg, idx, 1); rcu_read_unlock(); } #else static inline bool mem_cgroup_kmem_disabled(void) { return true; } static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { return 0; } static inline void memcg_kmem_uncharge_page(struct page *page, int order) { } static inline int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { return 0; } static inline void __memcg_kmem_uncharge_page(struct page *page, int order) { } static inline struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { return NULL; } static inline bool memcg_bpf_enabled(void) { return false; } static inline bool memcg_kmem_online(void) { return false; } static inline int memcg_kmem_id(struct mem_cgroup *memcg) { return -1; } static inline struct mem_cgroup *mem_cgroup_from_obj(void *p) { return NULL; } static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) { return NULL; } static inline void count_objcg_event(struct obj_cgroup *objcg, enum vm_event_item idx) { } #endif /* CONFIG_MEMCG */ #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg); #else static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) { return true; } static inline void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) { } static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) { } static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) { /* if zswap is disabled, do not block pages going to the swapping device */ return true; } #endif /* Cgroup v1-related declarations */ #ifdef CONFIG_MEMCG_V1 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); bool mem_cgroup_oom_synchronize(bool wait); static inline bool task_in_memcg_oom(struct task_struct *p) { return p->memcg_in_oom; } void folio_memcg_lock(struct folio *folio); void folio_memcg_unlock(struct folio *folio); /* try to stablize folio_memcg() for all the pages in a memcg */ static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) { rcu_read_lock(); if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) return true; rcu_read_unlock(); return false; } static inline void mem_cgroup_unlock_pages(void) { rcu_read_unlock(); } static inline void mem_cgroup_enter_user_fault(void) { WARN_ON(current->in_user_fault); current->in_user_fault = 1; } static inline void mem_cgroup_exit_user_fault(void) { WARN_ON(!current->in_user_fault); current->in_user_fault = 0; } #else /* CONFIG_MEMCG_V1 */ static inline unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned) { return 0; } static inline void folio_memcg_lock(struct folio *folio) { } static inline void folio_memcg_unlock(struct folio *folio) { } static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) { /* to match folio_memcg_rcu() */ rcu_read_lock(); return true; } static inline void mem_cgroup_unlock_pages(void) { rcu_read_unlock(); } static inline bool task_in_memcg_oom(struct task_struct *p) { return false; } static inline bool mem_cgroup_oom_synchronize(bool wait) { return false; } static inline void mem_cgroup_enter_user_fault(void) { } static inline void mem_cgroup_exit_user_fault(void) { } #endif /* CONFIG_MEMCG_V1 */ #endif /* _LINUX_MEMCONTROL_H */
37 10 26 4 4 5 3 5 5 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 #ifndef __LINUX_MROUTE_BASE_H #define __LINUX_MROUTE_BASE_H #include <linux/netdevice.h> #include <linux/rhashtable-types.h> #include <linux/spinlock.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> /** * struct vif_device - interface representor for multicast routing * @dev: network device being used * @dev_tracker: refcount tracker for @dev reference * @bytes_in: statistic; bytes ingressing * @bytes_out: statistic; bytes egresing * @pkt_in: statistic; packets ingressing * @pkt_out: statistic; packets egressing * @rate_limit: Traffic shaping (NI) * @threshold: TTL threshold * @flags: Control flags * @link: Physical interface index * @dev_parent_id: device parent id * @local: Local address * @remote: Remote address for tunnels */ struct vif_device { struct net_device __rcu *dev; netdevice_tracker dev_tracker; unsigned long bytes_in, bytes_out; unsigned long pkt_in, pkt_out; unsigned long rate_limit; unsigned char threshold; unsigned short flags; int link; /* Currently only used by ipmr */ struct netdev_phys_item_id dev_parent_id; __be32 local, remote; }; struct vif_entry_notifier_info { struct fib_notifier_info info; struct net_device *dev; unsigned short vif_index; unsigned short vif_flags; u32 tb_id; }; static inline int mr_call_vif_notifier(struct notifier_block *nb, unsigned short family, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, unsigned short vif_index, u32 tb_id, struct netlink_ext_ack *extack) { struct vif_entry_notifier_info info = { .info = { .family = family, .extack = extack, }, .dev = vif_dev, .vif_index = vif_index, .vif_flags = vif->flags, .tb_id = tb_id, }; return call_fib_notifier(nb, event_type, &info.info); } static inline int mr_call_vif_notifiers(struct net *net, unsigned short family, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, unsigned short vif_index, u32 tb_id, unsigned int *ipmr_seq) { struct vif_entry_notifier_info info = { .info = { .family = family, }, .dev = vif_dev, .vif_index = vif_index, .vif_flags = vif->flags, .tb_id = tb_id, }; ASSERT_RTNL(); (*ipmr_seq)++; return call_fib_notifiers(net, event_type, &info.info); } #ifndef MAXVIFS /* This one is nasty; value is defined in uapi using different symbols for * mroute and morute6 but both map into same 32. */ #define MAXVIFS 32 #endif /* Note: This helper is deprecated. */ #define VIF_EXISTS(_mrt, _idx) (!!rcu_access_pointer((_mrt)->vif_table[_idx].dev)) /* mfc_flags: * MFC_STATIC - the entry was added statically (not by a routing daemon) * MFC_OFFLOAD - the entry was offloaded to the hardware */ enum { MFC_STATIC = BIT(0), MFC_OFFLOAD = BIT(1), }; /** * struct mr_mfc - common multicast routing entries * @mnode: rhashtable list * @mfc_parent: source interface (iif) * @mfc_flags: entry flags * @expires: unresolved entry expire time * @unresolved: unresolved cached skbs * @last_assert: time of last assert * @minvif: minimum VIF id * @maxvif: maximum VIF id * @bytes: bytes that have passed for this entry * @pkt: packets that have passed for this entry * @wrong_if: number of wrong source interface hits * @lastuse: time of last use of the group (traffic or update) * @ttls: OIF TTL threshold array * @refcount: reference count for this entry * @list: global entry list * @rcu: used for entry destruction * @free: Operation used for freeing an entry under RCU */ struct mr_mfc { struct rhlist_head mnode; unsigned short mfc_parent; int mfc_flags; union { struct { unsigned long expires; struct sk_buff_head unresolved; } unres; struct { unsigned long last_assert; int minvif; int maxvif; unsigned long bytes; unsigned long pkt; unsigned long wrong_if; unsigned long lastuse; unsigned char ttls[MAXVIFS]; refcount_t refcount; } res; } mfc_un; struct list_head list; struct rcu_head rcu; void (*free)(struct rcu_head *head); }; static inline void mr_cache_put(struct mr_mfc *c) { if (refcount_dec_and_test(&c->mfc_un.res.refcount)) call_rcu(&c->rcu, c->free); } static inline void mr_cache_hold(struct mr_mfc *c) { refcount_inc(&c->mfc_un.res.refcount); } struct mfc_entry_notifier_info { struct fib_notifier_info info; struct mr_mfc *mfc; u32 tb_id; }; static inline int mr_call_mfc_notifier(struct notifier_block *nb, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id, struct netlink_ext_ack *extack) { struct mfc_entry_notifier_info info = { .info = { .family = family, .extack = extack, }, .mfc = mfc, .tb_id = tb_id }; return call_fib_notifier(nb, event_type, &info.info); } static inline int mr_call_mfc_notifiers(struct net *net, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id, unsigned int *ipmr_seq) { struct mfc_entry_notifier_info info = { .info = { .family = family, }, .mfc = mfc, .tb_id = tb_id }; ASSERT_RTNL(); (*ipmr_seq)++; return call_fib_notifiers(net, event_type, &info.info); } struct mr_table; /** * struct mr_table_ops - callbacks and info for protocol-specific ops * @rht_params: parameters for accessing the MFC hash * @cmparg_any: a hash key to be used for matching on (*,*) routes */ struct mr_table_ops { const struct rhashtable_params *rht_params; void *cmparg_any; }; /** * struct mr_table - a multicast routing table * @list: entry within a list of multicast routing tables * @net: net where this table belongs * @ops: protocol specific operations * @id: identifier of the table * @mroute_sk: socket associated with the table * @ipmr_expire_timer: timer for handling unresolved routes * @mfc_unres_queue: list of unresolved MFC entries * @vif_table: array containing all possible vifs * @mfc_hash: Hash table of all resolved routes for easy lookup * @mfc_cache_list: list of resovled routes for possible traversal * @maxvif: Identifier of highest value vif currently in use * @cache_resolve_queue_len: current size of unresolved queue * @mroute_do_assert: Whether to inform userspace on wrong ingress * @mroute_do_pim: Whether to receive IGMP PIMv1 * @mroute_reg_vif_num: PIM-device vif index */ struct mr_table { struct list_head list; possible_net_t net; struct mr_table_ops ops; u32 id; struct sock __rcu *mroute_sk; struct timer_list ipmr_expire_timer; struct list_head mfc_unres_queue; struct vif_device vif_table[MAXVIFS]; struct rhltable mfc_hash; struct list_head mfc_cache_list; int maxvif; atomic_t cache_resolve_queue_len; bool mroute_do_assert; bool mroute_do_pim; bool mroute_do_wrvifwhole; int mroute_reg_vif_num; }; #ifdef CONFIG_IP_MROUTE_COMMON void vif_device_init(struct vif_device *v, struct net_device *dev, unsigned long rate_limit, unsigned char threshold, unsigned short flags, unsigned short get_iflink_mask); struct mr_table * mr_table_alloc(struct net *net, u32 id, struct mr_table_ops *ops, void (*expire_func)(struct timer_list *t), void (*table_set)(struct mr_table *mrt, struct net *net)); /* These actually return 'struct mr_mfc *', but to avoid need for explicit * castings they simply return void. */ void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent); void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi); void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg); int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mr_mfc *c, struct rtmsg *rtm); int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb, struct netlink_callback *cb, int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter); int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, struct mr_table *(*iter)(struct net *net, struct mr_table *mrt), int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter); int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), struct netlink_ext_ack *extack); #else static inline void vif_device_init(struct vif_device *v, struct net_device *dev, unsigned long rate_limit, unsigned char threshold, unsigned short flags, unsigned short get_iflink_mask) { } static inline void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent) { return NULL; } static inline void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi) { return NULL; } static inline struct mr_mfc *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg) { return NULL; } static inline int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mr_mfc *c, struct rtmsg *rtm) { return -EINVAL; } static inline int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, struct mr_table *(*iter)(struct net *net, struct mr_table *mrt), int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter) { return -EINVAL; } static inline int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), struct netlink_ext_ack *extack) { return -EINVAL; } #endif static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg) { return mr_mfc_find_parent(mrt, hasharg, -1); } #ifdef CONFIG_PROC_FS struct mr_vif_iter { struct seq_net_private p; struct mr_table *mrt; int ct; }; struct mr_mfc_iter { struct seq_net_private p; struct mr_table *mrt; struct list_head *cache; /* Lock protecting the mr_table's unresolved queue */ spinlock_t *lock; }; #ifdef CONFIG_IP_MROUTE_COMMON void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos); void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos); static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos) { return *pos ? mr_vif_seq_idx(seq_file_net(seq), seq->private, *pos - 1) : SEQ_START_TOKEN; } /* These actually return 'struct mr_mfc *', but to avoid need for explicit * castings they simply return void. */ void *mr_mfc_seq_idx(struct net *net, struct mr_mfc_iter *it, loff_t pos); void *mr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos); static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos, struct mr_table *mrt, spinlock_t *lock) { struct mr_mfc_iter *it = seq->private; it->mrt = mrt; it->cache = NULL; it->lock = lock; return *pos ? mr_mfc_seq_idx(seq_file_net(seq), seq->private, *pos - 1) : SEQ_START_TOKEN; } static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v) { struct mr_mfc_iter *it = seq->private; struct mr_table *mrt = it->mrt; if (it->cache == &mrt->mfc_unres_queue) spin_unlock_bh(it->lock); else if (it->cache == &mrt->mfc_cache_list) rcu_read_unlock(); } #else static inline void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos) { return NULL; } static inline void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return NULL; } static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos) { return NULL; } static inline void *mr_mfc_seq_idx(struct net *net, struct mr_mfc_iter *it, loff_t pos) { return NULL; } static inline void *mr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return NULL; } static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos, struct mr_table *mrt, spinlock_t *lock) { return NULL; } static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v) { } #endif #endif #endif
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 // SPDX-License-Identifier: GPL-2.0-only /* * stackglue.c * * Code which implements an OCFS2 specific interface to underlying * cluster stacks. * * Copyright (C) 2007, 2009 Oracle. All rights reserved. */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/fs.h> #include <linux/kobject.h> #include <linux/sysfs.h> #include <linux/sysctl.h> #include "ocfs2_fs.h" #include "stackglue.h" #define OCFS2_STACK_PLUGIN_O2CB "o2cb" #define OCFS2_STACK_PLUGIN_USER "user" #define OCFS2_MAX_HB_CTL_PATH 256 static struct ocfs2_protocol_version locking_max_version; static DEFINE_SPINLOCK(ocfs2_stack_lock); static LIST_HEAD(ocfs2_stack_list); static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1]; static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; /* * The stack currently in use. If not null, active_stack->sp_count > 0, * the module is pinned, and the locking protocol cannot be changed. */ static struct ocfs2_stack_plugin *active_stack; static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) { struct ocfs2_stack_plugin *p; assert_spin_locked(&ocfs2_stack_lock); list_for_each_entry(p, &ocfs2_stack_list, sp_list) { if (!strcmp(p->sp_name, name)) return p; } return NULL; } static int ocfs2_stack_driver_request(const char *stack_name, const char *plugin_name) { int rc; struct ocfs2_stack_plugin *p; spin_lock(&ocfs2_stack_lock); /* * If the stack passed by the filesystem isn't the selected one, * we can't continue. */ if (strcmp(stack_name, cluster_stack_name)) { rc = -EBUSY; goto out; } if (active_stack) { /* * If the active stack isn't the one we want, it cannot * be selected right now. */ if (!strcmp(active_stack->sp_name, plugin_name)) rc = 0; else rc = -EBUSY; goto out; } p = ocfs2_stack_lookup(plugin_name); if (!p || !try_module_get(p->sp_owner)) { rc = -ENOENT; goto out; } active_stack = p; rc = 0; out: /* If we found it, pin it */ if (!rc) active_stack->sp_count++; spin_unlock(&ocfs2_stack_lock); return rc; } /* * This function looks up the appropriate stack and makes it active. If * there is no stack, it tries to load it. It will fail if the stack still * cannot be found. It will also fail if a different stack is in use. */ static int ocfs2_stack_driver_get(const char *stack_name) { int rc; char *plugin_name = OCFS2_STACK_PLUGIN_O2CB; /* * Classic stack does not pass in a stack name. This is * compatible with older tools as well. */ if (!stack_name || !*stack_name) stack_name = OCFS2_STACK_PLUGIN_O2CB; if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) { printk(KERN_ERR "ocfs2 passed an invalid cluster stack label: \"%s\"\n", stack_name); return -EINVAL; } /* Anything that isn't the classic stack is a user stack */ if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB)) plugin_name = OCFS2_STACK_PLUGIN_USER; rc = ocfs2_stack_driver_request(stack_name, plugin_name); if (rc == -ENOENT) { request_module("ocfs2_stack_%s", plugin_name); rc = ocfs2_stack_driver_request(stack_name, plugin_name); } if (rc == -ENOENT) { printk(KERN_ERR "ocfs2: Cluster stack driver \"%s\" cannot be found\n", plugin_name); } else if (rc == -EBUSY) { printk(KERN_ERR "ocfs2: A different cluster stack is in use\n"); } return rc; } static void ocfs2_stack_driver_put(void) { spin_lock(&ocfs2_stack_lock); BUG_ON(active_stack == NULL); BUG_ON(active_stack->sp_count == 0); active_stack->sp_count--; if (!active_stack->sp_count) { module_put(active_stack->sp_owner); active_stack = NULL; } spin_unlock(&ocfs2_stack_lock); } int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin) { int rc; spin_lock(&ocfs2_stack_lock); if (!ocfs2_stack_lookup(plugin->sp_name)) { plugin->sp_count = 0; plugin->sp_max_proto = locking_max_version; list_add(&plugin->sp_list, &ocfs2_stack_list); printk(KERN_INFO "ocfs2: Registered cluster interface %s\n", plugin->sp_name); rc = 0; } else { printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n", plugin->sp_name); rc = -EEXIST; } spin_unlock(&ocfs2_stack_lock); return rc; } EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin) { struct ocfs2_stack_plugin *p; spin_lock(&ocfs2_stack_lock); p = ocfs2_stack_lookup(plugin->sp_name); if (p) { BUG_ON(p != plugin); BUG_ON(plugin == active_stack); BUG_ON(plugin->sp_count != 0); list_del_init(&plugin->sp_list); printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n", plugin->sp_name); } else { printk(KERN_ERR "Stack \"%s\" is not registered\n", plugin->sp_name); } spin_unlock(&ocfs2_stack_lock); } EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister); void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto) { struct ocfs2_stack_plugin *p; spin_lock(&ocfs2_stack_lock); if (memcmp(max_proto, &locking_max_version, sizeof(struct ocfs2_protocol_version))) { BUG_ON(locking_max_version.pv_major != 0); locking_max_version = *max_proto; list_for_each_entry(p, &ocfs2_stack_list, sp_list) { p->sp_max_proto = locking_max_version; } } spin_unlock(&ocfs2_stack_lock); } EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version); /* * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument * for the ast and bast functions. They will pass the lksb to the ast * and bast. The caller can wrap the lksb with their own structure to * get more information. */ int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, int mode, struct ocfs2_dlm_lksb *lksb, u32 flags, void *name, unsigned int namelen) { if (!lksb->lksb_conn) lksb->lksb_conn = conn; else BUG_ON(lksb->lksb_conn != conn); return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags, name, namelen); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lock); int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, struct ocfs2_dlm_lksb *lksb, u32 flags) { BUG_ON(lksb->lksb_conn == NULL); return active_stack->sp_ops->dlm_unlock(conn, lksb, flags); } EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock); int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) { return active_stack->sp_ops->lock_status(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status); int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb) { return active_stack->sp_ops->lvb_valid(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid); void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb) { return active_stack->sp_ops->lock_lvb(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb); void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb) { active_stack->sp_ops->dump_lksb(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb); int ocfs2_stack_supports_plocks(void) { return active_stack && active_stack->sp_ops->plock; } EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks); /* * ocfs2_plock() can only be safely called if * ocfs2_stack_supports_plocks() returned true */ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, struct file *file, int cmd, struct file_lock *fl) { WARN_ON_ONCE(active_stack->sp_ops->plock == NULL); if (active_stack->sp_ops->plock) return active_stack->sp_ops->plock(conn, ino, file, cmd, fl); return -EOPNOTSUPP; } EXPORT_SYMBOL_GPL(ocfs2_plock); int ocfs2_cluster_connect(const char *stack_name, const char *cluster_name, int cluster_name_len, const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, void (*recovery_handler)(int node_num, void *recovery_data), void *recovery_data, struct ocfs2_cluster_connection **conn) { int rc = 0; struct ocfs2_cluster_connection *new_conn; BUG_ON(group == NULL); BUG_ON(conn == NULL); BUG_ON(recovery_handler == NULL); if (grouplen > GROUP_NAME_MAX) { rc = -EINVAL; goto out; } if (memcmp(&lproto->lp_max_version, &locking_max_version, sizeof(struct ocfs2_protocol_version))) { rc = -EINVAL; goto out; } new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection), GFP_KERNEL); if (!new_conn) { rc = -ENOMEM; goto out; } strscpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); new_conn->cc_namelen = grouplen; if (cluster_name_len) strscpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1); new_conn->cc_cluster_name_len = cluster_name_len; new_conn->cc_recovery_handler = recovery_handler; new_conn->cc_recovery_data = recovery_data; new_conn->cc_proto = lproto; /* Start the new connection at our maximum compatibility level */ new_conn->cc_version = lproto->lp_max_version; /* This will pin the stack driver if successful */ rc = ocfs2_stack_driver_get(stack_name); if (rc) goto out_free; rc = active_stack->sp_ops->connect(new_conn); if (rc) { ocfs2_stack_driver_put(); goto out_free; } *conn = new_conn; out_free: if (rc) kfree(new_conn); out: return rc; } EXPORT_SYMBOL_GPL(ocfs2_cluster_connect); /* The caller will ensure all nodes have the same cluster stack */ int ocfs2_cluster_connect_agnostic(const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, void (*recovery_handler)(int node_num, void *recovery_data), void *recovery_data, struct ocfs2_cluster_connection **conn) { char *stack_name = NULL; if (cluster_stack_name[0]) stack_name = cluster_stack_name; return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen, lproto, recovery_handler, recovery_data, conn); } EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); /* If hangup_pending is 0, the stack driver will be dropped */ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, int hangup_pending) { int ret; BUG_ON(conn == NULL); ret = active_stack->sp_ops->disconnect(conn); /* XXX Should we free it anyway? */ if (!ret) { kfree(conn); if (!hangup_pending) ocfs2_stack_driver_put(); } return ret; } EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect); /* * Leave the group for this filesystem. This is executed by a userspace * program (stored in ocfs2_hb_ctl_path). */ static void ocfs2_leave_group(const char *group) { int ret; char *argv[5], *envp[3]; argv[0] = ocfs2_hb_ctl_path; argv[1] = "-K"; argv[2] = "-u"; argv[3] = (char *)group; argv[4] = NULL; /* minimal command environment taken from cpu_run_sbin_hotplug */ envp[0] = "HOME=/"; envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (ret < 0) { printk(KERN_ERR "ocfs2: Error %d running user helper " "\"%s %s %s %s\"\n", ret, argv[0], argv[1], argv[2], argv[3]); } } /* * Hangup is a required post-umount. ocfs2-tools software expects the * filesystem to call "ocfs2_hb_ctl" during unmount. This happens * regardless of whether the DLM got started, so we can't do it * in ocfs2_cluster_disconnect(). The ocfs2_leave_group() function does * the actual work. */ void ocfs2_cluster_hangup(const char *group, int grouplen) { BUG_ON(group == NULL); BUG_ON(group[grouplen] != '\0'); ocfs2_leave_group(group); /* cluster_disconnect() was called with hangup_pending==1 */ ocfs2_stack_driver_put(); } EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, unsigned int *node) { return active_stack->sp_ops->this_node(conn, node); } EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); /* * Sysfs bits */ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t ret = 0; spin_lock(&ocfs2_stack_lock); if (locking_max_version.pv_major) ret = snprintf(buf, PAGE_SIZE, "%u.%u\n", locking_max_version.pv_major, locking_max_version.pv_minor); spin_unlock(&ocfs2_stack_lock); return ret; } static struct kobj_attribute ocfs2_attr_max_locking_protocol = __ATTR(max_locking_protocol, S_IRUGO, ocfs2_max_locking_protocol_show, NULL); static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t ret = 0, total = 0, remain = PAGE_SIZE; struct ocfs2_stack_plugin *p; spin_lock(&ocfs2_stack_lock); list_for_each_entry(p, &ocfs2_stack_list, sp_list) { ret = snprintf(buf, remain, "%s\n", p->sp_name); if (ret >= remain) { /* snprintf() didn't fit */ total = -E2BIG; break; } total += ret; remain -= ret; } spin_unlock(&ocfs2_stack_lock); return total; } static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins = __ATTR(loaded_cluster_plugins, S_IRUGO, ocfs2_loaded_cluster_plugins_show, NULL); static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t ret = 0; spin_lock(&ocfs2_stack_lock); if (active_stack) { ret = snprintf(buf, PAGE_SIZE, "%s\n", active_stack->sp_name); if (ret >= PAGE_SIZE) ret = -E2BIG; } spin_unlock(&ocfs2_stack_lock); return ret; } static struct kobj_attribute ocfs2_attr_active_cluster_plugin = __ATTR(active_cluster_plugin, S_IRUGO, ocfs2_active_cluster_plugin_show, NULL); static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t ret; spin_lock(&ocfs2_stack_lock); ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name); spin_unlock(&ocfs2_stack_lock); return ret; } static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { size_t len = count; ssize_t ret; if (len == 0) return len; if (buf[len - 1] == '\n') len--; if ((len != OCFS2_STACK_LABEL_LEN) || (strnlen(buf, len) != len)) return -EINVAL; spin_lock(&ocfs2_stack_lock); if (active_stack) { if (!strncmp(buf, cluster_stack_name, len)) ret = count; else ret = -EBUSY; } else { memcpy(cluster_stack_name, buf, len); ret = count; } spin_unlock(&ocfs2_stack_lock); return ret; } static struct kobj_attribute ocfs2_attr_cluster_stack = __ATTR(cluster_stack, S_IRUGO | S_IWUSR, ocfs2_cluster_stack_show, ocfs2_cluster_stack_store); static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "1\n"); } static struct kobj_attribute ocfs2_attr_dlm_recover_support = __ATTR(dlm_recover_callback_support, S_IRUGO, ocfs2_dlm_recover_show, NULL); static struct attribute *ocfs2_attrs[] = { &ocfs2_attr_max_locking_protocol.attr, &ocfs2_attr_loaded_cluster_plugins.attr, &ocfs2_attr_active_cluster_plugin.attr, &ocfs2_attr_cluster_stack.attr, &ocfs2_attr_dlm_recover_support.attr, NULL, }; static const struct attribute_group ocfs2_attr_group = { .attrs = ocfs2_attrs, }; struct kset *ocfs2_kset; EXPORT_SYMBOL_GPL(ocfs2_kset); static void ocfs2_sysfs_exit(void) { kset_unregister(ocfs2_kset); } static int ocfs2_sysfs_init(void) { int ret; ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj); if (!ocfs2_kset) return -ENOMEM; ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group); if (ret) goto error; return 0; error: kset_unregister(ocfs2_kset); return ret; } /* * Sysctl bits * * The sysctl lives at /proc/sys/fs/ocfs2/nm/hb_ctl_path. The 'nm' doesn't * make as much sense in a multiple cluster stack world, but it's safer * and easier to preserve the name. */ static struct ctl_table ocfs2_nm_table[] = { { .procname = "hb_ctl_path", .data = ocfs2_hb_ctl_path, .maxlen = OCFS2_MAX_HB_CTL_PATH, .mode = 0644, .proc_handler = proc_dostring, }, }; static struct ctl_table_header *ocfs2_table_header; /* * Initialization */ static int __init ocfs2_stack_glue_init(void) { int ret; strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table); if (!ocfs2_table_header) { printk(KERN_ERR "ocfs2 stack glue: unable to register sysctl\n"); return -ENOMEM; /* or something. */ } ret = ocfs2_sysfs_init(); if (ret) unregister_sysctl_table(ocfs2_table_header); return ret; } static void __exit ocfs2_stack_glue_exit(void) { memset(&locking_max_version, 0, sizeof(struct ocfs2_protocol_version)); ocfs2_sysfs_exit(); if (ocfs2_table_header) unregister_sysctl_table(ocfs2_table_header); } MODULE_AUTHOR("Oracle"); MODULE_DESCRIPTION("ocfs2 cluster stack glue layer"); MODULE_LICENSE("GPL"); module_init(ocfs2_stack_glue_init); module_exit(ocfs2_stack_glue_exit);
36 34 34 34 33 34 34 34 34 34 34 36 35 36 34 36 34 34 33 34 34 34 34 36 34 34 34 34 34 34 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 // SPDX-License-Identifier: GPL-2.0 #include "eytzinger.h" /** * is_aligned - is this pointer & size okay for word-wide copying? * @base: pointer to data * @size: size of each element * @align: required alignment (typically 4 or 8) * * Returns true if elements can be copied using word loads and stores. * The size must be a multiple of the alignment, and the base address must * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. * * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" * to "if ((a | b) & mask)", so we do that by hand. */ __attribute_const__ __always_inline static bool is_aligned(const void *base, size_t size, unsigned char align) { unsigned char lsbits = (unsigned char)size; (void)base; #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS lsbits |= (unsigned char)(uintptr_t)base; #endif return (lsbits & (align - 1)) == 0; } /** * swap_words_32 - swap two elements in 32-bit chunks * @a: pointer to the first element to swap * @b: pointer to the second element to swap * @n: element size (must be a multiple of 4) * * Exchange the two objects in memory. This exploits base+index addressing, * which basically all CPUs have, to minimize loop overhead computations. * * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the * bottom of the loop, even though the zero flag is still valid from the * subtract (since the intervening mov instructions don't alter the flags). * Gcc 8.1.0 doesn't have that problem. */ static void swap_words_32(void *a, void *b, size_t n) { do { u32 t = *(u32 *)(a + (n -= 4)); *(u32 *)(a + n) = *(u32 *)(b + n); *(u32 *)(b + n) = t; } while (n); } /** * swap_words_64 - swap two elements in 64-bit chunks * @a: pointer to the first element to swap * @b: pointer to the second element to swap * @n: element size (must be a multiple of 8) * * Exchange the two objects in memory. This exploits base+index * addressing, which basically all CPUs have, to minimize loop overhead * computations. * * We'd like to use 64-bit loads if possible. If they're not, emulating * one requires base+index+4 addressing which x86 has but most other * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, * but it's possible to have 64-bit loads without 64-bit pointers (e.g. * x32 ABI). Are there any cases the kernel needs to worry about? */ static void swap_words_64(void *a, void *b, size_t n) { do { #ifdef CONFIG_64BIT u64 t = *(u64 *)(a + (n -= 8)); *(u64 *)(a + n) = *(u64 *)(b + n); *(u64 *)(b + n) = t; #else /* Use two 32-bit transfers to avoid base+index+4 addressing */ u32 t = *(u32 *)(a + (n -= 4)); *(u32 *)(a + n) = *(u32 *)(b + n); *(u32 *)(b + n) = t; t = *(u32 *)(a + (n -= 4)); *(u32 *)(a + n) = *(u32 *)(b + n); *(u32 *)(b + n) = t; #endif } while (n); } /** * swap_bytes - swap two elements a byte at a time * @a: pointer to the first element to swap * @b: pointer to the second element to swap * @n: element size * * This is the fallback if alignment doesn't allow using larger chunks. */ static void swap_bytes(void *a, void *b, size_t n) { do { char t = ((char *)a)[--n]; ((char *)a)[n] = ((char *)b)[n]; ((char *)b)[n] = t; } while (n); } /* * The values are arbitrary as long as they can't be confused with * a pointer, but small integers make for the smallest compare * instructions. */ #define SWAP_WORDS_64 (swap_r_func_t)0 #define SWAP_WORDS_32 (swap_r_func_t)1 #define SWAP_BYTES (swap_r_func_t)2 #define SWAP_WRAPPER (swap_r_func_t)3 struct wrapper { cmp_func_t cmp; swap_func_t swap_func; }; /* * The function pointer is last to make tail calls most efficient if the * compiler decides not to inline this function. */ static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) { if (swap_func == SWAP_WRAPPER) { ((const struct wrapper *)priv)->swap_func(a, b, (int)size); return; } if (swap_func == SWAP_WORDS_64) swap_words_64(a, b, size); else if (swap_func == SWAP_WORDS_32) swap_words_32(a, b, size); else if (swap_func == SWAP_BYTES) swap_bytes(a, b, size); else swap_func(a, b, (int)size, priv); } #define _CMP_WRAPPER ((cmp_r_func_t)0L) static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) { if (cmp == _CMP_WRAPPER) return ((const struct wrapper *)priv)->cmp(a, b); return cmp(a, b, priv); } static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, cmp_r_func_t cmp_func, const void *priv, size_t l, size_t r) { return do_cmp(base + inorder_to_eytzinger0(l, n) * size, base + inorder_to_eytzinger0(r, n) * size, cmp_func, priv); } static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, swap_r_func_t swap_func, const void *priv, size_t l, size_t r) { do_swap(base + inorder_to_eytzinger0(l, n) * size, base + inorder_to_eytzinger0(r, n) * size, size, swap_func, priv); } void eytzinger0_sort_r(void *base, size_t n, size_t size, cmp_r_func_t cmp_func, swap_r_func_t swap_func, const void *priv) { int i, j, k; /* called from 'sort' without swap function, let's pick the default */ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) swap_func = NULL; if (!swap_func) { if (is_aligned(base, size, 8)) swap_func = SWAP_WORDS_64; else if (is_aligned(base, size, 4)) swap_func = SWAP_WORDS_32; else swap_func = SWAP_BYTES; } /* heapify */ for (i = n / 2 - 1; i >= 0; --i) { /* Find the sift-down path all the way to the leaves. */ for (j = i; k = j * 2 + 1, k + 1 < n;) j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; /* Special case for the last leaf with no sibling. */ if (j * 2 + 2 == n) j = j * 2 + 1; /* Backtrack to the correct location. */ while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0) j = (j - 1) / 2; /* Shift the element into its correct place. */ for (k = j; j != i;) { j = (j - 1) / 2; eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); } } /* sort */ for (i = n - 1; i > 0; --i) { eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); /* Find the sift-down path all the way to the leaves. */ for (j = 0; k = j * 2 + 1, k + 1 < i;) j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; /* Special case for the last leaf with no sibling. */ if (j * 2 + 2 == i) j = j * 2 + 1; /* Backtrack to the correct location. */ while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0) j = (j - 1) / 2; /* Shift the element into its correct place. */ for (k = j; j;) { j = (j - 1) / 2; eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); } } } void eytzinger0_sort(void *base, size_t n, size_t size, cmp_func_t cmp_func, swap_func_t swap_func) { struct wrapper w = { .cmp = cmp_func, .swap_func = swap_func, }; return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); } #if 0 #include <linux/slab.h> #include <linux/random.h> #include <linux/ktime.h> static u64 cmp_count; static int mycmp(const void *a, const void *b) { u32 _a = *(u32 *)a; u32 _b = *(u32 *)b; cmp_count++; if (_a < _b) return -1; else if (_a > _b) return 1; else return 0; } static int test(void) { size_t N, i; ktime_t start, end; s64 delta; u32 *arr; for (N = 10000; N <= 100000; N += 10000) { arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL); cmp_count = 0; for (i = 0; i < N; i++) arr[i] = get_random_u32(); start = ktime_get(); eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL); end = ktime_get(); delta = ktime_us_delta(end, start); printk(KERN_INFO "time: %lld\n", delta); printk(KERN_INFO "comparisons: %lld\n", cmp_count); u32 prev = 0; eytzinger0_for_each(i, N) { if (prev > arr[i]) goto err; prev = arr[i]; } kfree(arr); } return 0; err: kfree(arr); return -1; } #endif
42 145 42 128 983 148 76 96 58 5 80 22 136 112 103 11 12 168 160 153 117 155 78 132 96 102 58 168 130 130 121 121 5 130 5 11 137 136 2 2 137 137 25 166 63 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGETLB_H #define _LINUX_HUGETLB_H #include <linux/mm.h> #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/fs.h> #include <linux/hugetlb_inline.h> #include <linux/cgroup.h> #include <linux/page_ref.h> #include <linux/list.h> #include <linux/kref.h> #include <linux/pgtable.h> #include <linux/gfp.h> #include <linux/userfaultfd_k.h> struct ctl_table; struct user_struct; struct mmu_gather; struct node; void free_huge_folio(struct folio *folio); #ifdef CONFIG_HUGETLB_PAGE #include <linux/pagemap.h> #include <linux/shm.h> #include <asm/tlbflush.h> /* * For HugeTLB page, there are more metadata to save in the struct page. But * the head struct page cannot meet our needs, so we have to abuse other tail * struct page to store the metadata. */ #define __NR_USED_SUBPAGE 3 struct hugepage_subpool { spinlock_t lock; long count; long max_hpages; /* Maximum huge pages or -1 if no maximum. */ long used_hpages; /* Used count against maximum, includes */ /* both allocated and reserved pages. */ struct hstate *hstate; long min_hpages; /* Minimum huge pages or -1 if no minimum. */ long rsv_hpages; /* Pages reserved against global pool to */ /* satisfy minimum size. */ }; struct resv_map { struct kref refs; spinlock_t lock; struct list_head regions; long adds_in_progress; struct list_head region_cache; long region_cache_count; struct rw_semaphore rw_sema; #ifdef CONFIG_CGROUP_HUGETLB /* * On private mappings, the counter to uncharge reservations is stored * here. If these fields are 0, then either the mapping is shared, or * cgroup accounting is disabled for this resv_map. */ struct page_counter *reservation_counter; unsigned long pages_per_hpage; struct cgroup_subsys_state *css; #endif }; /* * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. * * The region data structures are embedded into a resv_map and protected * by a resv_map's lock. The set of regions within the resv_map represent * reservations for huge pages, or huge pages that have already been * instantiated within the map. The from and to elements are huge page * indices into the associated mapping. from indicates the starting index * of the region. to represents the first index past the end of the region. * * For example, a file region structure with from == 0 and to == 4 represents * four huge pages in a mapping. It is important to note that the to element * represents the first element past the end of the region. This is used in * arithmetic as 4(to) - 0(from) = 4 huge pages in the region. * * Interval notation of the form [from, to) will be used to indicate that * the endpoint from is inclusive and to is exclusive. */ struct file_region { struct list_head link; long from; long to; #ifdef CONFIG_CGROUP_HUGETLB /* * On shared mappings, each reserved region appears as a struct * file_region in resv_map. These fields hold the info needed to * uncharge each reservation. */ struct page_counter *reservation_counter; struct cgroup_subsys_state *css; #endif }; struct hugetlb_vma_lock { struct kref refs; struct rw_semaphore rw_sema; struct vm_area_struct *vma; }; extern struct resv_map *resv_map_alloc(void); void resv_map_release(struct kref *ref); extern spinlock_t hugetlb_lock; extern int hugetlb_max_hstate __read_mostly; #define for_each_hstate(h) \ for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, long min_hpages); void hugepage_put_subpool(struct hugepage_subpool *spool); void hugetlb_dup_vma_private(struct vm_area_struct *vma); void clear_vma_resv_huge_pages(struct vm_area_struct *vma); int move_hugetlb_page_tables(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long len); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *, struct vm_area_struct *); struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned int *page_mask); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *, zap_flags_t); void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(char *buf, int len, int nid); void hugetlb_show_meminfo_node(int nid); unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); #ifdef CONFIG_USERFAULTFD int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop); #endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void folio_putback_active_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud); bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address); struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages[MAX_NUMNODES]; /* arch callbacks */ #ifndef CONFIG_HIGHPTE /* * pte_offset_huge() and pte_alloc_huge() are helpers for those architectures * which may go down to the lowest PTE level in their huge_pte_offset() and * huge_pte_alloc(): to avoid reliance on pte_offset_map() without pte_unmap(). */ static inline pte_t *pte_offset_huge(pmd_t *pmd, unsigned long address) { return pte_offset_kernel(pmd, address); } static inline pte_t *pte_alloc_huge(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { return pte_alloc(mm, pmd) ? NULL : pte_offset_huge(pmd, address); } #endif pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); /* * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE. * Returns the pte_t* if found, or NULL if the address is not mapped. * * IMPORTANT: we should normally not directly call this function, instead * this is only a common interface to implement arch-specific * walker. Please use hugetlb_walk() instead, because that will attempt to * verify the locking for you. * * Since this function will walk all the pgtable pages (including not only * high-level pgtable page, but also PUD entry that can be unshared * concurrently for VM_SHARED), the caller of this function should be * responsible of its thread safety. One can follow this rule: * * (1) For private mappings: pmd unsharing is not possible, so holding the * mmap_lock for either read or write is sufficient. Most callers * already hold the mmap_lock, so normally, no special action is * required. * * (2) For shared mappings: pmd unsharing is possible (so the PUD-ranged * pgtable page can go away from under us! It can be done by a pmd * unshare with a follow up munmap() on the other process), then we * need either: * * (2.1) hugetlb vma lock read or write held, to make sure pmd unshare * won't happen upon the range (it also makes sure the pte_t we * read is the right and stable one), or, * * (2.2) hugetlb mapping i_mmap_rwsem lock held read or write, to make * sure even if unshare happened the racy unmap() will wait until * i_mmap_rwsem is released. * * Option (2.1) is the safest, which guarantees pte stability from pmd * sharing pov, until the vma lock released. Option (2.2) doesn't protect * a concurrent pmd unshare, but it makes sure the pgtable page is safe to * access. */ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); unsigned long hugetlb_mask_last_page(struct hstate *h); int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); extern void __hugetlb_zap_begin(struct vm_area_struct *vma, unsigned long *begin, unsigned long *end); extern void __hugetlb_zap_end(struct vm_area_struct *vma, struct zap_details *details); static inline void hugetlb_zap_begin(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { if (is_vm_hugetlb_page(vma)) __hugetlb_zap_begin(vma, start, end); } static inline void hugetlb_zap_end(struct vm_area_struct *vma, struct zap_details *details) { if (is_vm_hugetlb_page(vma)) __hugetlb_zap_end(vma, details); } void hugetlb_vma_lock_read(struct vm_area_struct *vma); void hugetlb_vma_unlock_read(struct vm_area_struct *vma); void hugetlb_vma_lock_write(struct vm_area_struct *vma); void hugetlb_vma_unlock_write(struct vm_area_struct *vma); int hugetlb_vma_trylock_write(struct vm_area_struct *vma); void hugetlb_vma_assert_locked(struct vm_area_struct *vma); void hugetlb_vma_lock_release(struct kref *kref); long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags); bool is_hugetlb_entry_migration(pte_t pte); bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma) { } static inline void clear_vma_resv_huge_pages(struct vm_area_struct *vma) { } static inline unsigned long hugetlb_total_pages(void) { return 0; } static inline struct address_space *hugetlb_folio_mapping_lock_write( struct folio *folio) { return NULL; } static inline int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return 0; } static inline void adjust_range_if_pmd_sharing_possible( struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } static inline void hugetlb_zap_begin( struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } static inline void hugetlb_zap_end( struct vm_area_struct *vma, struct zap_details *details) { } static inline int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { BUG(); return 0; } static inline int move_hugetlb_page_tables(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long len) { BUG(); return 0; } static inline void hugetlb_report_meminfo(struct seq_file *m) { } static inline int hugetlb_report_node_meminfo(char *buf, int len, int nid) { return 0; } static inline void hugetlb_show_meminfo_node(int nid) { } static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { return -EINVAL; } static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma) { } static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma) { } static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma) { } static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma) { } static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma) { return 1; } static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma) { } static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { return 0; } static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { BUG(); } #ifdef CONFIG_USERFAULTFD static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop) { BUG(); return 0; } #endif /* CONFIG_USERFAULTFD */ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { return NULL; } static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list) { return false; } static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { return 0; } static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { return 0; } static inline void folio_putback_active_hugetlb(struct folio *folio) { } static inline void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) { } static inline long hugetlb_change_protection( struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { return 0; } static inline void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { BUG(); } static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { BUG(); return 0; } static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { } #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write static inline int pgd_write(pgd_t pgd) { BUG(); return 0; } #endif #define HUGETLB_ANON_FILE "anon_hugepage" enum { /* * The file will be used as an shm file so shmfs accounting rules * apply */ HUGETLB_SHMFS_INODE = 1, /* * The file is being created on the internal vfs mount and shmfs * accounting rules do not apply */ HUGETLB_ANONHUGE_INODE = 2, }; #ifdef CONFIG_HUGETLBFS struct hugetlbfs_sb_info { long max_inodes; /* inodes allowed */ long free_inodes; /* inodes free */ spinlock_t stat_lock; struct hstate *hstate; struct hugepage_subpool *spool; kuid_t uid; kgid_t gid; umode_t mode; }; static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) { return sb->s_fs_info; } struct hugetlbfs_inode_info { struct inode vfs_inode; unsigned int seals; }; static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) { return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); } extern const struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, int creat_flags, int page_size_log); static inline bool is_file_hugepages(const struct file *file) { return file->f_op->fop_flags & FOP_HUGE_PAGES; } static inline struct hstate *hstate_inode(struct inode *i) { return HUGETLBFS_SB(i->i_sb)->hstate; } #else /* !CONFIG_HUGETLBFS */ #define is_file_hugepages(file) false static inline struct file * hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, int creat_flags, int page_size_log) { return ERR_PTR(-ENOSYS); } static inline struct hstate *hstate_inode(struct inode *i) { return NULL; } #endif /* !CONFIG_HUGETLBFS */ #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */ unsigned long generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); /* * huegtlb page specific state flags. These flags are located in page.private * of the hugetlb head page. Functions created via the below macros should be * used to manipulate these flags. * * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at * allocation time. Cleared when page is fully instantiated. Free * routine checks flag to restore a reservation on error paths. * Synchronization: Examined or modified by code that knows it has * the only reference to page. i.e. After allocation but before use * or when the page is being freed. * HPG_migratable - Set after a newly allocated page is added to the page * cache and/or page tables. Indicates the page is a candidate for * migration. * Synchronization: Initially set after new page allocation with no * locking. When examined and modified during migration processing * (isolate, migrate, putback) the hugetlb_lock is held. * HPG_temporary - Set on a page that is temporarily allocated from the buddy * allocator. Typically used for migration target pages when no pages * are available in the pool. The hugetlb free page path will * immediately free pages with this flag set to the buddy allocator. * Synchronization: Can be set after huge page allocation from buddy when * code knows it has only reference. All other examinations and * modifications require hugetlb_lock. * HPG_freed - Set when page is on the free lists. * Synchronization: hugetlb_lock held for examination and modification. * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed. * HPG_raw_hwp_unreliable - Set when the hugetlb page has a hwpoison sub-page * that is not tracked by raw_hwp_page list. */ enum hugetlb_page_flags { HPG_restore_reserve = 0, HPG_migratable, HPG_temporary, HPG_freed, HPG_vmemmap_optimized, HPG_raw_hwp_unreliable, __NR_HPAGEFLAGS, }; /* * Macros to create test, set and clear function definitions for * hugetlb specific page flags. */ #ifdef CONFIG_HUGETLB_PAGE #define TESTHPAGEFLAG(uname, flname) \ static __always_inline \ bool folio_test_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ return test_bit(HPG_##flname, private); \ } #define SETHPAGEFLAG(uname, flname) \ static __always_inline \ void folio_set_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ set_bit(HPG_##flname, private); \ } #define CLEARHPAGEFLAG(uname, flname) \ static __always_inline \ void folio_clear_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ clear_bit(HPG_##flname, private); \ } #else #define TESTHPAGEFLAG(uname, flname) \ static inline bool \ folio_test_hugetlb_##flname(struct folio *folio) \ { return 0; } #define SETHPAGEFLAG(uname, flname) \ static inline void \ folio_set_hugetlb_##flname(struct folio *folio) \ { } #define CLEARHPAGEFLAG(uname, flname) \ static inline void \ folio_clear_hugetlb_##flname(struct folio *folio) \ { } #endif #define HPAGEFLAG(uname, flname) \ TESTHPAGEFLAG(uname, flname) \ SETHPAGEFLAG(uname, flname) \ CLEARHPAGEFLAG(uname, flname) \ /* * Create functions associated with hugetlb page flags */ HPAGEFLAG(RestoreReserve, restore_reserve) HPAGEFLAG(Migratable, migratable) HPAGEFLAG(Temporary, temporary) HPAGEFLAG(Freed, freed) HPAGEFLAG(VmemmapOptimized, vmemmap_optimized) HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable) #ifdef CONFIG_HUGETLB_PAGE #define HSTATE_NAME_LEN 32 /* Defines one hugetlb page size */ struct hstate { struct mutex resize_lock; struct lock_class_key resize_key; int next_nid_to_alloc; int next_nid_to_free; unsigned int order; unsigned int demote_order; unsigned long mask; unsigned long max_huge_pages; unsigned long nr_huge_pages; unsigned long free_huge_pages; unsigned long resv_huge_pages; unsigned long surplus_huge_pages; unsigned long nr_overcommit_huge_pages; struct list_head hugepage_activelist; struct list_head hugepage_freelists[MAX_NUMNODES]; unsigned int max_huge_pages_node[MAX_NUMNODES]; unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; char name[HSTATE_NAME_LEN]; }; struct huge_bootmem_page { struct list_head list; struct hstate *hstate; }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback); int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct folio *folio); /* arch callback */ int __init __alloc_bootmem_huge_page(struct hstate *h, int nid); int __init alloc_bootmem_huge_page(struct hstate *h, int nid); bool __init hugetlb_node_alloc_supported(void); void __init hugetlb_add_hstate(unsigned order); bool __init arch_hugetlb_valid_size(unsigned long size); struct hstate *size_to_hstate(unsigned long size); #ifndef HUGE_MAX_HSTATE #define HUGE_MAX_HSTATE 1 #endif extern struct hstate hstates[HUGE_MAX_HSTATE]; extern unsigned int default_hstate_idx; #define default_hstate (hstates[default_hstate_idx]) static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) { return folio->_hugetlb_subpool; } static inline void hugetlb_set_folio_subpool(struct folio *folio, struct hugepage_subpool *subpool) { folio->_hugetlb_subpool = subpool; } static inline struct hstate *hstate_file(struct file *f) { return hstate_inode(file_inode(f)); } static inline struct hstate *hstate_sizelog(int page_size_log) { if (!page_size_log) return &default_hstate; if (page_size_log < BITS_PER_LONG) return size_to_hstate(1UL << page_size_log); return NULL; } static inline struct hstate *hstate_vma(struct vm_area_struct *vma) { return hstate_file(vma->vm_file); } static inline unsigned long huge_page_size(const struct hstate *h) { return (unsigned long)PAGE_SIZE << h->order; } extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma); extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); static inline unsigned long huge_page_mask(struct hstate *h) { return h->mask; } static inline unsigned int huge_page_order(struct hstate *h) { return h->order; } static inline unsigned huge_page_shift(struct hstate *h) { return h->order + PAGE_SHIFT; } static inline bool hstate_is_gigantic(struct hstate *h) { return huge_page_order(h) > MAX_PAGE_ORDER; } static inline unsigned int pages_per_huge_page(const struct hstate *h) { return 1 << h->order; } static inline unsigned int blocks_per_huge_page(struct hstate *h) { return huge_page_size(h) / 512; } static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, struct address_space *mapping, pgoff_t idx) { return filemap_lock_folio(mapping, idx << huge_page_order(h)); } #include <asm/hugetlb.h> #ifndef is_hugepage_only_range static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { return 0; } #define is_hugepage_only_range is_hugepage_only_range #endif #ifndef arch_clear_hugetlb_flags static inline void arch_clear_hugetlb_flags(struct folio *folio) { } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags #endif #ifndef arch_make_huge_pte static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { return pte_mkhuge(entry); } #endif static inline struct hstate *folio_hstate(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); return size_to_hstate(folio_size(folio)); } static inline unsigned hstate_index_to_shift(unsigned index) { return hstates[index].order + PAGE_SHIFT; } static inline int hstate_index(struct hstate *h) { return h - hstates; } int dissolve_free_hugetlb_folio(struct folio *folio); int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn); #ifdef CONFIG_MEMORY_FAILURE extern void folio_clear_hugetlb_hwpoison(struct folio *folio); #else static inline void folio_clear_hugetlb_hwpoison(struct folio *folio) { } #endif #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION #ifndef arch_hugetlb_migration_supported static inline bool arch_hugetlb_migration_supported(struct hstate *h) { if ((huge_page_shift(h) == PMD_SHIFT) || (huge_page_shift(h) == PUD_SHIFT) || (huge_page_shift(h) == PGDIR_SHIFT)) return true; else return false; } #endif #else static inline bool arch_hugetlb_migration_supported(struct hstate *h) { return false; } #endif static inline bool hugepage_migration_supported(struct hstate *h) { return arch_hugetlb_migration_supported(h); } /* * Movability check is different as compared to migration check. * It determines whether or not a huge page should be placed on * movable zone or not. Movability of any huge page should be * required only if huge page size is supported for migration. * There won't be any reason for the huge page to be movable if * it is not migratable to start with. Also the size of the huge * page should be large enough to be placed under a movable zone * and still feasible enough to be migratable. Just the presence * in movable zone does not make the migration feasible. * * So even though large huge page sizes like the gigantic ones * are migratable they should not be movable because its not * feasible to migrate them from movable zone. */ static inline bool hugepage_movable_supported(struct hstate *h) { if (!hugepage_migration_supported(h)) return false; if (hstate_is_gigantic(h)) return false; return true; } /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { if (hugepage_movable_supported(h)) return GFP_HIGHUSER_MOVABLE; else return GFP_HIGHUSER; } static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask) { gfp_t modified_mask = htlb_alloc_mask(h); /* Some callers might want to enforce node */ modified_mask |= (gfp_mask & __GFP_THISNODE); modified_mask |= (gfp_mask & __GFP_NOWARN); return modified_mask; } static inline bool htlb_allow_alloc_fallback(int reason) { bool allowed_fallback = false; /* * Note: the memory offline, memory failure and migration syscalls will * be allowed to fallback to other nodes due to lack of a better chioce, * that might break the per-node hugetlb pool. While other cases will * set the __GFP_THISNODE to avoid breaking the per-node hugetlb pool. */ switch (reason) { case MR_MEMORY_HOTPLUG: case MR_MEMORY_FAILURE: case MR_SYSCALL: case MR_MEMPOLICY_MBIND: allowed_fallback = true; break; default: break; } return allowed_fallback; } static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { const unsigned long size = huge_page_size(h); VM_WARN_ON(size == PAGE_SIZE); /* * hugetlb must use the exact same PT locks as core-mm page table * walkers would. When modifying a PTE table, hugetlb must take the * PTE PT lock, when modifying a PMD table, hugetlb must take the PMD * PT lock etc. * * The expectation is that any hugetlb folio smaller than a PMD is * always mapped into a single PTE table and that any hugetlb folio * smaller than a PUD (but at least as big as a PMD) is always mapped * into a single PMD table. * * If that does not hold for an architecture, then that architecture * must disable split PT locks such that all *_lockptr() functions * will give us the same result: the per-MM PT lock. * * Note that with e.g., CONFIG_PGTABLE_LEVELS=2 where * PGDIR_SIZE==P4D_SIZE==PUD_SIZE==PMD_SIZE, we'd use pud_lockptr() * and core-mm would use pmd_lockptr(). However, in such configurations * split PMD locks are disabled -- they don't make sense on a single * PGDIR page table -- and the end result is the same. */ if (size >= PUD_SIZE) return pud_lockptr(mm, (pud_t *) pte); else if (size >= PMD_SIZE || IS_ENABLED(CONFIG_HIGHPTE)) return pmd_lockptr(mm, (pmd_t *) pte); /* pte_alloc_huge() only applies with !CONFIG_HIGHPTE */ return ptep_lockptr(mm, pte); } #ifndef hugepages_supported /* * Some platform decide whether they support huge pages at boot * time. Some of them, such as powerpc, set HPAGE_SHIFT to 0 * when there is no such support */ #define hugepages_supported() (HPAGE_SHIFT != 0) #endif void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm); static inline void hugetlb_count_init(struct mm_struct *mm) { atomic_long_set(&mm->hugetlb_usage, 0); } static inline void hugetlb_count_add(long l, struct mm_struct *mm) { atomic_long_add(l, &mm->hugetlb_usage); } static inline void hugetlb_count_sub(long l, struct mm_struct *mm) { atomic_long_sub(l, &mm->hugetlb_usage); } #ifndef huge_ptep_modify_prot_start #define huge_ptep_modify_prot_start huge_ptep_modify_prot_start static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); } #endif #ifndef huge_ptep_modify_prot_commit #define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { unsigned long psize = huge_page_size(hstate_vma(vma)); set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } #endif #ifdef CONFIG_NUMA void hugetlb_register_node(struct node *node); void hugetlb_unregister_node(struct node *node); #endif /* * Check if a given raw @page in a hugepage is HWPOISON. */ bool is_raw_hwpoison_page_in_hugepage(struct page *page); #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) { return NULL; } static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, struct address_space *mapping, pgoff_t idx) { return NULL; } static inline int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { return -ENOMEM; } static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { return NULL; } static inline struct folio * alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback) { return NULL; } static inline int __alloc_bootmem_huge_page(struct hstate *h) { return 0; } static inline struct hstate *hstate_file(struct file *f) { return NULL; } static inline struct hstate *hstate_sizelog(int page_size_log) { return NULL; } static inline struct hstate *hstate_vma(struct vm_area_struct *vma) { return NULL; } static inline struct hstate *folio_hstate(struct folio *folio) { return NULL; } static inline struct hstate *size_to_hstate(unsigned long size) { return NULL; } static inline unsigned long huge_page_size(struct hstate *h) { return PAGE_SIZE; } static inline unsigned long huge_page_mask(struct hstate *h) { return PAGE_MASK; } static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; } static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; } static inline unsigned int huge_page_order(struct hstate *h) { return 0; } static inline unsigned int huge_page_shift(struct hstate *h) { return PAGE_SHIFT; } static inline bool hstate_is_gigantic(struct hstate *h) { return false; } static inline unsigned int pages_per_huge_page(struct hstate *h) { return 1; } static inline unsigned hstate_index_to_shift(unsigned index) { return 0; } static inline int hstate_index(struct hstate *h) { return 0; } static inline int dissolve_free_hugetlb_folio(struct folio *folio) { return 0; } static inline int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn) { return 0; } static inline bool hugepage_migration_supported(struct hstate *h) { return false; } static inline bool hugepage_movable_supported(struct hstate *h) { return false; } static inline gfp_t htlb_alloc_mask(struct hstate *h) { return 0; } static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask) { return 0; } static inline bool htlb_allow_alloc_fallback(int reason) { return false; } static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { return &mm->page_table_lock; } static inline void hugetlb_count_init(struct mm_struct *mm) { } static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m) { } static inline void hugetlb_count_sub(long l, struct mm_struct *mm) { } static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { #ifdef CONFIG_MMU return ptep_get(ptep); #else return *ptep; #endif } static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz) { } static inline void hugetlb_register_node(struct node *node) { } static inline void hugetlb_unregister_node(struct node *node) { } static inline bool hugetlbfs_pagecache_present( struct hstate *h, struct vm_area_struct *vma, unsigned long address) { return false; } #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, struct mm_struct *mm, pte_t *pte) { spinlock_t *ptl; ptl = huge_pte_lockptr(h, mm, pte); spin_lock(ptl); return ptl; } #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA) extern void __init hugetlb_cma_reserve(int order); #else static inline __init void hugetlb_cma_reserve(int order) { } #endif #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE static inline bool hugetlb_pmd_shared(pte_t *pte) { return page_count(virt_to_page(pte)) > 1; } #else static inline bool hugetlb_pmd_shared(pte_t *pte) { return false; } #endif bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE /* * ARCHes with special requirements for evicting HUGETLB backing TLB entries can * implement this. */ #define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #endif static inline bool __vma_shareable_lock(struct vm_area_struct *vma) { return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data; } bool __vma_private_lock(struct vm_area_struct *vma); /* * Safe version of huge_pte_offset() to check the locks. See comments * above huge_pte_offset(). */ static inline pte_t * hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { #if defined(CONFIG_HUGETLB_PAGE) && \ defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; /* * If pmd sharing possible, locking needed to safely walk the * hugetlb pgtables. More information can be found at the comment * above huge_pte_offset() in the same file. * * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP. */ if (__vma_shareable_lock(vma)) WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) && !lockdep_is_held( &vma->vm_file->f_mapping->i_mmap_rwsem)); #endif return huge_pte_offset(vma->vm_mm, addr, sz); } #endif /* _LINUX_HUGETLB_H */
33 28 28 5 32 32 16 32 5 32 33 271 268 260 260 149 272 152 151 131 131 132 61 152 133 152 108 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * */ #include <linux/types.h> #include "ntfs_fs.h" #define BITS_IN_SIZE_T (sizeof(size_t) * 8) /* * fill_mask[i] - first i bits are '1' , i = 0,1,2,3,4,5,6,7,8 * fill_mask[i] = 0xFF >> (8-i) */ static const u8 fill_mask[] = { 0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F, 0xFF }; /* * zero_mask[i] - first i bits are '0' , i = 0,1,2,3,4,5,6,7,8 * zero_mask[i] = 0xFF << i */ static const u8 zero_mask[] = { 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80, 0x00 }; /* * are_bits_clear * * Return: True if all bits [bit, bit+nbits) are zeros "0". */ bool are_bits_clear(const void *lmap, size_t bit, size_t nbits) { size_t pos = bit & 7; const u8 *map = (u8 *)lmap + (bit >> 3); if (pos) { if (8 - pos >= nbits) return !nbits || !(*map & fill_mask[pos + nbits] & zero_mask[pos]); if (*map++ & zero_mask[pos]) return false; nbits -= 8 - pos; } pos = ((size_t)map) & (sizeof(size_t) - 1); if (pos) { pos = sizeof(size_t) - pos; if (nbits >= pos * 8) { for (nbits -= pos * 8; pos; pos--, map++) { if (*map) return false; } } } for (pos = nbits / BITS_IN_SIZE_T; pos; pos--, map += sizeof(size_t)) { if (*((size_t *)map)) return false; } for (pos = (nbits % BITS_IN_SIZE_T) >> 3; pos; pos--, map++) { if (*map) return false; } pos = nbits & 7; if (pos && (*map & fill_mask[pos])) return false; return true; } /* * are_bits_set * * Return: True if all bits [bit, bit+nbits) are ones "1". */ bool are_bits_set(const void *lmap, size_t bit, size_t nbits) { u8 mask; size_t pos = bit & 7; const u8 *map = (u8 *)lmap + (bit >> 3); if (pos) { if (8 - pos >= nbits) { mask = fill_mask[pos + nbits] & zero_mask[pos]; return !nbits || (*map & mask) == mask; } mask = zero_mask[pos]; if ((*map++ & mask) != mask) return false; nbits -= 8 - pos; } pos = ((size_t)map) & (sizeof(size_t) - 1); if (pos) { pos = sizeof(size_t) - pos; if (nbits >= pos * 8) { for (nbits -= pos * 8; pos; pos--, map++) { if (*map != 0xFF) return false; } } } for (pos = nbits / BITS_IN_SIZE_T; pos; pos--, map += sizeof(size_t)) { if (*((size_t *)map) != MINUS_ONE_T) return false; } for (pos = (nbits % BITS_IN_SIZE_T) >> 3; pos; pos--, map++) { if (*map != 0xFF) return false; } pos = nbits & 7; if (pos) { mask = fill_mask[pos]; if ((*map & mask) != mask) return false; } return true; }
64 96 126 97 2 1 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tcp #if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TCP_H #include <linux/ipv6.h> #include <linux/tcp.h> #include <linux/tracepoint.h> #include <net/ipv6.h> #include <net/tcp.h> #include <linux/sock_diag.h> #include <net/rstreason.h> /* * tcp event with arguments sk and skb * * Note: this class requires a valid sk pointer; while skb pointer could * be NULL. */ DECLARE_EVENT_CLASS(tcp_event_sk_skb, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(const void *, skbaddr) __field(const void *, skaddr) __field(int, state) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skbaddr = skb; __entry->skaddr = sk; __entry->state = sk->sk_state; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("skbaddr=%p skaddr=%p family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s", __entry->skbaddr, __entry->skaddr, show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->state)) ); DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); #undef FN #define FN(reason) TRACE_DEFINE_ENUM(SK_RST_REASON_##reason); DEFINE_RST_REASON(FN, FN) #undef FN #undef FNe #define FN(reason) { SK_RST_REASON_##reason, #reason }, #define FNe(reason) { SK_RST_REASON_##reason, #reason } /* * skb of trace_tcp_send_reset is the skb that caused RST. In case of * active reset, skb should be NULL */ TRACE_EVENT(tcp_send_reset, TP_PROTO(const struct sock *sk, const struct sk_buff *skb__nullable, const enum sk_rst_reason reason), TP_ARGS(sk, skb__nullable, reason), TP_STRUCT__entry( __field(const void *, skbaddr) __field(const void *, skaddr) __field(int, state) __field(enum sk_rst_reason, reason) __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->skbaddr = skb__nullable; __entry->skaddr = sk; /* Zero means unknown state. */ __entry->state = sk ? sk->sk_state : 0; memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); if (sk && sk_fullsock(sk)) { const struct inet_sock *inet = inet_sk(sk); TP_STORE_ADDR_PORTS(__entry, inet, sk); } else if (skb__nullable) { const struct tcphdr *th = (const struct tcphdr *)skb__nullable->data; /* * We should reverse the 4-tuple of skb, so later * it can print the right flow direction of rst. */ TP_STORE_ADDR_PORTS_SKB(skb__nullable, th, entry->daddr, entry->saddr); } __entry->reason = reason; ), TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s reason=%s", __entry->skbaddr, __entry->skaddr, __entry->saddr, __entry->daddr, __entry->state ? show_tcp_state_name(__entry->state) : "UNKNOWN", __print_symbolic(__entry->reason, DEFINE_RST_REASON(FN, FNe))) ); #undef FN #undef FNe /* * tcp event with arguments sk * * Note: this class requires a valid sk pointer. */ DECLARE_EVENT_CLASS(tcp_event_sk, TP_PROTO(struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) __field(__u64, sock_cookie) ), TP_fast_assign( struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skaddr = sk; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); __entry->sock_cookie = sock_gen_cookie(sk); ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c sock_cookie=%llx", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->sock_cookie) ); DEFINE_EVENT(tcp_event_sk, tcp_receive_reset, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), TP_ARGS(sk, req), TP_STRUCT__entry( __field(const void *, skaddr) __field(const void *, req) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( struct inet_request_sock *ireq = inet_rsk(req); __be32 *p32; __entry->skaddr = sk; __entry->req = req; __entry->sport = ireq->ir_num; __entry->dport = ntohs(ireq->ir_rmt_port); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = ireq->ir_loc_addr; p32 = (__be32 *) __entry->daddr; *p32 = ireq->ir_rmt_addr; TP_STORE_ADDRS(__entry, ireq->ir_loc_addr, ireq->ir_rmt_addr, ireq->ir_v6_loc_addr, ireq->ir_v6_rmt_addr); ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6) ); #include <trace/events/net_probe_common.h> TRACE_EVENT(tcp_probe, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u32, mark) __field(__u16, data_len) __field(__u32, snd_nxt) __field(__u32, snd_una) __field(__u32, snd_cwnd) __field(__u32, ssthresh) __field(__u32, snd_wnd) __field(__u32, srtt) __field(__u32, rcv_wnd) __field(__u64, sock_cookie) __field(const void *, skbaddr) __field(const void *, skaddr) ), TP_fast_assign( const struct tcphdr *th = (const struct tcphdr *)skb->data; const struct inet_sock *inet = inet_sk(sk); const struct tcp_sock *tp = tcp_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS(__entry, inet, sk); /* For filtering use */ __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->mark = skb->mark; __entry->family = sk->sk_family; __entry->data_len = skb->len - __tcp_hdrlen(th); __entry->snd_nxt = tp->snd_nxt; __entry->snd_una = tp->snd_una; __entry->snd_cwnd = tcp_snd_cwnd(tp); __entry->snd_wnd = tp->snd_wnd; __entry->rcv_wnd = tp->rcv_wnd; __entry->ssthresh = tcp_current_ssthresh(sk); __entry->srtt = tp->srtt_us >> 3; __entry->sock_cookie = sock_gen_cookie(sk); __entry->skbaddr = skb; __entry->skaddr = sk; ), TP_printk("family=%s src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx skbaddr=%p skaddr=%p", show_family_name(__entry->family), __entry->saddr, __entry->daddr, __entry->mark, __entry->data_len, __entry->snd_nxt, __entry->snd_una, __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd, __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie, __entry->skbaddr, __entry->skaddr) ); /* * tcp event with only skb */ DECLARE_EVENT_CLASS(tcp_event_skb, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __field(const void *, skbaddr) __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( const struct tcphdr *th = (const struct tcphdr *)skb->data; __entry->skbaddr = skb; memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS_SKB(skb, th, __entry->saddr, __entry->daddr); ), TP_printk("skbaddr=%p src=%pISpc dest=%pISpc", __entry->skbaddr, __entry->saddr, __entry->daddr) ); DEFINE_EVENT(tcp_event_skb, tcp_bad_csum, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); TRACE_EVENT(tcp_cong_state_set, TP_PROTO(struct sock *sk, const u8 ca_state), TP_ARGS(sk, ca_state), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) __field(__u8, cong_state) ), TP_fast_assign( struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skaddr = sk; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); __entry->cong_state = ca_state; ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->cong_state) ); DECLARE_EVENT_CLASS(tcp_hash_event, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(__u64, net_cookie) __field(const void *, skbaddr) __field(const void *, skaddr) __field(int, state) /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(int, l3index) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(bool, fin) __field(bool, syn) __field(bool, rst) __field(bool, psh) __field(bool, ack) ), TP_fast_assign( const struct tcphdr *th = (const struct tcphdr *)skb->data; __entry->net_cookie = sock_net(sk)->net_cookie; __entry->skbaddr = skb; __entry->skaddr = sk; __entry->state = sk->sk_state; memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS_SKB(skb, th, __entry->saddr, __entry->daddr); __entry->l3index = inet_sdif(skb) ? inet_iif(skb) : 0; /* For filtering use */ __entry->sport = ntohs(th->source); __entry->dport = ntohs(th->dest); __entry->family = sk->sk_family; __entry->fin = th->fin; __entry->syn = th->syn; __entry->rst = th->rst; __entry->psh = th->psh; __entry->ack = th->ack; ), TP_printk("net=%llu state=%s family=%s src=%pISpc dest=%pISpc L3index=%d [%c%c%c%c%c]", __entry->net_cookie, show_tcp_state_name(__entry->state), show_family_name(__entry->family), __entry->saddr, __entry->daddr, __entry->l3index, __entry->fin ? 'F' : ' ', __entry->syn ? 'S' : ' ', __entry->rst ? 'R' : ' ', __entry->psh ? 'P' : ' ', __entry->ack ? '.' : ' ') ); DEFINE_EVENT(tcp_hash_event, tcp_hash_bad_header, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); DEFINE_EVENT(tcp_hash_event, tcp_hash_md5_required, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); DEFINE_EVENT(tcp_hash_event, tcp_hash_md5_unexpected, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); DEFINE_EVENT(tcp_hash_event, tcp_hash_md5_mismatch, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); DEFINE_EVENT(tcp_hash_event, tcp_hash_ao_required, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); DECLARE_EVENT_CLASS(tcp_ao_event, TP_PROTO(const struct sock *sk, const struct sk_buff *skb, const __u8 keyid, const __u8 rnext, const __u8 maclen), TP_ARGS(sk, skb, keyid, rnext, maclen), TP_STRUCT__entry( __field(__u64, net_cookie) __field(const void *, skbaddr) __field(const void *, skaddr) __field(int, state) /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(int, l3index) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(bool, fin) __field(bool, syn) __field(bool, rst) __field(bool, psh) __field(bool, ack) __field(__u8, keyid) __field(__u8, rnext) __field(__u8, maclen) ), TP_fast_assign( const struct tcphdr *th = (const struct tcphdr *)skb->data; __entry->net_cookie = sock_net(sk)->net_cookie; __entry->skbaddr = skb; __entry->skaddr = sk; __entry->state = sk->sk_state; memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS_SKB(skb, th, __entry->saddr, __entry->daddr); __entry->l3index = inet_sdif(skb) ? inet_iif(skb) : 0; /* For filtering use */ __entry->sport = ntohs(th->source); __entry->dport = ntohs(th->dest); __entry->family = sk->sk_family; __entry->fin = th->fin; __entry->syn = th->syn; __entry->rst = th->rst; __entry->psh = th->psh; __entry->ack = th->ack; __entry->keyid = keyid; __entry->rnext = rnext; __entry->maclen = maclen; ), TP_printk("net=%llu state=%s family=%s src=%pISpc dest=%pISpc L3index=%d [%c%c%c%c%c] keyid=%u rnext=%u maclen=%u", __entry->net_cookie, show_tcp_state_name(__entry->state), show_family_name(__entry->family), __entry->saddr, __entry->daddr, __entry->l3index, __entry->fin ? 'F' : ' ', __entry->syn ? 'S' : ' ', __entry->rst ? 'R' : ' ', __entry->psh ? 'P' : ' ', __entry->ack ? '.' : ' ', __entry->keyid, __entry->rnext, __entry->maclen) ); DEFINE_EVENT(tcp_ao_event, tcp_ao_handshake_failure, TP_PROTO(const struct sock *sk, const struct sk_buff *skb, const __u8 keyid, const __u8 rnext, const __u8 maclen), TP_ARGS(sk, skb, keyid, rnext, maclen) ); DEFINE_EVENT(tcp_ao_event, tcp_ao_wrong_maclen, TP_PROTO(const struct sock *sk, const struct sk_buff *skb, const __u8 keyid, const __u8 rnext, const __u8 maclen), TP_ARGS(sk, skb, keyid, rnext, maclen) ); DEFINE_EVENT(tcp_ao_event, tcp_ao_mismatch, TP_PROTO(const struct sock *sk, const struct sk_buff *skb, const __u8 keyid, const __u8 rnext, const __u8 maclen), TP_ARGS(sk, skb, keyid, rnext, maclen) ); DEFINE_EVENT(tcp_ao_event, tcp_ao_key_not_found, TP_PROTO(const struct sock *sk, const struct sk_buff *skb, const __u8 keyid, const __u8 rnext, const __u8 maclen), TP_ARGS(sk, skb, keyid, rnext, maclen) ); DEFINE_EVENT(tcp_ao_event, tcp_ao_rnext_request, TP_PROTO(const struct sock *sk, const struct sk_buff *skb, const __u8 keyid, const __u8 rnext, const __u8 maclen), TP_ARGS(sk, skb, keyid, rnext, maclen) ); DECLARE_EVENT_CLASS(tcp_ao_event_sk, TP_PROTO(const struct sock *sk, const __u8 keyid, const __u8 rnext), TP_ARGS(sk, keyid, rnext), TP_STRUCT__entry( __field(__u64, net_cookie) __field(const void *, skaddr) __field(int, state) /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u8, keyid) __field(__u8, rnext) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __entry->net_cookie = sock_net(sk)->net_cookie; __entry->skaddr = sk; __entry->state = sk->sk_state; memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS(__entry, inet, sk); /* For filtering use */ __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; __entry->keyid = keyid; __entry->rnext = rnext; ), TP_printk("net=%llu state=%s family=%s src=%pISpc dest=%pISpc keyid=%u rnext=%u", __entry->net_cookie, show_tcp_state_name(__entry->state), show_family_name(__entry->family), __entry->saddr, __entry->daddr, __entry->keyid, __entry->rnext) ); DEFINE_EVENT(tcp_ao_event_sk, tcp_ao_synack_no_key, TP_PROTO(const struct sock *sk, const __u8 keyid, const __u8 rnext), TP_ARGS(sk, keyid, rnext) ); DECLARE_EVENT_CLASS(tcp_ao_event_sne, TP_PROTO(const struct sock *sk, __u32 new_sne), TP_ARGS(sk, new_sne), TP_STRUCT__entry( __field(__u64, net_cookie) __field(const void *, skaddr) __field(int, state) /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u32, new_sne) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __entry->net_cookie = sock_net(sk)->net_cookie; __entry->skaddr = sk; __entry->state = sk->sk_state; memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS(__entry, inet, sk); /* For filtering use */ __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; __entry->new_sne = new_sne; ), TP_printk("net=%llu state=%s family=%s src=%pISpc dest=%pISpc sne=%u", __entry->net_cookie, show_tcp_state_name(__entry->state), show_family_name(__entry->family), __entry->saddr, __entry->daddr, __entry->new_sne) ); DEFINE_EVENT(tcp_ao_event_sne, tcp_ao_snd_sne_update, TP_PROTO(const struct sock *sk, __u32 new_sne), TP_ARGS(sk, new_sne) ); DEFINE_EVENT(tcp_ao_event_sne, tcp_ao_rcv_sne_update, TP_PROTO(const struct sock *sk, __u32 new_sne), TP_ARGS(sk, new_sne) ); #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
10 183 181 183 12 2 2 11 313 226 219 20 312 42 15 22 7 42 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/quota.h> #include <linux/export.h> /** * qid_eq - Test to see if to kquid values are the same * @left: A qid value * @right: Another quid value * * Return true if the two qid values are equal and false otherwise. */ bool qid_eq(struct kqid left, struct kqid right) { if (left.type != right.type) return false; switch(left.type) { case USRQUOTA: return uid_eq(left.uid, right.uid); case GRPQUOTA: return gid_eq(left.gid, right.gid); case PRJQUOTA: return projid_eq(left.projid, right.projid); default: BUG(); } } EXPORT_SYMBOL(qid_eq); /** * qid_lt - Test to see if one qid value is less than another * @left: The possibly lesser qid value * @right: The possibly greater qid value * * Return true if left is less than right and false otherwise. */ bool qid_lt(struct kqid left, struct kqid right) { if (left.type < right.type) return true; if (left.type > right.type) return false; switch (left.type) { case USRQUOTA: return uid_lt(left.uid, right.uid); case GRPQUOTA: return gid_lt(left.gid, right.gid); case PRJQUOTA: return projid_lt(left.projid, right.projid); default: BUG(); } } EXPORT_SYMBOL(qid_lt); /** * from_kqid - Create a qid from a kqid user-namespace pair. * @targ: The user namespace we want a qid in. * @kqid: The kernel internal quota identifier to start with. * * Map @kqid into the user-namespace specified by @targ and * return the resulting qid. * * There is always a mapping into the initial user_namespace. * * If @kqid has no mapping in @targ (qid_t)-1 is returned. */ qid_t from_kqid(struct user_namespace *targ, struct kqid kqid) { switch (kqid.type) { case USRQUOTA: return from_kuid(targ, kqid.uid); case GRPQUOTA: return from_kgid(targ, kqid.gid); case PRJQUOTA: return from_kprojid(targ, kqid.projid); default: BUG(); } } EXPORT_SYMBOL(from_kqid); /** * from_kqid_munged - Create a qid from a kqid user-namespace pair. * @targ: The user namespace we want a qid in. * @kqid: The kernel internal quota identifier to start with. * * Map @kqid into the user-namespace specified by @targ and * return the resulting qid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kqid from_kqid_munged never fails and always * returns a valid projid. This makes from_kqid_munged * appropriate for use in places where failing to provide * a qid_t is not a good option. * * If @kqid has no mapping in @targ the kqid.type specific * overflow identifier is returned. */ qid_t from_kqid_munged(struct user_namespace *targ, struct kqid kqid) { switch (kqid.type) { case USRQUOTA: return from_kuid_munged(targ, kqid.uid); case GRPQUOTA: return from_kgid_munged(targ, kqid.gid); case PRJQUOTA: return from_kprojid_munged(targ, kqid.projid); default: BUG(); } } EXPORT_SYMBOL(from_kqid_munged); /** * qid_valid - Report if a valid value is stored in a kqid. * @qid: The kernel internal quota identifier to test. */ bool qid_valid(struct kqid qid) { switch (qid.type) { case USRQUOTA: return uid_valid(qid.uid); case GRPQUOTA: return gid_valid(qid.gid); case PRJQUOTA: return projid_valid(qid.projid); default: BUG(); } } EXPORT_SYMBOL(qid_valid);
53 52 11 11 53 11 53 50 9 49 53 2 1 1 2 18 17 2 1 17 54 53 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 // SPDX-License-Identifier: GPL-2.0-or-later /* * linux/ipc/msgutil.c * Copyright (C) 1999, 2004 Manfred Spraul */ #include <linux/spinlock.h> #include <linux/init.h> #include <linux/security.h> #include <linux/slab.h> #include <linux/ipc.h> #include <linux/msg.h> #include <linux/ipc_namespace.h> #include <linux/utsname.h> #include <linux/proc_ns.h> #include <linux/uaccess.h> #include <linux/sched.h> #include "util.h" DEFINE_SPINLOCK(mq_lock); /* * The next 2 defines are here bc this is the only file * compiled when either CONFIG_SYSVIPC and CONFIG_POSIX_MQUEUE * and not CONFIG_IPC_NS. */ struct ipc_namespace init_ipc_ns = { .ns.count = REFCOUNT_INIT(1), .user_ns = &init_user_ns, .ns.inum = PROC_IPC_INIT_INO, #ifdef CONFIG_IPC_NS .ns.ops = &ipcns_operations, #endif }; struct msg_msgseg { struct msg_msgseg *next; /* the next part of the message follows immediately */ }; #define DATALEN_MSG ((size_t)PAGE_SIZE-sizeof(struct msg_msg)) #define DATALEN_SEG ((size_t)PAGE_SIZE-sizeof(struct msg_msgseg)) static kmem_buckets *msg_buckets __ro_after_init; static int __init init_msg_buckets(void) { msg_buckets = kmem_buckets_create("msg_msg", SLAB_ACCOUNT, sizeof(struct msg_msg), DATALEN_MSG, NULL); return 0; } subsys_initcall(init_msg_buckets); static struct msg_msg *alloc_msg(size_t len) { struct msg_msg *msg; struct msg_msgseg **pseg; size_t alen; alen = min(len, DATALEN_MSG); msg = kmem_buckets_alloc(msg_buckets, sizeof(*msg) + alen, GFP_KERNEL); if (msg == NULL) return NULL; msg->next = NULL; msg->security = NULL; len -= alen; pseg = &msg->next; while (len > 0) { struct msg_msgseg *seg; cond_resched(); alen = min(len, DATALEN_SEG); seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_ACCOUNT); if (seg == NULL) goto out_err; *pseg = seg; seg->next = NULL; pseg = &seg->next; len -= alen; } return msg; out_err: free_msg(msg); return NULL; } struct msg_msg *load_msg(const void __user *src, size_t len) { struct msg_msg *msg; struct msg_msgseg *seg; int err = -EFAULT; size_t alen; msg = alloc_msg(len); if (msg == NULL) return ERR_PTR(-ENOMEM); alen = min(len, DATALEN_MSG); if (copy_from_user(msg + 1, src, alen)) goto out_err; for (seg = msg->next; seg != NULL; seg = seg->next) { len -= alen; src = (char __user *)src + alen; alen = min(len, DATALEN_SEG); if (copy_from_user(seg + 1, src, alen)) goto out_err; } err = security_msg_msg_alloc(msg); if (err) goto out_err; return msg; out_err: free_msg(msg); return ERR_PTR(err); } #ifdef CONFIG_CHECKPOINT_RESTORE struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst) { struct msg_msgseg *dst_pseg, *src_pseg; size_t len = src->m_ts; size_t alen; if (src->m_ts > dst->m_ts) return ERR_PTR(-EINVAL); alen = min(len, DATALEN_MSG); memcpy(dst + 1, src + 1, alen); for (dst_pseg = dst->next, src_pseg = src->next; src_pseg != NULL; dst_pseg = dst_pseg->next, src_pseg = src_pseg->next) { len -= alen; alen = min(len, DATALEN_SEG); memcpy(dst_pseg + 1, src_pseg + 1, alen); } dst->m_type = src->m_type; dst->m_ts = src->m_ts; return dst; } #else struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst) { return ERR_PTR(-ENOSYS); } #endif int store_msg(void __user *dest, struct msg_msg *msg, size_t len) { size_t alen; struct msg_msgseg *seg; alen = min(len, DATALEN_MSG); if (copy_to_user(dest, msg + 1, alen)) return -1; for (seg = msg->next; seg != NULL; seg = seg->next) { len -= alen; dest = (char __user *)dest + alen; alen = min(len, DATALEN_SEG); if (copy_to_user(dest, seg + 1, alen)) return -1; } return 0; } void free_msg(struct msg_msg *msg) { struct msg_msgseg *seg; security_msg_msg_free(msg); seg = msg->next; kfree(msg); while (seg != NULL) { struct msg_msgseg *tmp = seg->next; cond_resched(); kfree(seg); seg = tmp; } }
9 9 9 9 9 9 9 9 9 9 9 9 9 8 9 9 9 9 9 9 9 10 10 10 10 10 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 // SPDX-License-Identifier: GPL-2.0-only /* * Sysfs interface for the universal power supply monitor class * * Copyright © 2007 David Woodhouse <dwmw2@infradead.org> * Copyright © 2007 Anton Vorontsov <cbou@mail.ru> * Copyright © 2004 Szabolcs Gyurko * Copyright © 2003 Ian Molton <spyro@f2s.com> * * Modified: 2004, Oct Szabolcs Gyurko */ #include <linux/ctype.h> #include <linux/device.h> #include <linux/power_supply.h> #include <linux/slab.h> #include <linux/stat.h> #include <linux/string_helpers.h> #include "power_supply.h" #define MAX_PROP_NAME_LEN 30 struct power_supply_attr { const char *prop_name; char attr_name[MAX_PROP_NAME_LEN + 1]; struct device_attribute dev_attr; const char * const *text_values; int text_values_len; }; #define _POWER_SUPPLY_ATTR(_name, _text, _len) \ [POWER_SUPPLY_PROP_ ## _name] = \ { \ .prop_name = #_name, \ .attr_name = #_name "\0", \ .text_values = _text, \ .text_values_len = _len, \ } #define POWER_SUPPLY_ATTR(_name) _POWER_SUPPLY_ATTR(_name, NULL, 0) #define _POWER_SUPPLY_ENUM_ATTR(_name, _text) \ _POWER_SUPPLY_ATTR(_name, _text, ARRAY_SIZE(_text)) #define POWER_SUPPLY_ENUM_ATTR(_name) \ _POWER_SUPPLY_ENUM_ATTR(_name, POWER_SUPPLY_ ## _name ## _TEXT) static const char * const POWER_SUPPLY_TYPE_TEXT[] = { [POWER_SUPPLY_TYPE_UNKNOWN] = "Unknown", [POWER_SUPPLY_TYPE_BATTERY] = "Battery", [POWER_SUPPLY_TYPE_UPS] = "UPS", [POWER_SUPPLY_TYPE_MAINS] = "Mains", [POWER_SUPPLY_TYPE_USB] = "USB", [POWER_SUPPLY_TYPE_USB_DCP] = "USB_DCP", [POWER_SUPPLY_TYPE_USB_CDP] = "USB_CDP", [POWER_SUPPLY_TYPE_USB_ACA] = "USB_ACA", [POWER_SUPPLY_TYPE_USB_TYPE_C] = "USB_C", [POWER_SUPPLY_TYPE_USB_PD] = "USB_PD", [POWER_SUPPLY_TYPE_USB_PD_DRP] = "USB_PD_DRP", [POWER_SUPPLY_TYPE_APPLE_BRICK_ID] = "BrickID", [POWER_SUPPLY_TYPE_WIRELESS] = "Wireless", }; static const char * const POWER_SUPPLY_USB_TYPE_TEXT[] = { [POWER_SUPPLY_USB_TYPE_UNKNOWN] = "Unknown", [POWER_SUPPLY_USB_TYPE_SDP] = "SDP", [POWER_SUPPLY_USB_TYPE_DCP] = "DCP", [POWER_SUPPLY_USB_TYPE_CDP] = "CDP", [POWER_SUPPLY_USB_TYPE_ACA] = "ACA", [POWER_SUPPLY_USB_TYPE_C] = "C", [POWER_SUPPLY_USB_TYPE_PD] = "PD", [POWER_SUPPLY_USB_TYPE_PD_DRP] = "PD_DRP", [POWER_SUPPLY_USB_TYPE_PD_PPS] = "PD_PPS", [POWER_SUPPLY_USB_TYPE_APPLE_BRICK_ID] = "BrickID", }; static const char * const POWER_SUPPLY_STATUS_TEXT[] = { [POWER_SUPPLY_STATUS_UNKNOWN] = "Unknown", [POWER_SUPPLY_STATUS_CHARGING] = "Charging", [POWER_SUPPLY_STATUS_DISCHARGING] = "Discharging", [POWER_SUPPLY_STATUS_NOT_CHARGING] = "Not charging", [POWER_SUPPLY_STATUS_FULL] = "Full", }; static const char * const POWER_SUPPLY_CHARGE_TYPE_TEXT[] = { [POWER_SUPPLY_CHARGE_TYPE_UNKNOWN] = "Unknown", [POWER_SUPPLY_CHARGE_TYPE_NONE] = "N/A", [POWER_SUPPLY_CHARGE_TYPE_TRICKLE] = "Trickle", [POWER_SUPPLY_CHARGE_TYPE_FAST] = "Fast", [POWER_SUPPLY_CHARGE_TYPE_STANDARD] = "Standard", [POWER_SUPPLY_CHARGE_TYPE_ADAPTIVE] = "Adaptive", [POWER_SUPPLY_CHARGE_TYPE_CUSTOM] = "Custom", [POWER_SUPPLY_CHARGE_TYPE_LONGLIFE] = "Long Life", [POWER_SUPPLY_CHARGE_TYPE_BYPASS] = "Bypass", }; static const char * const POWER_SUPPLY_HEALTH_TEXT[] = { [POWER_SUPPLY_HEALTH_UNKNOWN] = "Unknown", [POWER_SUPPLY_HEALTH_GOOD] = "Good", [POWER_SUPPLY_HEALTH_OVERHEAT] = "Overheat", [POWER_SUPPLY_HEALTH_DEAD] = "Dead", [POWER_SUPPLY_HEALTH_OVERVOLTAGE] = "Over voltage", [POWER_SUPPLY_HEALTH_UNSPEC_FAILURE] = "Unspecified failure", [POWER_SUPPLY_HEALTH_COLD] = "Cold", [POWER_SUPPLY_HEALTH_WATCHDOG_TIMER_EXPIRE] = "Watchdog timer expire", [POWER_SUPPLY_HEALTH_SAFETY_TIMER_EXPIRE] = "Safety timer expire", [POWER_SUPPLY_HEALTH_OVERCURRENT] = "Over current", [POWER_SUPPLY_HEALTH_CALIBRATION_REQUIRED] = "Calibration required", [POWER_SUPPLY_HEALTH_WARM] = "Warm", [POWER_SUPPLY_HEALTH_COOL] = "Cool", [POWER_SUPPLY_HEALTH_HOT] = "Hot", [POWER_SUPPLY_HEALTH_NO_BATTERY] = "No battery", }; static const char * const POWER_SUPPLY_TECHNOLOGY_TEXT[] = { [POWER_SUPPLY_TECHNOLOGY_UNKNOWN] = "Unknown", [POWER_SUPPLY_TECHNOLOGY_NiMH] = "NiMH", [POWER_SUPPLY_TECHNOLOGY_LION] = "Li-ion", [POWER_SUPPLY_TECHNOLOGY_LIPO] = "Li-poly", [POWER_SUPPLY_TECHNOLOGY_LiFe] = "LiFe", [POWER_SUPPLY_TECHNOLOGY_NiCd] = "NiCd", [POWER_SUPPLY_TECHNOLOGY_LiMn] = "LiMn", }; static const char * const POWER_SUPPLY_CAPACITY_LEVEL_TEXT[] = { [POWER_SUPPLY_CAPACITY_LEVEL_UNKNOWN] = "Unknown", [POWER_SUPPLY_CAPACITY_LEVEL_CRITICAL] = "Critical", [POWER_SUPPLY_CAPACITY_LEVEL_LOW] = "Low", [POWER_SUPPLY_CAPACITY_LEVEL_NORMAL] = "Normal", [POWER_SUPPLY_CAPACITY_LEVEL_HIGH] = "High", [POWER_SUPPLY_CAPACITY_LEVEL_FULL] = "Full", }; static const char * const POWER_SUPPLY_SCOPE_TEXT[] = { [POWER_SUPPLY_SCOPE_UNKNOWN] = "Unknown", [POWER_SUPPLY_SCOPE_SYSTEM] = "System", [POWER_SUPPLY_SCOPE_DEVICE] = "Device", }; static const char * const POWER_SUPPLY_CHARGE_BEHAVIOUR_TEXT[] = { [POWER_SUPPLY_CHARGE_BEHAVIOUR_AUTO] = "auto", [POWER_SUPPLY_CHARGE_BEHAVIOUR_INHIBIT_CHARGE] = "inhibit-charge", [POWER_SUPPLY_CHARGE_BEHAVIOUR_FORCE_DISCHARGE] = "force-discharge", }; static struct power_supply_attr power_supply_attrs[] = { /* Properties of type `int' */ POWER_SUPPLY_ENUM_ATTR(STATUS), POWER_SUPPLY_ENUM_ATTR(CHARGE_TYPE), POWER_SUPPLY_ENUM_ATTR(HEALTH), POWER_SUPPLY_ATTR(PRESENT), POWER_SUPPLY_ATTR(ONLINE), POWER_SUPPLY_ATTR(AUTHENTIC), POWER_SUPPLY_ENUM_ATTR(TECHNOLOGY), POWER_SUPPLY_ATTR(CYCLE_COUNT), POWER_SUPPLY_ATTR(VOLTAGE_MAX), POWER_SUPPLY_ATTR(VOLTAGE_MIN), POWER_SUPPLY_ATTR(VOLTAGE_MAX_DESIGN), POWER_SUPPLY_ATTR(VOLTAGE_MIN_DESIGN), POWER_SUPPLY_ATTR(VOLTAGE_NOW), POWER_SUPPLY_ATTR(VOLTAGE_AVG), POWER_SUPPLY_ATTR(VOLTAGE_OCV), POWER_SUPPLY_ATTR(VOLTAGE_BOOT), POWER_SUPPLY_ATTR(CURRENT_MAX), POWER_SUPPLY_ATTR(CURRENT_NOW), POWER_SUPPLY_ATTR(CURRENT_AVG), POWER_SUPPLY_ATTR(CURRENT_BOOT), POWER_SUPPLY_ATTR(POWER_NOW), POWER_SUPPLY_ATTR(POWER_AVG), POWER_SUPPLY_ATTR(CHARGE_FULL_DESIGN), POWER_SUPPLY_ATTR(CHARGE_EMPTY_DESIGN), POWER_SUPPLY_ATTR(CHARGE_FULL), POWER_SUPPLY_ATTR(CHARGE_EMPTY), POWER_SUPPLY_ATTR(CHARGE_NOW), POWER_SUPPLY_ATTR(CHARGE_AVG), POWER_SUPPLY_ATTR(CHARGE_COUNTER), POWER_SUPPLY_ATTR(CONSTANT_CHARGE_CURRENT), POWER_SUPPLY_ATTR(CONSTANT_CHARGE_CURRENT_MAX), POWER_SUPPLY_ATTR(CONSTANT_CHARGE_VOLTAGE), POWER_SUPPLY_ATTR(CONSTANT_CHARGE_VOLTAGE_MAX), POWER_SUPPLY_ATTR(CHARGE_CONTROL_LIMIT), POWER_SUPPLY_ATTR(CHARGE_CONTROL_LIMIT_MAX), POWER_SUPPLY_ATTR(CHARGE_CONTROL_START_THRESHOLD), POWER_SUPPLY_ATTR(CHARGE_CONTROL_END_THRESHOLD), POWER_SUPPLY_ENUM_ATTR(CHARGE_BEHAVIOUR), POWER_SUPPLY_ATTR(INPUT_CURRENT_LIMIT), POWER_SUPPLY_ATTR(INPUT_VOLTAGE_LIMIT), POWER_SUPPLY_ATTR(INPUT_POWER_LIMIT), POWER_SUPPLY_ATTR(ENERGY_FULL_DESIGN), POWER_SUPPLY_ATTR(ENERGY_EMPTY_DESIGN), POWER_SUPPLY_ATTR(ENERGY_FULL), POWER_SUPPLY_ATTR(ENERGY_EMPTY), POWER_SUPPLY_ATTR(ENERGY_NOW), POWER_SUPPLY_ATTR(ENERGY_AVG), POWER_SUPPLY_ATTR(CAPACITY), POWER_SUPPLY_ATTR(CAPACITY_ALERT_MIN), POWER_SUPPLY_ATTR(CAPACITY_ALERT_MAX), POWER_SUPPLY_ATTR(CAPACITY_ERROR_MARGIN), POWER_SUPPLY_ENUM_ATTR(CAPACITY_LEVEL), POWER_SUPPLY_ATTR(TEMP), POWER_SUPPLY_ATTR(TEMP_MAX), POWER_SUPPLY_ATTR(TEMP_MIN), POWER_SUPPLY_ATTR(TEMP_ALERT_MIN), POWER_SUPPLY_ATTR(TEMP_ALERT_MAX), POWER_SUPPLY_ATTR(TEMP_AMBIENT), POWER_SUPPLY_ATTR(TEMP_AMBIENT_ALERT_MIN), POWER_SUPPLY_ATTR(TEMP_AMBIENT_ALERT_MAX), POWER_SUPPLY_ATTR(TIME_TO_EMPTY_NOW), POWER_SUPPLY_ATTR(TIME_TO_EMPTY_AVG), POWER_SUPPLY_ATTR(TIME_TO_FULL_NOW), POWER_SUPPLY_ATTR(TIME_TO_FULL_AVG), POWER_SUPPLY_ENUM_ATTR(TYPE), POWER_SUPPLY_ENUM_ATTR(USB_TYPE), POWER_SUPPLY_ENUM_ATTR(SCOPE), POWER_SUPPLY_ATTR(PRECHARGE_CURRENT), POWER_SUPPLY_ATTR(CHARGE_TERM_CURRENT), POWER_SUPPLY_ATTR(CALIBRATE), POWER_SUPPLY_ATTR(MANUFACTURE_YEAR), POWER_SUPPLY_ATTR(MANUFACTURE_MONTH), POWER_SUPPLY_ATTR(MANUFACTURE_DAY), /* Properties of type `const char *' */ POWER_SUPPLY_ATTR(MODEL_NAME), POWER_SUPPLY_ATTR(MANUFACTURER), POWER_SUPPLY_ATTR(SERIAL_NUMBER), }; #define POWER_SUPPLY_ATTR_CNT ARRAY_SIZE(power_supply_attrs) static struct attribute * __power_supply_attrs[POWER_SUPPLY_ATTR_CNT + 1]; static struct power_supply_attr *to_ps_attr(struct device_attribute *attr) { return container_of(attr, struct power_supply_attr, dev_attr); } static enum power_supply_property dev_attr_psp(struct device_attribute *attr) { return to_ps_attr(attr) - power_supply_attrs; } static ssize_t power_supply_show_enum_with_available( struct device *dev, const char * const labels[], int label_count, unsigned int available_values, int value, char *buf) { bool match = false, available, active; ssize_t count = 0; int i; for (i = 0; i < label_count; i++) { available = available_values & BIT(i); active = i == value; if (available && active) { count += sysfs_emit_at(buf, count, "[%s] ", labels[i]); match = true; } else if (available) { count += sysfs_emit_at(buf, count, "%s ", labels[i]); } } if (!match) { dev_warn(dev, "driver reporting unavailable enum value %d\n", value); return -EINVAL; } if (count) buf[count - 1] = '\n'; return count; } static ssize_t power_supply_show_property(struct device *dev, struct device_attribute *attr, char *buf) { ssize_t ret; struct power_supply *psy = dev_get_drvdata(dev); struct power_supply_attr *ps_attr = to_ps_attr(attr); enum power_supply_property psp = dev_attr_psp(attr); union power_supply_propval value; if (psp == POWER_SUPPLY_PROP_TYPE) { value.intval = psy->desc->type; } else { ret = power_supply_get_property(psy, psp, &value); if (ret < 0) { if (ret == -ENODATA) dev_dbg_ratelimited(dev, "driver has no data for `%s' property\n", attr->attr.name); else if (ret != -ENODEV && ret != -EAGAIN) dev_err_ratelimited(dev, "driver failed to report `%s' property: %zd\n", attr->attr.name, ret); return ret; } } switch (psp) { case POWER_SUPPLY_PROP_USB_TYPE: ret = power_supply_show_enum_with_available( dev, POWER_SUPPLY_USB_TYPE_TEXT, ARRAY_SIZE(POWER_SUPPLY_USB_TYPE_TEXT), psy->desc->usb_types, value.intval, buf); break; case POWER_SUPPLY_PROP_CHARGE_BEHAVIOUR: ret = power_supply_charge_behaviour_show(dev, psy->desc->charge_behaviours, value.intval, buf); break; case POWER_SUPPLY_PROP_MODEL_NAME ... POWER_SUPPLY_PROP_SERIAL_NUMBER: ret = sysfs_emit(buf, "%s\n", value.strval); break; default: if (ps_attr->text_values_len > 0 && value.intval < ps_attr->text_values_len && value.intval >= 0) { ret = sysfs_emit(buf, "%s\n", ps_attr->text_values[value.intval]); } else { ret = sysfs_emit(buf, "%d\n", value.intval); } } return ret; } static ssize_t power_supply_store_property(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { ssize_t ret; struct power_supply *psy = dev_get_drvdata(dev); struct power_supply_attr *ps_attr = to_ps_attr(attr); enum power_supply_property psp = dev_attr_psp(attr); union power_supply_propval value; ret = -EINVAL; if (ps_attr->text_values_len > 0) { ret = __sysfs_match_string(ps_attr->text_values, ps_attr->text_values_len, buf); } /* * If no match was found, then check to see if it is an integer. * Integer values are valid for enums in addition to the text value. */ if (ret < 0) { long long_val; ret = kstrtol(buf, 10, &long_val); if (ret < 0) return ret; ret = long_val; } value.intval = ret; ret = power_supply_set_property(psy, psp, &value); if (ret < 0) return ret; return count; } static umode_t power_supply_attr_is_visible(struct kobject *kobj, struct attribute *attr, int attrno) { struct device *dev = kobj_to_dev(kobj); struct power_supply *psy = dev_get_drvdata(dev); umode_t mode = S_IRUSR | S_IRGRP | S_IROTH; int i; if (!power_supply_attrs[attrno].prop_name) return 0; if (attrno == POWER_SUPPLY_PROP_TYPE) return mode; for (i = 0; i < psy->desc->num_properties; i++) { int property = psy->desc->properties[i]; if (property == attrno) { if (power_supply_property_is_writeable(psy, property) > 0) mode |= S_IWUSR; return mode; } } if (power_supply_battery_info_has_prop(psy->battery_info, attrno)) return mode; return 0; } static const struct attribute_group power_supply_attr_group = { .attrs = __power_supply_attrs, .is_visible = power_supply_attr_is_visible, }; const struct attribute_group *power_supply_attr_groups[] = { &power_supply_attr_group, NULL }; void power_supply_init_attrs(void) { int i; for (i = 0; i < ARRAY_SIZE(power_supply_attrs); i++) { struct device_attribute *attr; if (!power_supply_attrs[i].prop_name) { pr_warn("%s: Property %d skipped because it is missing from power_supply_attrs\n", __func__, i); sprintf(power_supply_attrs[i].attr_name, "_err_%d", i); } else { string_lower(power_supply_attrs[i].attr_name, power_supply_attrs[i].attr_name); } attr = &power_supply_attrs[i].dev_attr; attr->attr.name = power_supply_attrs[i].attr_name; attr->show = power_supply_show_property; attr->store = power_supply_store_property; __power_supply_attrs[i] = &attr->attr; } } static int add_prop_uevent(const struct device *dev, struct kobj_uevent_env *env, enum power_supply_property prop, char *prop_buf) { int ret = 0; struct power_supply_attr *pwr_attr; struct device_attribute *dev_attr; char *line; pwr_attr = &power_supply_attrs[prop]; dev_attr = &pwr_attr->dev_attr; ret = power_supply_show_property((struct device *)dev, dev_attr, prop_buf); if (ret == -ENODEV || ret == -ENODATA) { /* * When a battery is absent, we expect -ENODEV. Don't abort; * send the uevent with at least the PRESENT=0 property */ return 0; } if (ret < 0) return ret; line = strchr(prop_buf, '\n'); if (line) *line = 0; return add_uevent_var(env, "POWER_SUPPLY_%s=%s", pwr_attr->prop_name, prop_buf); } int power_supply_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct power_supply *psy = dev_get_drvdata(dev); const enum power_supply_property *battery_props = power_supply_battery_info_properties; unsigned long psy_drv_properties[POWER_SUPPLY_ATTR_CNT / sizeof(unsigned long) + 1] = {0}; int ret = 0, j; char *prop_buf; if (!psy || !psy->desc) { dev_dbg(dev, "No power supply yet\n"); return ret; } ret = add_uevent_var(env, "POWER_SUPPLY_NAME=%s", psy->desc->name); if (ret) return ret; /* * Kernel generates KOBJ_REMOVE uevent in device removal path, after * resources have been freed. Exit early to avoid use-after-free. */ if (psy->removing) return 0; prop_buf = (char *)get_zeroed_page(GFP_KERNEL); if (!prop_buf) return -ENOMEM; ret = add_prop_uevent(dev, env, POWER_SUPPLY_PROP_TYPE, prop_buf); if (ret) goto out; for (j = 0; j < psy->desc->num_properties; j++) { set_bit(psy->desc->properties[j], psy_drv_properties); ret = add_prop_uevent(dev, env, psy->desc->properties[j], prop_buf); if (ret) goto out; } for (j = 0; j < power_supply_battery_info_properties_size; j++) { if (test_bit(battery_props[j], psy_drv_properties)) continue; if (!power_supply_battery_info_has_prop(psy->battery_info, battery_props[j])) continue; ret = add_prop_uevent(dev, env, battery_props[j], prop_buf); if (ret) goto out; } out: free_page((unsigned long)prop_buf); return ret; } ssize_t power_supply_charge_behaviour_show(struct device *dev, unsigned int available_behaviours, enum power_supply_charge_behaviour current_behaviour, char *buf) { return power_supply_show_enum_with_available( dev, POWER_SUPPLY_CHARGE_BEHAVIOUR_TEXT, ARRAY_SIZE(POWER_SUPPLY_CHARGE_BEHAVIOUR_TEXT), available_behaviours, current_behaviour, buf); } EXPORT_SYMBOL_GPL(power_supply_charge_behaviour_show); int power_supply_charge_behaviour_parse(unsigned int available_behaviours, const char *buf) { int i = sysfs_match_string(POWER_SUPPLY_CHARGE_BEHAVIOUR_TEXT, buf); if (i < 0) return i; if (available_behaviours & BIT(i)) return i; return -EINVAL; } EXPORT_SYMBOL_GPL(power_supply_charge_behaviour_parse);
7 5 7 4 4 4 65 9 9 9 65 4 61 65 7 5 7 2 5 5 1 5 59 59 1 1 59 57 59 6 4 4 4 5 4 4 4 3 6 1 5 1 24 22 2 23 21 58 56 5 55 7 6 1 187 186 187 44 187 187 186 186 186 185 186 185 33 185 109 109 101 36 3 33 33 33 156 87 112 110 93 33 32 1 1 33 3 184 187 187 93 93 91 63 63 91 28 42 67 67 67 23 23 66 92 42 42 42 42 39 39 8 31 31 31 30 10 6 6 6 6 6 6 6 3 6 6 6 6 6 6 3 6 6 6 4 4 4 4 4 4 4 4 1 4 6 6 6 4 33 42 6 6 6 3 6 6 6 6 6 6 6 6 4 4 4 2 2 2 2 2 4 4 4 3 4 4 3 1 4 4 4 4 4 4 4 4 3 3 3 3 3 3 4 1 3 1 6 6 32 20 32 32 32 32 32 32 32 32 32 3 29 29 29 32 31 21 21 21 20 20 21 32 32 32 32 31 32 32 32 34 32 24 24 3 17 24 32 7 26 26 1 26 10 1 10 26 20 17 17 17 17 17 26 7 2 2 2 2 7 2 34 2 2 2 2 38 23 23 23 15 38 38 38 37 4 4 5 4 3 4 4 4 4 4 4 4 4 4 4 43 62 61 62 52 9 8 7 5 7 6 2 2 5 5 4 1 1 1 44 44 44 10 44 41 10 4 4 6 1 3 1 4 10 10 2 8 47 47 47 47 46 46 43 42 5 1 46 3 8 4 4 46 46 3 47 5 1 47 47 46 43 33 4 33 31 5 1 46 54 54 53 53 50 9 2 2 2 9 9 9 1 10 9 9 5 2 1 4 2 2 2 3 2 2 2 2 2 1 3 8 2 2 33 4 33 33 2 2 1 33 21 142 44 186 137 185 186 138 50 21 21 186 21 21 20 138 6 6 6 3 3 6 6 6 4 4 4 82 83 83 56 26 56 27 6 83 83 47 29 29 1 29 47 47 47 83 48 48 1 47 83 83 68 68 50 50 45 49 49 82 4 4 4 4 4 4 4 3 1 4 4 4 4 3 3 3 4 4 4 4 3 3 4 4 4 26 26 26 26 26 26 1 1 1 1 1 1 1 26 21 26 10 10 10 10 10 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2004 */ /* * jfs_dtree.c: directory B+-tree manager * * B+-tree with variable length key directory: * * each directory page is structured as an array of 32-byte * directory entry slots initialized as a freelist * to avoid search/compaction of free space at insertion. * when an entry is inserted, a number of slots are allocated * from the freelist as required to store variable length data * of the entry; when the entry is deleted, slots of the entry * are returned to freelist. * * leaf entry stores full name as key and file serial number * (aka inode number) as data. * internal/router entry stores sufffix compressed name * as key and simple extent descriptor as data. * * each directory page maintains a sorted entry index table * which stores the start slot index of sorted entries * to allow binary search on the table. * * directory starts as a root/leaf page in on-disk inode * inline data area. * when it becomes full, it starts a leaf of a external extent * of length of 1 block. each time the first leaf becomes full, * it is extended rather than split (its size is doubled), * until its length becoms 4 KBytes, from then the extent is split * with new 4 Kbyte extent when it becomes full * to reduce external fragmentation of small directories. * * blah, blah, blah, for linear scan of directory in pieces by * readdir(). * * * case-insensitive directory file system * * names are stored in case-sensitive way in leaf entry. * but stored, searched and compared in case-insensitive (uppercase) order * (i.e., both search key and entry key are folded for search/compare): * (note that case-sensitive order is BROKEN in storage, e.g., * sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad * * entries which folds to the same key makes up a equivalent class * whose members are stored as contiguous cluster (may cross page boundary) * but whose order is arbitrary and acts as duplicate, e.g., * abc, Abc, aBc, abC) * * once match is found at leaf, requires scan forward/backward * either for, in case-insensitive search, duplicate * or for, in case-sensitive search, for exact match * * router entry must be created/stored in case-insensitive way * in internal entry: * (right most key of left page and left most key of right page * are folded, and its suffix compression is propagated as router * key in parent) * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB> * should be made the router key for the split) * * case-insensitive search: * * fold search key; * * case-insensitive search of B-tree: * for internal entry, router key is already folded; * for leaf entry, fold the entry key before comparison. * * if (leaf entry case-insensitive match found) * if (next entry satisfies case-insensitive match) * return EDUPLICATE; * if (prev entry satisfies case-insensitive match) * return EDUPLICATE; * return match; * else * return no match; * * serialization: * target directory inode lock is being held on entry/exit * of all main directory service routines. * * log based recovery: */ #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/slab.h> #include "jfs_incore.h" #include "jfs_superblock.h" #include "jfs_filsys.h" #include "jfs_metapage.h" #include "jfs_dmap.h" #include "jfs_unicode.h" #include "jfs_debug.h" /* dtree split parameter */ struct dtsplit { struct metapage *mp; s16 index; s16 nslot; struct component_name *key; ddata_t *data; struct pxdlist *pxdlist; }; #define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot) /* get page buffer for specified block address */ #define DT_GETPAGE(IP, BN, MP, SIZE, P, RC) \ do { \ BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot); \ if (!(RC)) { \ if (((P)->header.nextindex > \ (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \ ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \ BT_PUTPAGE(MP); \ jfs_error((IP)->i_sb, \ "DT_GETPAGE: dtree page corrupt\n"); \ MP = NULL; \ RC = -EIO; \ } \ } \ } while (0) /* for consistency */ #define DT_PUTPAGE(MP) BT_PUTPAGE(MP) #define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot) /* * forward references */ static int dtSplitUp(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack); static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp); static int dtExtendPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack); static int dtSplitRoot(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp); static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, dtpage_t * fp, struct btstack * btstack); static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p); static int dtReadFirst(struct inode *ip, struct btstack * btstack); static int dtReadNext(struct inode *ip, loff_t * offset, struct btstack * btstack); static int dtCompare(struct component_name * key, dtpage_t * p, int si); static int ciCompare(struct component_name * key, dtpage_t * p, int si, int flag); static void dtGetKey(dtpage_t * p, int i, struct component_name * key, int flag); static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, int ri, struct component_name * key, int flag); static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, ddata_t * data, struct dt_lock **); static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, int do_index); static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock); static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock); static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock); #define ciToUpper(c) UniStrupr((c)->name) /* * read_index_page() * * Reads a page of a directory's index table. * Having metadata mapped into the directory inode's address space * presents a multitude of problems. We avoid this by mapping to * the absolute address space outside of the *_metapage routines */ static struct metapage *read_index_page(struct inode *inode, s64 blkno) { int rc; s64 xaddr; int xflag; s32 xlen; rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); if (rc || (xaddr == 0)) return NULL; return read_metapage(inode, xaddr, PSIZE, 1); } /* * get_index_page() * * Same as get_index_page(), but get's a new page without reading */ static struct metapage *get_index_page(struct inode *inode, s64 blkno) { int rc; s64 xaddr; int xflag; s32 xlen; rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); if (rc || (xaddr == 0)) return NULL; return get_metapage(inode, xaddr, PSIZE, 1); } /* * find_index() * * Returns dtree page containing directory table entry for specified * index and pointer to its entry. * * mp must be released by caller. */ static struct dir_table_slot *find_index(struct inode *ip, u32 index, struct metapage ** mp, s64 *lblock) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); s64 blkno; s64 offset; int page_offset; struct dir_table_slot *slot; static int maxWarnings = 10; if (index < 2) { if (maxWarnings) { jfs_warn("find_entry called with index = %d", index); maxWarnings--; } return NULL; } if (index >= jfs_ip->next_index) { jfs_warn("find_entry called with index >= next_index"); return NULL; } if (jfs_dirtable_inline(ip)) { /* * Inline directory table */ *mp = NULL; slot = &jfs_ip->i_dirtable[index - 2]; } else { offset = (index - 2) * sizeof(struct dir_table_slot); page_offset = offset & (PSIZE - 1); blkno = ((offset + 1) >> L2PSIZE) << JFS_SBI(ip->i_sb)->l2nbperpage; if (*mp && (*lblock != blkno)) { release_metapage(*mp); *mp = NULL; } if (!(*mp)) { *lblock = blkno; *mp = read_index_page(ip, blkno); } if (!(*mp)) { jfs_err("free_index: error reading directory table"); return NULL; } slot = (struct dir_table_slot *) ((char *) (*mp)->data + page_offset); } return slot; } static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp, u32 index) { struct tlock *tlck; struct linelock *llck; struct lv *lv; tlck = txLock(tid, ip, mp, tlckDATA); llck = (struct linelock *) tlck->lock; if (llck->index >= llck->maxcnt) llck = txLinelock(llck); lv = &llck->lv[llck->index]; /* * Linelock slot size is twice the size of directory table * slot size. 512 entries per page. */ lv->offset = ((index - 2) & 511) >> 1; lv->length = 1; llck->index++; } /* * add_index() * * Adds an entry to the directory index table. This is used to provide * each directory entry with a persistent index in which to resume * directory traversals */ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) { struct super_block *sb = ip->i_sb; struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_inode_info *jfs_ip = JFS_IP(ip); u64 blkno; struct dir_table_slot *dirtab_slot; u32 index; struct linelock *llck; struct lv *lv; struct metapage *mp; s64 offset; uint page_offset; struct tlock *tlck; s64 xaddr; ASSERT(DO_INDEX(ip)); if (jfs_ip->next_index < 2) { jfs_warn("add_index: next_index = %d. Resetting!", jfs_ip->next_index); jfs_ip->next_index = 2; } index = jfs_ip->next_index++; if (index <= MAX_INLINE_DIRTABLE_ENTRY) { /* * i_size reflects size of index table, or 8 bytes per entry. */ ip->i_size = (loff_t) (index - 1) << 3; /* * dir table fits inline within inode */ dirtab_slot = &jfs_ip->i_dirtable[index-2]; dirtab_slot->flag = DIR_INDEX_VALID; dirtab_slot->slot = slot; DTSaddress(dirtab_slot, bn); set_cflag(COMMIT_Dirtable, ip); return index; } if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) { struct dir_table_slot temp_table[12]; /* * It's time to move the inline table to an external * page and begin to build the xtree */ if (dquot_alloc_block(ip, sbi->nbperpage)) goto clean_up; if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { dquot_free_block(ip, sbi->nbperpage); goto clean_up; } /* * Save the table, we're going to overwrite it with the * xtree root */ memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table)); /* * Initialize empty x-tree */ xtInitRoot(tid, ip); /* * Add the first block to the xtree */ if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) { /* This really shouldn't fail */ jfs_warn("add_index: xtInsert failed!"); memcpy(&jfs_ip->i_dirtable, temp_table, sizeof (temp_table)); dbFree(ip, xaddr, sbi->nbperpage); dquot_free_block(ip, sbi->nbperpage); goto clean_up; } ip->i_size = PSIZE; mp = get_index_page(ip, 0); if (!mp) { jfs_err("add_index: get_metapage failed!"); xtTruncate(tid, ip, 0, COMMIT_PWMAP); memcpy(&jfs_ip->i_dirtable, temp_table, sizeof (temp_table)); goto clean_up; } tlck = txLock(tid, ip, mp, tlckDATA); llck = (struct linelock *) & tlck->lock; ASSERT(llck->index == 0); lv = &llck->lv[0]; lv->offset = 0; lv->length = 6; /* tlckDATA slot size is 16 bytes */ llck->index++; memcpy(mp->data, temp_table, sizeof(temp_table)); mark_metapage_dirty(mp); release_metapage(mp); /* * Logging is now directed by xtree tlocks */ clear_cflag(COMMIT_Dirtable, ip); } offset = (index - 2) * sizeof(struct dir_table_slot); page_offset = offset & (PSIZE - 1); blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage; if (page_offset == 0) { /* * This will be the beginning of a new page */ xaddr = 0; if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) { jfs_warn("add_index: xtInsert failed!"); goto clean_up; } ip->i_size += PSIZE; if ((mp = get_index_page(ip, blkno))) memset(mp->data, 0, PSIZE); /* Just looks better */ else xtTruncate(tid, ip, offset, COMMIT_PWMAP); } else mp = read_index_page(ip, blkno); if (!mp) { jfs_err("add_index: get/read_metapage failed!"); goto clean_up; } lock_index(tid, ip, mp, index); dirtab_slot = (struct dir_table_slot *) ((char *) mp->data + page_offset); dirtab_slot->flag = DIR_INDEX_VALID; dirtab_slot->slot = slot; DTSaddress(dirtab_slot, bn); mark_metapage_dirty(mp); release_metapage(mp); return index; clean_up: jfs_ip->next_index--; return 0; } /* * free_index() * * Marks an entry to the directory index table as free. */ static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next) { struct dir_table_slot *dirtab_slot; s64 lblock; struct metapage *mp = NULL; dirtab_slot = find_index(ip, index, &mp, &lblock); if (!dirtab_slot) return; dirtab_slot->flag = DIR_INDEX_FREE; dirtab_slot->slot = dirtab_slot->addr1 = 0; dirtab_slot->addr2 = cpu_to_le32(next); if (mp) { lock_index(tid, ip, mp, index); mark_metapage_dirty(mp); release_metapage(mp); } else set_cflag(COMMIT_Dirtable, ip); } /* * modify_index() * * Changes an entry in the directory index table */ static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn, int slot, struct metapage ** mp, s64 *lblock) { struct dir_table_slot *dirtab_slot; dirtab_slot = find_index(ip, index, mp, lblock); if (!dirtab_slot) return; DTSaddress(dirtab_slot, bn); dirtab_slot->slot = slot; if (*mp) { lock_index(tid, ip, *mp, index); mark_metapage_dirty(*mp); } else set_cflag(COMMIT_Dirtable, ip); } /* * read_index() * * reads a directory table slot */ static int read_index(struct inode *ip, u32 index, struct dir_table_slot * dirtab_slot) { s64 lblock; struct metapage *mp = NULL; struct dir_table_slot *slot; slot = find_index(ip, index, &mp, &lblock); if (!slot) { return -EIO; } memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot)); if (mp) release_metapage(mp); return 0; } /* * dtSearch() * * function: * Search for the entry with specified key * * parameter: * * return: 0 - search result on stack, leaf page pinned; * errno - I/O error */ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, struct btstack * btstack, int flag) { int rc = 0; int cmp = 1; /* init for empty page */ s64 bn; struct metapage *mp; dtpage_t *p; s8 *stbl; int base, index, lim; struct btframe *btsp; pxd_t *pxd; int psize = 288; /* initial in-line directory */ ino_t inumber; struct component_name ciKey; struct super_block *sb = ip->i_sb; ciKey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_NOFS); if (!ciKey.name) { rc = -ENOMEM; goto dtSearch_Exit2; } /* uppercase search key for c-i directory */ UniStrcpy(ciKey.name, key->name); ciKey.namlen = key->namlen; /* only uppercase if case-insensitive support is on */ if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) { ciToUpper(&ciKey); } BT_CLR(btstack); /* reset stack */ /* init level count for max pages to split */ btstack->nsplit = 1; /* * search down tree from root: * * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of * internal page, child page Pi contains entry with k, Ki <= K < Kj. * * if entry with search key K is not found * internal page search find the entry with largest key Ki * less than K which point to the child page to search; * leaf page search find the entry with smallest key Kj * greater than K so that the returned index is the position of * the entry to be shifted right for insertion of new entry. * for empty tree, search key is greater than any key of the tree. * * by convention, root bn = 0. */ for (bn = 0;;) { /* get/pin the page to search */ DT_GETPAGE(ip, bn, mp, psize, p, rc); if (rc) goto dtSearch_Exit1; /* get sorted entry table of the page */ stbl = DT_GETSTBL(p); /* * binary search with search key K on the current page. */ for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) { index = base + (lim >> 1); if (stbl[index] < 0) { rc = -EIO; goto out; } if (p->header.flag & BT_LEAF) { /* uppercase leaf name to compare */ cmp = ciCompare(&ciKey, p, stbl[index], JFS_SBI(sb)->mntflag); } else { /* router key is in uppercase */ cmp = dtCompare(&ciKey, p, stbl[index]); } if (cmp == 0) { /* * search hit */ /* search hit - leaf page: * return the entry found */ if (p->header.flag & BT_LEAF) { inumber = le32_to_cpu( ((struct ldtentry *) & p->slot[stbl[index]])->inumber); /* * search for JFS_LOOKUP */ if (flag == JFS_LOOKUP) { *data = inumber; rc = 0; goto out; } /* * search for JFS_CREATE */ if (flag == JFS_CREATE) { *data = inumber; rc = -EEXIST; goto out; } /* * search for JFS_REMOVE or JFS_RENAME */ if ((flag == JFS_REMOVE || flag == JFS_RENAME) && *data != inumber) { rc = -ESTALE; goto out; } /* * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME */ /* save search result */ *data = inumber; btsp = btstack->top; btsp->bn = bn; btsp->index = index; btsp->mp = mp; rc = 0; goto dtSearch_Exit1; } /* search hit - internal page: * descend/search its child page */ goto getChild; } if (cmp > 0) { base = index + 1; --lim; } } /* * search miss * * base is the smallest index with key (Kj) greater than * search key (K) and may be zero or (maxindex + 1) index. */ /* * search miss - leaf page * * return location of entry (base) where new entry with * search key K is to be inserted. */ if (p->header.flag & BT_LEAF) { /* * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME */ if (flag == JFS_LOOKUP || flag == JFS_REMOVE || flag == JFS_RENAME) { rc = -ENOENT; goto out; } /* * search for JFS_CREATE|JFS_FINDDIR: * * save search result */ *data = 0; btsp = btstack->top; btsp->bn = bn; btsp->index = base; btsp->mp = mp; rc = 0; goto dtSearch_Exit1; } /* * search miss - internal page * * if base is non-zero, decrement base by one to get the parent * entry of the child page to search. */ index = base ? base - 1 : base; /* * go down to child page */ getChild: /* update max. number of pages to split */ if (BT_STACK_FULL(btstack)) { /* Something's corrupted, mark filesystem dirty so * chkdsk will fix it. */ jfs_error(sb, "stack overrun!\n"); BT_STACK_DUMP(btstack); rc = -EIO; goto out; } btstack->nsplit++; /* push (bn, index) of the parent page/entry */ BT_PUSH(btstack, bn, index); /* get the child page block number */ pxd = (pxd_t *) & p->slot[stbl[index]]; bn = addressPXD(pxd); psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; /* unpin the parent page */ DT_PUTPAGE(mp); } out: DT_PUTPAGE(mp); dtSearch_Exit1: kfree(ciKey.name); dtSearch_Exit2: return rc; } /* * dtInsert() * * function: insert an entry to directory tree * * parameter: * * return: 0 - success; * errno - failure; */ int dtInsert(tid_t tid, struct inode *ip, struct component_name * name, ino_t * fsn, struct btstack * btstack) { int rc = 0; struct metapage *mp; /* meta-page buffer */ dtpage_t *p; /* base B+-tree index page */ s64 bn; int index; struct dtsplit split; /* split information */ ddata_t data; struct dt_lock *dtlck; int n; struct tlock *tlck; struct lv *lv; /* * retrieve search result * * dtSearch() returns (leaf page pinned, index at which to insert). * n.b. dtSearch() may return index of (maxindex + 1) of * the full page. */ DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); if (p->header.freelist == 0) return -EINVAL; /* * insert entry for new key */ if (DO_INDEX(ip)) { if (JFS_IP(ip)->next_index == DIREND) { DT_PUTPAGE(mp); return -EMLINK; } n = NDTLEAF(name->namlen); data.leaf.tid = tid; data.leaf.ip = ip; } else { n = NDTLEAF_LEGACY(name->namlen); data.leaf.ip = NULL; /* signifies legacy directory format */ } data.leaf.ino = *fsn; /* * leaf page does not have enough room for new entry: * * extend/split the leaf page; * * dtSplitUp() will insert the entry and unpin the leaf page. */ if (n > p->header.freecnt) { split.mp = mp; split.index = index; split.nslot = n; split.key = name; split.data = &data; rc = dtSplitUp(tid, ip, &split, btstack); return rc; } /* * leaf page does have enough room for new entry: * * insert the new data entry into the leaf page; */ BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; /* linelock header */ lv->offset = 0; lv->length = 1; dtlck->index++; dtInsertEntry(p, index, name, &data, &dtlck); /* linelock stbl of non-root leaf page */ if (!(p->header.flag & BT_ROOT)) { if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; n = index >> L2DTSLOTSIZE; lv->offset = p->header.stblindex + n; lv->length = ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; dtlck->index++; } /* unpin the leaf page */ DT_PUTPAGE(mp); return 0; } /* * dtSplitUp() * * function: propagate insertion bottom up; * * parameter: * * return: 0 - success; * errno - failure; * leaf page unpinned; */ static int dtSplitUp(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack) { struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); int rc = 0; struct metapage *smp; dtpage_t *sp; /* split page */ struct metapage *rmp; dtpage_t *rp; /* new right page split from sp */ pxd_t rpxd; /* new right page extent descriptor */ struct metapage *lmp; dtpage_t *lp; /* left child page */ int skip; /* index of entry of insertion */ struct btframe *parent; /* parent page entry on traverse stack */ s64 xaddr, nxaddr; int xlen, xsize; struct pxdlist pxdlist; pxd_t *pxd; struct component_name key = { 0, NULL }; ddata_t *data = split->data; int n; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; int quota_allocation = 0; /* get split page */ smp = split->mp; sp = DT_PAGE(ip, smp); key.name = kmalloc_array(JFS_NAME_MAX + 2, sizeof(wchar_t), GFP_NOFS); if (!key.name) { DT_PUTPAGE(smp); rc = -ENOMEM; goto dtSplitUp_Exit; } /* * split leaf page * * The split routines insert the new entry, and * acquire txLock as appropriate. */ /* * split root leaf page: */ if (sp->header.flag & BT_ROOT) { /* * allocate a single extent child page */ xlen = 1; n = sbi->bsize >> L2DTSLOTSIZE; n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */ if (n <= split->nslot) xlen++; if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) { DT_PUTPAGE(smp); goto freeKeyName; } pxdlist.maxnpxd = 1; pxdlist.npxd = 0; pxd = &pxdlist.pxd[0]; PXDaddress(pxd, xaddr); PXDlength(pxd, xlen); split->pxdlist = &pxdlist; rc = dtSplitRoot(tid, ip, split, &rmp); if (rc) dbFree(ip, xaddr, xlen); else DT_PUTPAGE(rmp); DT_PUTPAGE(smp); if (!DO_INDEX(ip)) ip->i_size = xlen << sbi->l2bsize; goto freeKeyName; } /* * extend first leaf page * * extend the 1st extent if less than buffer page size * (dtExtendPage() reurns leaf page unpinned) */ pxd = &sp->header.self; xlen = lengthPXD(pxd); xsize = xlen << sbi->l2bsize; if (xsize < PSIZE) { xaddr = addressPXD(pxd); n = xsize >> L2DTSLOTSIZE; n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ if ((n + sp->header.freecnt) <= split->nslot) n = xlen + (xlen << 1); else n = xlen; /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, n); if (rc) goto extendOut; quota_allocation += n; if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, (s64) n, &nxaddr))) goto extendOut; pxdlist.maxnpxd = 1; pxdlist.npxd = 0; pxd = &pxdlist.pxd[0]; PXDaddress(pxd, nxaddr); PXDlength(pxd, xlen + n); split->pxdlist = &pxdlist; if ((rc = dtExtendPage(tid, ip, split, btstack))) { nxaddr = addressPXD(pxd); if (xaddr != nxaddr) { /* free relocated extent */ xlen = lengthPXD(pxd); dbFree(ip, nxaddr, (s64) xlen); } else { /* free extended delta */ xlen = lengthPXD(pxd) - n; xaddr = addressPXD(pxd) + xlen; dbFree(ip, xaddr, (s64) n); } } else if (!DO_INDEX(ip)) ip->i_size = lengthPXD(pxd) << sbi->l2bsize; extendOut: DT_PUTPAGE(smp); goto freeKeyName; } /* * split leaf page <sp> into <sp> and a new right page <rp>. * * return <rp> pinned and its extent descriptor <rpxd> */ /* * allocate new directory page extent and * new index page(s) to cover page split(s) * * allocation hint: ? */ n = btstack->nsplit; pxdlist.maxnpxd = pxdlist.npxd = 0; xlen = sbi->nbperpage; for (pxd = pxdlist.pxd; n > 0; n--, pxd++) { if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) { PXDaddress(pxd, xaddr); PXDlength(pxd, xlen); pxdlist.maxnpxd++; continue; } DT_PUTPAGE(smp); /* undo allocation */ goto splitOut; } split->pxdlist = &pxdlist; if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) { DT_PUTPAGE(smp); /* undo allocation */ goto splitOut; } if (!DO_INDEX(ip)) ip->i_size += PSIZE; /* * propagate up the router entry for the leaf page just split * * insert a router entry for the new page into the parent page, * propagate the insert/split up the tree by walking back the stack * of (bn of parent page, index of child page entry in parent page) * that were traversed during the search for the page that split. * * the propagation of insert/split up the tree stops if the root * splits or the page inserted into doesn't have to split to hold * the new entry. * * the parent entry for the split page remains the same, and * a new entry is inserted at its right with the first key and * block number of the new right page. * * There are a maximum of 4 pages pinned at any time: * two children, left parent and right parent (when the parent splits). * keep the child pages pinned while working on the parent. * make sure that all pins are released at exit. */ while ((parent = BT_POP(btstack)) != NULL) { /* parent page specified by stack frame <parent> */ /* keep current child pages (<lp>, <rp>) pinned */ lmp = smp; lp = sp; /* * insert router entry in parent for new right child page <rp> */ /* get the parent page <sp> */ DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); if (rc) { DT_PUTPAGE(lmp); DT_PUTPAGE(rmp); goto splitOut; } /* * The new key entry goes ONE AFTER the index of parent entry, * because the split was to the right. */ skip = parent->index + 1; /* * compute the key for the router entry * * key suffix compression: * for internal pages that have leaf pages as children, * retain only what's needed to distinguish between * the new entry and the entry on the page to its left. * If the keys compare equal, retain the entire key. * * note that compression is performed only at computing * router key at the lowest internal level. * further compression of the key between pairs of higher * level internal pages loses too much information and * the search may fail. * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,} * results in two adjacent parent entries (a)(xx). * if split occurs between these two entries, and * if compression is applied, the router key of parent entry * of right page (x) will divert search for x into right * subtree and miss x in the left subtree.) * * the entire key must be retained for the next-to-leftmost * internal key at any level of the tree, or search may fail * (e.g., ?) */ switch (rp->header.flag & BT_TYPE) { case BT_LEAF: /* * compute the length of prefix for suffix compression * between last entry of left page and first entry * of right page */ if ((sp->header.flag & BT_ROOT && skip > 1) || sp->header.prev != 0 || skip > 1) { /* compute uppercase router prefix key */ rc = ciGetLeafPrefixKey(lp, lp->header.nextindex-1, rp, 0, &key, sbi->mntflag); if (rc) { DT_PUTPAGE(lmp); DT_PUTPAGE(rmp); DT_PUTPAGE(smp); goto splitOut; } } else { /* next to leftmost entry of lowest internal level */ /* compute uppercase router key */ dtGetKey(rp, 0, &key, sbi->mntflag); key.name[key.namlen] = 0; if ((sbi->mntflag & JFS_OS2) == JFS_OS2) ciToUpper(&key); } n = NDTINTERNAL(key.namlen); break; case BT_INTERNAL: dtGetKey(rp, 0, &key, sbi->mntflag); n = NDTINTERNAL(key.namlen); break; default: jfs_err("dtSplitUp(): UFO!"); break; } /* unpin left child page */ DT_PUTPAGE(lmp); /* * compute the data for the router entry */ data->xd = rpxd; /* child page xd */ /* * parent page is full - split the parent page */ if (n > sp->header.freecnt) { /* init for parent page split */ split->mp = smp; split->index = skip; /* index at insert */ split->nslot = n; split->key = &key; /* split->data = data; */ /* unpin right child page */ DT_PUTPAGE(rmp); /* The split routines insert the new entry, * acquire txLock as appropriate. * return <rp> pinned and its block number <rbn>. */ rc = (sp->header.flag & BT_ROOT) ? dtSplitRoot(tid, ip, split, &rmp) : dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd); if (rc) { DT_PUTPAGE(smp); goto splitOut; } /* smp and rmp are pinned */ } /* * parent page is not full - insert router entry in parent page */ else { BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the parent page */ tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; /* linelock header */ lv->offset = 0; lv->length = 1; dtlck->index++; /* linelock stbl of non-root parent page */ if (!(sp->header.flag & BT_ROOT)) { lv++; n = skip >> L2DTSLOTSIZE; lv->offset = sp->header.stblindex + n; lv->length = ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; dtlck->index++; } dtInsertEntry(sp, skip, &key, data, &dtlck); /* exit propagate up */ break; } } /* unpin current split and its right page */ DT_PUTPAGE(smp); DT_PUTPAGE(rmp); /* * free remaining extents allocated for split */ splitOut: n = pxdlist.npxd; pxd = &pxdlist.pxd[n]; for (; n < pxdlist.maxnpxd; n++, pxd++) dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd)); freeKeyName: kfree(key.name); /* Rollback quota allocation */ if (rc && quota_allocation) dquot_free_block(ip, quota_allocation); dtSplitUp_Exit: return rc; } /* * dtSplitPage() * * function: Split a non-root page of a btree. * * parameter: * * return: 0 - success; * errno - failure; * return split and new page pinned; */ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp) { int rc = 0; struct metapage *smp; dtpage_t *sp; struct metapage *rmp; dtpage_t *rp; /* new right page allocated */ s64 rbn; /* new right page block number */ struct metapage *mp; dtpage_t *p; s64 nextbn; struct pxdlist *pxdlist; pxd_t *pxd; int skip, nextindex, half, left, nxt, off, si; struct ldtentry *ldtentry; struct idtentry *idtentry; u8 *stbl; struct dtslot *f; int fsi, stblsize; int n; struct dt_lock *sdtlck, *rdtlck; struct tlock *tlck; struct dt_lock *dtlck; struct lv *slv, *rlv, *lv; /* get split page */ smp = split->mp; sp = DT_PAGE(ip, smp); /* * allocate the new right page for the split */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; rbn = addressPXD(pxd); rmp = get_metapage(ip, rbn, PSIZE, 1); if (rmp == NULL) return -EIO; /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, lengthPXD(pxd)); if (rc) { release_metapage(rmp); return rc; } jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); BT_MARK_DIRTY(rmp, ip); /* * acquire a transaction lock on the new right page */ tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); rdtlck = (struct dt_lock *) & tlck->lock; rp = (dtpage_t *) rmp->data; *rpp = rp; rp->header.self = *pxd; BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the split page * * action: */ tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); sdtlck = (struct dt_lock *) & tlck->lock; /* linelock header of split page */ ASSERT(sdtlck->index == 0); slv = & sdtlck->lv[0]; slv->offset = 0; slv->length = 1; sdtlck->index++; /* * initialize/update sibling pointers between sp and rp */ nextbn = le64_to_cpu(sp->header.next); rp->header.next = cpu_to_le64(nextbn); rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); sp->header.next = cpu_to_le64(rbn); /* * initialize new right page */ rp->header.flag = sp->header.flag; /* compute sorted entry table at start of extent data area */ rp->header.nextindex = 0; rp->header.stblindex = 1; n = PSIZE >> L2DTSLOTSIZE; rp->header.maxslot = n; stblsize = (n + 31) >> L2DTSLOTSIZE; /* in unit of slot */ /* init freelist */ fsi = rp->header.stblindex + stblsize; rp->header.freelist = fsi; rp->header.freecnt = rp->header.maxslot - fsi; /* * sequential append at tail: append without split * * If splitting the last page on a level because of appending * a entry to it (skip is maxentry), it's likely that the access is * sequential. Adding an empty page on the side of the level is less * work and can push the fill factor much higher than normal. * If we're wrong it's no big deal, we'll just do the split the right * way next time. * (It may look like it's equally easy to do a similar hack for * reverse sorted data, that is, split the tree left, * but it's not. Be my guest.) */ if (nextbn == 0 && split->index == sp->header.nextindex) { /* linelock header + stbl (first slot) of new page */ rlv = & rdtlck->lv[rdtlck->index]; rlv->offset = 0; rlv->length = 2; rdtlck->index++; /* * initialize freelist of new right page */ f = &rp->slot[fsi]; for (fsi++; fsi < rp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* insert entry at the first entry of the new right page */ dtInsertEntry(rp, 0, split->key, split->data, &rdtlck); goto out; } /* * non-sequential insert (at possibly middle page) */ /* * update prev pointer of previous right sibling page; */ if (nextbn != 0) { DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); if (rc) { discard_metapage(rmp); return rc; } BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the next page */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p", tlck, ip, mp); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header of previous right sibling page */ lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; p->header.prev = cpu_to_le64(rbn); DT_PUTPAGE(mp); } /* * split the data between the split and right pages. */ skip = split->index; half = (PSIZE >> L2DTSLOTSIZE) >> 1; /* swag */ left = 0; /* * compute fill factor for split pages * * <nxt> traces the next entry to move to rp * <off> traces the next entry to stay in sp */ stbl = (u8 *) & sp->slot[sp->header.stblindex]; nextindex = sp->header.nextindex; for (nxt = off = 0; nxt < nextindex; ++off) { if (off == skip) /* check for fill factor with new entry size */ n = split->nslot; else { si = stbl[nxt]; switch (sp->header.flag & BT_TYPE) { case BT_LEAF: ldtentry = (struct ldtentry *) & sp->slot[si]; if (DO_INDEX(ip)) n = NDTLEAF(ldtentry->namlen); else n = NDTLEAF_LEGACY(ldtentry-> namlen); break; case BT_INTERNAL: idtentry = (struct idtentry *) & sp->slot[si]; n = NDTINTERNAL(idtentry->namlen); break; default: break; } ++nxt; /* advance to next entry to move in sp */ } left += n; if (left >= half) break; } /* <nxt> poins to the 1st entry to move */ /* * move entries to right page * * dtMoveEntry() initializes rp and reserves entry for insertion * * split page moved out entries are linelocked; * new/right page moved in entries are linelocked; */ /* linelock header + stbl of new right page */ rlv = & rdtlck->lv[rdtlck->index]; rlv->offset = 0; rlv->length = 5; rdtlck->index++; dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip)); sp->header.nextindex = nxt; /* * finalize freelist of new right page */ fsi = rp->header.freelist; f = &rp->slot[fsi]; for (fsi++; fsi < rp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* * Update directory index table for entries now in right page */ if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { s64 lblock; mp = NULL; stbl = DT_GETSTBL(rp); for (n = 0; n < rp->header.nextindex; n++) { ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), rbn, n, &mp, &lblock); } if (mp) release_metapage(mp); } /* * the skipped index was on the left page, */ if (skip <= off) { /* insert the new entry in the split page */ dtInsertEntry(sp, skip, split->key, split->data, &sdtlck); /* linelock stbl of split page */ if (sdtlck->index >= sdtlck->maxcnt) sdtlck = (struct dt_lock *) txLinelock(sdtlck); slv = & sdtlck->lv[sdtlck->index]; n = skip >> L2DTSLOTSIZE; slv->offset = sp->header.stblindex + n; slv->length = ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; sdtlck->index++; } /* * the skipped index was on the right page, */ else { /* adjust the skip index to reflect the new position */ skip -= nxt; /* insert the new entry in the right page */ dtInsertEntry(rp, skip, split->key, split->data, &rdtlck); } out: *rmpp = rmp; *rpxdp = *pxd; return rc; } /* * dtExtendPage() * * function: extend 1st/only directory leaf page * * parameter: * * return: 0 - success; * errno - failure; * return extended page pinned; */ static int dtExtendPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack) { struct super_block *sb = ip->i_sb; int rc; struct metapage *smp, *pmp, *mp; dtpage_t *sp, *pp; struct pxdlist *pxdlist; pxd_t *pxd, *tpxd; int xlen, xsize; int newstblindex, newstblsize; int oldstblindex, oldstblsize; int fsi, last; struct dtslot *f; struct btframe *parent; int n; struct dt_lock *dtlck; s64 xaddr, txaddr; struct tlock *tlck; struct pxd_lock *pxdlock; struct lv *lv; uint type; struct ldtentry *ldtentry; u8 *stbl; /* get page to extend */ smp = split->mp; sp = DT_PAGE(ip, smp); /* get parent/root page */ parent = BT_POP(btstack); DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc); if (rc) return (rc); /* * extend the extent */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; xaddr = addressPXD(pxd); tpxd = &sp->header.self; txaddr = addressPXD(tpxd); /* in-place extension */ if (xaddr == txaddr) { type = tlckEXTEND; } /* relocation */ else { type = tlckNEW; /* save moved extent descriptor for later free */ tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; pxdlock->pxd = sp->header.self; pxdlock->index = 1; /* * Update directory index table to reflect new page address */ if (DO_INDEX(ip)) { s64 lblock; mp = NULL; stbl = DT_GETSTBL(sp); for (n = 0; n < sp->header.nextindex; n++) { ldtentry = (struct ldtentry *) & sp->slot[stbl[n]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), xaddr, n, &mp, &lblock); } if (mp) release_metapage(mp); } } /* * extend the page */ sp->header.self = *pxd; jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp); BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the extended/leaf page */ tlck = txLock(tid, ip, smp, tlckDTREE | type); dtlck = (struct dt_lock *) & tlck->lock; lv = & dtlck->lv[0]; /* update buffer extent descriptor of extended page */ xlen = lengthPXD(pxd); xsize = xlen << JFS_SBI(sb)->l2bsize; /* * copy old stbl to new stbl at start of extended area */ oldstblindex = sp->header.stblindex; oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE; newstblindex = sp->header.maxslot; n = xsize >> L2DTSLOTSIZE; newstblsize = (n + 31) >> L2DTSLOTSIZE; memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex], sp->header.nextindex); /* * in-line extension: linelock old area of extended page */ if (type == tlckEXTEND) { /* linelock header */ lv->offset = 0; lv->length = 1; dtlck->index++; lv++; /* linelock new stbl of extended page */ lv->offset = newstblindex; lv->length = newstblsize; } /* * relocation: linelock whole relocated area */ else { lv->offset = 0; lv->length = sp->header.maxslot + newstblsize; } dtlck->index++; sp->header.maxslot = n; sp->header.stblindex = newstblindex; /* sp->header.nextindex remains the same */ /* * add old stbl region at head of freelist */ fsi = oldstblindex; f = &sp->slot[fsi]; last = sp->header.freelist; for (n = 0; n < oldstblsize; n++, fsi++, f++) { f->next = last; last = fsi; } sp->header.freelist = last; sp->header.freecnt += oldstblsize; /* * append free region of newly extended area at tail of freelist */ /* init free region of newly extended area */ fsi = n = newstblindex + newstblsize; f = &sp->slot[fsi]; for (fsi++; fsi < sp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* append new free region at tail of old freelist */ fsi = sp->header.freelist; if (fsi == -1) sp->header.freelist = n; else { do { f = &sp->slot[fsi]; fsi = f->next; } while (fsi != -1); f->next = n; } sp->header.freecnt += sp->header.maxslot - n; /* * insert the new entry */ dtInsertEntry(sp, split->index, split->key, split->data, &dtlck); BT_MARK_DIRTY(pmp, ip); /* * linelock any freeslots residing in old extent */ if (type == tlckEXTEND) { n = sp->header.maxslot >> 2; if (sp->header.freelist < n) dtLinelockFreelist(sp, n, &dtlck); } /* * update parent entry on the parent/root page */ /* * acquire a transaction lock on the parent/root page */ tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; lv = & dtlck->lv[dtlck->index]; /* linelock parent entry - 1st slot */ lv->offset = 1; lv->length = 1; dtlck->index++; /* update the parent pxd for page extension */ tpxd = (pxd_t *) & pp->slot[1]; *tpxd = *pxd; DT_PUTPAGE(pmp); return 0; } /* * dtSplitRoot() * * function: * split the full root page into * original/root/split page and new right page * i.e., root remains fixed in tree anchor (inode) and * the root is copied to a single new right child page * since root page << non-root page, and * the split root page contains a single entry for the * new right child page. * * parameter: * * return: 0 - success; * errno - failure; * return new page pinned; */ static int dtSplitRoot(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp) { struct super_block *sb = ip->i_sb; struct metapage *smp; dtroot_t *sp; struct metapage *rmp; dtpage_t *rp; s64 rbn; int xlen; int xsize; struct dtslot *f; s8 *stbl; int fsi, stblsize, n; struct idtentry *s; pxd_t *ppxd; struct pxdlist *pxdlist; pxd_t *pxd; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; int rc; /* get split root page */ smp = split->mp; sp = &JFS_IP(ip)->i_dtroot; /* * allocate/initialize a single (right) child page * * N.B. at first split, a one (or two) block to fit new entry * is allocated; at subsequent split, a full page is allocated; */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; rbn = addressPXD(pxd); xlen = lengthPXD(pxd); xsize = xlen << JFS_SBI(sb)->l2bsize; rmp = get_metapage(ip, rbn, xsize, 1); if (!rmp) return -EIO; rp = rmp->data; /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, lengthPXD(pxd)); if (rc) { release_metapage(rmp); return rc; } BT_MARK_DIRTY(rmp, ip); /* * acquire a transaction lock on the new right page */ tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); dtlck = (struct dt_lock *) & tlck->lock; rp->header.flag = (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; rp->header.self = *pxd; /* initialize sibling pointers */ rp->header.next = 0; rp->header.prev = 0; /* * move in-line root page into new right page extent */ /* linelock header + copied entries + new stbl (1st slot) in new page */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = 0; lv->length = 10; /* 1 + 8 + 1 */ dtlck->index++; n = xsize >> L2DTSLOTSIZE; rp->header.maxslot = n; stblsize = (n + 31) >> L2DTSLOTSIZE; /* copy old stbl to new stbl at start of extended area */ rp->header.stblindex = DTROOTMAXSLOT; stbl = (s8 *) & rp->slot[DTROOTMAXSLOT]; memcpy(stbl, sp->header.stbl, sp->header.nextindex); rp->header.nextindex = sp->header.nextindex; /* copy old data area to start of new data area */ memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE); /* * append free region of newly extended area at tail of freelist */ /* init free region of newly extended area */ fsi = n = DTROOTMAXSLOT + stblsize; f = &rp->slot[fsi]; for (fsi++; fsi < rp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* append new free region at tail of old freelist */ fsi = sp->header.freelist; if (fsi == -1) rp->header.freelist = n; else { rp->header.freelist = fsi; do { f = &rp->slot[fsi]; fsi = f->next; } while (fsi >= 0); f->next = n; } rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n; /* * Update directory index table for entries now in right page */ if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { s64 lblock; struct metapage *mp = NULL; struct ldtentry *ldtentry; stbl = DT_GETSTBL(rp); for (n = 0; n < rp->header.nextindex; n++) { ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), rbn, n, &mp, &lblock); } if (mp) release_metapage(mp); } /* * insert the new entry into the new right/child page * (skip index in the new right page will not change) */ dtInsertEntry(rp, split->index, split->key, split->data, &dtlck); /* * reset parent/root page * * set the 1st entry offset to 0, which force the left-most key * at any level of the tree to be less than any search key. * * The btree comparison code guarantees that the left-most key on any * level of the tree is never used, so it doesn't need to be filled in. */ BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the root page (in-memory inode) */ tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT); dtlck = (struct dt_lock *) & tlck->lock; /* linelock root */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = 0; lv->length = DTROOTMAXSLOT; dtlck->index++; /* update page header of root */ if (sp->header.flag & BT_LEAF) { sp->header.flag &= ~BT_LEAF; sp->header.flag |= BT_INTERNAL; } /* init the first entry */ s = (struct idtentry *) & sp->slot[DTENTRYSTART]; ppxd = (pxd_t *) s; *ppxd = *pxd; s->next = -1; s->namlen = 0; stbl = sp->header.stbl; stbl[0] = DTENTRYSTART; sp->header.nextindex = 1; /* init freelist */ fsi = DTENTRYSTART + 1; f = &sp->slot[fsi]; /* init free region of remaining area */ for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) f->next = fsi; f->next = -1; sp->header.freelist = DTENTRYSTART + 1; sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1); *rmpp = rmp; return 0; } /* * dtDelete() * * function: delete the entry(s) referenced by a key. * * parameter: * * return: */ int dtDelete(tid_t tid, struct inode *ip, struct component_name * key, ino_t * ino, int flag) { int rc = 0; s64 bn; struct metapage *mp, *imp; dtpage_t *p; int index; struct btstack btstack; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; int i; struct ldtentry *ldtentry; u8 *stbl; u32 table_index, next_index; struct metapage *nmp; dtpage_t *np; /* * search for the entry to delete: * * dtSearch() returns (leaf page pinned, index at which to delete). */ if ((rc = dtSearch(ip, key, ino, &btstack, flag))) return rc; /* retrieve search result */ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); /* * We need to find put the index of the next entry into the * directory index table in order to resume a readdir from this * entry. */ if (DO_INDEX(ip)) { stbl = DT_GETSTBL(p); ldtentry = (struct ldtentry *) & p->slot[stbl[index]]; table_index = le32_to_cpu(ldtentry->index); if (index == (p->header.nextindex - 1)) { /* * Last entry in this leaf page */ if ((p->header.flag & BT_ROOT) || (p->header.next == 0)) next_index = -1; else { /* Read next leaf page */ DT_GETPAGE(ip, le64_to_cpu(p->header.next), nmp, PSIZE, np, rc); if (rc) next_index = -1; else { stbl = DT_GETSTBL(np); ldtentry = (struct ldtentry *) & np-> slot[stbl[0]]; next_index = le32_to_cpu(ldtentry->index); DT_PUTPAGE(nmp); } } } else { ldtentry = (struct ldtentry *) & p->slot[stbl[index + 1]]; next_index = le32_to_cpu(ldtentry->index); } free_index(tid, ip, table_index, next_index); } /* * the leaf page becomes empty, delete the page */ if (p->header.nextindex == 1) { /* delete empty page */ rc = dtDeleteUp(tid, ip, mp, p, &btstack); } /* * the leaf page has other entries remaining: * * delete the entry from the leaf page. */ else { BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; /* * Do not assume that dtlck->index will be zero. During a * rename within a directory, this transaction may have * modified this page already when adding the new entry. */ /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; /* linelock stbl of non-root leaf page */ if (!(p->header.flag & BT_ROOT)) { if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; i = index >> L2DTSLOTSIZE; lv->offset = p->header.stblindex + i; lv->length = ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - i + 1; dtlck->index++; } /* free the leaf entry */ dtDeleteEntry(p, index, &dtlck); /* * Update directory index table for entries moved in stbl */ if (DO_INDEX(ip) && index < p->header.nextindex) { s64 lblock; imp = NULL; stbl = DT_GETSTBL(p); for (i = index; i < p->header.nextindex; i++) { ldtentry = (struct ldtentry *) & p->slot[stbl[i]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), bn, i, &imp, &lblock); } if (imp) release_metapage(imp); } DT_PUTPAGE(mp); } return rc; } /* * dtDeleteUp() * * function: * free empty pages as propagating deletion up the tree * * parameter: * * return: */ static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, dtpage_t * fp, struct btstack * btstack) { int rc = 0; struct metapage *mp; dtpage_t *p; int index, nextindex; int xlen; struct btframe *parent; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; struct pxd_lock *pxdlock; int i; /* * keep the root leaf page which has become empty */ if (BT_IS_ROOT(fmp)) { /* * reset the root * * dtInitRoot() acquires txlock on the root */ dtInitRoot(tid, ip, PARENT(ip)); DT_PUTPAGE(fmp); return 0; } /* * free the non-root leaf page */ /* * acquire a transaction lock on the page * * write FREEXTENT|NOREDOPAGE log record * N.B. linelock is overlaid as freed extent descriptor, and * the buffer page is freed; */ tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; pxdlock->pxd = fp->header.self; pxdlock->index = 1; /* update sibling pointers */ if ((rc = dtRelink(tid, ip, fp))) { BT_PUTPAGE(fmp); return rc; } xlen = lengthPXD(&fp->header.self); /* Free quota allocation. */ dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(fmp); /* * propagate page deletion up the directory tree * * If the delete from the parent page makes it empty, * continue all the way up the tree. * stop if the root page is reached (which is never deleted) or * if the entry deletion does not empty the page. */ while ((parent = BT_POP(btstack)) != NULL) { /* pin the parent page <sp> */ DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); if (rc) return rc; /* * free the extent of the child page deleted */ index = parent->index; /* * delete the entry for the child page from parent */ nextindex = p->header.nextindex; /* * the parent has the single entry being deleted: * * free the parent page which has become empty. */ if (nextindex == 1) { /* * keep the root internal page which has become empty */ if (p->header.flag & BT_ROOT) { /* * reset the root * * dtInitRoot() acquires txlock on the root */ dtInitRoot(tid, ip, PARENT(ip)); DT_PUTPAGE(mp); return 0; } /* * free the parent page */ else { /* * acquire a transaction lock on the page * * write FREEXTENT|NOREDOPAGE log record */ tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; pxdlock->pxd = p->header.self; pxdlock->index = 1; /* update sibling pointers */ if ((rc = dtRelink(tid, ip, p))) { DT_PUTPAGE(mp); return rc; } xlen = lengthPXD(&p->header.self); /* Free quota allocation */ dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(mp); /* propagate up */ continue; } } /* * the parent has other entries remaining: * * delete the router entry from the parent page. */ BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the page * * action: router entry deletion */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; /* linelock stbl of non-root leaf page */ if (!(p->header.flag & BT_ROOT)) { if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } i = index >> L2DTSLOTSIZE; lv->offset = p->header.stblindex + i; lv->length = ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - i + 1; dtlck->index++; } /* free the router entry */ dtDeleteEntry(p, index, &dtlck); /* reset key of new leftmost entry of level (for consistency) */ if (index == 0 && ((p->header.flag & BT_ROOT) || p->header.prev == 0)) dtTruncateEntry(p, 0, &dtlck); /* unpin the parent page */ DT_PUTPAGE(mp); /* exit propagation up */ break; } if (!DO_INDEX(ip)) ip->i_size -= PSIZE; return 0; } /* * dtRelink() * * function: * link around a freed page. * * parameter: * fp: page to be freed * * return: */ static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p) { int rc; struct metapage *mp; s64 nextbn, prevbn; struct tlock *tlck; struct dt_lock *dtlck; struct lv *lv; nextbn = le64_to_cpu(p->header.next); prevbn = le64_to_cpu(p->header.prev); /* update prev pointer of the next page */ if (nextbn != 0) { DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); if (rc) return rc; BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the next page * * action: update prev pointer; */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", tlck, ip, mp); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; p->header.prev = cpu_to_le64(prevbn); DT_PUTPAGE(mp); } /* update next pointer of the previous page */ if (prevbn != 0) { DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); if (rc) return rc; BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the prev page * * action: update next pointer; */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", tlck, ip, mp); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; p->header.next = cpu_to_le64(nextbn); DT_PUTPAGE(mp); } return 0; } /* * dtInitRoot() * * initialize directory root (inline in inode) */ void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); dtroot_t *p; int fsi; struct dtslot *f; struct tlock *tlck; struct dt_lock *dtlck; struct lv *lv; u16 xflag_save; /* * If this was previously an non-empty directory, we need to remove * the old directory table. */ if (DO_INDEX(ip)) { if (!jfs_dirtable_inline(ip)) { struct tblock *tblk = tid_to_tblock(tid); /* * We're playing games with the tid's xflag. If * we're removing a regular file, the file's xtree * is committed with COMMIT_PMAP, but we always * commit the directories xtree with COMMIT_PWMAP. */ xflag_save = tblk->xflag; tblk->xflag = 0; /* * xtTruncate isn't guaranteed to fully truncate * the xtree. The caller needs to check i_size * after committing the transaction to see if * additional truncation is needed. The * COMMIT_Stale flag tells caller that we * initiated the truncation. */ xtTruncate(tid, ip, 0, COMMIT_PWMAP); set_cflag(COMMIT_Stale, ip); tblk->xflag = xflag_save; } else ip->i_size = 1; jfs_ip->next_index = 2; } else ip->i_size = IDATASIZE; /* * acquire a transaction lock on the root * * action: directory initialization; */ tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag, tlckDTREE | tlckENTRY | tlckBTROOT); dtlck = (struct dt_lock *) & tlck->lock; /* linelock root */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = 0; lv->length = DTROOTMAXSLOT; dtlck->index++; p = &jfs_ip->i_dtroot; p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; p->header.nextindex = 0; /* init freelist */ fsi = 1; f = &p->slot[fsi]; /* init data area of root */ for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) f->next = fsi; f->next = -1; p->header.freelist = 1; p->header.freecnt = 8; /* init '..' entry */ p->header.idotdot = cpu_to_le32(idotdot); return; } /* * add_missing_indices() * * function: Fix dtree page in which one or more entries has an invalid index. * fsck.jfs should really fix this, but it currently does not. * Called from jfs_readdir when bad index is detected. */ static void add_missing_indices(struct inode *inode, s64 bn) { struct ldtentry *d; struct dt_lock *dtlck; int i; uint index; struct lv *lv; struct metapage *mp; dtpage_t *p; int rc; s8 *stbl; tid_t tid; struct tlock *tlck; tid = txBegin(inode->i_sb, 0); DT_GETPAGE(inode, bn, mp, PSIZE, p, rc); if (rc) { printk(KERN_ERR "DT_GETPAGE failed!\n"); goto end; } BT_MARK_DIRTY(mp, inode); ASSERT(p->header.flag & BT_LEAF); tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY); if (BT_IS_ROOT(mp)) tlck->type |= tlckBTROOT; dtlck = (struct dt_lock *) &tlck->lock; stbl = DT_GETSTBL(p); for (i = 0; i < p->header.nextindex; i++) { d = (struct ldtentry *) &p->slot[stbl[i]]; index = le32_to_cpu(d->index); if ((index < 2) || (index >= JFS_IP(inode)->next_index)) { d->index = cpu_to_le32(add_index(tid, inode, bn, i)); if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = &dtlck->lv[dtlck->index]; lv->offset = stbl[i]; lv->length = 1; dtlck->index++; } } DT_PUTPAGE(mp); (void) txCommit(tid, 1, &inode, 0); end: txEnd(tid); } /* * Buffer to hold directory entry info while traversing a dtree page * before being fed to the filldir function */ struct jfs_dirent { loff_t position; int ino; u16 name_len; char name[]; }; /* * function to determine next variable-sized jfs_dirent in buffer */ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent) { return (struct jfs_dirent *) ((char *)dirent + ((sizeof (struct jfs_dirent) + dirent->name_len + 1 + sizeof (loff_t) - 1) & ~(sizeof (loff_t) - 1))); } /* * jfs_readdir() * * function: read directory entries sequentially * from the specified entry offset * * parameter: * * return: offset = (pn, index) of start entry * of next jfs_readdir()/dtRead() */ int jfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *ip = file_inode(file); struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab; int rc = 0; loff_t dtpos; /* legacy OS/2 style position */ struct dtoffset { s16 pn; s16 index; s32 unused; } *dtoffset = (struct dtoffset *) &dtpos; s64 bn; struct metapage *mp; dtpage_t *p; int index; s8 *stbl; struct btstack btstack; int i, next; struct ldtentry *d; struct dtslot *t; int d_namleft, len, outlen; unsigned long dirent_buf; char *name_ptr; u32 dir_index; int do_index = 0; uint loop_count = 0; struct jfs_dirent *jfs_dirent; int jfs_dirents; int overflow, fix_page, page_fixed = 0; static int unique_pos = 2; /* If we can't fix broken index */ if (ctx->pos == DIREND) return 0; if (DO_INDEX(ip)) { /* * persistent index is stored in directory entries. * Special cases: 0 = . * 1 = .. * -1 = End of directory */ do_index = 1; dir_index = (u32) ctx->pos; /* * NFSv4 reserves cookies 1 and 2 for . and .. so the value * we return to the vfs is one greater than the one we use * internally. */ if (dir_index) dir_index--; if (dir_index > 1) { struct dir_table_slot dirtab_slot; if (dtEmpty(ip) || (dir_index >= JFS_IP(ip)->next_index)) { /* Stale position. Directory has shrunk */ ctx->pos = DIREND; return 0; } repeat: rc = read_index(ip, dir_index, &dirtab_slot); if (rc) { ctx->pos = DIREND; return rc; } if (dirtab_slot.flag == DIR_INDEX_FREE) { if (loop_count++ > JFS_IP(ip)->next_index) { jfs_err("jfs_readdir detected infinite loop!"); ctx->pos = DIREND; return 0; } dir_index = le32_to_cpu(dirtab_slot.addr2); if (dir_index == -1) { ctx->pos = DIREND; return 0; } goto repeat; } bn = addressDTS(&dirtab_slot); index = dirtab_slot.slot; DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) { ctx->pos = DIREND; return 0; } if (p->header.flag & BT_INTERNAL) { jfs_err("jfs_readdir: bad index table"); DT_PUTPAGE(mp); ctx->pos = DIREND; return 0; } } else { if (dir_index == 0) { /* * self "." */ ctx->pos = 1; if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; } /* * parent ".." */ ctx->pos = 2; if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; /* * Find first entry of left-most leaf */ if (dtEmpty(ip)) { ctx->pos = DIREND; return 0; } if ((rc = dtReadFirst(ip, &btstack))) return rc; DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); } } else { /* * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 * * pn = 0; index = 1: First entry "." * pn = 0; index = 2: Second entry ".." * pn > 0: Real entries, pn=1 -> leftmost page * pn = index = -1: No more entries */ dtpos = ctx->pos; if (dtpos < 2) { /* build "." entry */ ctx->pos = 1; if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; dtoffset->index = 2; ctx->pos = dtpos; } if (dtoffset->pn == 0) { if (dtoffset->index == 2) { /* build ".." entry */ if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; } else { jfs_err("jfs_readdir called with invalid offset!"); } dtoffset->pn = 1; dtoffset->index = 0; ctx->pos = dtpos; } if (dtEmpty(ip)) { ctx->pos = DIREND; return 0; } if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) { jfs_err("jfs_readdir: unexpected rc = %d from dtReadNext", rc); ctx->pos = DIREND; return 0; } /* get start leaf page and index */ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); /* offset beyond directory eof ? */ if (bn < 0) { ctx->pos = DIREND; return 0; } } dirent_buf = __get_free_page(GFP_KERNEL); if (dirent_buf == 0) { DT_PUTPAGE(mp); jfs_warn("jfs_readdir: __get_free_page failed!"); ctx->pos = DIREND; return -ENOMEM; } while (1) { jfs_dirent = (struct jfs_dirent *) dirent_buf; jfs_dirents = 0; overflow = fix_page = 0; stbl = DT_GETSTBL(p); for (i = index; i < p->header.nextindex; i++) { d = (struct ldtentry *) & p->slot[stbl[i]]; if (((long) jfs_dirent + d->namlen + 1) > (dirent_buf + PAGE_SIZE)) { /* DBCS codepages could overrun dirent_buf */ index = i; overflow = 1; break; } d_namleft = d->namlen; name_ptr = jfs_dirent->name; jfs_dirent->ino = le32_to_cpu(d->inumber); if (do_index) { len = min(d_namleft, DTLHDRDATALEN); jfs_dirent->position = le32_to_cpu(d->index); /* * d->index should always be valid, but it * isn't. fsck.jfs doesn't create the * directory index for the lost+found * directory. Rather than let it go, * we can try to fix it. */ if ((jfs_dirent->position < 2) || (jfs_dirent->position >= JFS_IP(ip)->next_index)) { if (!page_fixed && !isReadOnly(ip)) { fix_page = 1; /* * setting overflow and setting * index to i will cause the * same page to be processed * again starting here */ overflow = 1; index = i; break; } jfs_dirent->position = unique_pos++; } /* * We add 1 to the index because we may * use a value of 2 internally, and NFSv4 * doesn't like that. */ jfs_dirent->position++; } else { jfs_dirent->position = dtpos; len = min(d_namleft, DTLHDRDATALEN_LEGACY); } /* copy the name of head/only segment */ outlen = jfs_strfromUCS_le(name_ptr, d->name, len, codepage); jfs_dirent->name_len = outlen; /* copy name in the additional segment(s) */ next = d->next; while (next >= 0) { t = (struct dtslot *) & p->slot[next]; name_ptr += outlen; d_namleft -= len; /* Sanity Check */ if (d_namleft == 0) { jfs_error(ip->i_sb, "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n", (long)ip->i_ino, (long long)bn, i); goto skip_one; } len = min(d_namleft, DTSLOTDATALEN); outlen = jfs_strfromUCS_le(name_ptr, t->name, len, codepage); jfs_dirent->name_len += outlen; next = t->next; } jfs_dirents++; jfs_dirent = next_jfs_dirent(jfs_dirent); skip_one: if (!do_index) dtoffset->index++; } if (!overflow) { /* Point to next leaf page */ if (p->header.flag & BT_ROOT) bn = 0; else { bn = le64_to_cpu(p->header.next); index = 0; /* update offset (pn:index) for new page */ if (!do_index) { dtoffset->pn++; dtoffset->index = 0; } } page_fixed = 0; } /* unpin previous leaf page */ DT_PUTPAGE(mp); jfs_dirent = (struct jfs_dirent *) dirent_buf; while (jfs_dirents--) { ctx->pos = jfs_dirent->position; if (!dir_emit(ctx, jfs_dirent->name, jfs_dirent->name_len, jfs_dirent->ino, DT_UNKNOWN)) goto out; jfs_dirent = next_jfs_dirent(jfs_dirent); } if (fix_page) { add_missing_indices(ip, bn); page_fixed = 1; } if (!overflow && (bn == 0)) { ctx->pos = DIREND; break; } DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) { free_page(dirent_buf); return rc; } } out: free_page(dirent_buf); return rc; } /* * dtReadFirst() * * function: get the leftmost page of the directory */ static int dtReadFirst(struct inode *ip, struct btstack * btstack) { int rc = 0; s64 bn; int psize = 288; /* initial in-line directory */ struct metapage *mp; dtpage_t *p; s8 *stbl; struct btframe *btsp; pxd_t *xd; BT_CLR(btstack); /* reset stack */ /* * descend leftmost path of the tree * * by convention, root bn = 0. */ for (bn = 0;;) { DT_GETPAGE(ip, bn, mp, psize, p, rc); if (rc) return rc; /* * leftmost leaf page */ if (p->header.flag & BT_LEAF) { /* return leftmost entry */ btsp = btstack->top; btsp->bn = bn; btsp->index = 0; btsp->mp = mp; return 0; } /* * descend down to leftmost child page */ if (BT_STACK_FULL(btstack)) { DT_PUTPAGE(mp); jfs_error(ip->i_sb, "btstack overrun\n"); BT_STACK_DUMP(btstack); return -EIO; } /* push (bn, index) of the parent page/entry */ BT_PUSH(btstack, bn, 0); /* get the leftmost entry */ stbl = DT_GETSTBL(p); xd = (pxd_t *) & p->slot[stbl[0]]; /* get the child page block address */ bn = addressPXD(xd); psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize; /* unpin the parent page */ DT_PUTPAGE(mp); } } /* * dtReadNext() * * function: get the page of the specified offset (pn:index) * * return: if (offset > eof), bn = -1; * * note: if index > nextindex of the target leaf page, * start with 1st entry of next leaf page; */ static int dtReadNext(struct inode *ip, loff_t * offset, struct btstack * btstack) { int rc = 0; struct dtoffset { s16 pn; s16 index; s32 unused; } *dtoffset = (struct dtoffset *) offset; s64 bn; struct metapage *mp; dtpage_t *p; int index; int pn; s8 *stbl; struct btframe *btsp, *parent; pxd_t *xd; /* * get leftmost leaf page pinned */ if ((rc = dtReadFirst(ip, btstack))) return rc; /* get leaf page */ DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); /* get the start offset (pn:index) */ pn = dtoffset->pn - 1; /* Now pn = 0 represents leftmost leaf */ index = dtoffset->index; /* start at leftmost page ? */ if (pn == 0) { /* offset beyond eof ? */ if (index < p->header.nextindex) goto out; if (p->header.flag & BT_ROOT) { bn = -1; goto out; } /* start with 1st entry of next leaf page */ dtoffset->pn++; dtoffset->index = index = 0; goto a; } /* start at non-leftmost page: scan parent pages for large pn */ if (p->header.flag & BT_ROOT) { bn = -1; goto out; } /* start after next leaf page ? */ if (pn > 1) goto b; /* get leaf page pn = 1 */ a: bn = le64_to_cpu(p->header.next); /* unpin leaf page */ DT_PUTPAGE(mp); /* offset beyond eof ? */ if (bn == 0) { bn = -1; goto out; } goto c; /* * scan last internal page level to get target leaf page */ b: /* unpin leftmost leaf page */ DT_PUTPAGE(mp); /* get left most parent page */ btsp = btstack->top; parent = btsp - 1; bn = parent->bn; DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* scan parent pages at last internal page level */ while (pn >= p->header.nextindex) { pn -= p->header.nextindex; /* get next parent page address */ bn = le64_to_cpu(p->header.next); /* unpin current parent page */ DT_PUTPAGE(mp); /* offset beyond eof ? */ if (bn == 0) { bn = -1; goto out; } /* get next parent page */ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* update parent page stack frame */ parent->bn = bn; } /* get leaf page address */ stbl = DT_GETSTBL(p); xd = (pxd_t *) & p->slot[stbl[pn]]; bn = addressPXD(xd); /* unpin parent page */ DT_PUTPAGE(mp); /* * get target leaf page */ c: DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* * leaf page has been completed: * start with 1st entry of next leaf page */ if (index >= p->header.nextindex) { bn = le64_to_cpu(p->header.next); /* unpin leaf page */ DT_PUTPAGE(mp); /* offset beyond eof ? */ if (bn == 0) { bn = -1; goto out; } /* get next leaf page */ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* start with 1st entry of next leaf page */ dtoffset->pn++; dtoffset->index = 0; } out: /* return target leaf page pinned */ btsp = btstack->top; btsp->bn = bn; btsp->index = dtoffset->index; btsp->mp = mp; return 0; } /* * dtCompare() * * function: compare search key with an internal entry * * return: * < 0 if k is < record * = 0 if k is = record * > 0 if k is > record */ static int dtCompare(struct component_name * key, /* search key */ dtpage_t * p, /* directory page */ int si) { /* entry slot index */ wchar_t *kname; __le16 *name; int klen, namlen, len, rc; struct idtentry *ih; struct dtslot *t; /* * force the left-most key on internal pages, at any level of * the tree, to be less than any search key. * this obviates having to update the leftmost key on an internal * page when the user inserts a new key in the tree smaller than * anything that has been stored. * * (? if/when dtSearch() narrows down to 1st entry (index = 0), * at any internal page at any level of the tree, * it descends to child of the entry anyway - * ? make the entry as min size dummy entry) * * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) * return (1); */ kname = key->name; klen = key->namlen; ih = (struct idtentry *) & p->slot[si]; si = ih->next; name = ih->name; namlen = ih->namlen; len = min(namlen, DTIHDRDATALEN); /* compare with head/only segment */ len = min(klen, len); if ((rc = UniStrncmp_le(kname, name, len))) return rc; klen -= len; namlen -= len; /* compare with additional segment(s) */ kname += len; while (klen > 0 && namlen > 0) { /* compare with next name segment */ t = (struct dtslot *) & p->slot[si]; len = min(namlen, DTSLOTDATALEN); len = min(klen, len); name = t->name; if ((rc = UniStrncmp_le(kname, name, len))) return rc; klen -= len; namlen -= len; kname += len; si = t->next; } return (klen - namlen); } /* * ciCompare() * * function: compare search key with an (leaf/internal) entry * * return: * < 0 if k is < record * = 0 if k is = record * > 0 if k is > record */ static int ciCompare(struct component_name * key, /* search key */ dtpage_t * p, /* directory page */ int si, /* entry slot index */ int flag) { wchar_t *kname, x; __le16 *name; int klen, namlen, len, rc; struct ldtentry *lh; struct idtentry *ih; struct dtslot *t; int i; /* * force the left-most key on internal pages, at any level of * the tree, to be less than any search key. * this obviates having to update the leftmost key on an internal * page when the user inserts a new key in the tree smaller than * anything that has been stored. * * (? if/when dtSearch() narrows down to 1st entry (index = 0), * at any internal page at any level of the tree, * it descends to child of the entry anyway - * ? make the entry as min size dummy entry) * * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) * return (1); */ kname = key->name; klen = key->namlen; /* * leaf page entry */ if (p->header.flag & BT_LEAF) { lh = (struct ldtentry *) & p->slot[si]; si = lh->next; name = lh->name; namlen = lh->namlen; if (flag & JFS_DIR_INDEX) len = min(namlen, DTLHDRDATALEN); else len = min(namlen, DTLHDRDATALEN_LEGACY); } /* * internal page entry */ else { ih = (struct idtentry *) & p->slot[si]; si = ih->next; name = ih->name; namlen = ih->namlen; len = min(namlen, DTIHDRDATALEN); } /* compare with head/only segment */ len = min(klen, len); for (i = 0; i < len; i++, kname++, name++) { /* only uppercase if case-insensitive support is on */ if ((flag & JFS_OS2) == JFS_OS2) x = UniToupper(le16_to_cpu(*name)); else x = le16_to_cpu(*name); if ((rc = *kname - x)) return rc; } klen -= len; namlen -= len; /* compare with additional segment(s) */ while (klen > 0 && namlen > 0) { /* compare with next name segment */ t = (struct dtslot *) & p->slot[si]; len = min(namlen, DTSLOTDATALEN); len = min(klen, len); name = t->name; for (i = 0; i < len; i++, kname++, name++) { /* only uppercase if case-insensitive support is on */ if ((flag & JFS_OS2) == JFS_OS2) x = UniToupper(le16_to_cpu(*name)); else x = le16_to_cpu(*name); if ((rc = *kname - x)) return rc; } klen -= len; namlen -= len; si = t->next; } return (klen - namlen); } /* * ciGetLeafPrefixKey() * * function: compute prefix of suffix compression * from two adjacent leaf entries * across page boundary * * return: non-zero on error * */ static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, int ri, struct component_name * key, int flag) { int klen, namlen; wchar_t *pl, *pr, *kname; struct component_name lkey; struct component_name rkey; lkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_KERNEL); if (lkey.name == NULL) return -ENOMEM; rkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_KERNEL); if (rkey.name == NULL) { kfree(lkey.name); return -ENOMEM; } /* get left and right key */ dtGetKey(lp, li, &lkey, flag); lkey.name[lkey.namlen] = 0; if ((flag & JFS_OS2) == JFS_OS2) ciToUpper(&lkey); dtGetKey(rp, ri, &rkey, flag); rkey.name[rkey.namlen] = 0; if ((flag & JFS_OS2) == JFS_OS2) ciToUpper(&rkey); /* compute prefix */ klen = 0; kname = key->name; namlen = min(lkey.namlen, rkey.namlen); for (pl = lkey.name, pr = rkey.name; namlen; pl++, pr++, namlen--, klen++, kname++) { *kname = *pr; if (*pl != *pr) { key->namlen = klen + 1; goto free_names; } } /* l->namlen <= r->namlen since l <= r */ if (lkey.namlen < rkey.namlen) { *kname = *pr; key->namlen = klen + 1; } else /* l->namelen == r->namelen */ key->namlen = klen; free_names: kfree(lkey.name); kfree(rkey.name); return 0; } /* * dtGetKey() * * function: get key of the entry */ static void dtGetKey(dtpage_t * p, int i, /* entry index */ struct component_name * key, int flag) { int si; s8 *stbl; struct ldtentry *lh; struct idtentry *ih; struct dtslot *t; int namlen, len; wchar_t *kname; __le16 *name; /* get entry */ stbl = DT_GETSTBL(p); si = stbl[i]; if (p->header.flag & BT_LEAF) { lh = (struct ldtentry *) & p->slot[si]; si = lh->next; namlen = lh->namlen; name = lh->name; if (flag & JFS_DIR_INDEX) len = min(namlen, DTLHDRDATALEN); else len = min(namlen, DTLHDRDATALEN_LEGACY); } else { ih = (struct idtentry *) & p->slot[si]; si = ih->next; namlen = ih->namlen; name = ih->name; len = min(namlen, DTIHDRDATALEN); } key->namlen = namlen; kname = key->name; /* * move head/only segment */ UniStrncpy_from_le(kname, name, len); /* * move additional segment(s) */ while (si >= 0) { /* get next segment */ t = &p->slot[si]; kname += len; namlen -= len; len = min(namlen, DTSLOTDATALEN); UniStrncpy_from_le(kname, t->name, len); si = t->next; } } /* * dtInsertEntry() * * function: allocate free slot(s) and * write a leaf/internal entry * * return: entry slot index */ static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, ddata_t * data, struct dt_lock ** dtlock) { struct dtslot *h, *t; struct ldtentry *lh = NULL; struct idtentry *ih = NULL; int hsi, fsi, klen, len, nextindex; wchar_t *kname; __le16 *name; s8 *stbl; pxd_t *xd; struct dt_lock *dtlck = *dtlock; struct lv *lv; int xsi, n; s64 bn = 0; struct metapage *mp = NULL; klen = key->namlen; kname = key->name; /* allocate a free slot */ hsi = fsi = p->header.freelist; h = &p->slot[fsi]; p->header.freelist = h->next; --p->header.freecnt; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = hsi; /* write head/only segment */ if (p->header.flag & BT_LEAF) { lh = (struct ldtentry *) h; lh->next = h->next; lh->inumber = cpu_to_le32(data->leaf.ino); lh->namlen = klen; name = lh->name; if (data->leaf.ip) { len = min(klen, DTLHDRDATALEN); if (!(p->header.flag & BT_ROOT)) bn = addressPXD(&p->header.self); lh->index = cpu_to_le32(add_index(data->leaf.tid, data->leaf.ip, bn, index)); } else len = min(klen, DTLHDRDATALEN_LEGACY); } else { ih = (struct idtentry *) h; ih->next = h->next; xd = (pxd_t *) ih; *xd = data->xd; ih->namlen = klen; name = ih->name; len = min(klen, DTIHDRDATALEN); } UniStrncpy_to_le(name, kname, len); n = 1; xsi = hsi; /* write additional segment(s) */ t = h; klen -= len; while (klen) { /* get free slot */ fsi = p->header.freelist; t = &p->slot[fsi]; p->header.freelist = t->next; --p->header.freecnt; /* is next slot contiguous ? */ if (fsi != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = fsi; n = 0; } kname += len; len = min(klen, DTSLOTDATALEN); UniStrncpy_to_le(t->name, kname, len); n++; xsi = fsi; klen -= len; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; /* terminate last/only segment */ if (h == t) { /* single segment entry */ if (p->header.flag & BT_LEAF) lh->next = -1; else ih->next = -1; } else /* multi-segment entry */ t->next = -1; /* if insert into middle, shift right succeeding entries in stbl */ stbl = DT_GETSTBL(p); nextindex = p->header.nextindex; if (index < nextindex) { memmove(stbl + index + 1, stbl + index, nextindex - index); if ((p->header.flag & BT_LEAF) && data->leaf.ip) { s64 lblock; /* * Need to update slot number for entries that moved * in the stbl */ mp = NULL; for (n = index + 1; n <= nextindex; n++) { lh = (struct ldtentry *) & (p->slot[stbl[n]]); modify_index(data->leaf.tid, data->leaf.ip, le32_to_cpu(lh->index), bn, n, &mp, &lblock); } if (mp) release_metapage(mp); } } stbl[index] = hsi; /* advance next available entry index of stbl */ ++p->header.nextindex; } /* * dtMoveEntry() * * function: move entries from split/left page to new/right page * * nextindex of dst page and freelist/freecnt of both pages * are updated. */ static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, int do_index) { int ssi, next; /* src slot index */ int di; /* dst entry index */ int dsi; /* dst slot index */ s8 *sstbl, *dstbl; /* sorted entry table */ int snamlen, len; struct ldtentry *slh, *dlh = NULL; struct idtentry *sih, *dih = NULL; struct dtslot *h, *s, *d; struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock; struct lv *slv, *dlv; int xssi, ns, nd; int sfsi; sstbl = (s8 *) & sp->slot[sp->header.stblindex]; dstbl = (s8 *) & dp->slot[dp->header.stblindex]; dsi = dp->header.freelist; /* first (whole page) free slot */ sfsi = sp->header.freelist; /* linelock destination entry slot */ dlv = & ddtlck->lv[ddtlck->index]; dlv->offset = dsi; /* linelock source entry slot */ slv = & sdtlck->lv[sdtlck->index]; slv->offset = sstbl[si]; xssi = slv->offset - 1; /* * move entries */ ns = nd = 0; for (di = 0; si < sp->header.nextindex; si++, di++) { ssi = sstbl[si]; dstbl[di] = dsi; /* is next slot contiguous ? */ if (ssi != xssi + 1) { /* close current linelock */ slv->length = ns; sdtlck->index++; /* open new linelock */ if (sdtlck->index < sdtlck->maxcnt) slv++; else { sdtlck = (struct dt_lock *) txLinelock(sdtlck); slv = & sdtlck->lv[0]; } slv->offset = ssi; ns = 0; } /* * move head/only segment of an entry */ /* get dst slot */ h = d = &dp->slot[dsi]; /* get src slot and move */ s = &sp->slot[ssi]; if (sp->header.flag & BT_LEAF) { /* get source entry */ slh = (struct ldtentry *) s; dlh = (struct ldtentry *) h; snamlen = slh->namlen; if (do_index) { len = min(snamlen, DTLHDRDATALEN); dlh->index = slh->index; /* little-endian */ } else len = min(snamlen, DTLHDRDATALEN_LEGACY); memcpy(dlh, slh, 6 + len * 2); next = slh->next; /* update dst head/only segment next field */ dsi++; dlh->next = dsi; } else { sih = (struct idtentry *) s; snamlen = sih->namlen; len = min(snamlen, DTIHDRDATALEN); dih = (struct idtentry *) h; memcpy(dih, sih, 10 + len * 2); next = sih->next; dsi++; dih->next = dsi; } /* free src head/only segment */ s->next = sfsi; s->cnt = 1; sfsi = ssi; ns++; nd++; xssi = ssi; /* * move additional segment(s) of the entry */ snamlen -= len; while ((ssi = next) >= 0) { /* is next slot contiguous ? */ if (ssi != xssi + 1) { /* close current linelock */ slv->length = ns; sdtlck->index++; /* open new linelock */ if (sdtlck->index < sdtlck->maxcnt) slv++; else { sdtlck = (struct dt_lock *) txLinelock(sdtlck); slv = & sdtlck->lv[0]; } slv->offset = ssi; ns = 0; } /* get next source segment */ s = &sp->slot[ssi]; /* get next destination free slot */ d++; len = min(snamlen, DTSLOTDATALEN); UniStrncpy_le(d->name, s->name, len); ns++; nd++; xssi = ssi; dsi++; d->next = dsi; /* free source segment */ next = s->next; s->next = sfsi; s->cnt = 1; sfsi = ssi; snamlen -= len; } /* end while */ /* terminate dst last/only segment */ if (h == d) { /* single segment entry */ if (dp->header.flag & BT_LEAF) dlh->next = -1; else dih->next = -1; } else /* multi-segment entry */ d->next = -1; } /* end for */ /* close current linelock */ slv->length = ns; sdtlck->index++; *sdtlock = sdtlck; dlv->length = nd; ddtlck->index++; *ddtlock = ddtlck; /* update source header */ sp->header.freelist = sfsi; sp->header.freecnt += nd; /* update destination header */ dp->header.nextindex = di; dp->header.freelist = dsi; dp->header.freecnt -= nd; } /* * dtDeleteEntry() * * function: free a (leaf/internal) entry * * log freelist header, stbl, and each segment slot of entry * (even though last/only segment next field is modified, * physical image logging requires all segment slots of * the entry logged to avoid applying previous updates * to the same slots) */ static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock) { int fsi; /* free entry slot index */ s8 *stbl; struct dtslot *t; int si, freecnt; struct dt_lock *dtlck = *dtlock; struct lv *lv; int xsi, n; /* get free entry slot index */ stbl = DT_GETSTBL(p); fsi = stbl[fi]; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = fsi; /* get the head/only segment */ t = &p->slot[fsi]; if (p->header.flag & BT_LEAF) si = ((struct ldtentry *) t)->next; else si = ((struct idtentry *) t)->next; t->next = si; t->cnt = 1; n = freecnt = 1; xsi = fsi; /* find the last/only segment */ while (si >= 0) { /* is next slot contiguous ? */ if (si != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = si; n = 0; } n++; xsi = si; freecnt++; t = &p->slot[si]; t->cnt = 1; si = t->next; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; /* update freelist */ t->next = p->header.freelist; p->header.freelist = fsi; p->header.freecnt += freecnt; /* if delete from middle, * shift left the succedding entries in the stbl */ si = p->header.nextindex; if (fi < si - 1) memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1); p->header.nextindex--; } /* * dtTruncateEntry() * * function: truncate a (leaf/internal) entry * * log freelist header, stbl, and each segment slot of entry * (even though last/only segment next field is modified, * physical image logging requires all segment slots of * the entry logged to avoid applying previous updates * to the same slots) */ static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock) { int tsi; /* truncate entry slot index */ s8 *stbl; struct dtslot *t; int si, freecnt; struct dt_lock *dtlck = *dtlock; struct lv *lv; int fsi, xsi, n; /* get free entry slot index */ stbl = DT_GETSTBL(p); tsi = stbl[ti]; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = tsi; /* get the head/only segment */ t = &p->slot[tsi]; ASSERT(p->header.flag & BT_INTERNAL); ((struct idtentry *) t)->namlen = 0; si = ((struct idtentry *) t)->next; ((struct idtentry *) t)->next = -1; n = 1; freecnt = 0; fsi = si; xsi = tsi; /* find the last/only segment */ while (si >= 0) { /* is next slot contiguous ? */ if (si != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = si; n = 0; } n++; xsi = si; freecnt++; t = &p->slot[si]; t->cnt = 1; si = t->next; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; /* update freelist */ if (freecnt == 0) return; t->next = p->header.freelist; p->header.freelist = fsi; p->header.freecnt += freecnt; } /* * dtLinelockFreelist() */ static void dtLinelockFreelist(dtpage_t * p, /* directory page */ int m, /* max slot index */ struct dt_lock ** dtlock) { int fsi; /* free entry slot index */ struct dtslot *t; int si; struct dt_lock *dtlck = *dtlock; struct lv *lv; int xsi, n; /* get free entry slot index */ fsi = p->header.freelist; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = fsi; n = 1; xsi = fsi; t = &p->slot[fsi]; si = t->next; /* find the last/only segment */ while (si < m && si >= 0) { /* is next slot contiguous ? */ if (si != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = si; n = 0; } n++; xsi = si; t = &p->slot[si]; si = t->next; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; } /* * NAME: dtModify * * FUNCTION: Modify the inode number part of a directory entry * * PARAMETERS: * tid - Transaction id * ip - Inode of parent directory * key - Name of entry to be modified * orig_ino - Original inode number expected in entry * new_ino - New inode number to put into entry * flag - JFS_RENAME * * RETURNS: * -ESTALE - If entry found does not match orig_ino passed in * -ENOENT - If no entry can be found to match key * 0 - If successfully modified entry */ int dtModify(tid_t tid, struct inode *ip, struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag) { int rc; s64 bn; struct metapage *mp; dtpage_t *p; int index; struct btstack btstack; struct tlock *tlck; struct dt_lock *dtlck; struct lv *lv; s8 *stbl; int entry_si; /* entry slot index */ struct ldtentry *entry; /* * search for the entry to modify: * * dtSearch() returns (leaf page pinned, index at which to modify). */ if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag))) return rc; /* retrieve search result */ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page of named entry */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; /* get slot index of the entry */ stbl = DT_GETSTBL(p); entry_si = stbl[index]; /* linelock entry */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = entry_si; lv->length = 1; dtlck->index++; /* get the head/only segment */ entry = (struct ldtentry *) & p->slot[entry_si]; /* substitute the inode number of the entry */ entry->inumber = cpu_to_le32(new_ino); /* unpin the leaf page */ DT_PUTPAGE(mp); return 0; }
1 1 1 1 1 5 2 6 1 14 6 1 1 1 1 15 8 2 9 7 6 6 6 7 6 11 12 2 10 11 7 5 5 7 2 2 12 12 11 5 6 6 6 5 5 1 1 1 1 6 5 4 5 1 1 6 1 1 1 1 1 1 1 1 1 1 11 4 4 3 3 3 3 3 3 3 3 4 4 1 3 4 1 4 4 1 3 4 4 3 4 4 5 5 5 2 2 2 2 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 // SPDX-License-Identifier: GPL-2.0-or-later /* * NetLabel Domain Hash Table * * This file manages the domain hash table that NetLabel uses to determine * which network labeling protocol to use for a given domain. The NetLabel * system manages static and dynamic label mappings for network protocols such * as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 */ #include <linux/types.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/audit.h> #include <linux/slab.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> #include <net/calipso.h> #include <asm/bug.h> #include "netlabel_mgmt.h" #include "netlabel_addrlist.h" #include "netlabel_calipso.h" #include "netlabel_domainhash.h" #include "netlabel_user.h" struct netlbl_domhsh_tbl { struct list_head *tbl; u32 size; }; /* Domain hash table */ /* updates should be so rare that having one spinlock for the entire hash table * should be okay */ static DEFINE_SPINLOCK(netlbl_domhsh_lock); #define netlbl_domhsh_rcu_deref(p) \ rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock)) static struct netlbl_domhsh_tbl __rcu *netlbl_domhsh; static struct netlbl_dom_map __rcu *netlbl_domhsh_def_ipv4; static struct netlbl_dom_map __rcu *netlbl_domhsh_def_ipv6; /* * Domain Hash Table Helper Functions */ /** * netlbl_domhsh_free_entry - Frees a domain hash table entry * @entry: the entry's RCU field * * Description: * This function is designed to be used as a callback to the call_rcu() * function so that the memory allocated to a hash table entry can be released * safely. * */ static void netlbl_domhsh_free_entry(struct rcu_head *entry) { struct netlbl_dom_map *ptr; struct netlbl_af4list *iter4; struct netlbl_af4list *tmp4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; struct netlbl_af6list *tmp6; #endif /* IPv6 */ ptr = container_of(entry, struct netlbl_dom_map, rcu); if (ptr->def.type == NETLBL_NLTYPE_ADDRSELECT) { netlbl_af4list_foreach_safe(iter4, tmp4, &ptr->def.addrsel->list4) { netlbl_af4list_remove_entry(iter4); kfree(netlbl_domhsh_addr4_entry(iter4)); } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_safe(iter6, tmp6, &ptr->def.addrsel->list6) { netlbl_af6list_remove_entry(iter6); kfree(netlbl_domhsh_addr6_entry(iter6)); } #endif /* IPv6 */ kfree(ptr->def.addrsel); } kfree(ptr->domain); kfree(ptr); } /** * netlbl_domhsh_hash - Hashing function for the domain hash table * @key: the domain name to hash * * Description: * This is the hashing function for the domain hash table, it returns the * correct bucket number for the domain. The caller is responsible for * ensuring that the hash table is protected with either a RCU read lock or the * hash table lock. * */ static u32 netlbl_domhsh_hash(const char *key) { u32 iter; u32 val; u32 len; /* This is taken (with slight modification) from * security/selinux/ss/symtab.c:symhash() */ for (iter = 0, val = 0, len = strlen(key); iter < len; iter++) val = (val << 4 | (val >> (8 * sizeof(u32) - 4))) ^ key[iter]; return val & (netlbl_domhsh_rcu_deref(netlbl_domhsh)->size - 1); } static bool netlbl_family_match(u16 f1, u16 f2) { return (f1 == f2) || (f1 == AF_UNSPEC) || (f2 == AF_UNSPEC); } /** * netlbl_domhsh_search - Search for a domain entry * @domain: the domain * @family: the address family * * Description: * Searches the domain hash table and returns a pointer to the hash table * entry if found, otherwise NULL is returned. @family may be %AF_UNSPEC * which matches any address family entries. The caller is responsible for * ensuring that the hash table is protected with either a RCU read lock or the * hash table lock. * */ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, u16 family) { u32 bkt; struct list_head *bkt_list; struct netlbl_dom_map *iter; if (domain != NULL) { bkt = netlbl_domhsh_hash(domain); bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt]; list_for_each_entry_rcu(iter, bkt_list, list, lockdep_is_held(&netlbl_domhsh_lock)) if (iter->valid && netlbl_family_match(iter->family, family) && strcmp(iter->domain, domain) == 0) return iter; } return NULL; } /** * netlbl_domhsh_search_def - Search for a domain entry * @domain: the domain * @family: the address family * * Description: * Searches the domain hash table and returns a pointer to the hash table * entry if an exact match is found, if an exact match is not present in the * hash table then the default entry is returned if valid otherwise NULL is * returned. @family may be %AF_UNSPEC which matches any address family * entries. The caller is responsible ensuring that the hash table is * protected with either a RCU read lock or the hash table lock. * */ static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain, u16 family) { struct netlbl_dom_map *entry; entry = netlbl_domhsh_search(domain, family); if (entry != NULL) return entry; if (family == AF_INET || family == AF_UNSPEC) { entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def_ipv4); if (entry != NULL && entry->valid) return entry; } if (family == AF_INET6 || family == AF_UNSPEC) { entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def_ipv6); if (entry != NULL && entry->valid) return entry; } return NULL; } /** * netlbl_domhsh_audit_add - Generate an audit entry for an add event * @entry: the entry being added * @addr4: the IPv4 address information * @addr6: the IPv6 address information * @result: the result code * @audit_info: NetLabel audit information * * Description: * Generate an audit record for adding a new NetLabel/LSM mapping entry with * the given information. Caller is responsible for holding the necessary * locks. * */ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry, struct netlbl_af4list *addr4, struct netlbl_af6list *addr6, int result, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; struct cipso_v4_doi *cipsov4 = NULL; struct calipso_doi *calipso = NULL; u32 type; audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info); if (audit_buf != NULL) { audit_log_format(audit_buf, " nlbl_domain=%s", entry->domain ? entry->domain : "(default)"); if (addr4 != NULL) { struct netlbl_domaddr4_map *map4; map4 = netlbl_domhsh_addr4_entry(addr4); type = map4->def.type; cipsov4 = map4->def.cipso; netlbl_af4list_audit_addr(audit_buf, 0, NULL, addr4->addr, addr4->mask); #if IS_ENABLED(CONFIG_IPV6) } else if (addr6 != NULL) { struct netlbl_domaddr6_map *map6; map6 = netlbl_domhsh_addr6_entry(addr6); type = map6->def.type; calipso = map6->def.calipso; netlbl_af6list_audit_addr(audit_buf, 0, NULL, &addr6->addr, &addr6->mask); #endif /* IPv6 */ } else { type = entry->def.type; cipsov4 = entry->def.cipso; calipso = entry->def.calipso; } switch (type) { case NETLBL_NLTYPE_UNLABELED: audit_log_format(audit_buf, " nlbl_protocol=unlbl"); break; case NETLBL_NLTYPE_CIPSOV4: BUG_ON(cipsov4 == NULL); audit_log_format(audit_buf, " nlbl_protocol=cipsov4 cipso_doi=%u", cipsov4->doi); break; case NETLBL_NLTYPE_CALIPSO: BUG_ON(calipso == NULL); audit_log_format(audit_buf, " nlbl_protocol=calipso calipso_doi=%u", calipso->doi); break; } audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0); audit_log_end(audit_buf); } } /** * netlbl_domhsh_validate - Validate a new domain mapping entry * @entry: the entry to validate * * This function validates the new domain mapping entry to ensure that it is * a valid entry. Returns zero on success, negative values on failure. * */ static int netlbl_domhsh_validate(const struct netlbl_dom_map *entry) { struct netlbl_af4list *iter4; struct netlbl_domaddr4_map *map4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; struct netlbl_domaddr6_map *map6; #endif /* IPv6 */ if (entry == NULL) return -EINVAL; if (entry->family != AF_INET && entry->family != AF_INET6 && (entry->family != AF_UNSPEC || entry->def.type != NETLBL_NLTYPE_UNLABELED)) return -EINVAL; switch (entry->def.type) { case NETLBL_NLTYPE_UNLABELED: if (entry->def.cipso != NULL || entry->def.calipso != NULL || entry->def.addrsel != NULL) return -EINVAL; break; case NETLBL_NLTYPE_CIPSOV4: if (entry->family != AF_INET || entry->def.cipso == NULL) return -EINVAL; break; case NETLBL_NLTYPE_CALIPSO: if (entry->family != AF_INET6 || entry->def.calipso == NULL) return -EINVAL; break; case NETLBL_NLTYPE_ADDRSELECT: netlbl_af4list_foreach(iter4, &entry->def.addrsel->list4) { map4 = netlbl_domhsh_addr4_entry(iter4); switch (map4->def.type) { case NETLBL_NLTYPE_UNLABELED: if (map4->def.cipso != NULL) return -EINVAL; break; case NETLBL_NLTYPE_CIPSOV4: if (map4->def.cipso == NULL) return -EINVAL; break; default: return -EINVAL; } } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach(iter6, &entry->def.addrsel->list6) { map6 = netlbl_domhsh_addr6_entry(iter6); switch (map6->def.type) { case NETLBL_NLTYPE_UNLABELED: if (map6->def.calipso != NULL) return -EINVAL; break; case NETLBL_NLTYPE_CALIPSO: if (map6->def.calipso == NULL) return -EINVAL; break; default: return -EINVAL; } } #endif /* IPv6 */ break; default: return -EINVAL; } return 0; } /* * Domain Hash Table Functions */ /** * netlbl_domhsh_init - Init for the domain hash * @size: the number of bits to use for the hash buckets * * Description: * Initializes the domain hash table, should be called only by * netlbl_user_init() during initialization. Returns zero on success, non-zero * values on error. * */ int __init netlbl_domhsh_init(u32 size) { u32 iter; struct netlbl_domhsh_tbl *hsh_tbl; if (size == 0) return -EINVAL; hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); if (hsh_tbl == NULL) return -ENOMEM; hsh_tbl->size = 1 << size; hsh_tbl->tbl = kcalloc(hsh_tbl->size, sizeof(struct list_head), GFP_KERNEL); if (hsh_tbl->tbl == NULL) { kfree(hsh_tbl); return -ENOMEM; } for (iter = 0; iter < hsh_tbl->size; iter++) INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); spin_lock(&netlbl_domhsh_lock); rcu_assign_pointer(netlbl_domhsh, hsh_tbl); spin_unlock(&netlbl_domhsh_lock); return 0; } /** * netlbl_domhsh_add - Adds a entry to the domain hash table * @entry: the entry to add * @audit_info: NetLabel audit information * * Description: * Adds a new entry to the domain hash table and handles any updates to the * lower level protocol handler (i.e. CIPSO). @entry->family may be set to * %AF_UNSPEC which will add an entry that matches all address families. This * is only useful for the unlabelled type and will only succeed if there is no * existing entry for any address family with the same domain. Returns zero * on success, negative on failure. * */ int netlbl_domhsh_add(struct netlbl_dom_map *entry, struct netlbl_audit *audit_info) { int ret_val = 0; struct netlbl_dom_map *entry_old, *entry_b; struct netlbl_af4list *iter4; struct netlbl_af4list *tmp4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; struct netlbl_af6list *tmp6; #endif /* IPv6 */ ret_val = netlbl_domhsh_validate(entry); if (ret_val != 0) return ret_val; /* XXX - we can remove this RCU read lock as the spinlock protects the * entire function, but before we do we need to fixup the * netlbl_af[4,6]list RCU functions to do "the right thing" with * respect to rcu_dereference() when only a spinlock is held. */ rcu_read_lock(); spin_lock(&netlbl_domhsh_lock); if (entry->domain != NULL) entry_old = netlbl_domhsh_search(entry->domain, entry->family); else entry_old = netlbl_domhsh_search_def(entry->domain, entry->family); if (entry_old == NULL) { entry->valid = 1; if (entry->domain != NULL) { u32 bkt = netlbl_domhsh_hash(entry->domain); list_add_tail_rcu(&entry->list, &rcu_dereference(netlbl_domhsh)->tbl[bkt]); } else { INIT_LIST_HEAD(&entry->list); switch (entry->family) { case AF_INET: rcu_assign_pointer(netlbl_domhsh_def_ipv4, entry); break; case AF_INET6: rcu_assign_pointer(netlbl_domhsh_def_ipv6, entry); break; case AF_UNSPEC: if (entry->def.type != NETLBL_NLTYPE_UNLABELED) { ret_val = -EINVAL; goto add_return; } entry_b = kzalloc(sizeof(*entry_b), GFP_ATOMIC); if (entry_b == NULL) { ret_val = -ENOMEM; goto add_return; } entry_b->family = AF_INET6; entry_b->def.type = NETLBL_NLTYPE_UNLABELED; entry_b->valid = 1; entry->family = AF_INET; rcu_assign_pointer(netlbl_domhsh_def_ipv4, entry); rcu_assign_pointer(netlbl_domhsh_def_ipv6, entry_b); break; default: /* Already checked in * netlbl_domhsh_validate(). */ ret_val = -EINVAL; goto add_return; } } if (entry->def.type == NETLBL_NLTYPE_ADDRSELECT) { netlbl_af4list_foreach_rcu(iter4, &entry->def.addrsel->list4) netlbl_domhsh_audit_add(entry, iter4, NULL, ret_val, audit_info); #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(iter6, &entry->def.addrsel->list6) netlbl_domhsh_audit_add(entry, NULL, iter6, ret_val, audit_info); #endif /* IPv6 */ } else netlbl_domhsh_audit_add(entry, NULL, NULL, ret_val, audit_info); } else if (entry_old->def.type == NETLBL_NLTYPE_ADDRSELECT && entry->def.type == NETLBL_NLTYPE_ADDRSELECT) { struct list_head *old_list4; struct list_head *old_list6; old_list4 = &entry_old->def.addrsel->list4; old_list6 = &entry_old->def.addrsel->list6; /* we only allow the addition of address selectors if all of * the selectors do not exist in the existing domain map */ netlbl_af4list_foreach_rcu(iter4, &entry->def.addrsel->list4) if (netlbl_af4list_search_exact(iter4->addr, iter4->mask, old_list4)) { ret_val = -EEXIST; goto add_return; } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(iter6, &entry->def.addrsel->list6) if (netlbl_af6list_search_exact(&iter6->addr, &iter6->mask, old_list6)) { ret_val = -EEXIST; goto add_return; } #endif /* IPv6 */ netlbl_af4list_foreach_safe(iter4, tmp4, &entry->def.addrsel->list4) { netlbl_af4list_remove_entry(iter4); iter4->valid = 1; ret_val = netlbl_af4list_add(iter4, old_list4); netlbl_domhsh_audit_add(entry_old, iter4, NULL, ret_val, audit_info); if (ret_val != 0) goto add_return; } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_safe(iter6, tmp6, &entry->def.addrsel->list6) { netlbl_af6list_remove_entry(iter6); iter6->valid = 1; ret_val = netlbl_af6list_add(iter6, old_list6); netlbl_domhsh_audit_add(entry_old, NULL, iter6, ret_val, audit_info); if (ret_val != 0) goto add_return; } #endif /* IPv6 */ /* cleanup the new entry since we've moved everything over */ netlbl_domhsh_free_entry(&entry->rcu); } else ret_val = -EINVAL; add_return: spin_unlock(&netlbl_domhsh_lock); rcu_read_unlock(); return ret_val; } /** * netlbl_domhsh_add_default - Adds the default entry to the domain hash table * @entry: the entry to add * @audit_info: NetLabel audit information * * Description: * Adds a new default entry to the domain hash table and handles any updates * to the lower level protocol handler (i.e. CIPSO). Returns zero on success, * negative on failure. * */ int netlbl_domhsh_add_default(struct netlbl_dom_map *entry, struct netlbl_audit *audit_info) { return netlbl_domhsh_add(entry, audit_info); } /** * netlbl_domhsh_remove_entry - Removes a given entry from the domain table * @entry: the entry to remove * @audit_info: NetLabel audit information * * Description: * Removes an entry from the domain hash table and handles any updates to the * lower level protocol handler (i.e. CIPSO). Caller is responsible for * ensuring that the RCU read lock is held. Returns zero on success, negative * on failure. * */ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, struct netlbl_audit *audit_info) { int ret_val = 0; struct audit_buffer *audit_buf; struct netlbl_af4list *iter4; struct netlbl_domaddr4_map *map4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; struct netlbl_domaddr6_map *map6; #endif /* IPv6 */ if (entry == NULL) return -ENOENT; spin_lock(&netlbl_domhsh_lock); if (entry->valid) { entry->valid = 0; if (entry == rcu_dereference(netlbl_domhsh_def_ipv4)) RCU_INIT_POINTER(netlbl_domhsh_def_ipv4, NULL); else if (entry == rcu_dereference(netlbl_domhsh_def_ipv6)) RCU_INIT_POINTER(netlbl_domhsh_def_ipv6, NULL); else list_del_rcu(&entry->list); } else ret_val = -ENOENT; spin_unlock(&netlbl_domhsh_lock); if (ret_val) return ret_val; audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info); if (audit_buf != NULL) { audit_log_format(audit_buf, " nlbl_domain=%s res=1", entry->domain ? entry->domain : "(default)"); audit_log_end(audit_buf); } switch (entry->def.type) { case NETLBL_NLTYPE_ADDRSELECT: netlbl_af4list_foreach_rcu(iter4, &entry->def.addrsel->list4) { map4 = netlbl_domhsh_addr4_entry(iter4); cipso_v4_doi_putdef(map4->def.cipso); } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(iter6, &entry->def.addrsel->list6) { map6 = netlbl_domhsh_addr6_entry(iter6); calipso_doi_putdef(map6->def.calipso); } #endif /* IPv6 */ break; case NETLBL_NLTYPE_CIPSOV4: cipso_v4_doi_putdef(entry->def.cipso); break; #if IS_ENABLED(CONFIG_IPV6) case NETLBL_NLTYPE_CALIPSO: calipso_doi_putdef(entry->def.calipso); break; #endif /* IPv6 */ } call_rcu(&entry->rcu, netlbl_domhsh_free_entry); return ret_val; } /** * netlbl_domhsh_remove_af4 - Removes an address selector entry * @domain: the domain * @addr: IPv4 address * @mask: IPv4 address mask * @audit_info: NetLabel audit information * * Description: * Removes an individual address selector from a domain mapping and potentially * the entire mapping if it is empty. Returns zero on success, negative values * on failure. * */ int netlbl_domhsh_remove_af4(const char *domain, const struct in_addr *addr, const struct in_addr *mask, struct netlbl_audit *audit_info) { struct netlbl_dom_map *entry_map; struct netlbl_af4list *entry_addr; struct netlbl_af4list *iter4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; #endif /* IPv6 */ struct netlbl_domaddr4_map *entry; rcu_read_lock(); if (domain) entry_map = netlbl_domhsh_search(domain, AF_INET); else entry_map = netlbl_domhsh_search_def(domain, AF_INET); if (entry_map == NULL || entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT) goto remove_af4_failure; spin_lock(&netlbl_domhsh_lock); entry_addr = netlbl_af4list_remove(addr->s_addr, mask->s_addr, &entry_map->def.addrsel->list4); spin_unlock(&netlbl_domhsh_lock); if (entry_addr == NULL) goto remove_af4_failure; netlbl_af4list_foreach_rcu(iter4, &entry_map->def.addrsel->list4) goto remove_af4_single_addr; #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(iter6, &entry_map->def.addrsel->list6) goto remove_af4_single_addr; #endif /* IPv6 */ /* the domain mapping is empty so remove it from the mapping table */ netlbl_domhsh_remove_entry(entry_map, audit_info); remove_af4_single_addr: rcu_read_unlock(); /* yick, we can't use call_rcu here because we don't have a rcu head * pointer but hopefully this should be a rare case so the pause * shouldn't be a problem */ synchronize_rcu(); entry = netlbl_domhsh_addr4_entry(entry_addr); cipso_v4_doi_putdef(entry->def.cipso); kfree(entry); return 0; remove_af4_failure: rcu_read_unlock(); return -ENOENT; } #if IS_ENABLED(CONFIG_IPV6) /** * netlbl_domhsh_remove_af6 - Removes an address selector entry * @domain: the domain * @addr: IPv6 address * @mask: IPv6 address mask * @audit_info: NetLabel audit information * * Description: * Removes an individual address selector from a domain mapping and potentially * the entire mapping if it is empty. Returns zero on success, negative values * on failure. * */ int netlbl_domhsh_remove_af6(const char *domain, const struct in6_addr *addr, const struct in6_addr *mask, struct netlbl_audit *audit_info) { struct netlbl_dom_map *entry_map; struct netlbl_af6list *entry_addr; struct netlbl_af4list *iter4; struct netlbl_af6list *iter6; struct netlbl_domaddr6_map *entry; rcu_read_lock(); if (domain) entry_map = netlbl_domhsh_search(domain, AF_INET6); else entry_map = netlbl_domhsh_search_def(domain, AF_INET6); if (entry_map == NULL || entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT) goto remove_af6_failure; spin_lock(&netlbl_domhsh_lock); entry_addr = netlbl_af6list_remove(addr, mask, &entry_map->def.addrsel->list6); spin_unlock(&netlbl_domhsh_lock); if (entry_addr == NULL) goto remove_af6_failure; netlbl_af4list_foreach_rcu(iter4, &entry_map->def.addrsel->list4) goto remove_af6_single_addr; netlbl_af6list_foreach_rcu(iter6, &entry_map->def.addrsel->list6) goto remove_af6_single_addr; /* the domain mapping is empty so remove it from the mapping table */ netlbl_domhsh_remove_entry(entry_map, audit_info); remove_af6_single_addr: rcu_read_unlock(); /* yick, we can't use call_rcu here because we don't have a rcu