| 4155 4152 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 | // SPDX-License-Identifier: GPL-2.0-only /* * Landlock LSM - Ruleset management * * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI */ #include <linux/bits.h> #include <linux/bug.h> #include <linux/cleanup.h> #include <linux/compiler_types.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/lockdep.h> #include <linux/mutex.h> #include <linux/overflow.h> #include <linux/rbtree.h> #include <linux/refcount.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include "access.h" #include "audit.h" #include "domain.h" #include "limits.h" #include "object.h" #include "ruleset.h" static struct landlock_ruleset *create_ruleset(const u32 num_layers) { struct landlock_ruleset *new_ruleset; new_ruleset = kzalloc(struct_size(new_ruleset, access_masks, num_layers), GFP_KERNEL_ACCOUNT); if (!new_ruleset) return ERR_PTR(-ENOMEM); refcount_set(&new_ruleset->usage, 1); mutex_init(&new_ruleset->lock); new_ruleset->root_inode = RB_ROOT; #if IS_ENABLED(CONFIG_INET) new_ruleset->root_net_port = RB_ROOT; #endif /* IS_ENABLED(CONFIG_INET) */ new_ruleset->num_layers = num_layers; /* * hierarchy = NULL * num_rules = 0 * access_masks[] = 0 */ return new_ruleset; } struct landlock_ruleset * landlock_create_ruleset(const access_mask_t fs_access_mask, const access_mask_t net_access_mask, const access_mask_t scope_mask) { struct landlock_ruleset *new_ruleset; /* Informs about useless ruleset. */ if (!fs_access_mask && !net_access_mask && !scope_mask) return ERR_PTR(-ENOMSG); new_ruleset = create_ruleset(1); if (IS_ERR(new_ruleset)) return new_ruleset; if (fs_access_mask) landlock_add_fs_access_mask(new_ruleset, fs_access_mask, 0); if (net_access_mask) landlock_add_net_access_mask(new_ruleset, net_access_mask, 0); if (scope_mask) landlock_add_scope_mask(new_ruleset, scope_mask, 0); return new_ruleset; } static void build_check_rule(void) { const struct landlock_rule rule = { .num_layers = ~0, }; /* * Checks that .num_layers is large enough for at least * LANDLOCK_MAX_NUM_LAYERS layers. */ BUILD_BUG_ON(rule.num_layers < LANDLOCK_MAX_NUM_LAYERS); } static bool is_object_pointer(const enum landlock_key_type key_type) { switch (key_type) { case LANDLOCK_KEY_INODE: return true; #if IS_ENABLED(CONFIG_INET) case LANDLOCK_KEY_NET_PORT: return false; #endif /* IS_ENABLED(CONFIG_INET) */ default: WARN_ON_ONCE(1); return false; } } static struct landlock_rule * create_rule(const struct landlock_id id, const struct landlock_layer (*const layers)[], const u32 num_layers, const struct landlock_layer *const new_layer) { struct landlock_rule *new_rule; u32 new_num_layers; build_check_rule(); if (new_layer) { /* Should already be checked by landlock_merge_ruleset(). */ if (WARN_ON_ONCE(num_layers >= LANDLOCK_MAX_NUM_LAYERS)) return ERR_PTR(-E2BIG); new_num_layers = num_layers + 1; } else { new_num_layers = num_layers; } new_rule = kzalloc(struct_size(new_rule, layers, new_num_layers), GFP_KERNEL_ACCOUNT); if (!new_rule) return ERR_PTR(-ENOMEM); RB_CLEAR_NODE(&new_rule->node); if (is_object_pointer(id.type)) { /* This should have been caught by insert_rule(). */ WARN_ON_ONCE(!id.key.object); landlock_get_object(id.key.object); } new_rule->key = id.key; new_rule->num_layers = new_num_layers; /* Copies the original layer stack. */ memcpy(new_rule->layers, layers, flex_array_size(new_rule, layers, num_layers)); if (new_layer) /* Adds a copy of @new_layer on the layer stack. */ new_rule->layers[new_rule->num_layers - 1] = *new_layer; return new_rule; } static struct rb_root *get_root(struct landlock_ruleset *const ruleset, const enum landlock_key_type key_type) { switch (key_type) { case LANDLOCK_KEY_INODE: return &ruleset->root_inode; #if IS_ENABLED(CONFIG_INET) case LANDLOCK_KEY_NET_PORT: return &ruleset->root_net_port; #endif /* IS_ENABLED(CONFIG_INET) */ default: WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } } static void free_rule(struct landlock_rule *const rule, const enum landlock_key_type key_type) { might_sleep(); if (!rule) return; if (is_object_pointer(key_type)) landlock_put_object(rule->key.object); kfree(rule); } static void build_check_ruleset(void) { const struct landlock_ruleset ruleset = { .num_rules = ~0, .num_layers = ~0, }; BUILD_BUG_ON(ruleset.num_rules < LANDLOCK_MAX_NUM_RULES); BUILD_BUG_ON(ruleset.num_layers < LANDLOCK_MAX_NUM_LAYERS); } /** * insert_rule - Create and insert a rule in a ruleset * * @ruleset: The ruleset to be updated. * @id: The ID to build the new rule with. The underlying kernel object, if * any, must be held by the caller. * @layers: One or multiple layers to be copied into the new rule. * @num_layers: The number of @layers entries. * * When user space requests to add a new rule to a ruleset, @layers only * contains one entry and this entry is not assigned to any level. In this * case, the new rule will extend @ruleset, similarly to a boolean OR between * access rights. * * When merging a ruleset in a domain, or copying a domain, @layers will be * added to @ruleset as new constraints, similarly to a boolean AND between * access rights. */ static int insert_rule(struct landlock_ruleset *const ruleset, const struct landlock_id id, const struct landlock_layer (*const layers)[], const size_t num_layers) { struct rb_node **walker_node; struct rb_node *parent_node = NULL; struct landlock_rule *new_rule; struct rb_root *root; might_sleep(); lockdep_assert_held(&ruleset->lock); if (WARN_ON_ONCE(!layers)) return -ENOENT; if (is_object_pointer(id.type) && WARN_ON_ONCE(!id.key.object)) return -ENOENT; root = get_root(ruleset, id.type); if (IS_ERR(root)) return PTR_ERR(root); walker_node = &root->rb_node; while (*walker_node) { struct landlock_rule *const this = rb_entry(*walker_node, struct landlock_rule, node); if (this->key.data != id.key.data) { parent_node = *walker_node; if (this->key.data < id.key.data) walker_node = &((*walker_node)->rb_right); else walker_node = &((*walker_node)->rb_left); continue; } /* Only a single-level layer should match an existing rule. */ if (WARN_ON_ONCE(num_layers != 1)) return -EINVAL; /* If there is a matching rule, updates it. */ if ((*layers)[0].level == 0) { /* * Extends access rights when the request comes from * landlock_add_rule(2), i.e. @ruleset is not a domain. */ if (WARN_ON_ONCE(this->num_layers != 1)) return -EINVAL; if (WARN_ON_ONCE(this->layers[0].level != 0)) return -EINVAL; this->layers[0].access |= (*layers)[0].access; return 0; } if (WARN_ON_ONCE(this->layers[0].level == 0)) return -EINVAL; /* * Intersects access rights when it is a merge between a * ruleset and a domain. */ new_rule = create_rule(id, &this->layers, this->num_layers, &(*layers)[0]); if (IS_ERR(new_rule)) return PTR_ERR(new_rule); rb_replace_node(&this->node, &new_rule->node, root); free_rule(this, id.type); return 0; } /* There is no match for @id. */ build_check_ruleset(); if (ruleset->num_rules >= LANDLOCK_MAX_NUM_RULES) return -E2BIG; new_rule = create_rule(id, layers, num_layers, NULL); if (IS_ERR(new_rule)) return PTR_ERR(new_rule); rb_link_node(&new_rule->node, parent_node, walker_node); rb_insert_color(&new_rule->node, root); ruleset->num_rules++; return 0; } static void build_check_layer(void) { const struct landlock_layer layer = { .level = ~0, .access = ~0, }; /* * Checks that .level and .access are large enough to contain their expected * maximum values. */ BUILD_BUG_ON(layer.level < LANDLOCK_MAX_NUM_LAYERS); BUILD_BUG_ON(layer.access < LANDLOCK_MASK_ACCESS_FS); } /* @ruleset must be locked by the caller. */ int landlock_insert_rule(struct landlock_ruleset *const ruleset, const struct landlock_id id, const access_mask_t access) { struct landlock_layer layers[] = { { .access = access, /* When @level is zero, insert_rule() extends @ruleset. */ .level = 0, } }; build_check_layer(); return insert_rule(ruleset, id, &layers, ARRAY_SIZE(layers)); } static int merge_tree(struct landlock_ruleset *const dst, struct landlock_ruleset *const src, const enum landlock_key_type key_type) { struct landlock_rule *walker_rule, *next_rule; struct rb_root *src_root; int err = 0; might_sleep(); lockdep_assert_held(&dst->lock); lockdep_assert_held(&src->lock); src_root = get_root(src, key_type); if (IS_ERR(src_root)) return PTR_ERR(src_root); /* Merges the @src tree. */ rbtree_postorder_for_each_entry_safe(walker_rule, next_rule, src_root, node) { struct landlock_layer layers[] = { { .level = dst->num_layers, } }; const struct landlock_id id = { .key = walker_rule->key, .type = key_type, }; if (WARN_ON_ONCE(walker_rule->num_layers != 1)) return -EINVAL; if (WARN_ON_ONCE(walker_rule->layers[0].level != 0)) return -EINVAL; layers[0].access = walker_rule->layers[0].access; err = insert_rule(dst, id, &layers, ARRAY_SIZE(layers)); if (err) return err; } return err; } static int merge_ruleset(struct landlock_ruleset *const dst, struct landlock_ruleset *const src) { int err = 0; might_sleep(); /* Should already be checked by landlock_merge_ruleset() */ if (WARN_ON_ONCE(!src)) return 0; /* Only merge into a domain. */ if (WARN_ON_ONCE(!dst || !dst->hierarchy)) return -EINVAL; /* Locks @dst first because we are its only owner. */ mutex_lock(&dst->lock); mutex_lock_nested(&src->lock, SINGLE_DEPTH_NESTING); /* Stacks the new layer. */ if (WARN_ON_ONCE(src->num_layers != 1 || dst->num_layers < 1)) { err = -EINVAL; goto out_unlock; } dst->access_masks[dst->num_layers - 1] = landlock_upgrade_handled_access_masks(src->access_masks[0]); /* Merges the @src inode tree. */ err = merge_tree(dst, src, LANDLOCK_KEY_INODE); if (err) goto out_unlock; #if IS_ENABLED(CONFIG_INET) /* Merges the @src network port tree. */ err = merge_tree(dst, src, LANDLOCK_KEY_NET_PORT); if (err) goto out_unlock; #endif /* IS_ENABLED(CONFIG_INET) */ out_unlock: mutex_unlock(&src->lock); mutex_unlock(&dst->lock); return err; } static int inherit_tree(struct landlock_ruleset *const parent, struct landlock_ruleset *const child, const enum landlock_key_type key_type) { struct landlock_rule *walker_rule, *next_rule; struct rb_root *parent_root; int err = 0; might_sleep(); lockdep_assert_held(&parent->lock); lockdep_assert_held(&child->lock); parent_root = get_root(parent, key_type); if (IS_ERR(parent_root)) return PTR_ERR(parent_root); /* Copies the @parent inode or network tree. */ rbtree_postorder_for_each_entry_safe(walker_rule, next_rule, parent_root, node) { const struct landlock_id id = { .key = walker_rule->key, .type = key_type, }; err = insert_rule(child, id, &walker_rule->layers, walker_rule->num_layers); if (err) return err; } return err; } static int inherit_ruleset(struct landlock_ruleset *const parent, struct landlock_ruleset *const child) { int err = 0; might_sleep(); if (!parent) return 0; /* Locks @child first because we are its only owner. */ mutex_lock(&child->lock); mutex_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); /* Copies the @parent inode tree. */ err = inherit_tree(parent, child, LANDLOCK_KEY_INODE); if (err) goto out_unlock; #if IS_ENABLED(CONFIG_INET) /* Copies the @parent network port tree. */ err = inherit_tree(parent, child, LANDLOCK_KEY_NET_PORT); if (err) goto out_unlock; #endif /* IS_ENABLED(CONFIG_INET) */ if (WARN_ON_ONCE(child->num_layers <= parent->num_layers)) { err = -EINVAL; goto out_unlock; } /* Copies the parent layer stack and leaves a space for the new layer. */ memcpy(child->access_masks, parent->access_masks, flex_array_size(parent, access_masks, parent->num_layers)); if (WARN_ON_ONCE(!parent->hierarchy)) { err = -EINVAL; goto out_unlock; } landlock_get_hierarchy(parent->hierarchy); child->hierarchy->parent = parent->hierarchy; out_unlock: mutex_unlock(&parent->lock); mutex_unlock(&child->lock); return err; } static void free_ruleset(struct landlock_ruleset *const ruleset) { struct landlock_rule *freeme, *next; might_sleep(); rbtree_postorder_for_each_entry_safe(freeme, next, &ruleset->root_inode, node) free_rule(freeme, LANDLOCK_KEY_INODE); #if IS_ENABLED(CONFIG_INET) rbtree_postorder_for_each_entry_safe(freeme, next, &ruleset->root_net_port, node) free_rule(freeme, LANDLOCK_KEY_NET_PORT); #endif /* IS_ENABLED(CONFIG_INET) */ landlock_put_hierarchy(ruleset->hierarchy); kfree(ruleset); } void landlock_put_ruleset(struct landlock_ruleset *const ruleset) { might_sleep(); if (ruleset && refcount_dec_and_test(&ruleset->usage)) free_ruleset(ruleset); } static void free_ruleset_work(struct work_struct *const work) { struct landlock_ruleset *ruleset; ruleset = container_of(work, struct landlock_ruleset, work_free); free_ruleset(ruleset); } /* Only called by hook_cred_free(). */ void landlock_put_ruleset_deferred(struct landlock_ruleset *const ruleset) { if (ruleset && refcount_dec_and_test(&ruleset->usage)) { INIT_WORK(&ruleset->work_free, free_ruleset_work); schedule_work(&ruleset->work_free); } } /** * landlock_merge_ruleset - Merge a ruleset with a domain * * @parent: Parent domain. * @ruleset: New ruleset to be merged. * * The current task is requesting to be restricted. The subjective credentials * must not be in an overridden state. cf. landlock_init_hierarchy_log(). * * Returns the intersection of @parent and @ruleset, or returns @parent if * @ruleset is empty, or returns a duplicate of @ruleset if @parent is empty. */ struct landlock_ruleset * landlock_merge_ruleset(struct landlock_ruleset *const parent, struct landlock_ruleset *const ruleset) { struct landlock_ruleset *new_dom __free(landlock_put_ruleset) = NULL; u32 num_layers; int err; might_sleep(); if (WARN_ON_ONCE(!ruleset || parent == ruleset)) return ERR_PTR(-EINVAL); if (parent) { if (parent->num_layers >= LANDLOCK_MAX_NUM_LAYERS) return ERR_PTR(-E2BIG); num_layers = parent->num_layers + 1; } else { num_layers = 1; } /* Creates a new domain... */ new_dom = create_ruleset(num_layers); if (IS_ERR(new_dom)) return new_dom; new_dom->hierarchy = kzalloc(sizeof(*new_dom->hierarchy), GFP_KERNEL_ACCOUNT); if (!new_dom->hierarchy) return ERR_PTR(-ENOMEM); refcount_set(&new_dom->hierarchy->usage, 1); /* ...as a child of @parent... */ err = inherit_ruleset(parent, new_dom); if (err) return ERR_PTR(err); /* ...and including @ruleset. */ err = merge_ruleset(new_dom, ruleset); if (err) return ERR_PTR(err); err = landlock_init_hierarchy_log(new_dom->hierarchy); if (err) return ERR_PTR(err); return no_free_ptr(new_dom); } /* * The returned access has the same lifetime as @ruleset. */ const struct landlock_rule * landlock_find_rule(const struct landlock_ruleset *const ruleset, const struct landlock_id id) { const struct rb_root *root; const struct rb_node *node; root = get_root((struct landlock_ruleset *)ruleset, id.type); if (IS_ERR(root)) return NULL; node = root->rb_node; while (node) { struct landlock_rule *this = rb_entry(node, struct landlock_rule, node); if (this->key.data == id.key.data) return this; if (this->key.data < id.key.data) node = node->rb_right; else node = node->rb_left; } return NULL; } /* * @layer_masks is read and may be updated according to the access request and * the matching rule. * @masks_array_size must be equal to ARRAY_SIZE(*layer_masks). * * Returns true if the request is allowed (i.e. relevant layer masks for the * request are empty). */ bool landlock_unmask_layers(const struct landlock_rule *const rule, const access_mask_t access_request, layer_mask_t (*const layer_masks)[], const size_t masks_array_size) { size_t layer_level; if (!access_request || !layer_masks) return true; if (!rule) return false; /* * An access is granted if, for each policy layer, at least one rule * encountered on the pathwalk grants the requested access, * regardless of its position in the layer stack. We must then check * the remaining layers for each inode, from the first added layer to * the last one. When there is multiple requested accesses, for each * policy layer, the full set of requested accesses may not be granted * by only one rule, but by the union (binary OR) of multiple rules. * E.g. /a/b <execute> + /a <read> => /a/b <execute + read> */ for (layer_level = 0; layer_level < rule->num_layers; layer_level++) { const struct landlock_layer *const layer = &rule->layers[layer_level]; const layer_mask_t layer_bit = BIT_ULL(layer->level - 1); const unsigned long access_req = access_request; unsigned long access_bit; bool is_empty; /* * Records in @layer_masks which layer grants access to each requested * access: bit cleared if the related layer grants access. */ is_empty = true; for_each_set_bit(access_bit, &access_req, masks_array_size) { if (layer->access & BIT_ULL(access_bit)) (*layer_masks)[access_bit] &= ~layer_bit; is_empty = is_empty && !(*layer_masks)[access_bit]; } if (is_empty) return true; } return false; } typedef access_mask_t get_access_mask_t(const struct landlock_ruleset *const ruleset, const u16 layer_level); /** * landlock_init_layer_masks - Initialize layer masks from an access request * * Populates @layer_masks such that for each access right in @access_request, * the bits for all the layers are set where this access right is handled. * * @domain: The domain that defines the current restrictions. * @access_request: The requested access rights to check. * @layer_masks: It must contain %LANDLOCK_NUM_ACCESS_FS or * %LANDLOCK_NUM_ACCESS_NET elements according to @key_type. * @key_type: The key type to switch between access masks of different types. * * Returns: An access mask where each access right bit is set which is handled * in any of the active layers in @domain. */ access_mask_t landlock_init_layer_masks(const struct landlock_ruleset *const domain, const access_mask_t access_request, layer_mask_t (*const layer_masks)[], const enum landlock_key_type key_type) { access_mask_t handled_accesses = 0; size_t layer_level, num_access; get_access_mask_t *get_access_mask; switch (key_type) { case LANDLOCK_KEY_INODE: get_access_mask = landlock_get_fs_access_mask; num_access = LANDLOCK_NUM_ACCESS_FS; break; #if IS_ENABLED(CONFIG_INET) case LANDLOCK_KEY_NET_PORT: get_access_mask = landlock_get_net_access_mask; num_access = LANDLOCK_NUM_ACCESS_NET; break; #endif /* IS_ENABLED(CONFIG_INET) */ default: WARN_ON_ONCE(1); return 0; } memset(layer_masks, 0, array_size(sizeof((*layer_masks)[0]), num_access)); /* An empty access request can happen because of O_WRONLY | O_RDWR. */ if (!access_request) return 0; /* Saves all handled accesses per layer. */ for (layer_level = 0; layer_level < domain->num_layers; layer_level++) { const unsigned long access_req = access_request; const access_mask_t access_mask = get_access_mask(domain, layer_level); unsigned long access_bit; for_each_set_bit(access_bit, &access_req, num_access) { if (BIT_ULL(access_bit) & access_mask) { (*layer_masks)[access_bit] |= BIT_ULL(layer_level); handled_accesses |= BIT_ULL(access_bit); } } } return handled_accesses; } |
| 743 186 162 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_UIDGID_H #define _LINUX_UIDGID_H /* * A set of types for the internal kernel types representing uids and gids. * * The types defined in this header allow distinguishing which uids and gids in * the kernel are values used by userspace and which uid and gid values are * the internal kernel values. With the addition of user namespaces the values * can be different. Using the type system makes it possible for the compiler * to detect when we overlook these differences. * */ #include <linux/uidgid_types.h> #include <linux/highuid.h> struct user_namespace; extern struct user_namespace init_user_ns; struct uid_gid_map; #define KUIDT_INIT(value) (kuid_t){ value } #define KGIDT_INIT(value) (kgid_t){ value } #ifdef CONFIG_MULTIUSER static inline uid_t __kuid_val(kuid_t uid) { return uid.val; } static inline gid_t __kgid_val(kgid_t gid) { return gid.val; } #else static inline uid_t __kuid_val(kuid_t uid) { return 0; } static inline gid_t __kgid_val(kgid_t gid) { return 0; } #endif #define GLOBAL_ROOT_UID KUIDT_INIT(0) #define GLOBAL_ROOT_GID KGIDT_INIT(0) #define INVALID_UID KUIDT_INIT(-1) #define INVALID_GID KGIDT_INIT(-1) static inline bool uid_eq(kuid_t left, kuid_t right) { return __kuid_val(left) == __kuid_val(right); } static inline bool gid_eq(kgid_t left, kgid_t right) { return __kgid_val(left) == __kgid_val(right); } static inline bool uid_gt(kuid_t left, kuid_t right) { return __kuid_val(left) > __kuid_val(right); } static inline bool gid_gt(kgid_t left, kgid_t right) { return __kgid_val(left) > __kgid_val(right); } static inline bool uid_gte(kuid_t left, kuid_t right) { return __kuid_val(left) >= __kuid_val(right); } static inline bool gid_gte(kgid_t left, kgid_t right) { return __kgid_val(left) >= __kgid_val(right); } static inline bool uid_lt(kuid_t left, kuid_t right) { return __kuid_val(left) < __kuid_val(right); } static inline bool gid_lt(kgid_t left, kgid_t right) { return __kgid_val(left) < __kgid_val(right); } static inline bool uid_lte(kuid_t left, kuid_t right) { return __kuid_val(left) <= __kuid_val(right); } static inline bool gid_lte(kgid_t left, kgid_t right) { return __kgid_val(left) <= __kgid_val(right); } static inline bool uid_valid(kuid_t uid) { return __kuid_val(uid) != (uid_t) -1; } static inline bool gid_valid(kgid_t gid) { return __kgid_val(gid) != (gid_t) -1; } #ifdef CONFIG_USER_NS extern kuid_t make_kuid(struct user_namespace *from, uid_t uid); extern kgid_t make_kgid(struct user_namespace *from, gid_t gid); extern uid_t from_kuid(struct user_namespace *to, kuid_t uid); extern gid_t from_kgid(struct user_namespace *to, kgid_t gid); extern uid_t from_kuid_munged(struct user_namespace *to, kuid_t uid); extern gid_t from_kgid_munged(struct user_namespace *to, kgid_t gid); static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) { return from_kuid(ns, uid) != (uid_t) -1; } static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) { return from_kgid(ns, gid) != (gid_t) -1; } u32 map_id_down(struct uid_gid_map *map, u32 id); u32 map_id_up(struct uid_gid_map *map, u32 id); u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count); #else static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid) { return KUIDT_INIT(uid); } static inline kgid_t make_kgid(struct user_namespace *from, gid_t gid) { return KGIDT_INIT(gid); } static inline uid_t from_kuid(struct user_namespace *to, kuid_t kuid) { return __kuid_val(kuid); } static inline gid_t from_kgid(struct user_namespace *to, kgid_t kgid) { return __kgid_val(kgid); } static inline uid_t from_kuid_munged(struct user_namespace *to, kuid_t kuid) { uid_t uid = from_kuid(to, kuid); if (uid == (uid_t)-1) uid = overflowuid; return uid; } static inline gid_t from_kgid_munged(struct user_namespace *to, kgid_t kgid) { gid_t gid = from_kgid(to, kgid); if (gid == (gid_t)-1) gid = overflowgid; return gid; } static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) { return uid_valid(uid); } static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) { return gid_valid(gid); } static inline u32 map_id_down(struct uid_gid_map *map, u32 id) { return id; } static inline u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) { return id; } static inline u32 map_id_up(struct uid_gid_map *map, u32 id) { return id; } #endif /* CONFIG_USER_NS */ #endif /* _LINUX_UIDGID_H */ |
| 612 612 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/pm_qos.h> static inline void device_pm_init_common(struct device *dev) { if (!dev->power.early_init) { spin_lock_init(&dev->power.lock); dev->power.qos = NULL; dev->power.early_init = true; } } #ifdef CONFIG_PM static inline void pm_runtime_early_init(struct device *dev) { dev->power.disable_depth = 1; device_pm_init_common(dev); } extern void pm_runtime_init(struct device *dev); extern void pm_runtime_reinit(struct device *dev); extern void pm_runtime_remove(struct device *dev); extern u64 pm_runtime_active_time(struct device *dev); #define WAKE_IRQ_DEDICATED_ALLOCATED BIT(0) #define WAKE_IRQ_DEDICATED_MANAGED BIT(1) #define WAKE_IRQ_DEDICATED_REVERSE BIT(2) #define WAKE_IRQ_DEDICATED_MASK (WAKE_IRQ_DEDICATED_ALLOCATED | \ WAKE_IRQ_DEDICATED_MANAGED | \ WAKE_IRQ_DEDICATED_REVERSE) #define WAKE_IRQ_DEDICATED_ENABLED BIT(3) struct wake_irq { struct device *dev; unsigned int status; int irq; const char *name; }; extern void dev_pm_arm_wake_irq(struct wake_irq *wirq); extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq); extern void dev_pm_enable_wake_irq_check(struct device *dev, bool can_change_status); extern void dev_pm_disable_wake_irq_check(struct device *dev, bool cond_disable); extern void dev_pm_enable_wake_irq_complete(struct device *dev); #ifdef CONFIG_PM_SLEEP extern void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq); extern void device_wakeup_detach_irq(struct device *dev); extern void device_wakeup_arm_wake_irqs(void); extern void device_wakeup_disarm_wake_irqs(void); #else static inline void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq) {} static inline void device_wakeup_detach_irq(struct device *dev) { } #endif /* CONFIG_PM_SLEEP */ /* * sysfs.c */ extern int dpm_sysfs_add(struct device *dev); extern void dpm_sysfs_remove(struct device *dev); extern void rpm_sysfs_remove(struct device *dev); extern int wakeup_sysfs_add(struct device *dev); extern void wakeup_sysfs_remove(struct device *dev); extern int pm_qos_sysfs_add_resume_latency(struct device *dev); extern void pm_qos_sysfs_remove_resume_latency(struct device *dev); extern int pm_qos_sysfs_add_flags(struct device *dev); extern void pm_qos_sysfs_remove_flags(struct device *dev); extern int pm_qos_sysfs_add_latency_tolerance(struct device *dev); extern void pm_qos_sysfs_remove_latency_tolerance(struct device *dev); extern int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid); #else /* CONFIG_PM */ static inline void pm_runtime_early_init(struct device *dev) { device_pm_init_common(dev); } static inline void pm_runtime_init(struct device *dev) {} static inline void pm_runtime_reinit(struct device *dev) {} static inline void pm_runtime_remove(struct device *dev) {} static inline int dpm_sysfs_add(struct device *dev) { return 0; } static inline void dpm_sysfs_remove(struct device *dev) {} static inline int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid) { return 0; } #endif #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ extern int pm_async_enabled; /* drivers/base/power/main.c */ extern struct list_head dpm_list; /* The active device list */ static inline struct device *to_device(struct list_head *entry) { return container_of(entry, struct device, power.entry); } extern void device_pm_sleep_init(struct device *dev); extern void device_pm_add(struct device *); extern void device_pm_remove(struct device *); extern void device_pm_move_before(struct device *, struct device *); extern void device_pm_move_after(struct device *, struct device *); extern void device_pm_move_last(struct device *); extern void device_pm_check_callbacks(struct device *dev); static inline bool device_pm_initialized(struct device *dev) { return dev->power.in_dpm_list; } /* drivers/base/power/wakeup_stats.c */ extern int wakeup_source_sysfs_add(struct device *parent, struct wakeup_source *ws); extern void wakeup_source_sysfs_remove(struct wakeup_source *ws); extern int pm_wakeup_source_sysfs_add(struct device *parent); #else /* !CONFIG_PM_SLEEP */ static inline void device_pm_sleep_init(struct device *dev) {} static inline void device_pm_add(struct device *dev) {} static inline void device_pm_remove(struct device *dev) { pm_runtime_remove(dev); } static inline void device_pm_move_before(struct device *deva, struct device *devb) {} static inline void device_pm_move_after(struct device *deva, struct device *devb) {} static inline void device_pm_move_last(struct device *dev) {} static inline void device_pm_check_callbacks(struct device *dev) {} static inline bool device_pm_initialized(struct device *dev) { return device_is_registered(dev); } static inline int pm_wakeup_source_sysfs_add(struct device *parent) { return 0; } #endif /* !CONFIG_PM_SLEEP */ static inline void device_pm_init(struct device *dev) { device_pm_init_common(dev); device_pm_sleep_init(dev); pm_runtime_init(dev); } |
| 14 8 14 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 | /* * linux/fs/nls/nls_cp864.c * * Charset cp864 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x066a, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x00b0, 0x00b7, 0x2219, 0x221a, 0x2592, 0x2500, 0x2502, 0x253c, 0x2524, 0x252c, 0x251c, 0x2534, 0x2510, 0x250c, 0x2514, 0x2518, /* 0x90*/ 0x03b2, 0x221e, 0x03c6, 0x00b1, 0x00bd, 0x00bc, 0x2248, 0x00ab, 0x00bb, 0xfef7, 0xfef8, 0x0000, 0x0000, 0xfefb, 0xfefc, 0x0000, /* 0xa0*/ 0x00a0, 0x00ad, 0xfe82, 0x00a3, 0x00a4, 0xfe84, 0x0000, 0x0000, 0xfe8e, 0xfe8f, 0xfe95, 0xfe99, 0x060c, 0xfe9d, 0xfea1, 0xfea5, /* 0xb0*/ 0x0660, 0x0661, 0x0662, 0x0663, 0x0664, 0x0665, 0x0666, 0x0667, 0x0668, 0x0669, 0xfed1, 0x061b, 0xfeb1, 0xfeb5, 0xfeb9, 0x061f, /* 0xc0*/ 0x00a2, 0xfe80, 0xfe81, 0xfe83, 0xfe85, 0xfeca, 0xfe8b, 0xfe8d, 0xfe91, 0xfe93, 0xfe97, 0xfe9b, 0xfe9f, 0xfea3, 0xfea7, 0xfea9, /* 0xd0*/ 0xfeab, 0xfead, 0xfeaf, 0xfeb3, 0xfeb7, 0xfebb, 0xfebf, 0xfec1, 0xfec5, 0xfecb, 0xfecf, 0x00a6, 0x00ac, 0x00f7, 0x00d7, 0xfec9, /* 0xe0*/ 0x0640, 0xfed3, 0xfed7, 0xfedb, 0xfedf, 0xfee3, 0xfee7, 0xfeeb, 0xfeed, 0xfeef, 0xfef3, 0xfebd, 0xfecc, 0xfece, 0xfecd, 0xfee1, /* 0xf0*/ 0xfe7d, 0x0651, 0xfee5, 0xfee9, 0xfeec, 0xfef0, 0xfef2, 0xfed0, 0xfed5, 0xfef5, 0xfef6, 0xfedd, 0xfed9, 0xfef1, 0x25a0, 0x0000, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x00, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xa0, 0x00, 0xc0, 0xa3, 0xa4, 0x00, 0xdb, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x97, 0xdc, 0xa1, 0x00, 0x00, /* 0xa8-0xaf */ 0x80, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x98, 0x95, 0x94, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdd, /* 0xf0-0xf7 */ }; static const unsigned char page03[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x00, /* 0xc0-0xc7 */ }; static const unsigned char page06[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0xe0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0xf1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x60-0x67 */ 0xb8, 0xb9, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x82, 0x83, 0x00, 0x00, 0x00, 0x91, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ }; static const unsigned char page25[256] = { 0x85, 0x00, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x8c, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x8f, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char pagefe[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, /* 0x78-0x7f */ 0xc1, 0xc2, 0xa2, 0xc3, 0xa5, 0xc4, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0xc6, 0x00, 0xc7, 0xa8, 0xa9, /* 0x88-0x8f */ 0x00, 0xc8, 0x00, 0xc9, 0x00, 0xaa, 0x00, 0xca, /* 0x90-0x97 */ 0x00, 0xab, 0x00, 0xcb, 0x00, 0xad, 0x00, 0xcc, /* 0x98-0x9f */ 0x00, 0xae, 0x00, 0xcd, 0x00, 0xaf, 0x00, 0xce, /* 0xa0-0xa7 */ 0x00, 0xcf, 0x00, 0xd0, 0x00, 0xd1, 0x00, 0xd2, /* 0xa8-0xaf */ 0x00, 0xbc, 0x00, 0xd3, 0x00, 0xbd, 0x00, 0xd4, /* 0xb0-0xb7 */ 0x00, 0xbe, 0x00, 0xd5, 0x00, 0xeb, 0x00, 0xd6, /* 0xb8-0xbf */ 0x00, 0xd7, 0x00, 0x00, 0x00, 0xd8, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0xdf, 0xc5, 0xd9, 0xec, 0xee, 0xed, 0xda, /* 0xc8-0xcf */ 0xf7, 0xba, 0x00, 0xe1, 0x00, 0xf8, 0x00, 0xe2, /* 0xd0-0xd7 */ 0x00, 0xfc, 0x00, 0xe3, 0x00, 0xfb, 0x00, 0xe4, /* 0xd8-0xdf */ 0x00, 0xef, 0x00, 0xe5, 0x00, 0xf2, 0x00, 0xe6, /* 0xe0-0xe7 */ 0x00, 0xf3, 0x00, 0xe7, 0xf4, 0xe8, 0x00, 0xe9, /* 0xe8-0xef */ 0xf5, 0xfd, 0xf6, 0xea, 0x00, 0xf9, 0xfa, 0x99, /* 0xf0-0xf7 */ 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char *const page_uni2charset[256] = { page00, NULL, NULL, page03, NULL, NULL, page06, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page22, NULL, NULL, page25, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, pagefe, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0x00, 0x00, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x00, 0x91, 0x00, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0x00, 0x00, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp864", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp864(void) { return register_nls(&table); } static void __exit exit_nls_cp864(void) { unregister_nls(&table); } module_init(init_nls_cp864) module_exit(exit_nls_cp864) MODULE_DESCRIPTION("NLS Codepage 864 (Arabic)"); MODULE_LICENSE("Dual BSD/GPL"); |
| 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | // SPDX-License-Identifier: GPL-2.0 struct io_sq_data { refcount_t refs; atomic_t park_pending; struct mutex lock; /* ctx's that are using this sqd */ struct list_head ctx_list; struct task_struct __rcu *thread; struct wait_queue_head wait; unsigned sq_thread_idle; int sq_cpu; pid_t task_pid; pid_t task_tgid; u64 work_time; unsigned long state; struct completion exited; }; int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p); void io_sq_thread_finish(struct io_ring_ctx *ctx); void io_sq_thread_stop(struct io_sq_data *sqd); void io_sq_thread_park(struct io_sq_data *sqd); void io_sq_thread_unpark(struct io_sq_data *sqd); void io_put_sq_data(struct io_sq_data *sqd); void io_sqpoll_wait_sq(struct io_ring_ctx *ctx); int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask); u64 io_sq_cpu_usec(struct task_struct *tsk); static inline struct task_struct *sqpoll_task_locked(struct io_sq_data *sqd) { return rcu_dereference_protected(sqd->thread, lockdep_is_held(&sqd->lock)); } |
| 293 293 546 529 30 540 558 2 8 1 3 1 1 2 3 33 49 48 2 1 2 36 1 1 6 1 247 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 | // SPDX-License-Identifier: GPL-2.0 #include <linux/quotaops.h> #include <linux/uuid.h> #include "ext4.h" #include "xattr.h" #include "ext4_jbd2.h" static void ext4_fname_from_fscrypt_name(struct ext4_filename *dst, const struct fscrypt_name *src) { memset(dst, 0, sizeof(*dst)); dst->usr_fname = src->usr_fname; dst->disk_name = src->disk_name; dst->hinfo.hash = src->hash; dst->hinfo.minor_hash = src->minor_hash; dst->crypto_buf = src->crypto_buf; } int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname) { struct fscrypt_name name; int err; err = fscrypt_setup_filename(dir, iname, lookup, &name); if (err) return err; ext4_fname_from_fscrypt_name(fname, &name); err = ext4_fname_setup_ci_filename(dir, iname, fname); if (err) ext4_fname_free_filename(fname); return err; } int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, struct ext4_filename *fname) { struct fscrypt_name name; int err; err = fscrypt_prepare_lookup(dir, dentry, &name); if (err) return err; ext4_fname_from_fscrypt_name(fname, &name); err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); if (err) ext4_fname_free_filename(fname); return err; } void ext4_fname_free_filename(struct ext4_filename *fname) { struct fscrypt_name name; name.crypto_buf = fname->crypto_buf; fscrypt_free_filename(&name); fname->crypto_buf.name = NULL; fname->usr_fname = NULL; fname->disk_name.name = NULL; ext4_fname_free_ci_filename(fname); } static bool uuid_is_zero(__u8 u[16]) { int i; for (i = 0; i < 16; i++) if (u[i]) return false; return true; } int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg) { struct super_block *sb = file_inode(filp)->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); int err, err2; handle_t *handle; if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { err = mnt_want_write_file(filp); if (err) return err; handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto pwsalt_err_exit; } err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto pwsalt_err_journal; lock_buffer(sbi->s_sbh); generate_random_uuid(sbi->s_es->s_encrypt_pw_salt); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); pwsalt_err_journal: err2 = ext4_journal_stop(handle); if (err2 && !err) err = err2; pwsalt_err_exit: mnt_drop_write_file(filp); if (err) return err; } if (copy_to_user(arg, sbi->s_es->s_encrypt_pw_salt, 16)) return -EFAULT; return 0; } static int ext4_get_context(struct inode *inode, void *ctx, size_t len) { return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); } static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { handle_t *handle = fs_data; int res, res2, credits, retries = 0; /* * Encrypting the root directory is not allowed because e2fsck expects * lost+found to exist and be unencrypted, and encrypting the root * directory would imply encrypting the lost+found directory as well as * the filename "lost+found" itself. */ if (inode->i_ino == EXT4_ROOT_INO) return -EPERM; if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) return -EINVAL; if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) return -EOPNOTSUPP; res = ext4_convert_inline_data(inode); if (res) return res; /* * If a journal handle was specified, then the encryption context is * being set on a new inode via inheritance and is part of a larger * transaction to create the inode. Otherwise the encryption context is * being set on an existing inode in its own transaction. Only in the * latter case should the "retry on ENOSPC" logic be used. */ if (handle) { res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); /* * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ ext4_set_inode_flags(inode, false); } return res; } res = dquot_initialize(inode); if (res) return res; retry: res = ext4_xattr_set_credits(inode, len, false /* is_create */, &credits); if (res) return res; handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) return PTR_ERR(handle); res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); /* * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ ext4_set_inode_flags(inode, false); res = ext4_mark_inode_dirty(handle, inode); if (res) EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); } res2 = ext4_journal_stop(handle); if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; if (!res) res = res2; return res; } static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb) { return EXT4_SB(sb)->s_dummy_enc_policy.policy; } static bool ext4_has_stable_inodes(struct super_block *sb) { return ext4_has_feature_stable_inodes(sb); } const struct fscrypt_operations ext4_cryptops = { .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_crypt_info) - (int)offsetof(struct ext4_inode_info, vfs_inode), .needs_bounce_pages = 1, .has_32bit_inodes = 1, .supports_subblock_data_units = 1, .legacy_key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, .has_stable_inodes = ext4_has_stable_inodes, }; |
| 2 2 3 4 3 1 2 1 3 4 12 2 22 17 21 1 12 2 9 5 6 28 4 2 2 3 2 12 5 8 5 28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 | /* * linux/fs/nls/nls_euc-jp.c * * Added `OSF/JVC Recommended Code Set Conversion Specification * between Japanese EUC and Shift-JIS' support: <hirofumi@mail.parknet.co.jp> * (http://www.opengroup.or.jp/jvc/cde/sjis-euc-e.html) */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static struct nls_table *p_nls; #define IS_SJIS_LOW_BYTE(l) ((0x40 <= (l)) && ((l) <= 0xFC) && ((l) != 0x7F)) /* JIS X 0208 (include NEC spesial characters) */ #define IS_SJIS_JISX0208(h, l) ((((0x81 <= (h)) && ((h) <= 0x9F)) \ || ((0xE0 <= (h)) && ((h) <= 0xEA))) \ && IS_SJIS_LOW_BYTE(l)) #define IS_SJIS_JISX0201KANA(c) ((0xA1 <= (c)) && ((c) <= 0xDF)) #define IS_SJIS_UDC_LOW(h, l) (((0xF0 <= (h)) && ((h) <= 0xF4)) \ && IS_SJIS_LOW_BYTE(l)) #define IS_SJIS_UDC_HI(h, l) (((0xF5 <= (h)) && ((h) <= 0xF9)) \ && IS_SJIS_LOW_BYTE(l)) #define IS_SJIS_IBM(h, l) (((0xFA <= (h)) && ((h) <= 0xFC)) \ && IS_SJIS_LOW_BYTE(l)) #define IS_SJIS_NECIBM(h, l) (((0xED <= (h)) && ((h) <= 0xEE)) \ && IS_SJIS_LOW_BYTE(l)) #define MAP_SJIS2EUC(sjis_hi, sjis_lo, sjis_p, euc_hi, euc_lo, euc_p) { \ if ((sjis_lo) >= 0x9F) { \ (euc_hi) = (sjis_hi) * 2 - (((sjis_p) * 2 - (euc_p)) - 1); \ (euc_lo) = (sjis_lo) + 2; \ } else { \ (euc_hi) = (sjis_hi) * 2 - ((sjis_p) * 2 - (euc_p)); \ (euc_lo) = (sjis_lo) + ((sjis_lo) >= 0x7F ? 0x60 : 0x61); \ } \ } while(0) #define SS2 (0x8E) /* Single Shift 2 */ #define SS3 (0x8F) /* Single Shift 3 */ #define IS_EUC_BYTE(c) ((0xA1 <= (c)) && ((c) <= 0xFE)) #define IS_EUC_JISX0208(h, l) (IS_EUC_BYTE(h) && IS_EUC_BYTE(l)) #define IS_EUC_JISX0201KANA(h, l) (((h) == SS2) && (0xA1 <= (l) && (l) <= 0xDF)) #define IS_EUC_UDC_LOW(h, l) (((0xF5 <= (h)) && ((h) <= 0xFE)) \ && IS_EUC_BYTE(l)) #define IS_EUC_UDC_HI(h, l) IS_EUC_UDC_LOW(h, l) /* G3 block */ #define MAP_EUC2SJIS(euc_hi, euc_lo, euc_p, sjis_hi, sjis_lo, sjis_p) { \ if ((euc_hi) & 1) { \ (sjis_hi) = (euc_hi) / 2 + ((sjis_p) - (euc_p) / 2); \ (sjis_lo) = (euc_lo) - ((euc_lo) >= 0xE0 ? 0x60 : 0x61); \ } else { \ (sjis_hi) = (euc_hi) / 2 + (((sjis_p) - (euc_p) / 2) - 1); \ (sjis_lo) = (euc_lo) - 2; \ } \ } while(0) /* SJIS IBM extended characters to EUC map */ static const unsigned char sjisibm2euc_map[][2] = { {0xF3, 0xF3}, {0xF3, 0xF4}, {0xF3, 0xF5}, {0xF3, 0xF6}, {0xF3, 0xF7}, {0xF3, 0xF8}, {0xF3, 0xF9}, {0xF3, 0xFA}, {0xF3, 0xFB}, {0xF3, 0xFC}, {0xF3, 0xFD}, {0xF3, 0xFE}, {0xF4, 0xA1}, {0xF4, 0xA2}, {0xF4, 0xA3}, {0xF4, 0xA4}, {0xF4, 0xA5}, {0xF4, 0xA6}, {0xF4, 0xA7}, {0xF4, 0xA8}, {0xA2, 0xCC}, {0xA2, 0xC3}, {0xF4, 0xA9}, {0xF4, 0xAA}, {0xF4, 0xAB}, {0xF4, 0xAC}, {0xF4, 0xAD}, {0xA2, 0xE8}, {0xD4, 0xE3}, {0xDC, 0xDF}, {0xE4, 0xE9}, {0xE3, 0xF8}, {0xD9, 0xA1}, {0xB1, 0xBB}, {0xF4, 0xAE}, {0xC2, 0xAD}, {0xC3, 0xFC}, {0xE4, 0xD0}, {0xC2, 0xBF}, {0xBC, 0xF4}, {0xB0, 0xA9}, {0xB0, 0xC8}, {0xF4, 0xAF}, {0xB0, 0xD2}, {0xB0, 0xD4}, {0xB0, 0xE3}, {0xB0, 0xEE}, {0xB1, 0xA7}, {0xB1, 0xA3}, {0xB1, 0xAC}, {0xB1, 0xA9}, {0xB1, 0xBE}, {0xB1, 0xDF}, {0xB1, 0xD8}, {0xB1, 0xC8}, {0xB1, 0xD7}, {0xB1, 0xE3}, {0xB1, 0xF4}, {0xB1, 0xE1}, {0xB2, 0xA3}, {0xF4, 0xB0}, {0xB2, 0xBB}, {0xB2, 0xE6}, {0x00, 0x00}, {0xB2, 0xED}, {0xB2, 0xF5}, {0xB2, 0xFC}, {0xF4, 0xB1}, {0xB3, 0xB5}, {0xB3, 0xD8}, {0xB3, 0xDB}, {0xB3, 0xE5}, {0xB3, 0xEE}, {0xB3, 0xFB}, {0xF4, 0xB2}, {0xF4, 0xB3}, {0xB4, 0xC0}, {0xB4, 0xC7}, {0xB4, 0xD0}, {0xB4, 0xDE}, {0xF4, 0xB4}, {0xB5, 0xAA}, {0xF4, 0xB5}, {0xB5, 0xAF}, {0xB5, 0xC4}, {0xB5, 0xE8}, {0xF4, 0xB6}, {0xB7, 0xC2}, {0xB7, 0xE4}, {0xB7, 0xE8}, {0xB7, 0xE7}, {0xF4, 0xB7}, {0xF4, 0xB8}, {0xF4, 0xB9}, {0xB8, 0xCE}, {0xB8, 0xE1}, {0xB8, 0xF5}, {0xB8, 0xF7}, {0xB8, 0xF8}, {0xB8, 0xFC}, {0xB9, 0xAF}, {0xB9, 0xB7}, {0xBA, 0xBE}, {0xBA, 0xDB}, {0xCD, 0xAA}, {0xBA, 0xE1}, {0xF4, 0xBA}, {0xBA, 0xEB}, {0xBB, 0xB3}, {0xBB, 0xB8}, {0xF4, 0xBB}, {0xBB, 0xCA}, {0xF4, 0xBC}, {0xF4, 0xBD}, {0xBB, 0xD0}, {0xBB, 0xDE}, {0xBB, 0xF4}, {0xBB, 0xF5}, {0xBB, 0xF9}, {0xBC, 0xE4}, {0xBC, 0xED}, {0xBC, 0xFE}, {0xF4, 0xBE}, {0xBD, 0xC2}, {0xBD, 0xE7}, {0xF4, 0xBF}, {0xBD, 0xF0}, {0xBE, 0xB0}, {0xBE, 0xAC}, {0xF4, 0xC0}, {0xBE, 0xB3}, {0xBE, 0xBD}, {0xBE, 0xCD}, {0xBE, 0xC9}, {0xBE, 0xE4}, {0xBF, 0xA8}, {0xBF, 0xC9}, {0xC0, 0xC4}, {0xC0, 0xE4}, {0xC0, 0xF4}, {0xC1, 0xA6}, {0xF4, 0xC1}, {0xC1, 0xF5}, {0xC1, 0xFC}, {0xF4, 0xC2}, {0xC1, 0xF8}, {0xC2, 0xAB}, {0xC2, 0xA1}, {0xC2, 0xA5}, {0xF4, 0xC3}, {0xC2, 0xB8}, {0xC2, 0xBA}, {0xF4, 0xC4}, {0xC2, 0xC4}, {0xC2, 0xD2}, {0xC2, 0xD7}, {0xC2, 0xDB}, {0xC2, 0xDE}, {0xC2, 0xED}, {0xC2, 0xF0}, {0xF4, 0xC5}, {0xC3, 0xA1}, {0xC3, 0xB5}, {0xC3, 0xC9}, {0xC3, 0xB9}, {0xF4, 0xC6}, {0xC3, 0xD8}, {0xC3, 0xFE}, {0xF4, 0xC7}, {0xC4, 0xCC}, {0xF4, 0xC8}, {0xC4, 0xD9}, {0xC4, 0xEA}, {0xC4, 0xFD}, {0xF4, 0xC9}, {0xC5, 0xA7}, {0xC5, 0xB5}, {0xC5, 0xB6}, {0xF4, 0xCA}, {0xC5, 0xD5}, {0xC6, 0xB8}, {0xC6, 0xD7}, {0xC6, 0xE0}, {0xC6, 0xEA}, {0xC6, 0xE3}, {0xC7, 0xA1}, {0xC7, 0xAB}, {0xC7, 0xC7}, {0xC7, 0xC3}, {0xC7, 0xCB}, {0xC7, 0xCF}, {0xC7, 0xD9}, {0xF4, 0xCB}, {0xF4, 0xCC}, {0xC7, 0xE6}, {0xC7, 0xEE}, {0xC7, 0xFC}, {0xC7, 0xEB}, {0xC7, 0xF0}, {0xC8, 0xB1}, {0xC8, 0xE5}, {0xC8, 0xF8}, {0xC9, 0xA6}, {0xC9, 0xAB}, {0xC9, 0xAD}, {0xF4, 0xCD}, {0xC9, 0xCA}, {0xC9, 0xD3}, {0xC9, 0xE9}, {0xC9, 0xE3}, {0xC9, 0xFC}, {0xC9, 0xF4}, {0xC9, 0xF5}, {0xF4, 0xCE}, {0xCA, 0xB3}, {0xCA, 0xBD}, {0xCA, 0xEF}, {0xCA, 0xF1}, {0xCB, 0xAE}, {0xF4, 0xCF}, {0xCB, 0xCA}, {0xCB, 0xE6}, {0xCB, 0xEA}, {0xCB, 0xF0}, {0xCB, 0xF4}, {0xCB, 0xEE}, {0xCC, 0xA5}, {0xCB, 0xF9}, {0xCC, 0xAB}, {0xCC, 0xAE}, {0xCC, 0xAD}, {0xCC, 0xB2}, {0xCC, 0xC2}, {0xCC, 0xD0}, {0xCC, 0xD9}, {0xF4, 0xD0}, {0xCD, 0xBB}, {0xF4, 0xD1}, {0xCE, 0xBB}, {0xF4, 0xD2}, {0xCE, 0xBA}, {0xCE, 0xC3}, {0xF4, 0xD3}, {0xCE, 0xF2}, {0xB3, 0xDD}, {0xCF, 0xD5}, {0xCF, 0xE2}, {0xCF, 0xE9}, {0xCF, 0xED}, {0xF4, 0xD4}, {0xF4, 0xD5}, {0xF4, 0xD6}, {0x00, 0x00}, {0xF4, 0xD7}, {0xD0, 0xE5}, {0xF4, 0xD8}, {0xD0, 0xE9}, {0xD1, 0xE8}, {0xF4, 0xD9}, {0xF4, 0xDA}, {0xD1, 0xEC}, {0xD2, 0xBB}, {0xF4, 0xDB}, {0xD3, 0xE1}, {0xD3, 0xE8}, {0xD4, 0xA7}, {0xF4, 0xDC}, {0xF4, 0xDD}, {0xD4, 0xD4}, {0xD4, 0xF2}, {0xD5, 0xAE}, {0xF4, 0xDE}, {0xD7, 0xDE}, {0xF4, 0xDF}, {0xD8, 0xA2}, {0xD8, 0xB7}, {0xD8, 0xC1}, {0xD8, 0xD1}, {0xD8, 0xF4}, {0xD9, 0xC6}, {0xD9, 0xC8}, {0xD9, 0xD1}, {0xF4, 0xE0}, {0xF4, 0xE1}, {0xF4, 0xE2}, {0xF4, 0xE3}, {0xF4, 0xE4}, {0xDC, 0xD3}, {0xDD, 0xC8}, {0xDD, 0xD4}, {0xDD, 0xEA}, {0xDD, 0xFA}, {0xDE, 0xA4}, {0xDE, 0xB0}, {0xF4, 0xE5}, {0xDE, 0xB5}, {0xDE, 0xCB}, {0xF4, 0xE6}, {0xDF, 0xB9}, {0xF4, 0xE7}, {0xDF, 0xC3}, {0xF4, 0xE8}, {0xF4, 0xE9}, {0xE0, 0xD9}, {0xF4, 0xEA}, {0xF4, 0xEB}, {0xE1, 0xE2}, {0xF4, 0xEC}, {0xF4, 0xED}, {0xF4, 0xEE}, {0xE2, 0xC7}, {0xE3, 0xA8}, {0xE3, 0xA6}, {0xE3, 0xA9}, {0xE3, 0xAF}, {0xE3, 0xB0}, {0xE3, 0xAA}, {0xE3, 0xAB}, {0xE3, 0xBC}, {0xE3, 0xC1}, {0xE3, 0xBF}, {0xE3, 0xD5}, {0xE3, 0xD8}, {0xE3, 0xD6}, {0xE3, 0xDF}, {0xE3, 0xE3}, {0xE3, 0xE1}, {0xE3, 0xD4}, {0xE3, 0xE9}, {0xE4, 0xA6}, {0xE3, 0xF1}, {0xE3, 0xF2}, {0xE4, 0xCB}, {0xE4, 0xC1}, {0xE4, 0xC3}, {0xE4, 0xBE}, {0xF4, 0xEF}, {0xE4, 0xC0}, {0xE4, 0xC7}, {0xE4, 0xBF}, {0xE4, 0xE0}, {0xE4, 0xDE}, {0xE4, 0xD1}, {0xF4, 0xF0}, {0xE4, 0xDC}, {0xE4, 0xD2}, {0xE4, 0xDB}, {0xE4, 0xD4}, {0xE4, 0xFA}, {0xE4, 0xEF}, {0xE5, 0xB3}, {0xE5, 0xBF}, {0xE5, 0xC9}, {0xE5, 0xD0}, {0xE5, 0xE2}, {0xE5, 0xEA}, {0xE5, 0xEB}, {0xF4, 0xF1}, {0xF4, 0xF2}, {0xF4, 0xF3}, {0xE6, 0xE8}, {0xE6, 0xEF}, {0xE7, 0xAC}, {0xF4, 0xF4}, {0xE7, 0xAE}, {0xF4, 0xF5}, {0xE7, 0xB1}, {0xF4, 0xF6}, {0xE7, 0xB2}, {0xE8, 0xB1}, {0xE8, 0xB6}, {0xF4, 0xF7}, {0xF4, 0xF8}, {0xE8, 0xDD}, {0xF4, 0xF9}, {0xF4, 0xFA}, {0xE9, 0xD1}, {0xF4, 0xFB}, {0xE9, 0xED}, {0xEA, 0xCD}, {0xF4, 0xFC}, {0xEA, 0xDB}, {0xEA, 0xE6}, {0xEA, 0xEA}, {0xEB, 0xA5}, {0xEB, 0xFB}, {0xEB, 0xFA}, {0xF4, 0xFD}, {0xEC, 0xD6}, {0xF4, 0xFE}, }; #define IS_EUC_IBM2JISX0208(h, l) \ (((h) == 0xA2 && (l) == 0xCC) || ((h) == 0xA2 && (l) == 0xE8)) /* EUC to SJIS IBM extended characters map (G3 JIS X 0212 block) */ static struct { unsigned short euc; unsigned char sjis[2]; } euc2sjisibm_jisx0212_map[] = { {0xA2C3, {0xFA, 0x55}}, {0xB0A9, {0xFA, 0x68}}, {0xB0C8, {0xFA, 0x69}}, {0xB0D2, {0xFA, 0x6B}}, {0xB0D4, {0xFA, 0x6C}}, {0xB0E3, {0xFA, 0x6D}}, {0xB0EE, {0xFA, 0x6E}}, {0xB1A3, {0xFA, 0x70}}, {0xB1A7, {0xFA, 0x6F}}, {0xB1A9, {0xFA, 0x72}}, {0xB1AC, {0xFA, 0x71}}, {0xB1BB, {0xFA, 0x61}}, {0xB1BE, {0xFA, 0x73}}, {0xB1C8, {0xFA, 0x76}}, {0xB1D7, {0xFA, 0x77}}, {0xB1D8, {0xFA, 0x75}}, {0xB1DF, {0xFA, 0x74}}, {0xB1E1, {0xFA, 0x7A}}, {0xB1E3, {0xFA, 0x78}}, {0xB1F4, {0xFA, 0x79}}, {0xB2A3, {0xFA, 0x7B}}, {0xB2BB, {0xFA, 0x7D}}, {0xB2E6, {0xFA, 0x7E}}, {0xB2ED, {0xFA, 0x80}}, {0xB2F5, {0xFA, 0x81}}, {0xB2FC, {0xFA, 0x82}}, {0xB3B5, {0xFA, 0x84}}, {0xB3D8, {0xFA, 0x85}}, {0xB3DB, {0xFA, 0x86}}, {0xB3DD, {0xFB, 0x77}}, {0xB3E5, {0xFA, 0x87}}, {0xB3EE, {0xFA, 0x88}}, {0xB3FB, {0xFA, 0x89}}, {0xB4C0, {0xFA, 0x8C}}, {0xB4C7, {0xFA, 0x8D}}, {0xB4D0, {0xFA, 0x8E}}, {0xB4DE, {0xFA, 0x8F}}, {0xB5AA, {0xFA, 0x91}}, {0xB5AF, {0xFA, 0x93}}, {0xB5C4, {0xFA, 0x94}}, {0xB5E8, {0xFA, 0x95}}, {0xB7C2, {0xFA, 0x97}}, {0xB7E4, {0xFA, 0x98}}, {0xB7E7, {0xFA, 0x9A}}, {0xB7E8, {0xFA, 0x99}}, {0xB8CE, {0xFA, 0x9E}}, {0xB8E1, {0xFA, 0x9F}}, {0xB8F5, {0xFA, 0xA0}}, {0xB8F7, {0xFA, 0xA1}}, {0xB8F8, {0xFA, 0xA2}}, {0xB8FC, {0xFA, 0xA3}}, {0xB9AF, {0xFA, 0xA4}}, {0xB9B7, {0xFA, 0xA5}}, {0xBABE, {0xFA, 0xA6}}, {0xBADB, {0xFA, 0xA7}}, {0xBAE1, {0xFA, 0xA9}}, {0xBAEB, {0xFA, 0xAB}}, {0xBBB3, {0xFA, 0xAC}}, {0xBBB8, {0xFA, 0xAD}}, {0xBBCA, {0xFA, 0xAF}}, {0xBBD0, {0xFA, 0xB2}}, {0xBBDE, {0xFA, 0xB3}}, {0xBBF4, {0xFA, 0xB4}}, {0xBBF5, {0xFA, 0xB5}}, {0xBBF9, {0xFA, 0xB6}}, {0xBCE4, {0xFA, 0xB7}}, {0xBCED, {0xFA, 0xB8}}, {0xBCF4, {0xFA, 0x67}}, {0xBCFE, {0xFA, 0xB9}}, {0xBDC2, {0xFA, 0xBB}}, {0xBDE7, {0xFA, 0xBC}}, {0xBDF0, {0xFA, 0xBE}}, {0xBEAC, {0xFA, 0xC0}}, {0xBEB0, {0xFA, 0xBF}}, {0xBEB3, {0xFA, 0xC2}}, {0xBEBD, {0xFA, 0xC3}}, {0xBEC9, {0xFA, 0xC5}}, {0xBECD, {0xFA, 0xC4}}, {0xBEE4, {0xFA, 0xC6}}, {0xBFA8, {0xFA, 0xC7}}, {0xBFC9, {0xFA, 0xC8}}, {0xC0C4, {0xFA, 0xC9}}, {0xC0E4, {0xFA, 0xCA}}, {0xC0F4, {0xFA, 0xCB}}, {0xC1A6, {0xFA, 0xCC}}, {0xC1F5, {0xFA, 0xCE}}, {0xC1F8, {0xFA, 0xD1}}, {0xC1FC, {0xFA, 0xCF}}, {0xC2A1, {0xFA, 0xD3}}, {0xC2A5, {0xFA, 0xD4}}, {0xC2AB, {0xFA, 0xD2}}, {0xC2AD, {0xFA, 0x63}}, {0xC2B8, {0xFA, 0xD6}}, {0xC2BA, {0xFA, 0xD7}}, {0xC2BF, {0xFA, 0x66}}, {0xC2C4, {0xFA, 0xD9}}, {0xC2D2, {0xFA, 0xDA}}, {0xC2D7, {0xFA, 0xDB}}, {0xC2DB, {0xFA, 0xDC}}, {0xC2DE, {0xFA, 0xDD}}, {0xC2ED, {0xFA, 0xDE}}, {0xC2F0, {0xFA, 0xDF}}, {0xC3A1, {0xFA, 0xE1}}, {0xC3B5, {0xFA, 0xE2}}, {0xC3B9, {0xFA, 0xE4}}, {0xC3C9, {0xFA, 0xE3}}, {0xC3D8, {0xFA, 0xE6}}, {0xC3FC, {0xFA, 0x64}}, {0xC3FE, {0xFA, 0xE7}}, {0xC4CC, {0xFA, 0xE9}}, {0xC4D9, {0xFA, 0xEB}}, {0xC4EA, {0xFA, 0xEC}}, {0xC4FD, {0xFA, 0xED}}, {0xC5A7, {0xFA, 0xEF}}, {0xC5B5, {0xFA, 0xF0}}, {0xC5B6, {0xFA, 0xF1}}, {0xC5D5, {0xFA, 0xF3}}, {0xC6B8, {0xFA, 0xF4}}, {0xC6D7, {0xFA, 0xF5}}, {0xC6E0, {0xFA, 0xF6}}, {0xC6E3, {0xFA, 0xF8}}, {0xC6EA, {0xFA, 0xF7}}, {0xC7A1, {0xFA, 0xF9}}, {0xC7AB, {0xFA, 0xFA}}, {0xC7C3, {0xFA, 0xFC}}, {0xC7C7, {0xFA, 0xFB}}, {0xC7CB, {0xFB, 0x40}}, {0xC7CF, {0xFB, 0x41}}, {0xC7D9, {0xFB, 0x42}}, {0xC7E6, {0xFB, 0x45}}, {0xC7EB, {0xFB, 0x48}}, {0xC7EE, {0xFB, 0x46}}, {0xC7F0, {0xFB, 0x49}}, {0xC7FC, {0xFB, 0x47}}, {0xC8B1, {0xFB, 0x4A}}, {0xC8E5, {0xFB, 0x4B}}, {0xC8F8, {0xFB, 0x4C}}, {0xC9A6, {0xFB, 0x4D}}, {0xC9AB, {0xFB, 0x4E}}, {0xC9AD, {0xFB, 0x4F}}, {0xC9CA, {0xFB, 0x51}}, {0xC9D3, {0xFB, 0x52}}, {0xC9E3, {0xFB, 0x54}}, {0xC9E9, {0xFB, 0x53}}, {0xC9F4, {0xFB, 0x56}}, {0xC9F5, {0xFB, 0x57}}, {0xC9FC, {0xFB, 0x55}}, {0xCAB3, {0xFB, 0x59}}, {0xCABD, {0xFB, 0x5A}}, {0xCAEF, {0xFB, 0x5B}}, {0xCAF1, {0xFB, 0x5C}}, {0xCBAE, {0xFB, 0x5D}}, {0xCBCA, {0xFB, 0x5F}}, {0xCBE6, {0xFB, 0x60}}, {0xCBEA, {0xFB, 0x61}}, {0xCBEE, {0xFB, 0x64}}, {0xCBF0, {0xFB, 0x62}}, {0xCBF4, {0xFB, 0x63}}, {0xCBF9, {0xFB, 0x66}}, {0xCCA5, {0xFB, 0x65}}, {0xCCAB, {0xFB, 0x67}}, {0xCCAD, {0xFB, 0x69}}, {0xCCAE, {0xFB, 0x68}}, {0xCCB2, {0xFB, 0x6A}}, {0xCCC2, {0xFB, 0x6B}}, {0xCCD0, {0xFB, 0x6C}}, {0xCCD9, {0xFB, 0x6D}}, {0xCDAA, {0xFA, 0xA8}}, {0xCDBB, {0xFB, 0x6F}}, {0xCEBA, {0xFB, 0x73}}, {0xCEBB, {0xFB, 0x71}}, {0xCEC3, {0xFB, 0x74}}, {0xCEF2, {0xFB, 0x76}}, {0xCFD5, {0xFB, 0x78}}, {0xCFE2, {0xFB, 0x79}}, {0xCFE9, {0xFB, 0x7A}}, {0xCFED, {0xFB, 0x7B}}, {0xD0E5, {0xFB, 0x81}}, {0xD0E9, {0xFB, 0x83}}, {0xD1E8, {0xFB, 0x84}}, {0xD1EC, {0xFB, 0x87}}, {0xD2BB, {0xFB, 0x88}}, {0xD3E1, {0xFB, 0x8A}}, {0xD3E8, {0xFB, 0x8B}}, {0xD4A7, {0xFB, 0x8C}}, {0xD4D4, {0xFB, 0x8F}}, {0xD4E3, {0xFA, 0x5C}}, {0xD4F2, {0xFB, 0x90}}, {0xD5AE, {0xFB, 0x91}}, {0xD7DE, {0xFB, 0x93}}, {0xD8A2, {0xFB, 0x95}}, {0xD8B7, {0xFB, 0x96}}, {0xD8C1, {0xFB, 0x97}}, {0xD8D1, {0xFB, 0x98}}, {0xD8F4, {0xFB, 0x99}}, {0xD9A1, {0xFA, 0x60}}, {0xD9C6, {0xFB, 0x9A}}, {0xD9C8, {0xFB, 0x9B}}, {0xD9D1, {0xFB, 0x9C}}, {0xDCD3, {0xFB, 0xA2}}, {0xDCDF, {0xFA, 0x5D}}, {0xDDC8, {0xFB, 0xA3}}, {0xDDD4, {0xFB, 0xA4}}, {0xDDEA, {0xFB, 0xA5}}, {0xDDFA, {0xFB, 0xA6}}, {0xDEA4, {0xFB, 0xA7}}, {0xDEB0, {0xFB, 0xA8}}, {0xDEB5, {0xFB, 0xAA}}, {0xDECB, {0xFB, 0xAB}}, {0xDFB9, {0xFB, 0xAD}}, {0xDFC3, {0xFB, 0xAF}}, {0xE0D9, {0xFB, 0xB2}}, {0xE1E2, {0xFB, 0xB5}}, {0xE2C7, {0xFB, 0xB9}}, {0xE3A6, {0xFB, 0xBB}}, {0xE3A8, {0xFB, 0xBA}}, {0xE3A9, {0xFB, 0xBC}}, {0xE3AA, {0xFB, 0xBF}}, {0xE3AB, {0xFB, 0xC0}}, {0xE3AF, {0xFB, 0xBD}}, {0xE3B0, {0xFB, 0xBE}}, {0xE3BC, {0xFB, 0xC1}}, {0xE3BF, {0xFB, 0xC3}}, {0xE3C1, {0xFB, 0xC2}}, {0xE3D4, {0xFB, 0xCA}}, {0xE3D5, {0xFB, 0xC4}}, {0xE3D6, {0xFB, 0xC6}}, {0xE3D8, {0xFB, 0xC5}}, {0xE3DF, {0xFB, 0xC7}}, {0xE3E1, {0xFB, 0xC9}}, {0xE3E3, {0xFB, 0xC8}}, {0xE3E9, {0xFB, 0xCB}}, {0xE3F1, {0xFB, 0xCD}}, {0xE3F2, {0xFB, 0xCE}}, {0xE3F8, {0xFA, 0x5F}}, {0xE4A6, {0xFB, 0xCC}}, {0xE4BE, {0xFB, 0xD2}}, {0xE4BF, {0xFB, 0xD6}}, {0xE4C0, {0xFB, 0xD4}}, {0xE4C1, {0xFB, 0xD0}}, {0xE4C3, {0xFB, 0xD1}}, {0xE4C7, {0xFB, 0xD5}}, {0xE4CB, {0xFB, 0xCF}}, {0xE4D0, {0xFA, 0x65}}, {0xE4D1, {0xFB, 0xD9}}, {0xE4D2, {0xFB, 0xDC}}, {0xE4D4, {0xFB, 0xDE}}, {0xE4DB, {0xFB, 0xDD}}, {0xE4DC, {0xFB, 0xDB}}, {0xE4DE, {0xFB, 0xD8}}, {0xE4E0, {0xFB, 0xD7}}, {0xE4E9, {0xFA, 0x5E}}, {0xE4EF, {0xFB, 0xE0}}, {0xE4FA, {0xFB, 0xDF}}, {0xE5B3, {0xFB, 0xE1}}, {0xE5BF, {0xFB, 0xE2}}, {0xE5C9, {0xFB, 0xE3}}, {0xE5D0, {0xFB, 0xE4}}, {0xE5E2, {0xFB, 0xE5}}, {0xE5EA, {0xFB, 0xE6}}, {0xE5EB, {0xFB, 0xE7}}, {0xE6E8, {0xFB, 0xEB}}, {0xE6EF, {0xFB, 0xEC}}, {0xE7AC, {0xFB, 0xED}}, {0xE7AE, {0xFB, 0xEF}}, {0xE7B1, {0xFB, 0xF1}}, {0xE7B2, {0xFB, 0xF3}}, {0xE8B1, {0xFB, 0xF4}}, {0xE8B6, {0xFB, 0xF5}}, {0xE8DD, {0xFB, 0xF8}}, {0xE9D1, {0xFB, 0xFB}}, {0xE9ED, {0xFC, 0x40}}, {0xEACD, {0xFC, 0x41}}, {0xEADB, {0xFC, 0x43}}, {0xEAE6, {0xFC, 0x44}}, {0xEAEA, {0xFC, 0x45}}, {0xEBA5, {0xFC, 0x46}}, {0xEBFA, {0xFC, 0x48}}, {0xEBFB, {0xFC, 0x47}}, {0xECD6, {0xFC, 0x4A}}, }; /* EUC to SJIS IBM extended characters map (G3 Upper block) */ static const unsigned char euc2sjisibm_g3upper_map[][2] = { {0xFA, 0x40}, {0xFA, 0x41}, {0xFA, 0x42}, {0xFA, 0x43}, {0xFA, 0x44}, {0xFA, 0x45}, {0xFA, 0x46}, {0xFA, 0x47}, {0xFA, 0x48}, {0xFA, 0x49}, {0xFA, 0x4A}, {0xFA, 0x4B}, {0xFA, 0x4C}, {0xFA, 0x4D}, {0xFA, 0x4E}, {0xFA, 0x4F}, {0xFA, 0x50}, {0xFA, 0x51}, {0xFA, 0x52}, {0xFA, 0x53}, {0xFA, 0x56}, {0xFA, 0x57}, {0xFA, 0x58}, {0xFA, 0x59}, {0xFA, 0x5A}, {0xFA, 0x62}, {0xFA, 0x6A}, {0xFA, 0x7C}, {0xFA, 0x83}, {0xFA, 0x8A}, {0xFA, 0x8B}, {0xFA, 0x90}, {0xFA, 0x92}, {0xFA, 0x96}, {0xFA, 0x9B}, {0xFA, 0x9C}, {0xFA, 0x9D}, {0xFA, 0xAA}, {0xFA, 0xAE}, {0xFA, 0xB0}, {0xFA, 0xB1}, {0xFA, 0xBA}, {0xFA, 0xBD}, {0xFA, 0xC1}, {0xFA, 0xCD}, {0xFA, 0xD0}, {0xFA, 0xD5}, {0xFA, 0xD8}, {0xFA, 0xE0}, {0xFA, 0xE5}, {0xFA, 0xE8}, {0xFA, 0xEA}, {0xFA, 0xEE}, {0xFA, 0xF2}, {0xFB, 0x43}, {0xFB, 0x44}, {0xFB, 0x50}, {0xFB, 0x58}, {0xFB, 0x5E}, {0xFB, 0x6E}, {0xFB, 0x70}, {0xFB, 0x72}, {0xFB, 0x75}, {0xFB, 0x7C}, {0xFB, 0x7D}, {0xFB, 0x7E}, {0xFB, 0x80}, {0xFB, 0x82}, {0xFB, 0x85}, {0xFB, 0x86}, {0xFB, 0x89}, {0xFB, 0x8D}, {0xFB, 0x8E}, {0xFB, 0x92}, {0xFB, 0x94}, {0xFB, 0x9D}, {0xFB, 0x9E}, {0xFB, 0x9F}, {0xFB, 0xA0}, {0xFB, 0xA1}, {0xFB, 0xA9}, {0xFB, 0xAC}, {0xFB, 0xAE}, {0xFB, 0xB0}, {0xFB, 0xB1}, {0xFB, 0xB3}, {0xFB, 0xB4}, {0xFB, 0xB6}, {0xFB, 0xB7}, {0xFB, 0xB8}, {0xFB, 0xD3}, {0xFB, 0xDA}, {0xFB, 0xE8}, {0xFB, 0xE9}, {0xFB, 0xEA}, {0xFB, 0xEE}, {0xFB, 0xF0}, {0xFB, 0xF2}, {0xFB, 0xF6}, {0xFB, 0xF7}, {0xFB, 0xF9}, {0xFB, 0xFA}, {0xFB, 0xFC}, {0xFC, 0x42}, {0xFC, 0x49}, {0xFC, 0x4B}, }; static inline int sjisibm2euc(unsigned char *euc, const unsigned char sjis_hi, const unsigned char sjis_lo); static inline int euc2sjisibm_jisx0212(unsigned char *sjis, const unsigned char euc_hi, const unsigned char euc_lo); static inline int euc2sjisibm_g3upper(unsigned char *sjis, const unsigned char euc_hi, const unsigned char euc_lo); static inline int euc2sjisibm(unsigned char *sjis, const unsigned char euc_hi, const unsigned char euc_lo); static inline int sjisnec2sjisibm(unsigned char *sjisibm, const unsigned char sjisnec_hi, const unsigned char sjisnec_lo); /* SJIS IBM extended characters to EUC */ static inline int sjisibm2euc(unsigned char *euc, const unsigned char sjis_hi, const unsigned char sjis_lo) { int index; index = ((sjis_hi - 0xFA) * (0xFD - 0x40)) + (sjis_lo - 0x40); if (IS_EUC_IBM2JISX0208(sjisibm2euc_map[index][0], sjisibm2euc_map[index][1])) { euc[0] = sjisibm2euc_map[index][0]; euc[1] = sjisibm2euc_map[index][1]; return 2; } else { euc[0] = SS3; euc[1] = sjisibm2euc_map[index][0]; euc[2] = sjisibm2euc_map[index][1]; return 3; } } /* EUC to SJIS IBM extended characters (G3 JIS X 0212 block) */ static inline int euc2sjisibm_jisx0212(unsigned char *sjis, const unsigned char euc_hi, const unsigned char euc_lo) { int index, min_index, max_index; unsigned short euc; min_index = 0; max_index = ARRAY_SIZE(euc2sjisibm_jisx0212_map) - 1; euc = (euc_hi << 8) | euc_lo; while (min_index <= max_index) { index = (min_index + max_index) / 2; if (euc < euc2sjisibm_jisx0212_map[index].euc) max_index = index - 1; else min_index = index + 1; if (euc == euc2sjisibm_jisx0212_map[index].euc) { sjis[0] = euc2sjisibm_jisx0212_map[index].sjis[0]; sjis[1] = euc2sjisibm_jisx0212_map[index].sjis[1]; return 3; } } return 0; } /* EUC to SJIS IBM extended characters (G3 Upper block) */ static inline int euc2sjisibm_g3upper(unsigned char *sjis, const unsigned char euc_hi, const unsigned char euc_lo) { int index; if (euc_hi == 0xF3) index = ((euc_hi << 8) | euc_lo) - 0xF3F3; else index = ((euc_hi << 8) | euc_lo) - 0xF4A1 + 12; if ((index < 0) || (index >= ARRAY_SIZE(euc2sjisibm_g3upper_map))) return 0; sjis[0] = euc2sjisibm_g3upper_map[index][0]; sjis[1] = euc2sjisibm_g3upper_map[index][1]; return 3; } /* EUC to SJIS IBM extended characters (G3 block) */ static inline int euc2sjisibm(unsigned char *sjis, const unsigned char euc_hi, const unsigned char euc_lo) { int n; #if 0 if ((euc_hi == 0xA2) && (euc_lo == 0xCC)) { sjis[0] = 0xFA; sjis[1] = 0x54; return 2; } else if ((euc_hi == 0xA2) && (euc_lo == 0xE8)) { sjis[0] = 0xFA; sjis[1] = 0x5B; return 2; } #endif if ((n = euc2sjisibm_g3upper(sjis, euc_hi, euc_lo))) { return n; } else if ((n = euc2sjisibm_jisx0212(sjis, euc_hi, euc_lo))) { return n; } return 0; } /* NEC/IBM extended characters to IBM extended characters */ static inline int sjisnec2sjisibm(unsigned char *sjisibm, const unsigned char sjisnec_hi, const unsigned char sjisnec_lo) { int count; if (! IS_SJIS_NECIBM(sjisnec_hi, sjisnec_lo)) return 0; if ((sjisnec_hi == 0xEE) && (sjisnec_lo == 0xF9)) { sjisibm[0] = 0x81; sjisibm[1] = 0xCA; return 2; } if ((sjisnec_hi == 0xEE) && (sjisnec_lo >= 0xEF)) { count = (sjisnec_hi << 8 | sjisnec_lo) - (sjisnec_lo <= 0xF9 ? 0xEEEF : (0xEEEF - 10)); } else { count = (sjisnec_hi - 0xED) * (0xFC - 0x40) + (sjisnec_lo - 0x40) + (0x5C - 0x40); if (sjisnec_lo >= 0x7F) count--; } sjisibm[0] = 0xFA + (count / (0xFC - 0x40)); sjisibm[1] = 0x40 + (count % (0xFC - 0x40)); if (sjisibm[1] >= 0x7F) sjisibm[1]++; return 2; } static int uni2char(const wchar_t uni, unsigned char *out, int boundlen) { int n; if (!p_nls) return -EINVAL; if ((n = p_nls->uni2char(uni, out, boundlen)) < 0) return n; /* translate SJIS into EUC-JP */ if (n == 1) { if (IS_SJIS_JISX0201KANA(out[0])) { /* JIS X 0201 KANA */ if (boundlen < 2) return -ENAMETOOLONG; out[1] = out[0]; out[0] = SS2; return 2; } } else if (n == 2) { /* NEC/IBM extended characters to IBM extended characters */ sjisnec2sjisibm(out, out[0], out[1]); if (IS_SJIS_UDC_LOW(out[0], out[1])) { /* User defined characters half low */ MAP_SJIS2EUC(out[0], out[1], 0xF0, out[0], out[1], 0xF5); } else if (IS_SJIS_UDC_HI(out[0], out[1])) { /* User defined characters half high */ unsigned char ch, cl; if (boundlen < 3) return -ENAMETOOLONG; n = 3; ch = out[0]; cl = out[1]; out[0] = SS3; MAP_SJIS2EUC(ch, cl, 0xF5, out[1], out[2], 0xF5); } else if (IS_SJIS_IBM(out[0], out[1])) { /* IBM extended characters */ unsigned char euc[3], i; n = sjisibm2euc(euc, out[0], out[1]); if (boundlen < n) return -ENAMETOOLONG; for (i = 0; i < n; i++) out[i] = euc[i]; } else if (IS_SJIS_JISX0208(out[0], out[1])) { /* JIS X 0208 (include NEC special characters) */ out[0] = (out[0]^0xA0)*2 + 0x5F; if (out[1] > 0x9E) out[0]++; if (out[1] < 0x7F) out[1] = out[1] + 0x61; else if (out[1] < 0x9F) out[1] = out[1] + 0x60; else out[1] = out[1] + 0x02; } else { /* Invalid characters */ return -EINVAL; } } else return -EINVAL; return n; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { unsigned char sjis_temp[2]; int euc_offset, n; if ( !p_nls ) return -EINVAL; if (boundlen <= 0) return -ENAMETOOLONG; /* translate EUC-JP into SJIS */ if (rawstring[0] > 0x7F) { if (rawstring[0] == SS3) { if (boundlen < 3) return -EINVAL; euc_offset = 3; if (IS_EUC_UDC_HI(rawstring[1], rawstring[2])) { /* User defined characters half high */ MAP_EUC2SJIS(rawstring[1], rawstring[2], 0xF5, sjis_temp[0], sjis_temp[1], 0xF5); } else if (euc2sjisibm(sjis_temp,rawstring[1],rawstring[2])) { /* IBM extended characters */ } else { /* JIS X 0212 and Invalid characters*/ return -EINVAL; /* 'GETA' with SJIS coding */ /* sjis_temp[0] = 0x81; */ /* sjis_temp[1] = 0xAC; */ } } else { if (boundlen < 2) return -EINVAL; euc_offset = 2; if (IS_EUC_JISX0201KANA(rawstring[0], rawstring[1])) { /* JIS X 0201 KANA */ sjis_temp[0] = rawstring[1]; sjis_temp[1] = 0x00; } else if (IS_EUC_UDC_LOW(rawstring[0], rawstring[1])) { /* User defined characters half low */ MAP_EUC2SJIS(rawstring[0], rawstring[1], 0xF5, sjis_temp[0], sjis_temp[1], 0xF0); } else if (IS_EUC_JISX0208(rawstring[0], rawstring[1])) { /* JIS X 0208 (include NEC spesial characters) */ sjis_temp[0] = ((rawstring[0]-0x5f)/2) ^ 0xA0; if (!(rawstring[0] & 1)) sjis_temp[1] = rawstring[1] - 0x02; else if (rawstring[1] < 0xE0) sjis_temp[1] = rawstring[1] - 0x61; else sjis_temp[1] = rawstring[1] - 0x60; } else { /* Invalid characters */ return -EINVAL; } } } else { euc_offset = 1; /* JIS X 0201 ROMAJI */ sjis_temp[0] = rawstring[0]; sjis_temp[1] = 0x00; } if ( (n = p_nls->char2uni(sjis_temp, sizeof(sjis_temp), uni)) < 0) return n; return euc_offset; } static struct nls_table table = { .charset = "euc-jp", .uni2char = uni2char, .char2uni = char2uni, }; static int __init init_nls_euc_jp(void) { p_nls = load_nls("cp932"); if (p_nls) { table.charset2upper = p_nls->charset2upper; table.charset2lower = p_nls->charset2lower; return register_nls(&table); } return -EINVAL; } static void __exit exit_nls_euc_jp(void) { unregister_nls(&table); unload_nls(p_nls); } module_init(init_nls_euc_jp) module_exit(exit_nls_euc_jp) MODULE_DESCRIPTION("NLS Japanese charset (EUC-JP)"); MODULE_LICENSE("Dual BSD/GPL"); |
| 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtio PCI driver - common functionality for all device versions * * This module allows virtio devices to be used over a virtual PCI device. * This can be used with QEMU based VMMs like KVM or Xen. * * Copyright IBM Corp. 2007 * Copyright Red Hat, Inc. 2014 * * Authors: * Anthony Liguori <aliguori@us.ibm.com> * Rusty Russell <rusty@rustcorp.com.au> * Michael S. Tsirkin <mst@redhat.com> */ #include "virtio_pci_common.h" static bool force_legacy = false; #if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY) module_param(force_legacy, bool, 0444); MODULE_PARM_DESC(force_legacy, "Force legacy mode for transitional virtio 1 devices"); #endif bool vp_is_avq(struct virtio_device *vdev, unsigned int index) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) return false; return index == vp_dev->admin_vq.vq_index; } /* wait for pending irq handlers */ void vp_synchronize_vectors(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); int i; if (vp_dev->intx_enabled) synchronize_irq(vp_dev->pci_dev->irq); for (i = 0; i < vp_dev->msix_vectors; ++i) synchronize_irq(pci_irq_vector(vp_dev->pci_dev, i)); } /* the notify function used when creating a virt queue */ bool vp_notify(struct virtqueue *vq) { /* we write the queue's selector into the notification register to * signal the other end */ iowrite16(vq->index, (void __iomem *)vq->priv); return true; } /* Notify all slow path virtqueues on an interrupt. */ static void vp_vring_slow_path_interrupt(int irq, struct virtio_pci_device *vp_dev) { struct virtio_pci_vq_info *info; unsigned long flags; spin_lock_irqsave(&vp_dev->lock, flags); list_for_each_entry(info, &vp_dev->slow_virtqueues, node) vring_interrupt(irq, info->vq); spin_unlock_irqrestore(&vp_dev->lock, flags); } /* Handle a configuration change: Tell driver if it wants to know. */ static irqreturn_t vp_config_changed(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; virtio_config_changed(&vp_dev->vdev); vp_vring_slow_path_interrupt(irq, vp_dev); return IRQ_HANDLED; } /* Notify all virtqueues on an interrupt. */ static irqreturn_t vp_vring_interrupt(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; struct virtio_pci_vq_info *info; irqreturn_t ret = IRQ_NONE; unsigned long flags; spin_lock_irqsave(&vp_dev->lock, flags); list_for_each_entry(info, &vp_dev->virtqueues, node) { if (vring_interrupt(irq, info->vq) == IRQ_HANDLED) ret = IRQ_HANDLED; } spin_unlock_irqrestore(&vp_dev->lock, flags); return ret; } /* A small wrapper to also acknowledge the interrupt when it's handled. * I really need an EIO hook for the vring so I can ack the interrupt once we * know that we'll be handling the IRQ but before we invoke the callback since * the callback may notify the host which results in the host attempting to * raise an interrupt that we would then mask once we acknowledged the * interrupt. */ static irqreturn_t vp_interrupt(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; u8 isr; /* reading the ISR has the effect of also clearing it so it's very * important to save off the value. */ isr = ioread8(vp_dev->isr); /* It's definitely not us if the ISR was not high */ if (!isr) return IRQ_NONE; /* Configuration change? Tell driver if it wants to know. */ if (isr & VIRTIO_PCI_ISR_CONFIG) vp_config_changed(irq, opaque); return vp_vring_interrupt(irq, opaque); } static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, bool per_vq_vectors, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); const char *name = dev_name(&vp_dev->vdev.dev); unsigned int flags = PCI_IRQ_MSIX; unsigned int i, v; int err = -ENOMEM; vp_dev->msix_vectors = nvectors; vp_dev->msix_names = kmalloc_array(nvectors, sizeof(*vp_dev->msix_names), GFP_KERNEL); if (!vp_dev->msix_names) goto error; vp_dev->msix_affinity_masks = kcalloc(nvectors, sizeof(*vp_dev->msix_affinity_masks), GFP_KERNEL); if (!vp_dev->msix_affinity_masks) goto error; for (i = 0; i < nvectors; ++i) if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i], GFP_KERNEL)) goto error; if (!per_vq_vectors) desc = NULL; if (desc) { flags |= PCI_IRQ_AFFINITY; desc->pre_vectors++; /* virtio config vector */ } err = pci_alloc_irq_vectors_affinity(vp_dev->pci_dev, nvectors, nvectors, flags, desc); if (err < 0) goto error; vp_dev->msix_enabled = 1; /* Set the vector used for configuration */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, "%s-config", name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), vp_config_changed, 0, vp_dev->msix_names[v], vp_dev); if (err) goto error; ++vp_dev->msix_used_vectors; v = vp_dev->config_vector(vp_dev, v); /* Verify we had enough resources to assign the vector */ if (v == VIRTIO_MSI_NO_VECTOR) { err = -EBUSY; goto error; } if (!per_vq_vectors) { /* Shared vector for all VQs */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, "%s-virtqueues", name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), vp_vring_interrupt, 0, vp_dev->msix_names[v], vp_dev); if (err) goto error; ++vp_dev->msix_used_vectors; } return 0; error: return err; } static bool vp_is_slow_path_vector(u16 msix_vec) { return msix_vec == VP_MSIX_CONFIG_VECTOR; } static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int index, void (*callback)(struct virtqueue *vq), const char *name, bool ctx, u16 msix_vec, struct virtio_pci_vq_info **p_info) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info = kmalloc(sizeof *info, GFP_KERNEL); struct virtqueue *vq; unsigned long flags; /* fill out our structure that represents an active queue */ if (!info) return ERR_PTR(-ENOMEM); vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, ctx, msix_vec); if (IS_ERR(vq)) goto out_info; info->vq = vq; if (callback) { spin_lock_irqsave(&vp_dev->lock, flags); if (!vp_is_slow_path_vector(msix_vec)) list_add(&info->node, &vp_dev->virtqueues); else list_add(&info->node, &vp_dev->slow_virtqueues); spin_unlock_irqrestore(&vp_dev->lock, flags); } else { INIT_LIST_HEAD(&info->node); } *p_info = info; return vq; out_info: kfree(info); return vq; } static void vp_del_vq(struct virtqueue *vq, struct virtio_pci_vq_info *info) { struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); unsigned long flags; /* * If it fails during re-enable reset vq. This way we won't rejoin * info->node to the queue. Prevent unexpected irqs. */ if (!vq->reset) { spin_lock_irqsave(&vp_dev->lock, flags); list_del(&info->node); spin_unlock_irqrestore(&vp_dev->lock, flags); } vp_dev->del_vq(info); kfree(info); } /* the config->del_vqs() implementation */ void vp_del_vqs(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info; struct virtqueue *vq, *n; int i; list_for_each_entry_safe(vq, n, &vdev->vqs, list) { info = vp_is_avq(vdev, vq->index) ? vp_dev->admin_vq.info : vp_dev->vqs[vq->index]; if (vp_dev->per_vq_vectors) { int v = info->msix_vector; if (v != VIRTIO_MSI_NO_VECTOR && !vp_is_slow_path_vector(v)) { int irq = pci_irq_vector(vp_dev->pci_dev, v); irq_update_affinity_hint(irq, NULL); free_irq(irq, vq); } } vp_del_vq(vq, info); } vp_dev->per_vq_vectors = false; if (vp_dev->intx_enabled) { free_irq(vp_dev->pci_dev->irq, vp_dev); vp_dev->intx_enabled = 0; } for (i = 0; i < vp_dev->msix_used_vectors; ++i) free_irq(pci_irq_vector(vp_dev->pci_dev, i), vp_dev); if (vp_dev->msix_affinity_masks) { for (i = 0; i < vp_dev->msix_vectors; i++) free_cpumask_var(vp_dev->msix_affinity_masks[i]); } if (vp_dev->msix_enabled) { /* Disable the vector used for configuration */ vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR); pci_free_irq_vectors(vp_dev->pci_dev); vp_dev->msix_enabled = 0; } vp_dev->msix_vectors = 0; vp_dev->msix_used_vectors = 0; kfree(vp_dev->msix_names); vp_dev->msix_names = NULL; kfree(vp_dev->msix_affinity_masks); vp_dev->msix_affinity_masks = NULL; kfree(vp_dev->vqs); vp_dev->vqs = NULL; } enum vp_vq_vector_policy { VP_VQ_VECTOR_POLICY_EACH, VP_VQ_VECTOR_POLICY_SHARED_SLOW, VP_VQ_VECTOR_POLICY_SHARED, }; static struct virtqueue * vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, vq_callback_t *callback, const char *name, bool ctx, bool slow_path, int *allocated_vectors, enum vp_vq_vector_policy vector_policy, struct virtio_pci_vq_info **p_info) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq; u16 msix_vec; int err; if (!callback) msix_vec = VIRTIO_MSI_NO_VECTOR; else if (vector_policy == VP_VQ_VECTOR_POLICY_EACH || (vector_policy == VP_VQ_VECTOR_POLICY_SHARED_SLOW && !slow_path)) msix_vec = (*allocated_vectors)++; else if (vector_policy != VP_VQ_VECTOR_POLICY_EACH && slow_path) msix_vec = VP_MSIX_CONFIG_VECTOR; else msix_vec = VP_MSIX_VQ_VECTOR; vq = vp_setup_vq(vdev, queue_idx, callback, name, ctx, msix_vec, p_info); if (IS_ERR(vq)) return vq; if (vector_policy == VP_VQ_VECTOR_POLICY_SHARED || msix_vec == VIRTIO_MSI_NO_VECTOR || vp_is_slow_path_vector(msix_vec)) return vq; /* allocate per-vq irq if available and necessary */ snprintf(vp_dev->msix_names[msix_vec], sizeof(*vp_dev->msix_names), "%s-%s", dev_name(&vp_dev->vdev.dev), name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), vring_interrupt, 0, vp_dev->msix_names[msix_vec], vq); if (err) { vp_del_vq(vq, *p_info); return ERR_PTR(err); } return vq; } static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], enum vp_vq_vector_policy vector_policy, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; struct virtqueue_info *vqi; int i, err, nvectors, allocated_vectors, queue_idx = 0; struct virtqueue *vq; bool per_vq_vectors; u16 avq_num = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; if (vp_dev->avq_index) { err = vp_dev->avq_index(vdev, &avq->vq_index, &avq_num); if (err) goto error_find; } per_vq_vectors = vector_policy != VP_VQ_VECTOR_POLICY_SHARED; if (per_vq_vectors) { /* Best option: one for change interrupt, one per vq. */ nvectors = 1; for (i = 0; i < nvqs; ++i) { vqi = &vqs_info[i]; if (vqi->name && vqi->callback) ++nvectors; } if (avq_num && vector_policy == VP_VQ_VECTOR_POLICY_EACH) ++nvectors; } else { /* Second best: one for change, shared for all vqs. */ nvectors = 2; } err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors, desc); if (err) goto error_find; vp_dev->per_vq_vectors = per_vq_vectors; allocated_vectors = vp_dev->msix_used_vectors; for (i = 0; i < nvqs; ++i) { vqi = &vqs_info[i]; if (!vqi->name) { vqs[i] = NULL; continue; } vqs[i] = vp_find_one_vq_msix(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, false, &allocated_vectors, vector_policy, &vp_dev->vqs[i]); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto error_find; } } if (!avq_num) return 0; sprintf(avq->name, "avq.%u", avq->vq_index); vq = vp_find_one_vq_msix(vdev, avq->vq_index, vp_modern_avq_done, avq->name, false, true, &allocated_vectors, vector_policy, &vp_dev->admin_vq.info); if (IS_ERR(vq)) { err = PTR_ERR(vq); goto error_find; } return 0; error_find: vp_del_vqs(vdev); return err; } static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[]) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; int i, err, queue_idx = 0; struct virtqueue *vq; u16 avq_num = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; if (vp_dev->avq_index) { err = vp_dev->avq_index(vdev, &avq->vq_index, &avq_num); if (err) goto out_del_vqs; } err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vp_dev); if (err) goto out_del_vqs; vp_dev->intx_enabled = 1; vp_dev->per_vq_vectors = false; for (i = 0; i < nvqs; ++i) { struct virtqueue_info *vqi = &vqs_info[i]; if (!vqi->name) { vqs[i] = NULL; continue; } vqs[i] = vp_setup_vq(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, VIRTIO_MSI_NO_VECTOR, &vp_dev->vqs[i]); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto out_del_vqs; } } if (!avq_num) return 0; sprintf(avq->name, "avq.%u", avq->vq_index); vq = vp_setup_vq(vdev, queue_idx++, vp_modern_avq_done, avq->name, false, VIRTIO_MSI_NO_VECTOR, &vp_dev->admin_vq.info); if (IS_ERR(vq)) { err = PTR_ERR(vq); goto out_del_vqs; } return 0; out_del_vqs: vp_del_vqs(vdev); return err; } /* the config->find_vqs() implementation */ int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], struct irq_affinity *desc) { int err; /* Try MSI-X with one vector per queue. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_EACH, desc); if (!err) return 0; /* Fallback: MSI-X with one shared vector for config and * slow path queues, one vector per queue for the rest. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_SHARED_SLOW, desc); if (!err) return 0; /* Fallback: MSI-X with one vector for config, one shared for queues. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_SHARED, desc); if (!err) return 0; /* Is there an interrupt? If not give up. */ if (!(to_vp_device(vdev)->pci_dev->irq)) return err; /* Finally fall back to regular interrupts. */ return vp_find_vqs_intx(vdev, nvqs, vqs, vqs_info); } const char *vp_bus_name(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); return pci_name(vp_dev->pci_dev); } /* Setup the affinity for a virtqueue: * - force the affinity for per vq vector * - OR over all affinities for shared MSI * - ignore the affinity request if we're using INTX */ int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) { struct virtio_device *vdev = vq->vdev; struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index]; struct cpumask *mask; unsigned int irq; if (!vq->callback) return -EINVAL; if (vp_dev->msix_enabled) { mask = vp_dev->msix_affinity_masks[info->msix_vector]; irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector); if (!cpu_mask) irq_update_affinity_hint(irq, NULL); else { cpumask_copy(mask, cpu_mask); irq_set_affinity_and_hint(irq, mask); } } return 0; } const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); if (!vp_dev->per_vq_vectors || vp_dev->vqs[index]->msix_vector == VIRTIO_MSI_NO_VECTOR || vp_is_slow_path_vector(vp_dev->vqs[index]->msix_vector)) return NULL; return pci_irq_get_affinity(vp_dev->pci_dev, vp_dev->vqs[index]->msix_vector); } #ifdef CONFIG_PM_SLEEP static int virtio_pci_freeze(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; ret = virtio_device_freeze(&vp_dev->vdev); if (!ret) pci_disable_device(pci_dev); return ret; } static int virtio_pci_restore(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; ret = pci_enable_device(pci_dev); if (ret) return ret; pci_set_master(pci_dev); return virtio_device_restore(&vp_dev->vdev); } static bool vp_supports_pm_no_reset(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); u16 pmcsr; if (!pci_dev->pm_cap) return false; pci_read_config_word(pci_dev, pci_dev->pm_cap + PCI_PM_CTRL, &pmcsr); if (PCI_POSSIBLE_ERROR(pmcsr)) { dev_err(dev, "Unable to query pmcsr"); return false; } return pmcsr & PCI_PM_CTRL_NO_SOFT_RESET; } static int virtio_pci_suspend(struct device *dev) { return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_freeze(dev); } static int virtio_pci_resume(struct device *dev) { return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_restore(dev); } static const struct dev_pm_ops virtio_pci_pm_ops = { .suspend = virtio_pci_suspend, .resume = virtio_pci_resume, .freeze = virtio_pci_freeze, .thaw = virtio_pci_restore, .poweroff = virtio_pci_freeze, .restore = virtio_pci_restore, }; #endif /* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */ static const struct pci_device_id virtio_pci_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) }, { 0 } }; MODULE_DEVICE_TABLE(pci, virtio_pci_id_table); static void virtio_pci_release_dev(struct device *_d) { struct virtio_device *vdev = dev_to_virtio(_d); struct virtio_pci_device *vp_dev = to_vp_device(vdev); /* As struct device is a kobject, it's not safe to * free the memory (including the reference counter itself) * until it's release callback. */ kfree(vp_dev); } static int virtio_pci_probe(struct pci_dev *pci_dev, const struct pci_device_id *id) { struct virtio_pci_device *vp_dev, *reg_dev = NULL; int rc; /* allocate our structure and fill it out */ vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL); if (!vp_dev) return -ENOMEM; pci_set_drvdata(pci_dev, vp_dev); vp_dev->vdev.dev.parent = &pci_dev->dev; vp_dev->vdev.dev.release = virtio_pci_release_dev; vp_dev->pci_dev = pci_dev; INIT_LIST_HEAD(&vp_dev->virtqueues); INIT_LIST_HEAD(&vp_dev->slow_virtqueues); spin_lock_init(&vp_dev->lock); /* enable the device */ rc = pci_enable_device(pci_dev); if (rc) goto err_enable_device; if (force_legacy) { rc = virtio_pci_legacy_probe(vp_dev); /* Also try modern mode if we can't map BAR0 (no IO space). */ if (rc == -ENODEV || rc == -ENOMEM) rc = virtio_pci_modern_probe(vp_dev); if (rc) goto err_probe; } else { rc = virtio_pci_modern_probe(vp_dev); if (rc == -ENODEV) rc = virtio_pci_legacy_probe(vp_dev); if (rc) goto err_probe; } pci_set_master(pci_dev); rc = register_virtio_device(&vp_dev->vdev); reg_dev = vp_dev; if (rc) goto err_register; return 0; err_register: if (vp_dev->is_legacy) virtio_pci_legacy_remove(vp_dev); else virtio_pci_modern_remove(vp_dev); err_probe: pci_disable_device(pci_dev); err_enable_device: if (reg_dev) put_device(&vp_dev->vdev.dev); else kfree(vp_dev); return rc; } static void virtio_pci_remove(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct device *dev = get_device(&vp_dev->vdev.dev); /* * Device is marked broken on surprise removal so that virtio upper * layers can abort any ongoing operation. */ if (!pci_device_is_present(pci_dev)) virtio_break_device(&vp_dev->vdev); pci_disable_sriov(pci_dev); unregister_virtio_device(&vp_dev->vdev); if (vp_dev->is_legacy) virtio_pci_legacy_remove(vp_dev); else virtio_pci_modern_remove(vp_dev); pci_disable_device(pci_dev); put_device(dev); } static int virtio_pci_sriov_configure(struct pci_dev *pci_dev, int num_vfs) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct virtio_device *vdev = &vp_dev->vdev; int ret; if (!(vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK)) return -EBUSY; if (!__virtio_test_bit(vdev, VIRTIO_F_SR_IOV)) return -EINVAL; if (pci_vfs_assigned(pci_dev)) return -EPERM; if (num_vfs == 0) { pci_disable_sriov(pci_dev); return 0; } ret = pci_enable_sriov(pci_dev, num_vfs); if (ret < 0) return ret; return num_vfs; } static void virtio_pci_reset_prepare(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret = 0; ret = virtio_device_reset_prepare(&vp_dev->vdev); if (ret) { if (ret != -EOPNOTSUPP) dev_warn(&pci_dev->dev, "Reset prepare failure: %d", ret); return; } if (pci_is_enabled(pci_dev)) pci_disable_device(pci_dev); } static void virtio_pci_reset_done(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; if (pci_is_enabled(pci_dev)) return; ret = pci_enable_device(pci_dev); if (!ret) { pci_set_master(pci_dev); ret = virtio_device_reset_done(&vp_dev->vdev); } if (ret && ret != -EOPNOTSUPP) dev_warn(&pci_dev->dev, "Reset done failure: %d", ret); } static const struct pci_error_handlers virtio_pci_err_handler = { .reset_prepare = virtio_pci_reset_prepare, .reset_done = virtio_pci_reset_done, }; static struct pci_driver virtio_pci_driver = { .name = "virtio-pci", .id_table = virtio_pci_id_table, .probe = virtio_pci_probe, .remove = virtio_pci_remove, #ifdef CONFIG_PM_SLEEP .driver.pm = &virtio_pci_pm_ops, #endif .sriov_configure = virtio_pci_sriov_configure, .err_handler = &virtio_pci_err_handler, }; struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev) { struct virtio_pci_device *pf_vp_dev; pf_vp_dev = pci_iov_get_pf_drvdata(pdev, &virtio_pci_driver); if (IS_ERR(pf_vp_dev)) return NULL; return &pf_vp_dev->vdev; } module_pci_driver(virtio_pci_driver); MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>"); MODULE_DESCRIPTION("virtio-pci"); MODULE_LICENSE("GPL"); MODULE_VERSION("1"); |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* General filesystem caching interface * * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * NOTE!!! See: * * Documentation/filesystems/caching/netfs-api.rst * * for a description of the network filesystem interface declared here. */ #ifndef _LINUX_FSCACHE_H #define _LINUX_FSCACHE_H #include <linux/fs.h> #include <linux/netfs.h> #include <linux/writeback.h> #if defined(CONFIG_FSCACHE) || defined(CONFIG_FSCACHE_MODULE) #define __fscache_available (1) #define fscache_available() (1) #define fscache_volume_valid(volume) (volume) #define fscache_cookie_valid(cookie) (cookie) #define fscache_resources_valid(cres) ((cres)->cache_priv) #define fscache_cookie_enabled(cookie) (cookie && !test_bit(FSCACHE_COOKIE_DISABLED, &cookie->flags)) #else #define __fscache_available (0) #define fscache_available() (0) #define fscache_volume_valid(volume) (0) #define fscache_cookie_valid(cookie) (0) #define fscache_resources_valid(cres) (false) #define fscache_cookie_enabled(cookie) (0) #endif struct fscache_cookie; #define FSCACHE_ADV_SINGLE_CHUNK 0x01 /* The object is a single chunk of data */ #define FSCACHE_ADV_WRITE_CACHE 0x00 /* Do cache if written to locally */ #define FSCACHE_ADV_WRITE_NOCACHE 0x02 /* Don't cache if written to locally */ #define FSCACHE_ADV_WANT_CACHE_SIZE 0x04 /* Retrieve cache size at runtime */ #define FSCACHE_INVAL_DIO_WRITE 0x01 /* Invalidate due to DIO write */ enum fscache_want_state { FSCACHE_WANT_PARAMS, FSCACHE_WANT_WRITE, FSCACHE_WANT_READ, }; /* * Data object state. */ enum fscache_cookie_state { FSCACHE_COOKIE_STATE_QUIESCENT, /* The cookie is uncached */ FSCACHE_COOKIE_STATE_LOOKING_UP, /* The cache object is being looked up */ FSCACHE_COOKIE_STATE_CREATING, /* The cache object is being created */ FSCACHE_COOKIE_STATE_ACTIVE, /* The cache is active, readable and writable */ FSCACHE_COOKIE_STATE_INVALIDATING, /* The cache is being invalidated */ FSCACHE_COOKIE_STATE_FAILED, /* The cache failed, withdraw to clear */ FSCACHE_COOKIE_STATE_LRU_DISCARDING, /* The cookie is being discarded by the LRU */ FSCACHE_COOKIE_STATE_WITHDRAWING, /* The cookie is being withdrawn */ FSCACHE_COOKIE_STATE_RELINQUISHING, /* The cookie is being relinquished */ FSCACHE_COOKIE_STATE_DROPPED, /* The cookie has been dropped */ #define FSCACHE_COOKIE_STATE__NR (FSCACHE_COOKIE_STATE_DROPPED + 1) } __attribute__((mode(byte))); /* * Volume representation cookie. */ struct fscache_volume { refcount_t ref; atomic_t n_cookies; /* Number of data cookies in volume */ atomic_t n_accesses; /* Number of cache accesses in progress */ unsigned int debug_id; unsigned int key_hash; /* Hash of key string */ u8 *key; /* Volume ID, eg. "afs@example.com@1234" */ struct list_head proc_link; /* Link in /proc/fs/fscache/volumes */ struct hlist_bl_node hash_link; /* Link in hash table */ struct work_struct work; struct fscache_cache *cache; /* The cache in which this resides */ void *cache_priv; /* Cache private data */ spinlock_t lock; unsigned long flags; #define FSCACHE_VOLUME_RELINQUISHED 0 /* Volume is being cleaned up */ #define FSCACHE_VOLUME_INVALIDATE 1 /* Volume was invalidated */ #define FSCACHE_VOLUME_COLLIDED_WITH 2 /* Volume was collided with */ #define FSCACHE_VOLUME_ACQUIRE_PENDING 3 /* Volume is waiting to complete acquisition */ #define FSCACHE_VOLUME_CREATING 4 /* Volume is being created on disk */ u8 coherency_len; /* Length of the coherency data */ u8 coherency[]; /* Coherency data */ }; /* * Data file representation cookie. * - a file will only appear in one cache * - a request to cache a file may or may not be honoured, subject to * constraints such as disk space * - indices are created on disk just-in-time */ struct fscache_cookie { refcount_t ref; atomic_t n_active; /* number of active users of cookie */ atomic_t n_accesses; /* Number of cache accesses in progress */ unsigned int debug_id; unsigned int inval_counter; /* Number of invalidations made */ spinlock_t lock; struct fscache_volume *volume; /* Parent volume of this file. */ void *cache_priv; /* Cache-side representation */ struct hlist_bl_node hash_link; /* Link in hash table */ struct list_head proc_link; /* Link in proc list */ struct list_head commit_link; /* Link in commit queue */ struct work_struct work; /* Commit/relinq/withdraw work */ loff_t object_size; /* Size of the netfs object */ unsigned long unused_at; /* Time at which unused (jiffies) */ unsigned long flags; #define FSCACHE_COOKIE_RELINQUISHED 0 /* T if cookie has been relinquished */ #define FSCACHE_COOKIE_RETIRED 1 /* T if this cookie has retired on relinq */ #define FSCACHE_COOKIE_IS_CACHING 2 /* T if this cookie is cached */ #define FSCACHE_COOKIE_NO_DATA_TO_READ 3 /* T if this cookie has nothing to read */ #define FSCACHE_COOKIE_NEEDS_UPDATE 4 /* T if attrs have been updated */ #define FSCACHE_COOKIE_HAS_BEEN_CACHED 5 /* T if cookie needs withdraw-on-relinq */ #define FSCACHE_COOKIE_DISABLED 6 /* T if cookie has been disabled */ #define FSCACHE_COOKIE_LOCAL_WRITE 7 /* T if cookie has been modified locally */ #define FSCACHE_COOKIE_NO_ACCESS_WAKE 8 /* T if no wake when n_accesses goes 0 */ #define FSCACHE_COOKIE_DO_RELINQUISH 9 /* T if this cookie needs relinquishment */ #define FSCACHE_COOKIE_DO_WITHDRAW 10 /* T if this cookie needs withdrawing */ #define FSCACHE_COOKIE_DO_LRU_DISCARD 11 /* T if this cookie needs LRU discard */ #define FSCACHE_COOKIE_DO_PREP_TO_WRITE 12 /* T if cookie needs write preparation */ #define FSCACHE_COOKIE_HAVE_DATA 13 /* T if this cookie has data stored */ #define FSCACHE_COOKIE_IS_HASHED 14 /* T if this cookie is hashed */ #define FSCACHE_COOKIE_DO_INVALIDATE 15 /* T if cookie needs invalidation */ enum fscache_cookie_state state; u8 advice; /* FSCACHE_ADV_* */ u8 key_len; /* Length of index key */ u8 aux_len; /* Length of auxiliary data */ u32 key_hash; /* Hash of volume, key, len */ union { void *key; /* Index key */ u8 inline_key[16]; /* - If the key is short enough */ }; union { void *aux; /* Auxiliary data */ u8 inline_aux[8]; /* - If the aux data is short enough */ }; }; /* * slow-path functions for when there is actually caching available, and the * netfs does actually have a valid token * - these are not to be called directly * - these are undefined symbols when FS-Cache is not configured and the * optimiser takes care of not using them */ extern struct fscache_volume *__fscache_acquire_volume(const char *, const char *, const void *, size_t); extern void __fscache_relinquish_volume(struct fscache_volume *, const void *, bool); extern struct fscache_cookie *__fscache_acquire_cookie( struct fscache_volume *, u8, const void *, size_t, const void *, size_t, loff_t); extern void __fscache_use_cookie(struct fscache_cookie *, bool); extern void __fscache_unuse_cookie(struct fscache_cookie *, const void *, const loff_t *); extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool); extern void __fscache_resize_cookie(struct fscache_cookie *, loff_t); extern void __fscache_invalidate(struct fscache_cookie *, const void *, loff_t, unsigned int); extern int __fscache_begin_read_operation(struct netfs_cache_resources *, struct fscache_cookie *); extern int __fscache_begin_write_operation(struct netfs_cache_resources *, struct fscache_cookie *); void __fscache_write_to_cache(struct fscache_cookie *cookie, struct address_space *mapping, loff_t start, size_t len, loff_t i_size, netfs_io_terminated_t term_func, void *term_func_priv, bool using_pgpriv2, bool cond); extern void __fscache_clear_page_bits(struct address_space *, loff_t, size_t); /** * fscache_acquire_volume - Register a volume as desiring caching services * @volume_key: An identification string for the volume * @cache_name: The name of the cache to use (or NULL for the default) * @coherency_data: Piece of arbitrary coherency data to check (or NULL) * @coherency_len: The size of the coherency data * * Register a volume as desiring caching services if they're available. The * caller must provide an identifier for the volume and may also indicate which * cache it should be in. If a preexisting volume entry is found in the cache, * the coherency data must match otherwise the entry will be invalidated. * * Returns a cookie pointer on success, -ENOMEM if out of memory or -EBUSY if a * cache volume of that name is already acquired. Note that "NULL" is a valid * cookie pointer and can be returned if caching is refused. */ static inline struct fscache_volume *fscache_acquire_volume(const char *volume_key, const char *cache_name, const void *coherency_data, size_t coherency_len) { if (!fscache_available()) return NULL; return __fscache_acquire_volume(volume_key, cache_name, coherency_data, coherency_len); } /** * fscache_relinquish_volume - Cease caching a volume * @volume: The volume cookie * @coherency_data: Piece of arbitrary coherency data to set (or NULL) * @invalidate: True if the volume should be invalidated * * Indicate that a filesystem no longer desires caching services for a volume. * The caller must have relinquished all file cookies prior to calling this. * The stored coherency data is updated. */ static inline void fscache_relinquish_volume(struct fscache_volume *volume, const void *coherency_data, bool invalidate) { if (fscache_volume_valid(volume)) __fscache_relinquish_volume(volume, coherency_data, invalidate); } /** * fscache_acquire_cookie - Acquire a cookie to represent a cache object * @volume: The volume in which to locate/create this cookie * @advice: Advice flags (FSCACHE_COOKIE_ADV_*) * @index_key: The index key for this cookie * @index_key_len: Size of the index key * @aux_data: The auxiliary data for the cookie (may be NULL) * @aux_data_len: Size of the auxiliary data buffer * @object_size: The initial size of object * * Acquire a cookie to represent a data file within the given cache volume. * * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline struct fscache_cookie *fscache_acquire_cookie(struct fscache_volume *volume, u8 advice, const void *index_key, size_t index_key_len, const void *aux_data, size_t aux_data_len, loff_t object_size) { if (!fscache_volume_valid(volume)) return NULL; return __fscache_acquire_cookie(volume, advice, index_key, index_key_len, aux_data, aux_data_len, object_size); } /** * fscache_use_cookie - Request usage of cookie attached to an object * @cookie: The cookie representing the cache object * @will_modify: If cache is expected to be modified locally * * Request usage of the cookie attached to an object. The caller should tell * the cache if the object's contents are about to be modified locally and then * the cache can apply the policy that has been set to handle this case. */ static inline void fscache_use_cookie(struct fscache_cookie *cookie, bool will_modify) { if (fscache_cookie_valid(cookie)) __fscache_use_cookie(cookie, will_modify); } /** * fscache_unuse_cookie - Cease usage of cookie attached to an object * @cookie: The cookie representing the cache object * @aux_data: Updated auxiliary data (or NULL) * @object_size: Revised size of the object (or NULL) * * Cease usage of the cookie attached to an object. When the users count * reaches zero then the cookie relinquishment will be permitted to proceed. */ static inline void fscache_unuse_cookie(struct fscache_cookie *cookie, const void *aux_data, const loff_t *object_size) { if (fscache_cookie_valid(cookie)) __fscache_unuse_cookie(cookie, aux_data, object_size); } /** * fscache_relinquish_cookie - Return the cookie to the cache, maybe discarding * it * @cookie: The cookie being returned * @retire: True if the cache object the cookie represents is to be discarded * * This function returns a cookie to the cache, forcibly discarding the * associated cache object if retire is set to true. * * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline void fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) { if (fscache_cookie_valid(cookie)) __fscache_relinquish_cookie(cookie, retire); } /* * Find the auxiliary data on a cookie. */ static inline void *fscache_get_aux(struct fscache_cookie *cookie) { if (cookie->aux_len <= sizeof(cookie->inline_aux)) return cookie->inline_aux; else return cookie->aux; } /* * Update the auxiliary data on a cookie. */ static inline void fscache_update_aux(struct fscache_cookie *cookie, const void *aux_data, const loff_t *object_size) { void *p = fscache_get_aux(cookie); if (aux_data && p) memcpy(p, aux_data, cookie->aux_len); if (object_size) cookie->object_size = *object_size; } #ifdef CONFIG_FSCACHE_STATS extern atomic_t fscache_n_updates; #endif static inline void __fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data, const loff_t *object_size) { #ifdef CONFIG_FSCACHE_STATS atomic_inc(&fscache_n_updates); #endif fscache_update_aux(cookie, aux_data, object_size); smp_wmb(); set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &cookie->flags); } /** * fscache_update_cookie - Request that a cache object be updated * @cookie: The cookie representing the cache object * @aux_data: The updated auxiliary data for the cookie (may be NULL) * @object_size: The current size of the object (may be NULL) * * Request an update of the index data for the cache object associated with the * cookie. The auxiliary data on the cookie will be updated first if @aux_data * is set and the object size will be updated and the object possibly trimmed * if @object_size is set. * * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline void fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data, const loff_t *object_size) { if (fscache_cookie_enabled(cookie)) __fscache_update_cookie(cookie, aux_data, object_size); } /** * fscache_resize_cookie - Request that a cache object be resized * @cookie: The cookie representing the cache object * @new_size: The new size of the object (may be NULL) * * Request that the size of an object be changed. * * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline void fscache_resize_cookie(struct fscache_cookie *cookie, loff_t new_size) { if (fscache_cookie_enabled(cookie)) __fscache_resize_cookie(cookie, new_size); } /** * fscache_invalidate - Notify cache that an object needs invalidation * @cookie: The cookie representing the cache object * @aux_data: The updated auxiliary data for the cookie (may be NULL) * @size: The revised size of the object. * @flags: Invalidation flags (FSCACHE_INVAL_*) * * Notify the cache that an object is needs to be invalidated and that it * should abort any retrievals or stores it is doing on the cache. This * increments inval_counter on the cookie which can be used by the caller to * reconsider I/O requests as they complete. * * If @flags has FSCACHE_INVAL_DIO_WRITE set, this indicates that this is due * to a direct I/O write and will cause caching to be disabled on this cookie * until it is completely unused. * * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline void fscache_invalidate(struct fscache_cookie *cookie, const void *aux_data, loff_t size, unsigned int flags) { if (fscache_cookie_enabled(cookie)) __fscache_invalidate(cookie, aux_data, size, flags); } /** * fscache_operation_valid - Return true if operations resources are usable * @cres: The resources to check. * * Returns a pointer to the operations table if usable or NULL if not. */ static inline const struct netfs_cache_ops *fscache_operation_valid(const struct netfs_cache_resources *cres) { return fscache_resources_valid(cres) ? cres->ops : NULL; } /** * fscache_begin_read_operation - Begin a read operation for the netfs lib * @cres: The cache resources for the read being performed * @cookie: The cookie representing the cache object * * Begin a read operation on behalf of the netfs helper library. @cres * indicates the cache resources to which the operation state should be * attached; @cookie indicates the cache object that will be accessed. * * @cres->inval_counter is set from @cookie->inval_counter for comparison at * the end of the operation. This allows invalidation during the operation to * be detected by the caller. * * Returns: * * 0 - Success * * -ENOBUFS - No caching available * * Other error code from the cache, such as -ENOMEM. */ static inline int fscache_begin_read_operation(struct netfs_cache_resources *cres, struct fscache_cookie *cookie) { if (fscache_cookie_enabled(cookie)) return __fscache_begin_read_operation(cres, cookie); return -ENOBUFS; } /** * fscache_end_operation - End the read operation for the netfs lib * @cres: The cache resources for the read operation * * Clean up the resources at the end of the read request. */ static inline void fscache_end_operation(struct netfs_cache_resources *cres) { const struct netfs_cache_ops *ops = fscache_operation_valid(cres); if (ops) ops->end_operation(cres); } /** * fscache_read - Start a read from the cache. * @cres: The cache resources to use * @start_pos: The beginning file offset in the cache file * @iter: The buffer to fill - and also the length * @read_hole: How to handle a hole in the data. * @term_func: The function to call upon completion * @term_func_priv: The private data for @term_func * * Start a read from the cache. @cres indicates the cache object to read from * and must be obtained by a call to fscache_begin_operation() beforehand. * * The data is read into the iterator, @iter, and that also indicates the size * of the operation. @start_pos is the start position in the file, though if * @seek_data is set appropriately, the cache can use SEEK_DATA to find the * next piece of data, writing zeros for the hole into the iterator. * * Upon termination of the operation, @term_func will be called and supplied * with @term_func_priv plus the amount of data written, if successful, or the * error code otherwise. * * @read_hole indicates how a partially populated region in the cache should be * handled. It can be one of a number of settings: * * NETFS_READ_HOLE_IGNORE - Just try to read (may return a short read). * * NETFS_READ_HOLE_FAIL - Give ENODATA if we encounter a hole. */ static inline int fscache_read(struct netfs_cache_resources *cres, loff_t start_pos, struct iov_iter *iter, enum netfs_read_from_hole read_hole, netfs_io_terminated_t term_func, void *term_func_priv) { const struct netfs_cache_ops *ops = fscache_operation_valid(cres); return ops->read(cres, start_pos, iter, read_hole, term_func, term_func_priv); } /** * fscache_begin_write_operation - Begin a write operation for the netfs lib * @cres: The cache resources for the write being performed * @cookie: The cookie representing the cache object * * Begin a write operation on behalf of the netfs helper library. @cres * indicates the cache resources to which the operation state should be * attached; @cookie indicates the cache object that will be accessed. * * @cres->inval_counter is set from @cookie->inval_counter for comparison at * the end of the operation. This allows invalidation during the operation to * be detected by the caller. * * Returns: * * 0 - Success * * -ENOBUFS - No caching available * * Other error code from the cache, such as -ENOMEM. */ static inline int fscache_begin_write_operation(struct netfs_cache_resources *cres, struct fscache_cookie *cookie) { if (fscache_cookie_enabled(cookie)) return __fscache_begin_write_operation(cres, cookie); return -ENOBUFS; } /** * fscache_write - Start a write to the cache. * @cres: The cache resources to use * @start_pos: The beginning file offset in the cache file * @iter: The data to write - and also the length * @term_func: The function to call upon completion * @term_func_priv: The private data for @term_func * * Start a write to the cache. @cres indicates the cache object to write to and * must be obtained by a call to fscache_begin_operation() beforehand. * * The data to be written is obtained from the iterator, @iter, and that also * indicates the size of the operation. @start_pos is the start position in * the file. * * Upon termination of the operation, @term_func will be called and supplied * with @term_func_priv plus the amount of data written, if successful, or the * error code otherwise. */ static inline int fscache_write(struct netfs_cache_resources *cres, loff_t start_pos, struct iov_iter *iter, netfs_io_terminated_t term_func, void *term_func_priv) { const struct netfs_cache_ops *ops = fscache_operation_valid(cres); return ops->write(cres, start_pos, iter, term_func, term_func_priv); } /** * fscache_clear_page_bits - Clear the PG_fscache bits from a set of pages * @mapping: The netfs inode to use as the source * @start: The start position in @mapping * @len: The amount of data to unlock * @caching: If PG_fscache has been set * * Clear the PG_fscache flag from a sequence of pages and wake up anyone who's * waiting. */ static inline void fscache_clear_page_bits(struct address_space *mapping, loff_t start, size_t len, bool caching) { if (caching) __fscache_clear_page_bits(mapping, start, len); } /** * fscache_write_to_cache - Save a write to the cache and clear PG_fscache * @cookie: The cookie representing the cache object * @mapping: The netfs inode to use as the source * @start: The start position in @mapping * @len: The amount of data to write back * @i_size: The new size of the inode * @term_func: The function to call upon completion * @term_func_priv: The private data for @term_func * @using_pgpriv2: If we're using PG_private_2 to mark in-progress write * @caching: If we actually want to do the caching * * Helper function for a netfs to write dirty data from an inode into the cache * object that's backing it. * * @start and @len describe the range of the data. This does not need to be * page-aligned, but to satisfy DIO requirements, the cache may expand it up to * the page boundaries on either end. All the pages covering the range must be * marked with PG_fscache. * * If given, @term_func will be called upon completion and supplied with * @term_func_priv. Note that if @using_pgpriv2 is set, the PG_private_2 flags * will have been cleared by this point, so the netfs must retain its own pin * on the mapping. */ static inline void fscache_write_to_cache(struct fscache_cookie *cookie, struct address_space *mapping, loff_t start, size_t len, loff_t i_size, netfs_io_terminated_t term_func, void *term_func_priv, bool using_pgpriv2, bool caching) { if (caching) __fscache_write_to_cache(cookie, mapping, start, len, i_size, term_func, term_func_priv, using_pgpriv2, caching); else if (term_func) term_func(term_func_priv, -ENOBUFS); } /** * fscache_note_page_release - Note that a netfs page got released * @cookie: The cookie corresponding to the file * * Note that a page that has been copied to the cache has been released. This * means that future reads will need to look in the cache to see if it's there. */ static inline void fscache_note_page_release(struct fscache_cookie *cookie) { /* If we've written data to the cache (HAVE_DATA) and there wasn't any * data in the cache when we started (NO_DATA_TO_READ), it may no * longer be true that we can skip reading from the cache - so clear * the flag that causes reads to be skipped. */ if (cookie && test_bit(FSCACHE_COOKIE_HAVE_DATA, &cookie->flags) && test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); } #endif /* _LINUX_FSCACHE_H */ |
| 4 642 7 14 3884 3906 16 3784 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_USER_NAMESPACE_H #define _LINUX_USER_NAMESPACE_H #include <linux/kref.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/rculist_nulls.h> #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/rcuref.h> #include <linux/rwsem.h> #include <linux/sysctl.h> #include <linux/err.h> #define UID_GID_MAP_MAX_BASE_EXTENTS 5 #define UID_GID_MAP_MAX_EXTENTS 340 struct uid_gid_extent { u32 first; u32 lower_first; u32 count; }; struct uid_gid_map { /* 64 bytes -- 1 cache line */ union { struct { struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; u32 nr_extents; }; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; }; }; }; #define USERNS_SETGROUPS_ALLOWED 1UL #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED struct ucounts; enum ucount_type { UCOUNT_USER_NAMESPACES, UCOUNT_PID_NAMESPACES, UCOUNT_UTS_NAMESPACES, UCOUNT_IPC_NAMESPACES, UCOUNT_NET_NAMESPACES, UCOUNT_MNT_NAMESPACES, UCOUNT_CGROUP_NAMESPACES, UCOUNT_TIME_NAMESPACES, #ifdef CONFIG_INOTIFY_USER UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, #endif #ifdef CONFIG_FANOTIFY UCOUNT_FANOTIFY_GROUPS, UCOUNT_FANOTIFY_MARKS, #endif UCOUNT_COUNTS, }; enum rlimit_type { UCOUNT_RLIMIT_NPROC, UCOUNT_RLIMIT_MSGQUEUE, UCOUNT_RLIMIT_SIGPENDING, UCOUNT_RLIMIT_MEMLOCK, UCOUNT_RLIMIT_COUNTS, }; #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc; #endif struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; struct uid_gid_map projid_map; struct user_namespace *parent; int level; kuid_t owner; kgid_t group; struct ns_common ns; unsigned long flags; /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP * in its effective capability set at the child ns creation time. */ bool parent_could_setfcap; #ifdef CONFIG_KEYS /* List of joinable keyrings in this namespace. Modification access of * these pointers is controlled by keyring_sem. Once * user_keyring_register is set, it won't be changed, so it can be * accessed directly with READ_ONCE(). */ struct list_head keyring_name_list; struct key *user_keyring_register; struct rw_semaphore keyring_sem; #endif /* Register of per-UID persistent keyrings for this namespace */ #ifdef CONFIG_PERSISTENT_KEYRINGS struct key *persistent_keyring_register; #endif struct work_struct work; #ifdef CONFIG_SYSCTL struct ctl_table_set set; struct ctl_table_header *sysctls; #endif struct ucounts *ucounts; long ucount_max[UCOUNT_COUNTS]; long rlimit_max[UCOUNT_RLIMIT_COUNTS]; #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc *binfmt_misc; #endif } __randomize_layout; struct ucounts { struct hlist_nulls_node node; struct user_namespace *ns; kuid_t uid; struct rcu_head rcu; rcuref_t count; atomic_long_t ucount[UCOUNT_COUNTS]; atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; }; extern struct user_namespace init_user_ns; extern struct ucounts init_ucounts; bool setup_userns_sysctls(struct user_namespace *ns); void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); void put_ucounts(struct ucounts *ucounts); static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) { if (rcuref_get(&ucounts->count)) return ucounts; return NULL; } static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { return atomic_long_read(&ucounts->rlimit[type]); } long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, bool override_rlimit); void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type); bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max); static inline long get_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type) { return READ_ONCE(ns->rlimit_max[type]); } static inline void set_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type, unsigned long max) { ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX; } static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } #ifdef CONFIG_USER_NS static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) ns_ref_inc(ns); return ns; } extern int create_user_ns(struct cred *new); extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { if (ns && ns_ref_put(ns)) __put_user_ns(ns); } struct seq_operations; extern const struct seq_operations proc_uid_seq_operations; extern const struct seq_operations proc_gid_seq_operations; extern const struct seq_operations proc_projid_seq_operations; extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); extern bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child); extern bool current_in_userns(const struct user_namespace *target_ns); struct ns_common *ns_get_owner(struct ns_common *ns); #else static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { return &init_user_ns; } static inline int create_user_ns(struct cred *new) { return -EINVAL; } static inline int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { if (unshare_flags & CLONE_NEWUSER) return -EINVAL; return 0; } static inline void put_user_ns(struct user_namespace *ns) { } static inline bool userns_may_setgroups(const struct user_namespace *ns) { return true; } static inline bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { return true; } static inline bool current_in_userns(const struct user_namespace *target_ns) { return true; } static inline struct ns_common *ns_get_owner(struct ns_common *ns) { return ERR_PTR(-EPERM); } #endif #endif /* _LINUX_USER_H */ |
| 1 1 1 1 1 1 1 3 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * PACKET - implements raw packet sockets. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * * Fixes: * Alan Cox : verify_area() now used correctly * Alan Cox : new skbuff lists, look ma no backlogs! * Alan Cox : tidied skbuff lists. * Alan Cox : Now uses generic datagram routines I * added. Also fixed the peek/read crash * from all old Linux datagram code. * Alan Cox : Uses the improved datagram code. * Alan Cox : Added NULL's for socket options. * Alan Cox : Re-commented the code. * Alan Cox : Use new kernel side addressing * Rob Janssen : Correct MTU usage. * Dave Platt : Counter leaks caused by incorrect * interrupt locking and some slightly * dubious gcc output. Can you read * compiler: it said _VOLATILE_ * Richard Kooijman : Timestamp fixes. * Alan Cox : New buffers. Use sk->mac.raw. * Alan Cox : sendmsg/recvmsg support. * Alan Cox : Protocol setting support * Alexey Kuznetsov : Untied from IPv4 stack. * Cyrus Durgin : Fixed kerneld for kmod. * Michal Ostrowski : Module initialization cleanup. * Ulises Alonso : Frame number limit removal and * packet_set_ring memory leak. * Eric Biederman : Allow for > 8 byte hardware addresses. * The convention is that longer addresses * will simply extend the hardware address * byte arrays at the end of sockaddr_ll * and packet_mreq. * Johann Baudy : Added TX RING. * Chetan Loke : Implemented TPACKET_V3 block abstraction * layer. * Copyright (C) 2011, <lokec@ccs.neu.edu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ethtool.h> #include <linux/filter.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/capability.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> #include <linux/wireless.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <asm/page.h> #include <asm/cacheflush.h> #include <asm/io.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/module.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/if_vlan.h> #include <linux/virtio_net.h> #include <linux/errqueue.h> #include <linux/net_tstamp.h> #include <linux/percpu.h> #ifdef CONFIG_INET #include <net/inet_common.h> #endif #include <linux/bpf.h> #include <net/compat.h> #include <linux/netfilter_netdev.h> #include "internal.h" /* Assumptions: - If the device has no dev->header_ops->create, there is no LL header visible above the device. In this case, its hard_header_len should be 0. The device may prepend its own header internally. In this case, its needed_headroom should be set to the space needed for it to add its internal header. For example, a WiFi driver pretending to be an Ethernet driver should set its hard_header_len to be the Ethernet header length, and set its needed_headroom to be (the real WiFi header length - the fake Ethernet header length). - packet socket receives packets with pulled ll header, so that SOCK_RAW should push it back. On receive: ----------- Incoming, dev_has_header(dev) == true mac_header -> ll header data -> data Outgoing, dev_has_header(dev) == true mac_header -> ll header data -> ll header Incoming, dev_has_header(dev) == false mac_header -> data However drivers often make it point to the ll header. This is incorrect because the ll header should be invisible to us. data -> data Outgoing, dev_has_header(dev) == false mac_header -> data. ll header is invisible to us. data -> data Resume If dev_has_header(dev) == false we are unable to restore the ll header, because it is invisible to us. On transmit: ------------ dev_has_header(dev) == true mac_header -> ll header data -> ll header dev_has_header(dev) == false (ll header is invisible to us) mac_header -> data data -> data We should set network_header on output to the correct position, packet classifier depends on it. */ /* Private packet socket structures. */ /* identical to struct packet_mreq except it has * a longer address field. */ struct packet_mreq_max { int mr_ifindex; unsigned short mr_type; unsigned short mr_alen; unsigned char mr_address[MAX_ADDR_LEN]; }; union tpacket_uhdr { struct tpacket_hdr *h1; struct tpacket2_hdr *h2; struct tpacket3_hdr *h3; void *raw; }; static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring); #define V3_ALIGNMENT (8) #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT)) #define BLK_PLUS_PRIV(sz_of_priv) \ (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status) #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts) #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt) #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len) #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num) #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) struct packet_sock; static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status); static void packet_increment_head(struct packet_ring_buffer *buff); static int prb_curr_blk_in_use(struct tpacket_block_desc *); static void *prb_dispatch_next_block(struct tpacket_kbdq_core *, struct packet_sock *); static void prb_retire_current_block(struct tpacket_kbdq_core *, struct packet_sock *, unsigned int status); static int prb_queue_frozen(struct tpacket_kbdq_core *); static void prb_open_block(struct tpacket_kbdq_core *, struct tpacket_block_desc *); static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *); static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_clear_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_fill_vlan_info(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void packet_flush_mclist(struct sock *sk); static u16 packet_pick_tx_queue(struct sk_buff *skb); struct packet_skb_cb { union { struct sockaddr_pkt pkt; union { /* Trick: alias skb original length with * ll.sll_family and ll.protocol in order * to save room. */ unsigned int origlen; struct sockaddr_ll ll; }; } sa; }; #define vio_le() virtio_legacy_is_little_endian() #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) #define GET_PBLOCK_DESC(x, bid) \ ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer)) #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \ ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer)) #define GET_NEXT_PRB_BLK_NUM(x) \ (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \ ((x)->kactive_blk_num+1) : 0) static void __fanout_unlink(struct sock *sk, struct packet_sock *po); static void __fanout_link(struct sock *sk, struct packet_sock *po); #ifdef CONFIG_NETFILTER_EGRESS static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb) { struct sk_buff *next, *head = NULL, *tail; int rc; rcu_read_lock(); for (; skb != NULL; skb = next) { next = skb->next; skb_mark_not_on_list(skb); if (!nf_hook_egress(skb, &rc, skb->dev)) continue; if (!head) head = skb; else tail->next = skb; tail = skb; } rcu_read_unlock(); return head; } #endif static int packet_xmit(const struct packet_sock *po, struct sk_buff *skb) { if (!packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS)) return dev_queue_xmit(skb); #ifdef CONFIG_NETFILTER_EGRESS if (nf_hook_egress_active()) { skb = nf_hook_direct_egress(skb); if (!skb) return NET_XMIT_DROP; } #endif return dev_direct_xmit(skb, packet_pick_tx_queue(skb)); } static struct net_device *packet_cached_dev_get(struct packet_sock *po) { struct net_device *dev; rcu_read_lock(); dev = rcu_dereference(po->cached_dev); dev_hold(dev); rcu_read_unlock(); return dev; } static void packet_cached_dev_assign(struct packet_sock *po, struct net_device *dev) { rcu_assign_pointer(po->cached_dev, dev); } static void packet_cached_dev_reset(struct packet_sock *po) { RCU_INIT_POINTER(po->cached_dev, NULL); } static u16 packet_pick_tx_queue(struct sk_buff *skb) { struct net_device *dev = skb->dev; const struct net_device_ops *ops = dev->netdev_ops; int cpu = raw_smp_processor_id(); u16 queue_index; #ifdef CONFIG_XPS skb->sender_cpu = cpu + 1; #endif skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues); if (ops->ndo_select_queue) { queue_index = ops->ndo_select_queue(dev, skb, NULL); queue_index = netdev_cap_txqueue(dev, queue_index); } else { queue_index = netdev_pick_tx(dev, skb, NULL); } return queue_index; } /* __register_prot_hook must be invoked through register_prot_hook * or from a context in which asynchronous accesses to the packet * socket is not possible (packet_create()). */ static void __register_prot_hook(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) { if (po->fanout) __fanout_link(sk, po); else dev_add_pack(&po->prot_hook); sock_hold(sk); packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 1); } } static void register_prot_hook(struct sock *sk) { lockdep_assert_held_once(&pkt_sk(sk)->bind_lock); __register_prot_hook(sk); } /* If the sync parameter is true, we will temporarily drop * the po->bind_lock and do a synchronize_net to make sure no * asynchronous packet processing paths still refer to the elements * of po->prot_hook. If the sync parameter is false, it is the * callers responsibility to take care of this. */ static void __unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); lockdep_assert_held_once(&po->bind_lock); packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0); if (po->fanout) __fanout_unlink(sk, po); else __dev_remove_pack(&po->prot_hook); __sock_put(sk); if (sync) { spin_unlock(&po->bind_lock); synchronize_net(); spin_lock(&po->bind_lock); } } static void unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) __unregister_prot_hook(sk, sync); } static inline struct page * __pure pgv_to_page(void *addr) { if (is_vmalloc_addr(addr)) return vmalloc_to_page(addr); return virt_to_page(addr); } static void __packet_set_status(struct packet_sock *po, void *frame, int status) { union tpacket_uhdr h; /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */ h.raw = frame; switch (po->tp_version) { case TPACKET_V1: WRITE_ONCE(h.h1->tp_status, status); flush_dcache_page(pgv_to_page(&h.h1->tp_status)); break; case TPACKET_V2: WRITE_ONCE(h.h2->tp_status, status); flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: WRITE_ONCE(h.h3->tp_status, status); flush_dcache_page(pgv_to_page(&h.h3->tp_status)); break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); } smp_wmb(); } static int __packet_get_status(const struct packet_sock *po, void *frame) { union tpacket_uhdr h; smp_rmb(); /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */ h.raw = frame; switch (po->tp_version) { case TPACKET_V1: flush_dcache_page(pgv_to_page(&h.h1->tp_status)); return READ_ONCE(h.h1->tp_status); case TPACKET_V2: flush_dcache_page(pgv_to_page(&h.h2->tp_status)); return READ_ONCE(h.h2->tp_status); case TPACKET_V3: flush_dcache_page(pgv_to_page(&h.h3->tp_status)); return READ_ONCE(h.h3->tp_status); default: WARN(1, "TPACKET version not supported.\n"); BUG(); return 0; } } static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts, unsigned int flags) { struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); if (shhwtstamps && (flags & SOF_TIMESTAMPING_RAW_HARDWARE) && ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts)) return TP_STATUS_TS_RAW_HARDWARE; if ((flags & SOF_TIMESTAMPING_SOFTWARE) && ktime_to_timespec64_cond(skb_tstamp(skb), ts)) return TP_STATUS_TS_SOFTWARE; return 0; } static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, struct sk_buff *skb) { union tpacket_uhdr h; struct timespec64 ts; __u32 ts_status; if (!(ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp)))) return 0; h.raw = frame; /* * versions 1 through 3 overflow the timestamps in y2106, since they * all store the seconds in a 32-bit unsigned integer. * If we create a version 4, that should have a 64-bit timestamp, * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit * nanoseconds. */ switch (po->tp_version) { case TPACKET_V1: h.h1->tp_sec = ts.tv_sec; h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; break; case TPACKET_V2: h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; break; case TPACKET_V3: h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); } /* one flush is safe, as both fields always lie on the same cacheline */ flush_dcache_page(pgv_to_page(&h.h1->tp_sec)); smp_wmb(); return ts_status; } static void *packet_lookup_frame(const struct packet_sock *po, const struct packet_ring_buffer *rb, unsigned int position, int status) { unsigned int pg_vec_pos, frame_offset; union tpacket_uhdr h; pg_vec_pos = position / rb->frames_per_block; frame_offset = position % rb->frames_per_block; h.raw = rb->pg_vec[pg_vec_pos].buffer + (frame_offset * rb->frame_size); if (status != __packet_get_status(po, h.raw)) return NULL; return h.raw; } static void *packet_current_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { return packet_lookup_frame(po, rb, rb->head, status); } static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) { struct vlan_hdr vhdr, *vh; unsigned int header_len; if (!dev) return 0; /* In the SOCK_DGRAM scenario, skb data starts at the network * protocol, which is after the VLAN headers. The outer VLAN * header is at the hard_header_len offset in non-variable * length link layer headers. If it's a VLAN device, the * min_header_len should be used to exclude the VLAN header * size. */ if (dev->min_header_len == dev->hard_header_len) header_len = dev->hard_header_len; else if (is_vlan_dev(dev)) header_len = dev->min_header_len; else return 0; vh = skb_header_pointer(skb, skb_mac_offset(skb) + header_len, sizeof(vhdr), &vhdr); if (unlikely(!vh)) return 0; return ntohs(vh->h_vlan_TCI); } static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) { __be16 proto = skb->protocol; if (unlikely(eth_type_vlan(proto))) proto = __vlan_get_protocol_offset(skb, proto, skb_mac_offset(skb), NULL); return proto; } static void prb_shutdown_retire_blk_timer(struct packet_sock *po, struct sk_buff_head *rb_queue) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); hrtimer_cancel(&pkc->retire_blk_timer); } static int prb_calc_retire_blk_tmo(struct packet_sock *po, int blk_size_in_bytes) { struct net_device *dev; unsigned int mbits, div; struct ethtool_link_ksettings ecmd; int err; rtnl_lock(); dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); if (unlikely(!dev)) { rtnl_unlock(); return DEFAULT_PRB_RETIRE_TOV; } err = __ethtool_get_link_ksettings(dev, &ecmd); rtnl_unlock(); if (err) return DEFAULT_PRB_RETIRE_TOV; /* If the link speed is so slow you don't really * need to worry about perf anyways */ if (ecmd.base.speed < SPEED_1000 || ecmd.base.speed == SPEED_UNKNOWN) return DEFAULT_PRB_RETIRE_TOV; div = ecmd.base.speed / 1000; mbits = (blk_size_in_bytes * 8) / (1024 * 1024); if (div) mbits /= div; if (div) return mbits + 1; return mbits; } static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, union tpacket_req_u *req_u) { p1->feature_req_word = req_u->req3.tp_feature_req_word; } static void init_prb_bdqc(struct packet_sock *po, struct packet_ring_buffer *rb, struct pgv *pg_vec, union tpacket_req_u *req_u) { struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd; memset(p1, 0x0, sizeof(*p1)); p1->knxt_seq_num = 1; p1->pkbdq = pg_vec; pbd = (struct tpacket_block_desc *)pg_vec[0].buffer; p1->pkblk_start = pg_vec[0].buffer; p1->kblk_size = req_u->req3.tp_block_size; p1->knum_blocks = req_u->req3.tp_block_nr; p1->hdrlen = po->tp_hdrlen; p1->version = po->tp_version; po->stats.stats3.tp_freeze_q_cnt = 0; if (req_u->req3.tp_retire_blk_tov) p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov); else p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po, req_u->req3.tp_block_size)); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; rwlock_init(&p1->blk_fill_in_prog_lock); p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime, HRTIMER_MODE_REL_SOFT); prb_open_block(p1, pbd); } /* * With a 1MB block-size, on a 1Gbps line, it will take * i) ~8 ms to fill a block + ii) memcpy etc. * In this cut we are not accounting for the memcpy time. * * Since the tmo granularity is in msecs, it is not too expensive * to refresh the timer, lets say every '8' msecs. * Either the user can set the 'tmo' or we can derive it based on * a) line-speed and b) block-size. * prb_calc_retire_blk_tmo() calculates the tmo. */ static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t) { struct packet_sock *po = timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer); struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); unsigned int frozen; struct tpacket_block_desc *pbd; spin_lock(&po->sk.sk_receive_queue.lock); frozen = prb_queue_frozen(pkc); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* We only need to plug the race when the block is partially filled. * tpacket_rcv: * lock(); increment BLOCK_NUM_PKTS; unlock() * copy_bits() is in progress ... * timer fires on other cpu: * we can't retire the current block because copy_bits * is in progress. * */ if (BLOCK_NUM_PKTS(pbd)) { /* Waiting for skb_copy_bits to finish... */ write_lock(&pkc->blk_fill_in_prog_lock); write_unlock(&pkc->blk_fill_in_prog_lock); } if (!frozen) { if (BLOCK_NUM_PKTS(pbd)) { /* Not an empty block. Need retire the block. */ prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); prb_dispatch_next_block(pkc, po); } } else { /* Case 1. Queue was frozen because user-space was * lagging behind. */ if (!prb_curr_blk_in_use(pbd)) { /* Case 2. queue was frozen,user-space caught up, * now the link went idle && the timer fired. * We don't have a block to close.So we open this * block and restart the timer. * opening a block thaws the queue,restarts timer * Thawing/timer-refresh is a side effect. */ prb_open_block(pkc, pbd); } } hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime); spin_unlock(&po->sk.sk_receive_queue.lock); return HRTIMER_RESTART; } static void prb_flush_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, __u32 status) { /* Flush everything minus the block header */ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 u8 *start, *end; start = (u8 *)pbd1; /* Skip the block header(we know header WILL fit in 4K) */ start += PAGE_SIZE; end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end); for (; start < end; start += PAGE_SIZE) flush_dcache_page(pgv_to_page(start)); smp_wmb(); #endif /* Now update the block status. */ BLOCK_STATUS(pbd1) = status; /* Flush the block header */ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 start = (u8 *)pbd1; flush_dcache_page(pgv_to_page(start)); smp_wmb(); #endif } /* * Side effect: * * 1) flush the block * 2) Increment active_blk_num * * Note:We DONT refresh the timer on purpose. * Because almost always the next block will be opened. */ static void prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, struct packet_sock *po, unsigned int stat) { __u32 status = TP_STATUS_USER | stat; struct tpacket3_hdr *last_pkt; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; struct sock *sk = &po->sk; if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; last_pkt = (struct tpacket3_hdr *)pkc1->prev; last_pkt->tp_next_offset = 0; /* Get the ts of the last pkt */ if (BLOCK_NUM_PKTS(pbd1)) { h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec; } else { /* Ok, we tmo'd - so get the current time. * * It shouldn't really happen as we don't close empty * blocks. See prb_retire_rx_blk_timer_expired(). */ struct timespec64 ts; ktime_get_real_ts64(&ts); h1->ts_last_pkt.ts_sec = ts.tv_sec; h1->ts_last_pkt.ts_nsec = ts.tv_nsec; } smp_wmb(); /* Flush the block */ prb_flush_block(pkc1, pbd1, status); sk->sk_data_ready(sk); pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1); } static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) { pkc->reset_pending_on_curr_blk = 0; } /* * prb_open_block is called by tpacket_rcv or timer callback. * * Reasons why NOT update hrtimer in prb_open_block: * 1) It will increase complexity to distinguish the two caller scenario. * 2) hrtimer_cancel and hrtimer_start need to be called if you want to update * TMO of an already enqueued hrtimer, leading to complex shutdown logic. * * One side effect of NOT update hrtimer when called by tpacket_rcv is that * a newly opened block triggered by tpacket_rcv may be retired earlier than * expected. On the other hand, if timeout is updated in prb_open_block, the * frequent reception of network packets that leads to prb_open_block being * called may cause hrtimer to be removed and enqueued repeatedly. */ static void prb_open_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1) { struct timespec64 ts; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; smp_rmb(); /* We could have just memset this but we will lose the * flexibility of making the priv area sticky */ BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; BLOCK_NUM_PKTS(pbd1) = 0; BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); ktime_get_real_ts64(&ts); h1->ts_first_pkt.ts_sec = ts.tv_sec; h1->ts_first_pkt.ts_nsec = ts.tv_nsec; pkc1->pkblk_start = (char *)pbd1; pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; pbd1->version = pkc1->version; pkc1->prev = pkc1->nxt_offset; pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; prb_thaw_queue(pkc1); smp_wmb(); } /* * Queue freeze logic: * 1) Assume tp_block_nr = 8 blocks. * 2) At time 't0', user opens Rx ring. * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 * 4) user-space is either sleeping or processing block '0'. * 5) tpacket_rcv is currently filling block '7', since there is no space left, * it will close block-7,loop around and try to fill block '0'. * call-flow: * __packet_lookup_frame_in_block * prb_retire_current_block() * prb_dispatch_next_block() * |->(BLOCK_STATUS == USER) evaluates to true * 5.1) Since block-0 is currently in-use, we just freeze the queue. * 6) Now there are two cases: * 6.1) Link goes idle right after the queue is frozen. * But remember, the last open_block() refreshed the timer. * When this timer expires,it will refresh itself so that we can * re-open block-0 in near future. * 6.2) Link is busy and keeps on receiving packets. This is a simple * case and __packet_lookup_frame_in_block will check if block-0 * is free and can now be re-used. */ static void prb_freeze_queue(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { pkc->reset_pending_on_curr_blk = 1; po->stats.stats3.tp_freeze_q_cnt++; } #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT)) /* * If the next block is free then we will dispatch it * and return a good offset. * Else, we will freeze the queue. * So, caller must check the return value. */ static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { struct tpacket_block_desc *pbd; smp_rmb(); /* 1. Get current block num */ pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* 2. If this block is currently in_use then freeze the queue */ if (TP_STATUS_USER & BLOCK_STATUS(pbd)) { prb_freeze_queue(pkc, po); return NULL; } /* * 3. * open this block and return the offset where the first packet * needs to get stored. */ prb_open_block(pkc, pbd); return (void *)pkc->nxt_offset; } static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po, unsigned int status) { struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* retire/close the current block */ if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { /* * Plug the case where copy_bits() is in progress on * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't * have space to copy the pkt in the current block and * called prb_retire_current_block() * * We don't need to worry about the TMO case because * the timer-handler already handled this case. */ if (!(status & TP_STATUS_BLK_TMO)) { /* Waiting for skb_copy_bits to finish... */ write_lock(&pkc->blk_fill_in_prog_lock); write_unlock(&pkc->blk_fill_in_prog_lock); } prb_close_block(pkc, pbd, po, status); return; } } static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd) { return TP_STATUS_USER & BLOCK_STATUS(pbd); } static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) { return pkc->reset_pending_on_curr_blk; } static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) __releases(&pkc->blk_fill_in_prog_lock) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); read_unlock(&pkc->blk_fill_in_prog_lock); } static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb); } static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_rxhash = 0; } static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { struct packet_sock *po = container_of(pkc, struct packet_sock, rx_ring.prb_bdqc); if (skb_vlan_tag_present(pkc->skb)) { ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb); ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto); ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else if (unlikely(po->sk.sk_type == SOCK_DGRAM && eth_type_vlan(pkc->skb->protocol))) { ppd->hv1.tp_vlan_tci = vlan_get_tci(pkc->skb, pkc->skb->dev); ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->protocol); ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { ppd->hv1.tp_vlan_tci = 0; ppd->hv1.tp_vlan_tpid = 0; ppd->tp_status = TP_STATUS_AVAILABLE; } } static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_padding = 0; prb_fill_vlan_info(pkc, ppd); if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) prb_fill_rxhash(pkc, ppd); else prb_clear_rxhash(pkc, ppd); } static void prb_fill_curr_block(char *curr, struct tpacket_kbdq_core *pkc, struct tpacket_block_desc *pbd, unsigned int len) __acquires(&pkc->blk_fill_in_prog_lock) { struct tpacket3_hdr *ppd; ppd = (struct tpacket3_hdr *)curr; ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len); pkc->prev = curr; pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_NUM_PKTS(pbd) += 1; read_lock(&pkc->blk_fill_in_prog_lock); prb_run_all_ft_ops(pkc, ppd); } /* Assumes caller has the sk->rx_queue.lock */ static void *__packet_lookup_frame_in_block(struct packet_sock *po, struct sk_buff *skb, unsigned int len ) { struct tpacket_kbdq_core *pkc; struct tpacket_block_desc *pbd; char *curr, *end; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* Queue is frozen when user space is lagging behind */ if (prb_queue_frozen(pkc)) { /* * Check if that last block which caused the queue to freeze, * is still in_use by user-space. */ if (prb_curr_blk_in_use(pbd)) { /* Can't record this packet */ return NULL; } else { /* * Ok, the block was released by user-space. * Now let's open that block. * opening a block also thaws the queue. * Thawing is a side effect. */ prb_open_block(pkc, pbd); } } smp_mb(); curr = pkc->nxt_offset; pkc->skb = skb; end = (char *)pbd + pkc->kblk_size; /* first try the current block */ if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) { prb_fill_curr_block(curr, pkc, pbd, len); return (void *)curr; } /* Ok, close the current block */ prb_retire_current_block(pkc, po, 0); /* Now, try to dispatch the next block */ curr = (char *)prb_dispatch_next_block(pkc, po); if (curr) { pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); prb_fill_curr_block(curr, pkc, pbd, len); return (void *)curr; } /* * No free blocks are available.user_space hasn't caught up yet. * Queue was just frozen and now this packet will get dropped. */ return NULL; } static void *packet_current_rx_frame(struct packet_sock *po, struct sk_buff *skb, int status, unsigned int len) { char *curr = NULL; switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: curr = packet_lookup_frame(po, &po->rx_ring, po->rx_ring.head, status); return curr; case TPACKET_V3: return __packet_lookup_frame_in_block(po, skb, len); default: WARN(1, "TPACKET version not supported\n"); BUG(); return NULL; } } static void *prb_lookup_block(const struct packet_sock *po, const struct packet_ring_buffer *rb, unsigned int idx, int status) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); if (status != BLOCK_STATUS(pbd)) return NULL; return pbd; } static int prb_previous_blk_num(struct packet_ring_buffer *rb) { unsigned int prev; if (rb->prb_bdqc.kactive_blk_num) prev = rb->prb_bdqc.kactive_blk_num-1; else prev = rb->prb_bdqc.knum_blocks-1; return prev; } /* Assumes caller has held the rx_queue.lock */ static void *__prb_previous_block(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { unsigned int previous = prb_previous_blk_num(rb); return prb_lookup_block(po, rb, previous, status); } static void *packet_previous_rx_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { if (po->tp_version <= TPACKET_V2) return packet_previous_frame(po, rb, status); return __prb_previous_block(po, rb, status); } static void packet_increment_rx_head(struct packet_sock *po, struct packet_ring_buffer *rb) { switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: return packet_increment_head(rb); case TPACKET_V3: default: WARN(1, "TPACKET version not supported.\n"); BUG(); return; } } static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max; return packet_lookup_frame(po, rb, previous, status); } static void packet_increment_head(struct packet_ring_buffer *buff) { buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; } static void packet_inc_pending(struct packet_ring_buffer *rb) { this_cpu_inc(*rb->pending_refcnt); } static void packet_dec_pending(struct packet_ring_buffer *rb) { this_cpu_dec(*rb->pending_refcnt); } static unsigned int packet_read_pending(const struct packet_ring_buffer *rb) { unsigned int refcnt = 0; int cpu; /* We don't use pending refcount in rx_ring. */ if (rb->pending_refcnt == NULL) return 0; for_each_possible_cpu(cpu) refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu); return refcnt; } static int packet_alloc_pending(struct packet_sock *po) { po->rx_ring.pending_refcnt = NULL; po->tx_ring.pending_refcnt = alloc_percpu(unsigned int); if (unlikely(po->tx_ring.pending_refcnt == NULL)) return -ENOBUFS; return 0; } static void packet_free_pending(struct packet_sock *po) { free_percpu(po->tx_ring.pending_refcnt); } #define ROOM_POW_OFF 2 #define ROOM_NONE 0x0 #define ROOM_LOW 0x1 #define ROOM_NORMAL 0x2 static bool __tpacket_has_room(const struct packet_sock *po, int pow_off) { int idx, len; len = READ_ONCE(po->rx_ring.frame_max) + 1; idx = READ_ONCE(po->rx_ring.head); if (pow_off) idx += len >> pow_off; if (idx >= len) idx -= len; return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off) { int idx, len; len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks); idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num); if (pow_off) idx += len >> pow_off; if (idx >= len) idx -= len; return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } static int __packet_rcv_has_room(const struct packet_sock *po, const struct sk_buff *skb) { const struct sock *sk = &po->sk; int ret = ROOM_NONE; if (po->prot_hook.func != tpacket_rcv) { int rcvbuf = READ_ONCE(sk->sk_rcvbuf); int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc) - (skb ? skb->truesize : 0); if (avail > (rcvbuf >> ROOM_POW_OFF)) return ROOM_NORMAL; else if (avail > 0) return ROOM_LOW; else return ROOM_NONE; } if (po->tp_version == TPACKET_V3) { if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) ret = ROOM_NORMAL; else if (__tpacket_v3_has_room(po, 0)) ret = ROOM_LOW; } else { if (__tpacket_has_room(po, ROOM_POW_OFF)) ret = ROOM_NORMAL; else if (__tpacket_has_room(po, 0)) ret = ROOM_LOW; } return ret; } static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { bool pressure; int ret; ret = __packet_rcv_has_room(po, skb); pressure = ret != ROOM_NORMAL; if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) != pressure) packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, pressure); return ret; } static void packet_rcv_try_clear_pressure(struct packet_sock *po) { if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, false); } static void packet_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_error_queue); WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive packet socket: %p\n", sk); return; } } static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) { u32 *history = po->rollover->history; u32 victim, rxhash; int i, count = 0; rxhash = skb_get_hash(skb); for (i = 0; i < ROLLOVER_HLEN; i++) if (READ_ONCE(history[i]) == rxhash) count++; victim = get_random_u32_below(ROLLOVER_HLEN); /* Avoid dirtying the cache line if possible */ if (READ_ONCE(history[victim]) != rxhash) WRITE_ONCE(history[victim], rxhash); return count > (ROLLOVER_HLEN >> 1); } static unsigned int fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return reciprocal_scale(__skb_get_hash_symmetric(skb), num); } static unsigned int fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { unsigned int val = atomic_inc_return(&f->rr_cur); return val % num; } static unsigned int fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return smp_processor_id() % num; } static unsigned int fanout_demux_rnd(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return get_random_u32_below(num); } static unsigned int fanout_demux_rollover(struct packet_fanout *f, struct sk_buff *skb, unsigned int idx, bool try_self, unsigned int num) { struct packet_sock *po, *po_next, *po_skip = NULL; unsigned int i, j, room = ROOM_NONE; po = pkt_sk(rcu_dereference(f->arr[idx])); if (try_self) { room = packet_rcv_has_room(po, skb); if (room == ROOM_NORMAL || (room == ROOM_LOW && !fanout_flow_is_huge(po, skb))) return idx; po_skip = po; } i = j = min_t(int, po->rollover->sock, num - 1); do { po_next = pkt_sk(rcu_dereference(f->arr[i])); if (po_next != po_skip && !packet_sock_flag(po_next, PACKET_SOCK_PRESSURE) && packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; atomic_long_inc(&po->rollover->num); if (room == ROOM_LOW) atomic_long_inc(&po->rollover->num_huge); return i; } if (++i == num) i = 0; } while (i != j); atomic_long_inc(&po->rollover->num_failed); return idx; } static unsigned int fanout_demux_qm(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return skb_get_queue_mapping(skb) % num; } static unsigned int fanout_demux_bpf(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { struct bpf_prog *prog; unsigned int ret = 0; rcu_read_lock(); prog = rcu_dereference(f->bpf_prog); if (prog) ret = bpf_prog_run_clear_cb(prog, skb) % num; rcu_read_unlock(); return ret; } static bool fanout_has_flag(struct packet_fanout *f, u16 flag) { return f->flags & (flag >> 8); } static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct packet_fanout *f = pt->af_packet_priv; unsigned int num = READ_ONCE(f->num_members); struct net *net = read_pnet(&f->net); struct packet_sock *po; unsigned int idx; if (!net_eq(dev_net(dev), net) || !num) { kfree_skb(skb); return 0; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET); if (!skb) return 0; } switch (f->type) { case PACKET_FANOUT_HASH: default: idx = fanout_demux_hash(f, skb, num); break; case PACKET_FANOUT_LB: idx = fanout_demux_lb(f, skb, num); break; case PACKET_FANOUT_CPU: idx = fanout_demux_cpu(f, skb, num); break; case PACKET_FANOUT_RND: idx = fanout_demux_rnd(f, skb, num); break; case PACKET_FANOUT_QM: idx = fanout_demux_qm(f, skb, num); break; case PACKET_FANOUT_ROLLOVER: idx = fanout_demux_rollover(f, skb, 0, false, num); break; case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: idx = fanout_demux_bpf(f, skb, num); break; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) idx = fanout_demux_rollover(f, skb, idx, true, num); po = pkt_sk(rcu_dereference(f->arr[idx])); return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); } DEFINE_MUTEX(fanout_mutex); EXPORT_SYMBOL_GPL(fanout_mutex); static LIST_HEAD(fanout_list); static u16 fanout_next_id; static void __fanout_link(struct sock *sk, struct packet_sock *po) { struct packet_fanout *f = po->fanout; spin_lock(&f->lock); rcu_assign_pointer(f->arr[f->num_members], sk); smp_wmb(); f->num_members++; if (f->num_members == 1) dev_add_pack(&f->prot_hook); spin_unlock(&f->lock); } static void __fanout_unlink(struct sock *sk, struct packet_sock *po) { struct packet_fanout *f = po->fanout; int i; spin_lock(&f->lock); for (i = 0; i < f->num_members; i++) { if (rcu_dereference_protected(f->arr[i], lockdep_is_held(&f->lock)) == sk) break; } BUG_ON(i >= f->num_members); rcu_assign_pointer(f->arr[i], rcu_dereference_protected(f->arr[f->num_members - 1], lockdep_is_held(&f->lock))); f->num_members--; if (f->num_members == 0) __dev_remove_pack(&f->prot_hook); spin_unlock(&f->lock); } static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) { if (sk->sk_family != PF_PACKET) return false; return ptype->af_packet_priv == pkt_sk(sk)->fanout; } static void fanout_init_data(struct packet_fanout *f) { switch (f->type) { case PACKET_FANOUT_LB: atomic_set(&f->rr_cur, 0); break; case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: RCU_INIT_POINTER(f->bpf_prog, NULL); break; } } static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new) { struct bpf_prog *old; spin_lock(&f->lock); old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock)); rcu_assign_pointer(f->bpf_prog, new); spin_unlock(&f->lock); if (old) { synchronize_net(); bpf_prog_destroy(old); } } static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; struct sock_fprog fprog; int ret; if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; ret = copy_bpf_fprog_from_user(&fprog, data, len); if (ret) return ret; ret = bpf_prog_create_from_user(&new, &fprog, NULL, false); if (ret) return ret; __fanout_set_data_bpf(po->fanout, new); return 0; } static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; u32 fd; if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; if (len != sizeof(fd)) return -EINVAL; if (copy_from_sockptr(&fd, data, len)) return -EFAULT; new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(new)) return PTR_ERR(new); __fanout_set_data_bpf(po->fanout, new); return 0; } static int fanout_set_data(struct packet_sock *po, sockptr_t data, unsigned int len) { switch (po->fanout->type) { case PACKET_FANOUT_CBPF: return fanout_set_data_cbpf(po, data, len); case PACKET_FANOUT_EBPF: return fanout_set_data_ebpf(po, data, len); default: return -EINVAL; } } static void fanout_release_data(struct packet_fanout *f) { switch (f->type) { case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: __fanout_set_data_bpf(f, NULL); } } static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id) { struct packet_fanout *f; list_for_each_entry(f, &fanout_list, list) { if (f->id == candidate_id && read_pnet(&f->net) == sock_net(sk)) { return false; } } return true; } static bool fanout_find_new_id(struct sock *sk, u16 *new_id) { u16 id = fanout_next_id; do { if (__fanout_id_is_free(sk, id)) { *new_id = id; fanout_next_id = id + 1; return true; } id++; } while (id != fanout_next_id); return false; } static int fanout_add(struct sock *sk, struct fanout_args *args) { struct packet_rollover *rollover = NULL; struct packet_sock *po = pkt_sk(sk); u16 type_flags = args->type_flags; struct packet_fanout *f, *match; u8 type = type_flags & 0xff; u8 flags = type_flags >> 8; u16 id = args->id; int err; switch (type) { case PACKET_FANOUT_ROLLOVER: if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) return -EINVAL; break; case PACKET_FANOUT_HASH: case PACKET_FANOUT_LB: case PACKET_FANOUT_CPU: case PACKET_FANOUT_RND: case PACKET_FANOUT_QM: case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: break; default: return -EINVAL; } mutex_lock(&fanout_mutex); err = -EALREADY; if (po->fanout) goto out; if (type == PACKET_FANOUT_ROLLOVER || (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { err = -ENOMEM; rollover = kzalloc(sizeof(*rollover), GFP_KERNEL); if (!rollover) goto out; atomic_long_set(&rollover->num, 0); atomic_long_set(&rollover->num_huge, 0); atomic_long_set(&rollover->num_failed, 0); } if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) { if (id != 0) { err = -EINVAL; goto out; } if (!fanout_find_new_id(sk, &id)) { err = -ENOMEM; goto out; } /* ephemeral flag for the first socket in the group: drop it */ flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8); } match = NULL; list_for_each_entry(f, &fanout_list, list) { if (f->id == id && read_pnet(&f->net) == sock_net(sk)) { match = f; break; } } err = -EINVAL; if (match) { if (match->flags != flags) goto out; if (args->max_num_members && args->max_num_members != match->max_num_members) goto out; } else { if (args->max_num_members > PACKET_FANOUT_MAX) goto out; if (!args->max_num_members) /* legacy PACKET_FANOUT_MAX */ args->max_num_members = 256; err = -ENOMEM; match = kvzalloc(struct_size(match, arr, args->max_num_members), GFP_KERNEL); if (!match) goto out; write_pnet(&match->net, sock_net(sk)); match->id = id; match->type = type; match->flags = flags; INIT_LIST_HEAD(&match->list); spin_lock_init(&match->lock); refcount_set(&match->sk_ref, 0); fanout_init_data(match); match->prot_hook.type = po->prot_hook.type; match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; match->prot_hook.af_packet_net = read_pnet(&match->net); match->prot_hook.id_match = match_fanout_group; match->max_num_members = args->max_num_members; match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING; list_add(&match->list, &fanout_list); } err = -EINVAL; spin_lock(&po->bind_lock); if (po->num && match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; if (refcount_read(&match->sk_ref) < match->max_num_members) { /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ WRITE_ONCE(po->fanout, match); po->rollover = rollover; rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { __dev_remove_pack(&po->prot_hook); __fanout_link(sk, po); } err = 0; } } spin_unlock(&po->bind_lock); if (err && !refcount_read(&match->sk_ref)) { list_del(&match->list); kvfree(match); } out: kfree(rollover); mutex_unlock(&fanout_mutex); return err; } /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout. * It is the responsibility of the caller to call fanout_release_data() and * free the returned packet_fanout (after synchronize_net()) */ static struct packet_fanout *fanout_release(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f; mutex_lock(&fanout_mutex); f = po->fanout; if (f) { po->fanout = NULL; if (refcount_dec_and_test(&f->sk_ref)) list_del(&f->list); else f = NULL; } mutex_unlock(&fanout_mutex); return f; } static bool packet_extra_vlan_len_allowed(const struct net_device *dev, struct sk_buff *skb) { /* Earlier code assumed this would be a VLAN pkt, double-check * this now that we have the actual packet in hand. We can only * do this check on Ethernet devices. */ if (unlikely(dev->type != ARPHRD_ETHER)) return false; skb_reset_mac_header(skb); return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q)); } static const struct proto_ops packet_ops; static const struct proto_ops packet_ops_spkt; static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_pkt *spkt; /* * When we registered the protocol we saved the socket in the data * field for just this event. */ sk = pt->af_packet_priv; /* * Yank back the headers [hope the device set this * right or kerboom...] * * Incoming packets have ll header pulled, * push it back. * * For outgoing ones skb->data == skb_mac_header(skb) * so that this procedure is noop. */ if (skb->pkt_type == PACKET_LOOPBACK) goto out; if (!net_eq(dev_net(dev), sock_net(sk))) goto out; skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) goto oom; /* drop any routing info */ skb_dst_drop(skb); /* drop conntrack reference */ nf_reset_ct(skb); spkt = &PACKET_SKB_CB(skb)->sa.pkt; skb_push(skb, skb->data - skb_mac_header(skb)); /* * The SOCK_PACKET socket receives _all_ frames. */ spkt->spkt_family = dev->type; strscpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); spkt->spkt_protocol = skb->protocol; /* * Charge the memory to the socket. This is done specifically * to prevent sockets using all the memory up. */ if (sock_queue_rcv_skb(sk, skb) == 0) return 0; out: kfree_skb(skb); oom: return 0; } static void packet_parse_headers(struct sk_buff *skb, struct socket *sock) { int depth; if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) && sock->type == SOCK_RAW) { skb_reset_mac_header(skb); skb->protocol = dev_parse_header_protocol(skb); } /* Move network header to the right position for VLAN tagged packets */ if (likely(skb->dev->type == ARPHRD_ETHER) && eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); skb_probe_transport_header(skb); } /* * Output a raw packet to a device layer. This bypasses all the other * protocol layers and you must therefore supply it with a complete frame */ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name); struct sk_buff *skb = NULL; struct net_device *dev; struct sockcm_cookie sockc; __be16 proto = 0; int err; int extra_len = 0; /* * Get and verify the address. */ if (saddr) { if (msg->msg_namelen < sizeof(struct sockaddr)) return -EINVAL; if (msg->msg_namelen == sizeof(struct sockaddr_pkt)) proto = saddr->spkt_protocol; } else return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */ /* * Find the device first to size check it */ saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0; retry: rcu_read_lock(); dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); err = -ENODEV; if (dev == NULL) goto out_unlock; err = -ENETDOWN; if (!(dev->flags & IFF_UP)) goto out_unlock; /* * You may not queue a frame bigger than the mtu. This is the lowest level * raw protocol and you must do your own fragmentation at this level. */ if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) { err = -EPROTONOSUPPORT; goto out_unlock; } extra_len = 4; /* We're doing our own CRC */ } err = -EMSGSIZE; if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len) goto out_unlock; if (!skb) { size_t reserved = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; rcu_read_unlock(); skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL); if (skb == NULL) return -ENOBUFS; /* FIXME: Save some space for broken drivers that write a hard * header at transmission time by themselves. PPP is the notable * one here. This should really be fixed at the driver level. */ skb_reserve(skb, reserved); skb_reset_network_header(skb); /* Try to align data part correctly */ if (hhlen) { skb->data -= hhlen; skb->tail -= hhlen; if (len < hhlen) skb_reset_network_header(skb); } err = memcpy_from_msg(skb_put(skb, len), msg, len); if (err) goto out_free; goto retry; } if (!dev_validate_header(dev, skb->data, len) || !skb->len) { err = -EINVAL; goto out_unlock; } if (len > (dev->mtu + dev->hard_header_len + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_unlock; } sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto out_unlock; } skb->protocol = proto; skb->dev = dev; skb->priority = sockc.priority; skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); skb_setup_tx_timestamp(skb, &sockc); if (unlikely(extra_len == 4)) skb->no_fcs = 1; packet_parse_headers(skb, sock); dev_queue_xmit(skb); rcu_read_unlock(); return len; out_unlock: rcu_read_unlock(); out_free: kfree_skb(skb); return err; } static unsigned int run_filter(struct sk_buff *skb, const struct sock *sk, unsigned int res) { struct sk_filter *filter; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) res = bpf_prog_run_clear_cb(filter->prog, skb); rcu_read_unlock(); return res; } static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, size_t *len, int vnet_hdr_sz) { struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 }; if (*len < vnet_hdr_sz) return -EINVAL; *len -= vnet_hdr_sz; if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0)) return -EINVAL; return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz); } /* * This function makes lazy skb cloning in hope that most of packets * are discarded by BPF. * * Note tricky part: we DO mangle shared skb! skb->data, skb->len * and skb->cb are mangled. It works because (and until) packets * falling here are owned by current CPU. Output packets are cloned * by dev_queue_xmit_nit(), input packets are processed by net_bh * sequentially, so that if we return skb to original state on exit, * we will not harm anyone. */ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason = SKB_CONSUMED; struct sock *sk = NULL; struct sockaddr_ll *sll; struct packet_sock *po; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; skb->dev = dev; if (dev_has_header(dev)) { /* The device has an explicit notion of ll header, * exported to higher levels. * * Otherwise, the device hides details of its frame * structure, so that corresponding packet head is * never delivered to user. */ if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ skb_pull(skb, skb_network_offset(skb)); } } snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; if (snaplen > res) snaplen = res; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto drop_n_acct; if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); if (nskb == NULL) goto drop_n_acct; if (skb_head != skb->data) { skb->data = skb_head; skb->len = skb_len; } consume_skb(skb); skb = nskb; } sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8); sll = &PACKET_SKB_CB(skb)->sa.ll; sll->sll_hatype = dev->type; sll->sll_pkttype = skb->pkt_type; if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; sll->sll_halen = dev_parse_header(skb, sll->sll_addr); /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg(). * Use their space for storing the original skb length. */ PACKET_SKB_CB(skb)->sa.origlen = skb->len; if (pskb_trim(skb, snaplen)) goto drop_n_acct; skb_set_owner_r(skb, sk); skb->dev = NULL; skb_dst_drop(skb); /* drop conntrack reference */ nf_reset_ct(skb); spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; sock_skb_set_dropcount(sk, skb); skb_clear_delivery_time(skb); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); return 0; drop_n_acct: atomic_inc(&po->tp_drops); sk_drops_inc(sk); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { skb->data = skb_head; skb->len = skb_len; } drop: sk_skb_reason_drop(sk, skb, drop_reason); return 0; } static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason = SKB_CONSUMED; struct sock *sk = NULL; struct packet_sock *po; struct sockaddr_ll *sll; union tpacket_uhdr h; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; unsigned long status = TP_STATUS_USER; unsigned short macoff, hdrlen; unsigned int netoff; struct sk_buff *copy_skb = NULL; struct timespec64 ts; __u32 ts_status; unsigned int slot_id = 0; int vnet_hdr_sz = 0; /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. * We may add members to them until current aligned size without forcing * userspace to call getsockopt(..., PACKET_HDRLEN, ...). */ BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32); BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48); if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; if (dev_has_header(dev)) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ skb_pull(skb, skb_network_offset(skb)); } } snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; /* If we are flooded, just give up */ if (__packet_rcv_has_room(po, skb) == ROOM_NONE) { atomic_inc(&po->tp_drops); goto drop_n_restore; } if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) status |= TP_STATUS_CSUM_VALID; if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) status |= TP_STATUS_GSO_TCP; if (snaplen > res) snaplen = res; if (sk->sk_type == SOCK_DGRAM) { macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + po->tp_reserve; } else { unsigned int maclen = skb_network_offset(skb); netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + po->tp_reserve; vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); if (vnet_hdr_sz) netoff += vnet_hdr_sz; macoff = netoff - maclen; } if (netoff > USHRT_MAX) { atomic_inc(&po->tp_drops); goto drop_n_restore; } if (po->tp_version <= TPACKET_V2) { if (macoff + snaplen > po->rx_ring.frame_size) { if (READ_ONCE(po->copy_thresh) && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { if (skb_shared(skb)) { copy_skb = skb_clone(skb, GFP_ATOMIC); } else { copy_skb = skb_get(skb); skb_head = skb->data; } if (copy_skb) { memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0, sizeof(PACKET_SKB_CB(copy_skb)->sa.ll)); skb_set_owner_r(copy_skb, sk); } } snaplen = po->rx_ring.frame_size - macoff; if ((int)snaplen < 0) { snaplen = 0; vnet_hdr_sz = 0; } } } else if (unlikely(macoff + snaplen > GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) { u32 nval; nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff; pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n", snaplen, nval, macoff); snaplen = nval; if (unlikely((int)snaplen < 0)) { snaplen = 0; macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; vnet_hdr_sz = 0; } } spin_lock(&sk->sk_receive_queue.lock); h.raw = packet_current_rx_frame(po, skb, TP_STATUS_KERNEL, (macoff+snaplen)); if (!h.raw) goto drop_n_account; if (po->tp_version <= TPACKET_V2) { slot_id = po->rx_ring.head; if (test_bit(slot_id, po->rx_ring.rx_owner_map)) goto drop_n_account; __set_bit(slot_id, po->rx_ring.rx_owner_map); } if (vnet_hdr_sz && virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), vio_le(), true, 0)) { if (po->tp_version == TPACKET_V3) prb_clear_blk_fill_status(&po->rx_ring); goto drop_n_account; } if (po->tp_version <= TPACKET_V2) { packet_increment_rx_head(po, &po->rx_ring); /* * LOSING will be reported till you read the stats, * because it's COR - Clear On Read. * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level. */ if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; } po->stats.stats1.tp_packets++; if (copy_skb) { status |= TP_STATUS_COPY; skb_clear_delivery_time(copy_skb); __skb_queue_tail(&sk->sk_receive_queue, copy_skb); } spin_unlock(&sk->sk_receive_queue.lock); skb_copy_bits(skb, 0, h.raw + macoff, snaplen); /* Always timestamp; prefer an existing software timestamp taken * closer to the time of capture. */ ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp) | SOF_TIMESTAMPING_SOFTWARE); if (!ts_status) ktime_get_real_ts64(&ts); status |= ts_status; switch (po->tp_version) { case TPACKET_V1: h.h1->tp_len = skb->len; h.h1->tp_snaplen = snaplen; h.h1->tp_mac = macoff; h.h1->tp_net = netoff; h.h1->tp_sec = ts.tv_sec; h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; hdrlen = sizeof(*h.h1); break; case TPACKET_V2: h.h2->tp_len = skb->len; h.h2->tp_snaplen = snaplen; h.h2->tp_mac = macoff; h.h2->tp_net = netoff; h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; if (skb_vlan_tag_present(skb)) { h.h2->tp_vlan_tci = skb_vlan_tag_get(skb); h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else if (unlikely(sk->sk_type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) { h.h2->tp_vlan_tci = vlan_get_tci(skb, skb->dev); h.h2->tp_vlan_tpid = ntohs(skb->protocol); status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { h.h2->tp_vlan_tci = 0; h.h2->tp_vlan_tpid = 0; } memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding)); hdrlen = sizeof(*h.h2); break; case TPACKET_V3: /* tp_nxt_offset,vlan are already populated above. * So DONT clear those fields here */ h.h3->tp_status |= status; h.h3->tp_len = skb->len; h.h3->tp_snaplen = snaplen; h.h3->tp_mac = macoff; h.h3->tp_net = netoff; h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding)); hdrlen = sizeof(*h.h3); break; default: BUG(); } sll = h.raw + TPACKET_ALIGN(hdrlen); sll->sll_halen = dev_parse_header(skb, sll->sll_addr); sll->sll_family = AF_PACKET; sll->sll_hatype = dev->type; sll->sll_protocol = (sk->sk_type == SOCK_DGRAM) ? vlan_get_protocol_dgram(skb) : skb->protocol; sll->sll_pkttype = skb->pkt_type; if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; smp_mb(); #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 if (po->tp_version <= TPACKET_V2) { u8 *start, *end; end = (u8 *) PAGE_ALIGN((unsigned long) h.raw + macoff + snaplen); for (start = h.raw; start < end; start += PAGE_SIZE) flush_dcache_page(pgv_to_page(start)); } smp_wmb(); #endif if (po->tp_version <= TPACKET_V2) { spin_lock(&sk->sk_receive_queue.lock); __packet_set_status(po, h.raw, status); __clear_bit(slot_id, po->rx_ring.rx_owner_map); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); } else if (po->tp_version == TPACKET_V3) { prb_clear_blk_fill_status(&po->rx_ring); } drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { skb->data = skb_head; skb->len = skb_len; } drop: sk_skb_reason_drop(sk, skb, drop_reason); return 0; drop_n_account: spin_unlock(&sk->sk_receive_queue.lock); atomic_inc(&po->tp_drops); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; sk->sk_data_ready(sk); sk_skb_reason_drop(sk, copy_skb, drop_reason); goto drop_n_restore; } static void tpacket_destruct_skb(struct sk_buff *skb) { struct packet_sock *po = pkt_sk(skb->sk); if (likely(po->tx_ring.pg_vec)) { void *ph; __u32 ts; ph = skb_zcopy_get_nouarg(skb); packet_dec_pending(&po->tx_ring); ts = __packet_set_timestamp(po, ph, skb); __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); complete(&po->skb_completion); } sock_wfree(skb); } static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) { if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 > __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len))) vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(), __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2); if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len) return -EINVAL; return 0; } static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz) { int ret; if (*len < vnet_hdr_sz) return -EINVAL; *len -= vnet_hdr_sz; if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter)) return -EFAULT; ret = __packet_snd_vnet_parse(vnet_hdr, *len); if (ret) return ret; /* move iter to point to the start of mac header */ if (vnet_hdr_sz != sizeof(struct virtio_net_hdr)) iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr)); return 0; } static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, void *frame, struct net_device *dev, void *data, int tp_len, __be16 proto, unsigned char *addr, int hlen, int copylen, const struct sockcm_cookie *sockc) { union tpacket_uhdr ph; int to_write, offset, len, nr_frags, len_max; struct socket *sock = po->sk.sk_socket; struct page *page; int err; ph.raw = frame; skb->protocol = proto; skb->dev = dev; skb->priority = sockc->priority; skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid); skb_setup_tx_timestamp(skb, sockc); skb_zcopy_set_nouarg(skb, ph.raw); skb_reserve(skb, hlen); skb_reset_network_header(skb); to_write = tp_len; if (sock->type == SOCK_DGRAM) { err = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, tp_len); if (unlikely(err < 0)) return -EINVAL; } else if (copylen) { int hdrlen = min_t(int, copylen, tp_len); skb_push(skb, dev->hard_header_len); skb_put(skb, copylen - dev->hard_header_len); err = skb_store_bits(skb, 0, data, hdrlen); if (unlikely(err)) return err; if (!dev_validate_header(dev, skb->data, hdrlen)) return -EINVAL; data += hdrlen; to_write -= hdrlen; } offset = offset_in_page(data); len_max = PAGE_SIZE - offset; len = ((to_write > len_max) ? len_max : to_write); skb->data_len = to_write; skb->len += to_write; skb->truesize += to_write; refcount_add(to_write, &po->sk.sk_wmem_alloc); while (likely(to_write)) { nr_frags = skb_shinfo(skb)->nr_frags; if (unlikely(nr_frags >= MAX_SKB_FRAGS)) { pr_err("Packet exceed the number of skb frags(%u)\n", (unsigned int)MAX_SKB_FRAGS); return -EFAULT; } page = pgv_to_page(data); data += len; flush_dcache_page(page); get_page(page); skb_fill_page_desc(skb, nr_frags, page, offset, len); to_write -= len; offset = 0; len_max = PAGE_SIZE; len = ((to_write > len_max) ? len_max : to_write); } packet_parse_headers(skb, sock); return tp_len; } static int tpacket_parse_header(struct packet_sock *po, void *frame, int size_max, void **data) { union tpacket_uhdr ph; int tp_len, off; ph.raw = frame; switch (po->tp_version) { case TPACKET_V3: if (ph.h3->tp_next_offset != 0) { pr_warn_once("variable sized slot not supported"); return -EINVAL; } tp_len = ph.h3->tp_len; break; case TPACKET_V2: tp_len = ph.h2->tp_len; break; default: tp_len = ph.h1->tp_len; break; } if (unlikely(tp_len > size_max)) { pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); return -EMSGSIZE; } if (unlikely(packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF))) { int off_min, off_max; off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); off_max = po->tx_ring.frame_size - tp_len; if (po->sk.sk_type == SOCK_DGRAM) { switch (po->tp_version) { case TPACKET_V3: off = ph.h3->tp_net; break; case TPACKET_V2: off = ph.h2->tp_net; break; default: off = ph.h1->tp_net; break; } } else { switch (po->tp_version) { case TPACKET_V3: off = ph.h3->tp_mac; break; case TPACKET_V2: off = ph.h2->tp_mac; break; default: off = ph.h1->tp_mac; break; } } if (unlikely((off < off_min) || (off_max < off))) return -EINVAL; } else { off = po->tp_hdrlen - sizeof(struct sockaddr_ll); } *data = frame + off; return tp_len; } static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) { struct sk_buff *skb = NULL; struct net_device *dev; struct virtio_net_hdr *vnet_hdr = NULL; struct sockcm_cookie sockc; __be16 proto; int err, reserve = 0; void *ph; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); unsigned char *addr = NULL; int tp_len, size_max; void *data; int len_sum = 0; int status = TP_STATUS_AVAILABLE; int hlen, tlen, copylen = 0; long timeo; mutex_lock(&po->pg_vec_lock); /* packet_sendmsg() check on tx_ring.pg_vec was lockless, * we need to confirm it under protection of pg_vec_lock. */ if (unlikely(!po->tx_ring.pg_vec)) { err = -EBUSY; goto out; } if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); if (po->sk.sk_socket->type == SOCK_DGRAM) { if (dev && msg->msg_namelen < dev->addr_len + offsetof(struct sockaddr_ll, sll_addr)) goto out_put; addr = saddr->sll_addr; } } err = -ENXIO; if (unlikely(dev == NULL)) goto out; err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_put; sockcm_init(&sockc, &po->sk); if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); if (unlikely(err)) goto out_put; } if (po->sk.sk_socket->type == SOCK_RAW) reserve = dev->hard_header_len; size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz) size_max = dev->mtu + reserve + VLAN_HLEN; timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT); reinit_completion(&po->skb_completion); do { ph = packet_current_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST); if (unlikely(ph == NULL)) { /* Note: packet_read_pending() might be slow if we * have to call it as it's per_cpu variable, but in * fast-path we don't have to call it, only when ph * is NULL, we need to check the pending_refcnt. */ if (need_wait && packet_read_pending(&po->tx_ring)) { timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo); if (timeo <= 0) { err = !timeo ? -ETIMEDOUT : -ERESTARTSYS; goto out_put; } /* check for additional frames */ continue; } else break; } skb = NULL; tp_len = tpacket_parse_header(po, ph, size_max, &data); if (tp_len < 0) goto tpacket_error; status = TP_STATUS_SEND_REQUEST; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; if (vnet_hdr_sz) { vnet_hdr = data; data += vnet_hdr_sz; tp_len -= vnet_hdr_sz; if (tp_len < 0 || __packet_snd_vnet_parse(vnet_hdr, tp_len)) { tp_len = -EINVAL; goto tpacket_error; } copylen = __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len); } copylen = max_t(int, copylen, dev->hard_header_len); skb = sock_alloc_send_skb(&po->sk, hlen + tlen + sizeof(struct sockaddr_ll) + (copylen - dev->hard_header_len), !need_wait, &err); if (unlikely(skb == NULL)) { /* we assume the socket was initially writeable ... */ if (likely(len_sum > 0)) err = len_sum; goto out_status; } tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto, addr, hlen, copylen, &sockc); if (likely(tp_len >= 0) && tp_len > dev->mtu + reserve && !vnet_hdr_sz && !packet_extra_vlan_len_allowed(dev, skb)) tp_len = -EMSGSIZE; if (unlikely(tp_len < 0)) { tpacket_error: if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) { __packet_set_status(po, ph, TP_STATUS_AVAILABLE); packet_increment_head(&po->tx_ring); kfree_skb(skb); continue; } else { status = TP_STATUS_WRONG_FORMAT; err = tp_len; goto out_status; } } if (vnet_hdr_sz) { if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { tp_len = -EINVAL; goto tpacket_error; } virtio_net_hdr_set_proto(skb, vnet_hdr); } skb->destructor = tpacket_destruct_skb; __packet_set_status(po, ph, TP_STATUS_SENDING); packet_inc_pending(&po->tx_ring); status = TP_STATUS_SEND_REQUEST; err = packet_xmit(po, skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); if (err && __packet_get_status(po, ph) == TP_STATUS_AVAILABLE) { /* skb was destructed already */ skb = NULL; goto out_status; } /* * skb was dropped but not destructed yet; * let's treat it like congestion or err < 0 */ err = 0; } packet_increment_head(&po->tx_ring); len_sum += tp_len; } while (1); err = len_sum; goto out_put; out_status: __packet_set_status(po, ph, status); kfree_skb(skb); out_put: dev_put(dev); out: mutex_unlock(&po->pg_vec_lock); return err; } static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, size_t reserve, size_t len, size_t linear, int noblock, int *err) { struct sk_buff *skb; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE || !linear) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return NULL; skb_reserve(skb, reserve); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); struct sk_buff *skb; struct net_device *dev; __be16 proto; unsigned char *addr = NULL; int err, reserve = 0; struct sockcm_cookie sockc; struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); int hlen, tlen, linear; int extra_len = 0; /* * Get and verify the address. */ if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex); if (sock->type == SOCK_DGRAM) { if (dev && msg->msg_namelen < dev->addr_len + offsetof(struct sockaddr_ll, sll_addr)) goto out_unlock; addr = saddr->sll_addr; } } err = -ENXIO; if (unlikely(dev == NULL)) goto out_unlock; err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto out_unlock; } if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; if (vnet_hdr_sz) { err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz); if (err) goto out_unlock; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) { err = -EPROTONOSUPPORT; goto out_unlock; } extra_len = 4; /* We're doing our own CRC */ } err = -EMSGSIZE; if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) goto out_unlock; err = -ENOBUFS; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len); linear = max(linear, min_t(int, len, dev->hard_header_len)); skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear, msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; skb_reset_network_header(skb); err = -EINVAL; if (sock->type == SOCK_DGRAM) { offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len); if (unlikely(offset < 0)) goto out_free; } else if (reserve) { skb_reserve(skb, -reserve); if (len < reserve + sizeof(struct ipv6hdr) && dev->min_header_len != dev->hard_header_len) skb_reset_network_header(skb); } /* Returns -EFAULT on error */ err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len); if (err) goto out_free; if ((sock->type == SOCK_RAW && !dev_validate_header(dev, skb->data, len)) || !skb->len) { err = -EINVAL; goto out_free; } skb_setup_tx_timestamp(skb, &sockc); if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_free; } skb->protocol = proto; skb->dev = dev; skb->priority = sockc.priority; skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); if (unlikely(extra_len == 4)) skb->no_fcs = 1; packet_parse_headers(skb, sock); if (vnet_hdr_sz) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; len += vnet_hdr_sz; virtio_net_hdr_set_proto(skb, &vnet_hdr); } err = packet_xmit(po, skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); if (err) goto out_unlock; } dev_put(dev); return len; out_free: kfree_skb(skb); out_unlock: dev_put(dev); out: return err; } static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); /* Reading tx_ring.pg_vec without holding pg_vec_lock is racy. * tpacket_snd() will redo the check safely. */ if (data_race(po->tx_ring.pg_vec)) return tpacket_snd(po, msg); return packet_snd(sock, msg, len); } /* * Close a PACKET socket. This is fairly simple. We immediately go * to 'closed' state and remove our protocol entry in the device list. */ static int packet_release(struct socket *sock) { struct sock *sk = sock->sk; struct packet_sock *po; struct packet_fanout *f; struct net *net; union tpacket_req_u req_u; if (!sk) return 0; net = sock_net(sk); po = pkt_sk(sk); mutex_lock(&net->packet.sklist_lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->packet.sklist_lock); sock_prot_inuse_add(net, sk->sk_prot, -1); spin_lock(&po->bind_lock); unregister_prot_hook(sk, false); packet_cached_dev_reset(po); if (po->prot_hook.dev) { netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); packet_flush_mclist(sk); lock_sock(sk); if (po->rx_ring.pg_vec) { memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 0); } if (po->tx_ring.pg_vec) { memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 1); } release_sock(sk); f = fanout_release(sk); synchronize_net(); kfree(po->rollover); if (f) { fanout_release_data(f); kvfree(f); } /* * Now the socket is dead. No more input will appear. */ sock_orphan(sk); sock->sk = NULL; /* Purge queues */ skb_queue_purge(&sk->sk_receive_queue); packet_free_pending(po); sock_put(sk); return 0; } /* * Attach a packet hook. */ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, __be16 proto) { struct packet_sock *po = pkt_sk(sk); struct net_device *dev = NULL; bool unlisted = false; bool need_rehook; int ret = 0; lock_sock(sk); spin_lock(&po->bind_lock); if (!proto) proto = po->num; rcu_read_lock(); if (po->fanout) { ret = -EINVAL; goto out_unlock; } if (name) { dev = dev_get_by_name_rcu(sock_net(sk), name); if (!dev) { ret = -ENODEV; goto out_unlock; } } else if (ifindex) { dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (!dev) { ret = -ENODEV; goto out_unlock; } } need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev; if (need_rehook) { dev_hold(dev); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { rcu_read_unlock(); /* prevents packet_notifier() from calling * register_prot_hook() */ WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, true); rcu_read_lock(); if (dev) unlisted = !dev_get_by_index_rcu(sock_net(sk), dev->ifindex); } BUG_ON(packet_sock_flag(po, PACKET_SOCK_RUNNING)); WRITE_ONCE(po->num, proto); po->prot_hook.type = proto; netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); if (unlikely(unlisted)) { po->prot_hook.dev = NULL; WRITE_ONCE(po->ifindex, -1); packet_cached_dev_reset(po); } else { netdev_hold(dev, &po->prot_hook.dev_tracker, GFP_ATOMIC); po->prot_hook.dev = dev; WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0); packet_cached_dev_assign(po, dev); } dev_put(dev); } if (proto == 0 || !need_rehook) goto out_unlock; if (!unlisted && (!dev || (dev->flags & IFF_UP))) { register_prot_hook(sk); } else { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } out_unlock: rcu_read_unlock(); spin_unlock(&po->bind_lock); release_sock(sk); return ret; } /* * Bind a packet socket to a device */ static int packet_bind_spkt(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct sockaddr *sa = (struct sockaddr *)uaddr; char name[sizeof(sa->sa_data) + 1]; /* * Check legality */ if (addr_len != sizeof(struct sockaddr)) return -EINVAL; /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated. */ memcpy(name, sa->sa_data, sizeof(sa->sa_data)); name[sizeof(sa->sa_data)] = 0; return packet_do_bind(sk, name, 0, 0); } static int packet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; struct sock *sk = sock->sk; /* * Check legality */ if (addr_len < sizeof(struct sockaddr_ll)) return -EINVAL; if (sll->sll_family != AF_PACKET) return -EINVAL; return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol); } static struct proto packet_proto = { .name = "PACKET", .owner = THIS_MODULE, .obj_size = sizeof(struct packet_sock), }; /* * Create a packet of type SOCK_PACKET. */ static int packet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct packet_sock *po; __be16 proto = (__force __be16)protocol; /* weird, but documented */ int err; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && sock->type != SOCK_PACKET) return -ESOCKTNOSUPPORT; sock->state = SS_UNCONNECTED; err = -ENOBUFS; sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern); if (sk == NULL) goto out; sock->ops = &packet_ops; if (sock->type == SOCK_PACKET) sock->ops = &packet_ops_spkt; po = pkt_sk(sk); err = packet_alloc_pending(po); if (err) goto out_sk_free; sock_init_data(sock, sk); init_completion(&po->skb_completion); sk->sk_family = PF_PACKET; po->num = proto; packet_cached_dev_reset(po); sk->sk_destruct = packet_sock_destruct; /* * Attach a protocol block */ spin_lock_init(&po->bind_lock); mutex_init(&po->pg_vec_lock); po->rollover = NULL; po->prot_hook.func = packet_rcv; if (sock->type == SOCK_PACKET) po->prot_hook.func = packet_rcv_spkt; po->prot_hook.af_packet_priv = sk; po->prot_hook.af_packet_net = sock_net(sk); if (proto) { po->prot_hook.type = proto; __register_prot_hook(sk); } mutex_lock(&net->packet.sklist_lock); sk_add_node_tail_rcu(sk, &net->packet.sklist); mutex_unlock(&net->packet.sklist_lock); sock_prot_inuse_add(net, &packet_proto, 1); return 0; out_sk_free: sk_free(sk); out: return err; } /* * Pull a packet from our receive queue and hand it to the user. * If necessary we block. */ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz); unsigned int origlen = 0; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE)) goto out; #if 0 /* What error should we return now? EUNATTACH? */ if (pkt_sk(sk)->ifindex < 0) return -ENODEV; #endif if (flags & MSG_ERRQUEUE) { err = sock_recv_errqueue(sk, msg, len, SOL_PACKET, PACKET_TX_TIMESTAMP); goto out; } /* * Call the generic datagram receiver. This handles all sorts * of horrible races and re-entrancy so we can forget about it * in the protocol layers. * * Now it will return ENETDOWN, if device have just gone down, * but then it will block. */ skb = skb_recv_datagram(sk, flags, &err); /* * An error occurred so return it. Because skb_recv_datagram() * handles the blocking we don't see and worry about blocking * retries. */ if (skb == NULL) goto out; packet_rcv_try_clear_pressure(pkt_sk(sk)); if (vnet_hdr_len) { err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len); if (err) goto out_free; } /* You lose any data beyond the buffer you gave. If it worries * a user program they can ask the device for its MTU * anyway. */ copied = skb->len; if (copied > len) { copied = len; msg->msg_flags |= MSG_TRUNC; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; if (sock->type != SOCK_PACKET) { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; /* Original length was stored in sockaddr_ll fields */ origlen = PACKET_SKB_CB(skb)->sa.origlen; sll->sll_family = AF_PACKET; sll->sll_protocol = (sock->type == SOCK_DGRAM) ? vlan_get_protocol_dgram(skb) : skb->protocol; } sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { const size_t max_len = min(sizeof(skb->cb), sizeof(struct sockaddr_storage)); int copy_len; /* If the address length field is there to be filled * in, we fill it in now. */ if (sock->type == SOCK_PACKET) { __sockaddr_check_size(sizeof(struct sockaddr_pkt)); msg->msg_namelen = sizeof(struct sockaddr_pkt); copy_len = msg->msg_namelen; } else { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); copy_len = msg->msg_namelen; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) { memset(msg->msg_name + offsetof(struct sockaddr_ll, sll_addr), 0, sizeof(sll->sll_addr)); msg->msg_namelen = sizeof(struct sockaddr_ll); } } if (WARN_ON_ONCE(copy_len > max_len)) { copy_len = max_len; msg->msg_namelen = copy_len; } memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); } if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) { struct tpacket_auxdata aux; aux.tp_status = TP_STATUS_USER; if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) aux.tp_status |= TP_STATUS_CSUM_VALID; if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) aux.tp_status |= TP_STATUS_GSO_TCP; aux.tp_len = origlen; aux.tp_snaplen = skb->len; aux.tp_mac = 0; aux.tp_net = skb_network_offset(skb); if (skb_vlan_tag_present(skb)) { aux.tp_vlan_tci = skb_vlan_tag_get(skb); aux.tp_vlan_tpid = ntohs(skb->vlan_proto); aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else if (unlikely(sock->type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), sll->sll_ifindex); if (dev) { aux.tp_vlan_tci = vlan_get_tci(skb, dev); aux.tp_vlan_tpid = ntohs(skb->protocol); aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { aux.tp_vlan_tci = 0; aux.tp_vlan_tpid = 0; } rcu_read_unlock(); } else { aux.tp_vlan_tci = 0; aux.tp_vlan_tpid = 0; } put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); } /* * Free or return the buffer as appropriate. Again this * hides all the races and re-entrancy issues from us. */ err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); out_free: skb_free_datagram(sk, skb); out: return err; } static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, int peer) { struct net_device *dev; struct sock *sk = sock->sk; if (peer) return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data)); rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); rcu_read_unlock(); return sizeof(*uaddr); } static int packet_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct net_device *dev; struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr); int ifindex; if (peer) return -EOPNOTSUPP; ifindex = READ_ONCE(po->ifindex); sll->sll_family = AF_PACKET; sll->sll_ifindex = ifindex; sll->sll_protocol = READ_ONCE(po->num); sll->sll_pkttype = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; /* Let __fortify_memcpy_chk() know the actual buffer size. */ memcpy(((struct sockaddr_storage *)sll)->__data + offsetof(struct sockaddr_ll, sll_addr) - offsetofend(struct sockaddr_ll, sll_family), dev->dev_addr, dev->addr_len); } else { sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ sll->sll_halen = 0; } rcu_read_unlock(); return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; } static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what) { switch (i->type) { case PACKET_MR_MULTICAST: if (i->alen != dev->addr_len) return -EINVAL; if (what > 0) return dev_mc_add(dev, i->addr); else return dev_mc_del(dev, i->addr); break; case PACKET_MR_PROMISC: return dev_set_promiscuity(dev, what); case PACKET_MR_ALLMULTI: return dev_set_allmulti(dev, what); case PACKET_MR_UNICAST: if (i->alen != dev->addr_len) return -EINVAL; if (what > 0) return dev_uc_add(dev, i->addr); else return dev_uc_del(dev, i->addr); break; default: break; } return 0; } static void packet_dev_mclist_delete(struct net_device *dev, struct packet_mclist **mlp, struct list_head *list) { struct packet_mclist *ml; while ((ml = *mlp) != NULL) { if (ml->ifindex == dev->ifindex) { list_add(&ml->remove_list, list); *mlp = ml->next; } else mlp = &ml->next; } } static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) { struct packet_sock *po = pkt_sk(sk); struct packet_mclist *ml, *i; struct net_device *dev; int err; rtnl_lock(); err = -ENODEV; dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex); if (!dev) goto done; err = -EINVAL; if (mreq->mr_alen > dev->addr_len) goto done; err = -ENOBUFS; i = kmalloc(sizeof(*i), GFP_KERNEL); if (i == NULL) goto done; err = 0; for (ml = po->mclist; ml; ml = ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq->mr_alen && memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { ml->count++; /* Free the new element ... */ kfree(i); goto done; } } i->type = mreq->mr_type; i->ifindex = mreq->mr_ifindex; i->alen = mreq->mr_alen; memcpy(i->addr, mreq->mr_address, i->alen); memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen); i->count = 1; INIT_LIST_HEAD(&i->remove_list); i->next = po->mclist; po->mclist = i; err = packet_dev_mc(dev, i, 1); if (err) { po->mclist = i->next; kfree(i); } done: rtnl_unlock(); return err; } static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) { struct packet_mclist *ml, **mlp; rtnl_lock(); for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq->mr_alen && memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { if (--ml->count == 0) { struct net_device *dev; *mlp = ml->next; dev = __dev_get_by_index(sock_net(sk), ml->ifindex); if (dev) packet_dev_mc(dev, ml, -1); kfree(ml); } break; } } rtnl_unlock(); return 0; } static void packet_flush_mclist(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); struct packet_mclist *ml; if (!po->mclist) return; rtnl_lock(); while ((ml = po->mclist) != NULL) { struct net_device *dev; po->mclist = ml->next; dev = __dev_get_by_index(sock_net(sk), ml->ifindex); if (dev != NULL) packet_dev_mc(dev, ml, -1); kfree(ml); } rtnl_unlock(); } static int packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); int ret; if (level != SOL_PACKET) return -ENOPROTOOPT; switch (optname) { case PACKET_ADD_MEMBERSHIP: case PACKET_DROP_MEMBERSHIP: { struct packet_mreq_max mreq; int len = optlen; memset(&mreq, 0, sizeof(mreq)); if (len < sizeof(struct packet_mreq)) return -EINVAL; if (len > sizeof(mreq)) len = sizeof(mreq); if (copy_from_sockptr(&mreq, optval, len)) return -EFAULT; if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) return -EINVAL; if (optname == PACKET_ADD_MEMBERSHIP) ret = packet_mc_add(sk, &mreq); else ret = packet_mc_drop(sk, &mreq); return ret; } case PACKET_RX_RING: case PACKET_TX_RING: { union tpacket_req_u req_u; ret = -EINVAL; lock_sock(sk); switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: if (optlen < sizeof(req_u.req)) break; ret = copy_from_sockptr(&req_u.req, optval, sizeof(req_u.req)) ? -EINVAL : 0; break; case TPACKET_V3: default: if (optlen < sizeof(req_u.req3)) break; ret = copy_from_sockptr(&req_u.req3, optval, sizeof(req_u.req3)) ? -EINVAL : 0; break; } if (!ret) ret = packet_set_ring(sk, &req_u, 0, optname == PACKET_TX_RING); release_sock(sk); return ret; } case PACKET_COPY_THRESH: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; WRITE_ONCE(pkt_sk(sk)->copy_thresh, val); return 0; } case PACKET_VERSION: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (val) { case TPACKET_V1: case TPACKET_V2: case TPACKET_V3: break; default: return -EINVAL; } lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { po->tp_version = val; ret = 0; } release_sock(sk); return ret; } case PACKET_RESERVE: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val > INT_MAX) return -EINVAL; lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { po->tp_reserve = val; ret = 0; } release_sock(sk); return ret; } case PACKET_LOSS: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { packet_sock_flag_set(po, PACKET_SOCK_TP_LOSS, val); ret = 0; } release_sock(sk); return ret; } case PACKET_AUXDATA: { int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val); return 0; } case PACKET_ORIGDEV: { int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val); return 0; } case PACKET_VNET_HDR: case PACKET_VNET_HDR_SZ: { int val, hdr_len; if (sock->type != SOCK_RAW) return -EINVAL; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (optname == PACKET_VNET_HDR_SZ) { if (val && val != sizeof(struct virtio_net_hdr) && val != sizeof(struct virtio_net_hdr_mrg_rxbuf)) return -EINVAL; hdr_len = val; } else { hdr_len = val ? sizeof(struct virtio_net_hdr) : 0; } lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { WRITE_ONCE(po->vnet_hdr_sz, hdr_len); ret = 0; } release_sock(sk); return ret; } case PACKET_TIMESTAMP: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; WRITE_ONCE(po->tp_tstamp, val); return 0; } case PACKET_FANOUT: { struct fanout_args args = { 0 }; if (optlen != sizeof(int) && optlen != sizeof(args)) return -EINVAL; if (copy_from_sockptr(&args, optval, optlen)) return -EFAULT; return fanout_add(sk, &args); } case PACKET_FANOUT_DATA: { /* Paired with the WRITE_ONCE() in fanout_add() */ if (!READ_ONCE(po->fanout)) return -EINVAL; return fanout_set_data(po, optval, optlen); } case PACKET_IGNORE_OUTGOING: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val < 0 || val > 1) return -EINVAL; WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val); return 0; } case PACKET_TX_HAS_OFF: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec) packet_sock_flag_set(po, PACKET_SOCK_TX_HAS_OFF, val); release_sock(sk); return 0; } case PACKET_QDISC_BYPASS: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_QDISC_BYPASS, val); return 0; } default: return -ENOPROTOOPT; } } static int packet_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { int len; int val, lv = sizeof(val); struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); void *data = &val; union tpacket_stats_u st; struct tpacket_rollover_stats rstats; int drops; if (level != SOL_PACKET) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case PACKET_STATISTICS: spin_lock_bh(&sk->sk_receive_queue.lock); memcpy(&st, &po->stats, sizeof(st)); memset(&po->stats, 0, sizeof(po->stats)); spin_unlock_bh(&sk->sk_receive_queue.lock); drops = atomic_xchg(&po->tp_drops, 0); if (po->tp_version == TPACKET_V3) { lv = sizeof(struct tpacket_stats_v3); st.stats3.tp_drops = drops; st.stats3.tp_packets += drops; data = &st.stats3; } else { lv = sizeof(struct tpacket_stats); st.stats1.tp_drops = drops; st.stats1.tp_packets += drops; data = &st.stats1; } break; case PACKET_AUXDATA: val = packet_sock_flag(po, PACKET_SOCK_AUXDATA); break; case PACKET_ORIGDEV: val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); break; case PACKET_VNET_HDR: val = !!READ_ONCE(po->vnet_hdr_sz); break; case PACKET_VNET_HDR_SZ: val = READ_ONCE(po->vnet_hdr_sz); break; case PACKET_COPY_THRESH: val = READ_ONCE(pkt_sk(sk)->copy_thresh); break; case PACKET_VERSION: val = po->tp_version; break; case PACKET_HDRLEN: if (len > sizeof(int)) len = sizeof(int); if (len < sizeof(int)) return -EINVAL; if (copy_from_user(&val, optval, len)) return -EFAULT; switch (val) { case TPACKET_V1: val = sizeof(struct tpacket_hdr); break; case TPACKET_V2: val = sizeof(struct tpacket2_hdr); break; case TPACKET_V3: val = sizeof(struct tpacket3_hdr); break; default: return -EINVAL; } break; case PACKET_RESERVE: val = po->tp_reserve; break; case PACKET_LOSS: val = packet_sock_flag(po, PACKET_SOCK_TP_LOSS); break; case PACKET_TIMESTAMP: val = READ_ONCE(po->tp_tstamp); break; case PACKET_FANOUT: val = (po->fanout ? ((u32)po->fanout->id | ((u32)po->fanout->type << 16) | ((u32)po->fanout->flags << 24)) : 0); break; case PACKET_IGNORE_OUTGOING: val = READ_ONCE(po->prot_hook.ignore_outgoing); break; case PACKET_ROLLOVER_STATS: if (!po->rollover) return -EINVAL; rstats.tp_all = atomic_long_read(&po->rollover->num); rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); data = &rstats; lv = sizeof(rstats); break; case PACKET_TX_HAS_OFF: val = packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF); break; case PACKET_QDISC_BYPASS: val = packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS); break; default: return -ENOPROTOOPT; } if (len > lv) len = lv; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, data, len)) return -EFAULT; return 0; } static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct packet_mclist *ml, *tmp; LIST_HEAD(mclist); struct sock *sk; rcu_read_lock(); sk_for_each_rcu(sk, &net->packet.sklist) { struct packet_sock *po = pkt_sk(sk); switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) packet_dev_mclist_delete(dev, &po->mclist, &mclist); fallthrough; case NETDEV_DOWN: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { __unregister_prot_hook(sk, false); sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } if (msg == NETDEV_UNREGISTER) { packet_cached_dev_reset(po); WRITE_ONCE(po->ifindex, -1); netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); } break; case NETDEV_UP: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); if (po->num) register_prot_hook(sk); spin_unlock(&po->bind_lock); } break; } } rcu_read_unlock(); /* packet_dev_mc might grab instance locks so can't run under rcu */ list_for_each_entry_safe(ml, tmp, &mclist, remove_list) { packet_dev_mc(dev, ml, -1); kfree(ml); } return NOTIFY_DONE; } static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; switch (cmd) { case SIOCOUTQ: { int amount = sk_wmem_alloc_get(sk); return put_user(amount, (int __user *)arg); } case SIOCINQ: { struct sk_buff *skb; int amount = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) amount = skb->len; spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } #ifdef CONFIG_INET case SIOCADDRT: case SIOCDELRT: case SIOCDARP: case SIOCGARP: case SIOCSARP: case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCSIFFLAGS: return inet_dgram_ops.ioctl(sock, cmd, arg); #endif default: return -ENOIOCTLCMD; } return 0; } static __poll_t packet_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); __poll_t mask = datagram_poll(file, sock, wait); spin_lock_bh(&sk->sk_receive_queue.lock); if (po->rx_ring.pg_vec) { if (!packet_previous_rx_frame(po, &po->rx_ring, TP_STATUS_KERNEL)) mask |= EPOLLIN | EPOLLRDNORM; } packet_rcv_try_clear_pressure(po); spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE)) mask |= EPOLLOUT | EPOLLWRNORM; } spin_unlock_bh(&sk->sk_write_queue.lock); return mask; } /* Dirty? Well, I still did not learn better way to account * for user mmaps. */ static void packet_mm_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct socket *sock = file->private_data; struct sock *sk = sock->sk; if (sk) atomic_long_inc(&pkt_sk(sk)->mapped); } static void packet_mm_close(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct socket *sock = file->private_data; struct sock *sk = sock->sk; if (sk) atomic_long_dec(&pkt_sk(sk)->mapped); } static const struct vm_operations_struct packet_mmap_ops = { .open = packet_mm_open, .close = packet_mm_close, }; static void free_pg_vec(struct pgv *pg_vec, unsigned int order, unsigned int len) { int i; for (i = 0; i < len; i++) { if (likely(pg_vec[i].buffer)) { if (is_vmalloc_addr(pg_vec[i].buffer)) vfree(pg_vec[i].buffer); else free_pages((unsigned long)pg_vec[i].buffer, order); pg_vec[i].buffer = NULL; } } kfree(pg_vec); } static char *alloc_one_pg_vec_page(unsigned long order) { char *buffer; gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; buffer = (char *) __get_free_pages(gfp_flags, order); if (buffer) return buffer; /* __get_free_pages failed, fall back to vmalloc */ buffer = vzalloc(array_size((1 << order), PAGE_SIZE)); if (buffer) return buffer; /* vmalloc failed, lets dig into swap here */ gfp_flags &= ~__GFP_NORETRY; buffer = (char *) __get_free_pages(gfp_flags, order); if (buffer) return buffer; /* complete and utter failure */ return NULL; } static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) { unsigned int block_nr = req->tp_block_nr; struct pgv *pg_vec; int i; pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN); if (unlikely(!pg_vec)) goto out; for (i = 0; i < block_nr; i++) { pg_vec[i].buffer = alloc_one_pg_vec_page(order); if (unlikely(!pg_vec[i].buffer)) goto out_free_pgvec; } out: return pg_vec; out_free_pgvec: free_pg_vec(pg_vec, order, block_nr); pg_vec = NULL; goto out; } static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring) { struct pgv *pg_vec = NULL; struct packet_sock *po = pkt_sk(sk); unsigned long *rx_owner_map = NULL; int was_running, order = 0; struct packet_ring_buffer *rb; struct sk_buff_head *rb_queue; __be16 num; int err; /* Added to avoid minimal code churn */ struct tpacket_req *req = &req_u->req; rb = tx_ring ? &po->tx_ring : &po->rx_ring; rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; err = -EBUSY; if (!closing) { if (atomic_long_read(&po->mapped)) goto out; if (packet_read_pending(rb)) goto out; } if (req->tp_block_nr) { unsigned int min_frame_size; /* Sanity tests and some calculations */ err = -EBUSY; if (unlikely(rb->pg_vec)) goto out; switch (po->tp_version) { case TPACKET_V1: po->tp_hdrlen = TPACKET_HDRLEN; break; case TPACKET_V2: po->tp_hdrlen = TPACKET2_HDRLEN; break; case TPACKET_V3: po->tp_hdrlen = TPACKET3_HDRLEN; break; } err = -EINVAL; if (unlikely((int)req->tp_block_size <= 0)) goto out; if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) goto out; min_frame_size = po->tp_hdrlen + po->tp_reserve; if (po->tp_version >= TPACKET_V3 && req->tp_block_size < BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size) goto out; if (unlikely(req->tp_frame_size < min_frame_size)) goto out; if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) goto out; rb->frames_per_block = req->tp_block_size / req->tp_frame_size; if (unlikely(rb->frames_per_block == 0)) goto out; if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr)) goto out; if (unlikely((rb->frames_per_block * req->tp_block_nr) != req->tp_frame_nr)) goto out; err = -ENOMEM; order = get_order(req->tp_block_size); pg_vec = alloc_pg_vec(req, order); if (unlikely(!pg_vec)) goto out; switch (po->tp_version) { case TPACKET_V3: /* Block transmit is not supported yet */ if (!tx_ring) { init_prb_bdqc(po, rb, pg_vec, req_u); } else { struct tpacket_req3 *req3 = &req_u->req3; if (req3->tp_retire_blk_tov || req3->tp_sizeof_priv || req3->tp_feature_req_word) { err = -EINVAL; goto out_free_pg_vec; } } break; default: if (!tx_ring) { rx_owner_map = bitmap_alloc(req->tp_frame_nr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); if (!rx_owner_map) goto out_free_pg_vec; } break; } } /* Done */ else { err = -EINVAL; if (unlikely(req->tp_frame_nr)) goto out; } /* Detach socket from network */ spin_lock(&po->bind_lock); was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING); num = po->num; WRITE_ONCE(po->num, 0); if (was_running) __unregister_prot_hook(sk, false); spin_unlock(&po->bind_lock); synchronize_net(); err = -EBUSY; mutex_lock(&po->pg_vec_lock); if (closing || atomic_long_read(&po->mapped) == 0) { err = 0; spin_lock_bh(&rb_queue->lock); swap(rb->pg_vec, pg_vec); if (po->tp_version <= TPACKET_V2) swap(rb->rx_owner_map, rx_owner_map); rb->frame_max = (req->tp_frame_nr - 1); rb->head = 0; rb->frame_size = req->tp_frame_size; spin_unlock_bh(&rb_queue->lock); swap(rb->pg_vec_order, order); swap(rb->pg_vec_len, req->tp_block_nr); rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; po->prot_hook.func = (po->rx_ring.pg_vec) ? tpacket_rcv : packet_rcv; skb_queue_purge(rb_queue); if (atomic_long_read(&po->mapped)) pr_err("packet_mmap: vma is busy: %ld\n", atomic_long_read(&po->mapped)); } mutex_unlock(&po->pg_vec_lock); spin_lock(&po->bind_lock); WRITE_ONCE(po->num, num); if (was_running) register_prot_hook(sk); spin_unlock(&po->bind_lock); if (pg_vec && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ if (!tx_ring) prb_shutdown_retire_blk_timer(po, rb_queue); } out_free_pg_vec: if (pg_vec) { bitmap_free(rx_owner_map); free_pg_vec(pg_vec, order, req->tp_block_nr); } out: return err; } static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); unsigned long size, expected_size; struct packet_ring_buffer *rb; unsigned long start; int err = -EINVAL; int i; if (vma->vm_pgoff) return -EINVAL; mutex_lock(&po->pg_vec_lock); expected_size = 0; for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { if (rb->pg_vec) { expected_size += rb->pg_vec_len * rb->pg_vec_pages * PAGE_SIZE; } } if (expected_size == 0) goto out; size = vma->vm_end - vma->vm_start; if (size != expected_size) goto out; start = vma->vm_start; for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { if (rb->pg_vec == NULL) continue; for (i = 0; i < rb->pg_vec_len; i++) { struct page *page; void *kaddr = rb->pg_vec[i].buffer; int pg_num; for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) { page = pgv_to_page(kaddr); err = vm_insert_page(vma, start, page); if (unlikely(err)) goto out; start += PAGE_SIZE; kaddr += PAGE_SIZE; } } } atomic_long_inc(&po->mapped); vma->vm_ops = &packet_mmap_ops; err = 0; out: mutex_unlock(&po->pg_vec_lock); return err; } static const struct proto_ops packet_ops_spkt = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind_spkt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname_spkt, .poll = datagram_poll, .ioctl = packet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = packet_sendmsg_spkt, .recvmsg = packet_recvmsg, .mmap = sock_no_mmap, }; static const struct proto_ops packet_ops = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname, .poll = packet_poll, .ioctl = packet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = packet_setsockopt, .getsockopt = packet_getsockopt, .sendmsg = packet_sendmsg, .recvmsg = packet_recvmsg, .mmap = packet_mmap, }; static const struct net_proto_family packet_family_ops = { .family = PF_PACKET, .create = packet_create, .owner = THIS_MODULE, }; static struct notifier_block packet_netdev_notifier = { .notifier_call = packet_notifier, }; #ifdef CONFIG_PROC_FS static void *packet_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); rcu_read_lock(); return seq_hlist_start_head_rcu(&net->packet.sklist, *pos); } static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct net *net = seq_file_net(seq); return seq_hlist_next_rcu(v, &net->packet.sklist, pos); } static void packet_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int packet_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, "%*sRefCnt Type Proto Iface R Rmem User Inode\n", IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk"); else { struct sock *s = sk_entry(v); const struct packet_sock *po = pkt_sk(s); seq_printf(seq, "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n", s, refcount_read(&s->sk_refcnt), s->sk_type, ntohs(READ_ONCE(po->num)), READ_ONCE(po->ifindex), packet_sock_flag(po, PACKET_SOCK_RUNNING), atomic_read(&s->sk_rmem_alloc), from_kuid_munged(seq_user_ns(seq), sk_uid(s)), sock_i_ino(s)); } return 0; } static const struct seq_operations packet_seq_ops = { .start = packet_seq_start, .next = packet_seq_next, .stop = packet_seq_stop, .show = packet_seq_show, }; #endif static int __net_init packet_net_init(struct net *net) { mutex_init(&net->packet.sklist_lock); INIT_HLIST_HEAD(&net->packet.sklist); #ifdef CONFIG_PROC_FS if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; #endif /* CONFIG_PROC_FS */ return 0; } static void __net_exit packet_net_exit(struct net *net) { remove_proc_entry("packet", net->proc_net); WARN_ON_ONCE(!hlist_empty(&net->packet.sklist)); } static struct pernet_operations packet_net_ops = { .init = packet_net_init, .exit = packet_net_exit, }; static void __exit packet_exit(void) { sock_unregister(PF_PACKET); proto_unregister(&packet_proto); unregister_netdevice_notifier(&packet_netdev_notifier); unregister_pernet_subsys(&packet_net_ops); } static int __init packet_init(void) { int rc; rc = register_pernet_subsys(&packet_net_ops); if (rc) goto out; rc = register_netdevice_notifier(&packet_netdev_notifier); if (rc) goto out_pernet; rc = proto_register(&packet_proto, 0); if (rc) goto out_notifier; rc = sock_register(&packet_family_ops); if (rc) goto out_proto; return 0; out_proto: proto_unregister(&packet_proto); out_notifier: unregister_netdevice_notifier(&packet_netdev_notifier); out_pernet: unregister_pernet_subsys(&packet_net_ops); out: return rc; } module_init(packet_init); module_exit(packet_exit); MODULE_DESCRIPTION("Packet socket support (AF_PACKET)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_PACKET); |
| 7 6 1 1 7 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) by Jaroslav Kysela <perex@perex.cz> * Takashi Iwai <tiwai@suse.de> * * Generic memory allocators */ #include <linux/slab.h> #include <linux/mm.h> #include <linux/dma-mapping.h> #include <linux/dma-map-ops.h> #include <linux/genalloc.h> #include <linux/highmem.h> #include <linux/vmalloc.h> #ifdef CONFIG_X86 #include <asm/set_memory.h> #endif #include <sound/memalloc.h> struct snd_malloc_ops { void *(*alloc)(struct snd_dma_buffer *dmab, size_t size); void (*free)(struct snd_dma_buffer *dmab); dma_addr_t (*get_addr)(struct snd_dma_buffer *dmab, size_t offset); struct page *(*get_page)(struct snd_dma_buffer *dmab, size_t offset); unsigned int (*get_chunk_size)(struct snd_dma_buffer *dmab, unsigned int ofs, unsigned int size); int (*mmap)(struct snd_dma_buffer *dmab, struct vm_area_struct *area); void (*sync)(struct snd_dma_buffer *dmab, enum snd_dma_sync_mode mode); }; #define DEFAULT_GFP \ (GFP_KERNEL | \ __GFP_RETRY_MAYFAIL | /* don't trigger OOM-killer */ \ __GFP_NOWARN) /* no stack trace print - this call is non-critical */ static const struct snd_malloc_ops *snd_dma_get_ops(struct snd_dma_buffer *dmab); static void *__snd_dma_alloc_pages(struct snd_dma_buffer *dmab, size_t size) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (WARN_ON_ONCE(!ops || !ops->alloc)) return NULL; return ops->alloc(dmab, size); } /** * snd_dma_alloc_dir_pages - allocate the buffer area according to the given * type and direction * @type: the DMA buffer type * @device: the device pointer * @dir: DMA direction * @size: the buffer size to allocate * @dmab: buffer allocation record to store the allocated data * * Calls the memory-allocator function for the corresponding * buffer type. * * Return: Zero if the buffer with the given size is allocated successfully, * otherwise a negative value on error. */ int snd_dma_alloc_dir_pages(int type, struct device *device, enum dma_data_direction dir, size_t size, struct snd_dma_buffer *dmab) { if (WARN_ON(!size)) return -ENXIO; if (WARN_ON(!dmab)) return -ENXIO; size = PAGE_ALIGN(size); dmab->dev.type = type; dmab->dev.dev = device; dmab->dev.dir = dir; dmab->bytes = 0; dmab->addr = 0; dmab->private_data = NULL; dmab->area = __snd_dma_alloc_pages(dmab, size); if (!dmab->area) return -ENOMEM; dmab->bytes = size; return 0; } EXPORT_SYMBOL(snd_dma_alloc_dir_pages); /** * snd_dma_alloc_pages_fallback - allocate the buffer area according to the given type with fallback * @type: the DMA buffer type * @device: the device pointer * @size: the buffer size to allocate * @dmab: buffer allocation record to store the allocated data * * Calls the memory-allocator function for the corresponding * buffer type. When no space is left, this function reduces the size and * tries to allocate again. The size actually allocated is stored in * res_size argument. * * Return: Zero if the buffer with the given size is allocated successfully, * otherwise a negative value on error. */ int snd_dma_alloc_pages_fallback(int type, struct device *device, size_t size, struct snd_dma_buffer *dmab) { int err; while ((err = snd_dma_alloc_pages(type, device, size, dmab)) < 0) { if (err != -ENOMEM) return err; if (size <= PAGE_SIZE) return -ENOMEM; size >>= 1; size = PAGE_SIZE << get_order(size); } if (! dmab->area) return -ENOMEM; return 0; } EXPORT_SYMBOL(snd_dma_alloc_pages_fallback); /** * snd_dma_free_pages - release the allocated buffer * @dmab: the buffer allocation record to release * * Releases the allocated buffer via snd_dma_alloc_pages(). */ void snd_dma_free_pages(struct snd_dma_buffer *dmab) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (ops && ops->free) ops->free(dmab); } EXPORT_SYMBOL(snd_dma_free_pages); /* called by devres */ static void __snd_release_pages(struct device *dev, void *res) { snd_dma_free_pages(res); } /** * snd_devm_alloc_dir_pages - allocate the buffer and manage with devres * @dev: the device pointer * @type: the DMA buffer type * @dir: DMA direction * @size: the buffer size to allocate * * Allocate buffer pages depending on the given type and manage using devres. * The pages will be released automatically at the device removal. * * Unlike snd_dma_alloc_pages(), this function requires the real device pointer, * hence it can't work with SNDRV_DMA_TYPE_CONTINUOUS or * SNDRV_DMA_TYPE_VMALLOC type. * * Return: the snd_dma_buffer object at success, or NULL if failed */ struct snd_dma_buffer * snd_devm_alloc_dir_pages(struct device *dev, int type, enum dma_data_direction dir, size_t size) { struct snd_dma_buffer *dmab; int err; if (WARN_ON(type == SNDRV_DMA_TYPE_CONTINUOUS || type == SNDRV_DMA_TYPE_VMALLOC)) return NULL; dmab = devres_alloc(__snd_release_pages, sizeof(*dmab), GFP_KERNEL); if (!dmab) return NULL; err = snd_dma_alloc_dir_pages(type, dev, dir, size, dmab); if (err < 0) { devres_free(dmab); return NULL; } devres_add(dev, dmab); return dmab; } EXPORT_SYMBOL_GPL(snd_devm_alloc_dir_pages); /** * snd_dma_buffer_mmap - perform mmap of the given DMA buffer * @dmab: buffer allocation information * @area: VM area information * * Return: zero if successful, or a negative error code */ int snd_dma_buffer_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { const struct snd_malloc_ops *ops; if (!dmab) return -ENOENT; ops = snd_dma_get_ops(dmab); if (ops && ops->mmap) return ops->mmap(dmab, area); else return -ENOENT; } EXPORT_SYMBOL(snd_dma_buffer_mmap); #ifdef CONFIG_HAS_DMA /** * snd_dma_buffer_sync - sync DMA buffer between CPU and device * @dmab: buffer allocation information * @mode: sync mode */ void snd_dma_buffer_sync(struct snd_dma_buffer *dmab, enum snd_dma_sync_mode mode) { const struct snd_malloc_ops *ops; if (!dmab || !dmab->dev.need_sync) return; ops = snd_dma_get_ops(dmab); if (ops && ops->sync) ops->sync(dmab, mode); } EXPORT_SYMBOL_GPL(snd_dma_buffer_sync); #endif /* CONFIG_HAS_DMA */ /** * snd_sgbuf_get_addr - return the physical address at the corresponding offset * @dmab: buffer allocation information * @offset: offset in the ring buffer * * Return: the physical address */ dma_addr_t snd_sgbuf_get_addr(struct snd_dma_buffer *dmab, size_t offset) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (ops && ops->get_addr) return ops->get_addr(dmab, offset); else return dmab->addr + offset; } EXPORT_SYMBOL(snd_sgbuf_get_addr); /** * snd_sgbuf_get_page - return the physical page at the corresponding offset * @dmab: buffer allocation information * @offset: offset in the ring buffer * * Return: the page pointer */ struct page *snd_sgbuf_get_page(struct snd_dma_buffer *dmab, size_t offset) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (ops && ops->get_page) return ops->get_page(dmab, offset); else return virt_to_page(dmab->area + offset); } EXPORT_SYMBOL(snd_sgbuf_get_page); /** * snd_sgbuf_get_chunk_size - compute the max chunk size with continuous pages * on sg-buffer * @dmab: buffer allocation information * @ofs: offset in the ring buffer * @size: the requested size * * Return: the chunk size */ unsigned int snd_sgbuf_get_chunk_size(struct snd_dma_buffer *dmab, unsigned int ofs, unsigned int size) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (ops && ops->get_chunk_size) return ops->get_chunk_size(dmab, ofs, size); else return size; } EXPORT_SYMBOL(snd_sgbuf_get_chunk_size); /* * Continuous pages allocator */ static void *do_alloc_pages(struct device *dev, size_t size, dma_addr_t *addr, bool wc) { void *p; gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; again: p = alloc_pages_exact(size, gfp); if (!p) return NULL; *addr = page_to_phys(virt_to_page(p)); if (!dev) return p; if ((*addr + size - 1) & ~dev->coherent_dma_mask) { if (IS_ENABLED(CONFIG_ZONE_DMA32) && !(gfp & GFP_DMA32)) { gfp |= GFP_DMA32; goto again; } if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } } #ifdef CONFIG_X86 if (wc) set_memory_wc((unsigned long)(p), size >> PAGE_SHIFT); #endif return p; } static void do_free_pages(void *p, size_t size, bool wc) { #ifdef CONFIG_X86 if (wc) set_memory_wb((unsigned long)(p), size >> PAGE_SHIFT); #endif free_pages_exact(p, size); } static void *snd_dma_continuous_alloc(struct snd_dma_buffer *dmab, size_t size) { return do_alloc_pages(dmab->dev.dev, size, &dmab->addr, false); } static void snd_dma_continuous_free(struct snd_dma_buffer *dmab) { do_free_pages(dmab->area, dmab->bytes, false); } static int snd_dma_continuous_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return remap_pfn_range(area, area->vm_start, dmab->addr >> PAGE_SHIFT, area->vm_end - area->vm_start, area->vm_page_prot); } static const struct snd_malloc_ops snd_dma_continuous_ops = { .alloc = snd_dma_continuous_alloc, .free = snd_dma_continuous_free, .mmap = snd_dma_continuous_mmap, }; /* * VMALLOC allocator */ static void *snd_dma_vmalloc_alloc(struct snd_dma_buffer *dmab, size_t size) { return vmalloc(size); } static void snd_dma_vmalloc_free(struct snd_dma_buffer *dmab) { vfree(dmab->area); } static int snd_dma_vmalloc_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return remap_vmalloc_range(area, dmab->area, 0); } #define get_vmalloc_page_addr(dmab, offset) \ page_to_phys(vmalloc_to_page((dmab)->area + (offset))) static dma_addr_t snd_dma_vmalloc_get_addr(struct snd_dma_buffer *dmab, size_t offset) { return get_vmalloc_page_addr(dmab, offset) + offset % PAGE_SIZE; } static struct page *snd_dma_vmalloc_get_page(struct snd_dma_buffer *dmab, size_t offset) { return vmalloc_to_page(dmab->area + offset); } static unsigned int snd_dma_vmalloc_get_chunk_size(struct snd_dma_buffer *dmab, unsigned int ofs, unsigned int size) { unsigned int start, end; unsigned long addr; start = ALIGN_DOWN(ofs, PAGE_SIZE); end = ofs + size - 1; /* the last byte address */ /* check page continuity */ addr = get_vmalloc_page_addr(dmab, start); for (;;) { start += PAGE_SIZE; if (start > end) break; addr += PAGE_SIZE; if (get_vmalloc_page_addr(dmab, start) != addr) return start - ofs; } /* ok, all on continuous pages */ return size; } static const struct snd_malloc_ops snd_dma_vmalloc_ops = { .alloc = snd_dma_vmalloc_alloc, .free = snd_dma_vmalloc_free, .mmap = snd_dma_vmalloc_mmap, .get_addr = snd_dma_vmalloc_get_addr, .get_page = snd_dma_vmalloc_get_page, .get_chunk_size = snd_dma_vmalloc_get_chunk_size, }; #ifdef CONFIG_HAS_DMA /* * IRAM allocator */ #ifdef CONFIG_GENERIC_ALLOCATOR static void *snd_dma_iram_alloc(struct snd_dma_buffer *dmab, size_t size) { struct device *dev = dmab->dev.dev; struct gen_pool *pool; void *p; if (dev->of_node) { pool = of_gen_pool_get(dev->of_node, "iram", 0); /* Assign the pool into private_data field */ dmab->private_data = pool; p = gen_pool_dma_alloc_align(pool, size, &dmab->addr, PAGE_SIZE); if (p) return p; } /* Internal memory might have limited size and no enough space, * so if we fail to malloc, try to fetch memory traditionally. */ dmab->dev.type = SNDRV_DMA_TYPE_DEV; return __snd_dma_alloc_pages(dmab, size); } static void snd_dma_iram_free(struct snd_dma_buffer *dmab) { struct gen_pool *pool = dmab->private_data; if (pool && dmab->area) gen_pool_free(pool, (unsigned long)dmab->area, dmab->bytes); } static int snd_dma_iram_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); return remap_pfn_range(area, area->vm_start, dmab->addr >> PAGE_SHIFT, area->vm_end - area->vm_start, area->vm_page_prot); } static const struct snd_malloc_ops snd_dma_iram_ops = { .alloc = snd_dma_iram_alloc, .free = snd_dma_iram_free, .mmap = snd_dma_iram_mmap, }; #endif /* CONFIG_GENERIC_ALLOCATOR */ /* * Coherent device pages allocator */ static void *snd_dma_dev_alloc(struct snd_dma_buffer *dmab, size_t size) { return dma_alloc_coherent(dmab->dev.dev, size, &dmab->addr, DEFAULT_GFP); } static void snd_dma_dev_free(struct snd_dma_buffer *dmab) { dma_free_coherent(dmab->dev.dev, dmab->bytes, dmab->area, dmab->addr); } static int snd_dma_dev_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return dma_mmap_coherent(dmab->dev.dev, area, dmab->area, dmab->addr, dmab->bytes); } static const struct snd_malloc_ops snd_dma_dev_ops = { .alloc = snd_dma_dev_alloc, .free = snd_dma_dev_free, .mmap = snd_dma_dev_mmap, }; /* * Write-combined pages */ #ifdef CONFIG_SND_DMA_SGBUF /* x86-specific allocations */ static void *snd_dma_wc_alloc(struct snd_dma_buffer *dmab, size_t size) { void *p = do_alloc_pages(dmab->dev.dev, size, &dmab->addr, true); if (!p) return NULL; dmab->addr = dma_map_single(dmab->dev.dev, p, size, DMA_BIDIRECTIONAL); if (dma_mapping_error(dmab->dev.dev, dmab->addr)) { do_free_pages(dmab->area, size, true); return NULL; } return p; } static void snd_dma_wc_free(struct snd_dma_buffer *dmab) { dma_unmap_single(dmab->dev.dev, dmab->addr, dmab->bytes, DMA_BIDIRECTIONAL); do_free_pages(dmab->area, dmab->bytes, true); } static int snd_dma_wc_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); return dma_mmap_coherent(dmab->dev.dev, area, dmab->area, dmab->addr, dmab->bytes); } #else static void *snd_dma_wc_alloc(struct snd_dma_buffer *dmab, size_t size) { return dma_alloc_wc(dmab->dev.dev, size, &dmab->addr, DEFAULT_GFP); } static void snd_dma_wc_free(struct snd_dma_buffer *dmab) { dma_free_wc(dmab->dev.dev, dmab->bytes, dmab->area, dmab->addr); } static int snd_dma_wc_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return dma_mmap_wc(dmab->dev.dev, area, dmab->area, dmab->addr, dmab->bytes); } #endif static const struct snd_malloc_ops snd_dma_wc_ops = { .alloc = snd_dma_wc_alloc, .free = snd_dma_wc_free, .mmap = snd_dma_wc_mmap, }; /* * Non-contiguous pages allocator */ static void *snd_dma_noncontig_alloc(struct snd_dma_buffer *dmab, size_t size) { struct sg_table *sgt; void *p; sgt = dma_alloc_noncontiguous(dmab->dev.dev, size, dmab->dev.dir, DEFAULT_GFP, 0); if (!sgt) return NULL; dmab->dev.need_sync = dma_need_sync(dmab->dev.dev, sg_dma_address(sgt->sgl)); p = dma_vmap_noncontiguous(dmab->dev.dev, size, sgt); if (p) { dmab->private_data = sgt; /* store the first page address for convenience */ dmab->addr = snd_sgbuf_get_addr(dmab, 0); } else { dma_free_noncontiguous(dmab->dev.dev, size, sgt, dmab->dev.dir); } return p; } static void snd_dma_noncontig_free(struct snd_dma_buffer *dmab) { dma_vunmap_noncontiguous(dmab->dev.dev, dmab->area); dma_free_noncontiguous(dmab->dev.dev, dmab->bytes, dmab->private_data, dmab->dev.dir); } static int snd_dma_noncontig_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return dma_mmap_noncontiguous(dmab->dev.dev, area, dmab->bytes, dmab->private_data); } static void snd_dma_noncontig_sync(struct snd_dma_buffer *dmab, enum snd_dma_sync_mode mode) { if (mode == SNDRV_DMA_SYNC_CPU) { if (dmab->dev.dir == DMA_TO_DEVICE) return; invalidate_kernel_vmap_range(dmab->area, dmab->bytes); dma_sync_sgtable_for_cpu(dmab->dev.dev, dmab->private_data, dmab->dev.dir); } else { if (dmab->dev.dir == DMA_FROM_DEVICE) return; flush_kernel_vmap_range(dmab->area, dmab->bytes); dma_sync_sgtable_for_device(dmab->dev.dev, dmab->private_data, dmab->dev.dir); } } static inline void snd_dma_noncontig_iter_set(struct snd_dma_buffer *dmab, struct sg_page_iter *piter, size_t offset) { struct sg_table *sgt = dmab->private_data; __sg_page_iter_start(piter, sgt->sgl, sgt->orig_nents, offset >> PAGE_SHIFT); } static dma_addr_t snd_dma_noncontig_get_addr(struct snd_dma_buffer *dmab, size_t offset) { struct sg_dma_page_iter iter; snd_dma_noncontig_iter_set(dmab, &iter.base, offset); __sg_page_iter_dma_next(&iter); return sg_page_iter_dma_address(&iter) + offset % PAGE_SIZE; } static struct page *snd_dma_noncontig_get_page(struct snd_dma_buffer *dmab, size_t offset) { struct sg_page_iter iter; snd_dma_noncontig_iter_set(dmab, &iter, offset); __sg_page_iter_next(&iter); return sg_page_iter_page(&iter); } static unsigned int snd_dma_noncontig_get_chunk_size(struct snd_dma_buffer *dmab, unsigned int ofs, unsigned int size) { struct sg_dma_page_iter iter; unsigned int start, end; unsigned long addr; start = ALIGN_DOWN(ofs, PAGE_SIZE); end = ofs + size - 1; /* the last byte address */ snd_dma_noncontig_iter_set(dmab, &iter.base, start); if (!__sg_page_iter_dma_next(&iter)) return 0; /* check page continuity */ addr = sg_page_iter_dma_address(&iter); for (;;) { start += PAGE_SIZE; if (start > end) break; addr += PAGE_SIZE; if (!__sg_page_iter_dma_next(&iter) || sg_page_iter_dma_address(&iter) != addr) return start - ofs; } /* ok, all on continuous pages */ return size; } static const struct snd_malloc_ops snd_dma_noncontig_ops = { .alloc = snd_dma_noncontig_alloc, .free = snd_dma_noncontig_free, .mmap = snd_dma_noncontig_mmap, .sync = snd_dma_noncontig_sync, .get_addr = snd_dma_noncontig_get_addr, .get_page = snd_dma_noncontig_get_page, .get_chunk_size = snd_dma_noncontig_get_chunk_size, }; #ifdef CONFIG_SND_DMA_SGBUF /* Fallback SG-buffer allocations for x86 */ struct snd_dma_sg_fallback { struct sg_table sgt; /* used by get_addr - must be the first item */ size_t count; struct page **pages; unsigned int *npages; }; static void __snd_dma_sg_fallback_free(struct snd_dma_buffer *dmab, struct snd_dma_sg_fallback *sgbuf) { bool wc = dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG; size_t i, size; if (sgbuf->pages && sgbuf->npages) { i = 0; while (i < sgbuf->count) { size = sgbuf->npages[i]; if (!size) break; do_free_pages(page_address(sgbuf->pages[i]), size << PAGE_SHIFT, wc); i += size; } } kvfree(sgbuf->pages); kvfree(sgbuf->npages); kfree(sgbuf); } /* fallback manual S/G buffer allocations */ static void *snd_dma_sg_fallback_alloc(struct snd_dma_buffer *dmab, size_t size) { bool wc = dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG; struct snd_dma_sg_fallback *sgbuf; struct page **pagep, *curp; size_t chunk; dma_addr_t addr; unsigned int idx, npages; void *p; sgbuf = kzalloc(sizeof(*sgbuf), GFP_KERNEL); if (!sgbuf) return NULL; size = PAGE_ALIGN(size); sgbuf->count = size >> PAGE_SHIFT; sgbuf->pages = kvcalloc(sgbuf->count, sizeof(*sgbuf->pages), GFP_KERNEL); sgbuf->npages = kvcalloc(sgbuf->count, sizeof(*sgbuf->npages), GFP_KERNEL); if (!sgbuf->pages || !sgbuf->npages) goto error; pagep = sgbuf->pages; chunk = size; idx = 0; while (size > 0) { chunk = min(size, chunk); p = do_alloc_pages(dmab->dev.dev, chunk, &addr, wc); if (!p) { if (chunk <= PAGE_SIZE) goto error; chunk >>= 1; chunk = PAGE_SIZE << get_order(chunk); continue; } size -= chunk; /* fill pages */ npages = chunk >> PAGE_SHIFT; sgbuf->npages[idx] = npages; idx += npages; curp = virt_to_page(p); while (npages--) *pagep++ = curp++; } if (sg_alloc_table_from_pages(&sgbuf->sgt, sgbuf->pages, sgbuf->count, 0, sgbuf->count << PAGE_SHIFT, GFP_KERNEL)) goto error; if (dma_map_sgtable(dmab->dev.dev, &sgbuf->sgt, DMA_BIDIRECTIONAL, 0)) goto error_dma_map; p = vmap(sgbuf->pages, sgbuf->count, VM_MAP, PAGE_KERNEL); if (!p) goto error_vmap; dmab->private_data = sgbuf; /* store the first page address for convenience */ dmab->addr = snd_sgbuf_get_addr(dmab, 0); return p; error_vmap: dma_unmap_sgtable(dmab->dev.dev, &sgbuf->sgt, DMA_BIDIRECTIONAL, 0); error_dma_map: sg_free_table(&sgbuf->sgt); error: __snd_dma_sg_fallback_free(dmab, sgbuf); return NULL; } static void snd_dma_sg_fallback_free(struct snd_dma_buffer *dmab) { struct snd_dma_sg_fallback *sgbuf = dmab->private_data; vunmap(dmab->area); dma_unmap_sgtable(dmab->dev.dev, &sgbuf->sgt, DMA_BIDIRECTIONAL, 0); sg_free_table(&sgbuf->sgt); __snd_dma_sg_fallback_free(dmab, dmab->private_data); } static int snd_dma_sg_fallback_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { struct snd_dma_sg_fallback *sgbuf = dmab->private_data; if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG) area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); return vm_map_pages(area, sgbuf->pages, sgbuf->count); } static void *snd_dma_sg_alloc(struct snd_dma_buffer *dmab, size_t size) { int type = dmab->dev.type; void *p; /* try the standard DMA API allocation at first */ if (type == SNDRV_DMA_TYPE_DEV_WC_SG) dmab->dev.type = SNDRV_DMA_TYPE_DEV_WC; else dmab->dev.type = SNDRV_DMA_TYPE_DEV; p = __snd_dma_alloc_pages(dmab, size); if (p) return p; dmab->dev.type = type; /* restore the type */ return snd_dma_sg_fallback_alloc(dmab, size); } static const struct snd_malloc_ops snd_dma_sg_ops = { .alloc = snd_dma_sg_alloc, .free = snd_dma_sg_fallback_free, .mmap = snd_dma_sg_fallback_mmap, /* reuse noncontig helper */ .get_addr = snd_dma_noncontig_get_addr, /* reuse vmalloc helpers */ .get_page = snd_dma_vmalloc_get_page, .get_chunk_size = snd_dma_vmalloc_get_chunk_size, }; #endif /* CONFIG_SND_DMA_SGBUF */ /* * Non-coherent pages allocator */ static void *snd_dma_noncoherent_alloc(struct snd_dma_buffer *dmab, size_t size) { void *p; p = dma_alloc_noncoherent(dmab->dev.dev, size, &dmab->addr, dmab->dev.dir, DEFAULT_GFP); if (p) dmab->dev.need_sync = dma_need_sync(dmab->dev.dev, dmab->addr); return p; } static void snd_dma_noncoherent_free(struct snd_dma_buffer *dmab) { dma_free_noncoherent(dmab->dev.dev, dmab->bytes, dmab->area, dmab->addr, dmab->dev.dir); } static int snd_dma_noncoherent_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { area->vm_page_prot = vm_get_page_prot(area->vm_flags); return dma_mmap_pages(dmab->dev.dev, area, area->vm_end - area->vm_start, virt_to_page(dmab->area)); } static void snd_dma_noncoherent_sync(struct snd_dma_buffer *dmab, enum snd_dma_sync_mode mode) { if (mode == SNDRV_DMA_SYNC_CPU) { if (dmab->dev.dir != DMA_TO_DEVICE) dma_sync_single_for_cpu(dmab->dev.dev, dmab->addr, dmab->bytes, dmab->dev.dir); } else { if (dmab->dev.dir != DMA_FROM_DEVICE) dma_sync_single_for_device(dmab->dev.dev, dmab->addr, dmab->bytes, dmab->dev.dir); } } static const struct snd_malloc_ops snd_dma_noncoherent_ops = { .alloc = snd_dma_noncoherent_alloc, .free = snd_dma_noncoherent_free, .mmap = snd_dma_noncoherent_mmap, .sync = snd_dma_noncoherent_sync, }; #endif /* CONFIG_HAS_DMA */ /* * Entry points */ static const struct snd_malloc_ops *snd_dma_ops[] = { [SNDRV_DMA_TYPE_CONTINUOUS] = &snd_dma_continuous_ops, [SNDRV_DMA_TYPE_VMALLOC] = &snd_dma_vmalloc_ops, #ifdef CONFIG_HAS_DMA [SNDRV_DMA_TYPE_DEV] = &snd_dma_dev_ops, [SNDRV_DMA_TYPE_DEV_WC] = &snd_dma_wc_ops, [SNDRV_DMA_TYPE_NONCONTIG] = &snd_dma_noncontig_ops, [SNDRV_DMA_TYPE_NONCOHERENT] = &snd_dma_noncoherent_ops, #ifdef CONFIG_SND_DMA_SGBUF [SNDRV_DMA_TYPE_DEV_SG] = &snd_dma_sg_ops, [SNDRV_DMA_TYPE_DEV_WC_SG] = &snd_dma_sg_ops, #endif #ifdef CONFIG_GENERIC_ALLOCATOR [SNDRV_DMA_TYPE_DEV_IRAM] = &snd_dma_iram_ops, #endif /* CONFIG_GENERIC_ALLOCATOR */ #endif /* CONFIG_HAS_DMA */ }; static const struct snd_malloc_ops *snd_dma_get_ops(struct snd_dma_buffer *dmab) { if (WARN_ON_ONCE(!dmab)) return NULL; if (WARN_ON_ONCE(dmab->dev.type <= SNDRV_DMA_TYPE_UNKNOWN || dmab->dev.type >= ARRAY_SIZE(snd_dma_ops))) return NULL; return snd_dma_ops[dmab->dev.type]; } |
| 1 1 1 1 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer Priority Queue * Copyright (c) 1998-1999 by Frank van de Pol <fvdpol@coil.demon.nl> */ #include <linux/time.h> #include <linux/slab.h> #include <sound/core.h> #include "seq_timer.h" #include "seq_prioq.h" /* Implementation is a simple linked list for now... This priority queue orders the events on timestamp. For events with an equeal timestamp the queue behaves as a FIFO. * * +-------+ * Head --> | first | * +-------+ * |next * +-----v-+ * | | * +-------+ * | * +-----v-+ * | | * +-------+ * | * +-----v-+ * Tail --> | last | * +-------+ * */ /* create new prioq (constructor) */ struct snd_seq_prioq *snd_seq_prioq_new(void) { struct snd_seq_prioq *f; f = kzalloc(sizeof(*f), GFP_KERNEL); if (!f) return NULL; spin_lock_init(&f->lock); f->head = NULL; f->tail = NULL; f->cells = 0; return f; } /* delete prioq (destructor) */ void snd_seq_prioq_delete(struct snd_seq_prioq **fifo) { struct snd_seq_prioq *f = *fifo; *fifo = NULL; if (f == NULL) { pr_debug("ALSA: seq: snd_seq_prioq_delete() called with NULL prioq\n"); return; } /* release resources...*/ /*....................*/ if (f->cells > 0) { /* drain prioQ */ while (f->cells > 0) snd_seq_cell_free(snd_seq_prioq_cell_out(f, NULL)); } kfree(f); } /* compare timestamp between events */ /* return 1 if a >= b; 0 */ static inline int compare_timestamp(struct snd_seq_event *a, struct snd_seq_event *b) { if ((a->flags & SNDRV_SEQ_TIME_STAMP_MASK) == SNDRV_SEQ_TIME_STAMP_TICK) { /* compare ticks */ return (snd_seq_compare_tick_time(&a->time.tick, &b->time.tick)); } else { /* compare real time */ return (snd_seq_compare_real_time(&a->time.time, &b->time.time)); } } /* compare timestamp between events */ /* return negative if a < b; * zero if a = b; * positive if a > b; */ static inline int compare_timestamp_rel(struct snd_seq_event *a, struct snd_seq_event *b) { if ((a->flags & SNDRV_SEQ_TIME_STAMP_MASK) == SNDRV_SEQ_TIME_STAMP_TICK) { /* compare ticks */ if (a->time.tick > b->time.tick) return 1; else if (a->time.tick == b->time.tick) return 0; else return -1; } else { /* compare real time */ if (a->time.time.tv_sec > b->time.time.tv_sec) return 1; else if (a->time.time.tv_sec == b->time.time.tv_sec) { if (a->time.time.tv_nsec > b->time.time.tv_nsec) return 1; else if (a->time.time.tv_nsec == b->time.time.tv_nsec) return 0; else return -1; } else return -1; } } /* enqueue cell to prioq */ int snd_seq_prioq_cell_in(struct snd_seq_prioq * f, struct snd_seq_event_cell * cell) { struct snd_seq_event_cell *cur, *prev; int count; int prior; if (snd_BUG_ON(!f || !cell)) return -EINVAL; /* check flags */ prior = (cell->event.flags & SNDRV_SEQ_PRIORITY_MASK); guard(spinlock_irqsave)(&f->lock); /* check if this element needs to inserted at the end (ie. ordered data is inserted) This will be very likeley if a sequencer application or midi file player is feeding us (sequential) data */ if (f->tail && !prior) { if (compare_timestamp(&cell->event, &f->tail->event)) { /* add new cell to tail of the fifo */ f->tail->next = cell; f->tail = cell; cell->next = NULL; f->cells++; return 0; } } /* traverse list of elements to find the place where the new cell is to be inserted... Note that this is a order n process ! */ prev = NULL; /* previous cell */ cur = f->head; /* cursor */ count = 10000; /* FIXME: enough big, isn't it? */ while (cur != NULL) { /* compare timestamps */ int rel = compare_timestamp_rel(&cell->event, &cur->event); if (rel < 0) /* new cell has earlier schedule time, */ break; else if (rel == 0 && prior) /* equal schedule time and prior to others */ break; /* new cell has equal or larger schedule time, */ /* move cursor to next cell */ prev = cur; cur = cur->next; if (! --count) { pr_err("ALSA: seq: cannot find a pointer.. infinite loop?\n"); return -EINVAL; } } /* insert it before cursor */ if (prev != NULL) prev->next = cell; cell->next = cur; if (f->head == cur) /* this is the first cell, set head to it */ f->head = cell; if (cur == NULL) /* reached end of the list */ f->tail = cell; f->cells++; return 0; } /* return 1 if the current time >= event timestamp */ static int event_is_ready(struct snd_seq_event *ev, void *current_time) { if ((ev->flags & SNDRV_SEQ_TIME_STAMP_MASK) == SNDRV_SEQ_TIME_STAMP_TICK) return snd_seq_compare_tick_time(current_time, &ev->time.tick); else return snd_seq_compare_real_time(current_time, &ev->time.time); } /* dequeue cell from prioq */ struct snd_seq_event_cell *snd_seq_prioq_cell_out(struct snd_seq_prioq *f, void *current_time) { struct snd_seq_event_cell *cell; if (f == NULL) { pr_debug("ALSA: seq: snd_seq_prioq_cell_in() called with NULL prioq\n"); return NULL; } guard(spinlock_irqsave)(&f->lock); cell = f->head; if (cell && current_time && !event_is_ready(&cell->event, current_time)) cell = NULL; if (cell) { f->head = cell->next; /* reset tail if this was the last element */ if (f->tail == cell) f->tail = NULL; cell->next = NULL; f->cells--; } return cell; } /* return number of events available in prioq */ int snd_seq_prioq_avail(struct snd_seq_prioq * f) { if (f == NULL) { pr_debug("ALSA: seq: snd_seq_prioq_cell_in() called with NULL prioq\n"); return 0; } return f->cells; } /* remove cells matching with the condition */ static void prioq_remove_cells(struct snd_seq_prioq *f, bool (*match)(struct snd_seq_event_cell *cell, void *arg), void *arg) { register struct snd_seq_event_cell *cell, *next; struct snd_seq_event_cell *prev = NULL; struct snd_seq_event_cell *freefirst = NULL, *freeprev = NULL, *freenext; /* collect all removed cells */ scoped_guard(spinlock_irqsave, &f->lock) { for (cell = f->head; cell; cell = next) { next = cell->next; if (!match(cell, arg)) { prev = cell; continue; } /* remove cell from prioq */ if (cell == f->head) f->head = cell->next; else prev->next = cell->next; if (cell == f->tail) f->tail = cell->next; f->cells--; /* add cell to free list */ cell->next = NULL; if (freefirst == NULL) freefirst = cell; else freeprev->next = cell; freeprev = cell; } } /* remove selected cells */ while (freefirst) { freenext = freefirst->next; snd_seq_cell_free(freefirst); freefirst = freenext; } } struct prioq_match_arg { int client; int timestamp; }; static inline bool prioq_match(struct snd_seq_event_cell *cell, void *arg) { struct prioq_match_arg *v = arg; if (cell->event.source.client == v->client || cell->event.dest.client == v->client) return true; if (!v->timestamp) return false; switch (cell->event.flags & SNDRV_SEQ_TIME_STAMP_MASK) { case SNDRV_SEQ_TIME_STAMP_TICK: if (cell->event.time.tick) return true; break; case SNDRV_SEQ_TIME_STAMP_REAL: if (cell->event.time.time.tv_sec || cell->event.time.time.tv_nsec) return true; break; } return false; } /* remove cells for left client */ void snd_seq_prioq_leave(struct snd_seq_prioq *f, int client, int timestamp) { struct prioq_match_arg arg = { client, timestamp }; return prioq_remove_cells(f, prioq_match, &arg); } struct prioq_remove_match_arg { int client; struct snd_seq_remove_events *info; }; static bool prioq_remove_match(struct snd_seq_event_cell *cell, void *arg) { struct prioq_remove_match_arg *v = arg; struct snd_seq_event *ev = &cell->event; struct snd_seq_remove_events *info = v->info; int res; if (ev->source.client != v->client) return false; if (info->remove_mode & SNDRV_SEQ_REMOVE_DEST) { if (ev->dest.client != info->dest.client || ev->dest.port != info->dest.port) return false; } if (info->remove_mode & SNDRV_SEQ_REMOVE_DEST_CHANNEL) { if (! snd_seq_ev_is_channel_type(ev)) return false; /* data.note.channel and data.control.channel are identical */ if (ev->data.note.channel != info->channel) return false; } if (info->remove_mode & SNDRV_SEQ_REMOVE_TIME_AFTER) { if (info->remove_mode & SNDRV_SEQ_REMOVE_TIME_TICK) res = snd_seq_compare_tick_time(&ev->time.tick, &info->time.tick); else res = snd_seq_compare_real_time(&ev->time.time, &info->time.time); if (!res) return false; } if (info->remove_mode & SNDRV_SEQ_REMOVE_TIME_BEFORE) { if (info->remove_mode & SNDRV_SEQ_REMOVE_TIME_TICK) res = snd_seq_compare_tick_time(&ev->time.tick, &info->time.tick); else res = snd_seq_compare_real_time(&ev->time.time, &info->time.time); if (res) return false; } if (info->remove_mode & SNDRV_SEQ_REMOVE_EVENT_TYPE) { if (ev->type != info->type) return false; } if (info->remove_mode & SNDRV_SEQ_REMOVE_IGNORE_OFF) { /* Do not remove off events */ switch (ev->type) { case SNDRV_SEQ_EVENT_NOTEOFF: /* case SNDRV_SEQ_EVENT_SAMPLE_STOP: */ return false; default: break; } } if (info->remove_mode & SNDRV_SEQ_REMOVE_TAG_MATCH) { if (info->tag != ev->tag) return false; } return true; } /* remove cells matching remove criteria */ void snd_seq_prioq_remove_events(struct snd_seq_prioq * f, int client, struct snd_seq_remove_events *info) { struct prioq_remove_match_arg arg = { client, info }; return prioq_remove_cells(f, prioq_remove_match, &arg); } |
| 193 55 53 63 165 3 39 9 81 26 60 43 82 56 10 3 42 36 97 87 32 289 273 206 46 12 293 203 6 4 303 490 47 202 122 334 332 334 334 508 500 245 303 22 8 6 2 8 201 2 1 18 18 13 4 10 132 4 139 495 478 248 47 550 347 347 550 19 489 239 291 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM btrfs #if !defined(_TRACE_BTRFS_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BTRFS_H #include <linux/writeback.h> #include <linux/tracepoint.h> #include <trace/events/mmflags.h> struct btrfs_root; struct btrfs_fs_info; struct btrfs_inode; struct extent_map; struct btrfs_file_extent_item; struct btrfs_ordered_extent; struct btrfs_delayed_ref_node; struct btrfs_delayed_ref_head; struct btrfs_block_group; struct btrfs_free_cluster; struct btrfs_chunk_map; struct extent_buffer; struct btrfs_work; struct btrfs_workqueue; struct btrfs_qgroup_extent_record; struct btrfs_qgroup; struct extent_io_tree; struct prelim_ref; struct btrfs_space_info; struct btrfs_raid_bio; struct raid56_bio_trace_info; struct find_free_extent_ctl; #define show_ref_type(type) \ __print_symbolic(type, \ { BTRFS_TREE_BLOCK_REF_KEY, "TREE_BLOCK_REF" }, \ { BTRFS_EXTENT_DATA_REF_KEY, "EXTENT_DATA_REF" }, \ { BTRFS_SHARED_BLOCK_REF_KEY, "SHARED_BLOCK_REF" }, \ { BTRFS_SHARED_DATA_REF_KEY, "SHARED_DATA_REF" }) #define __show_root_type(obj) \ __print_symbolic_u64(obj, \ { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" }, \ { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" }, \ { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" }, \ { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" }, \ { BTRFS_FS_TREE_OBJECTID, "FS_TREE" }, \ { BTRFS_ROOT_TREE_DIR_OBJECTID, "ROOT_TREE_DIR" }, \ { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" }, \ { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" }, \ { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" }, \ { BTRFS_TREE_RELOC_OBJECTID, "TREE_RELOC" }, \ { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" }, \ { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, \ { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" },\ { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }) #define show_root_type(obj) \ obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ (obj >= BTRFS_ROOT_TREE_OBJECTID && \ obj <= BTRFS_QUOTA_TREE_OBJECTID)) ? __show_root_type(obj) : "-" #define FLUSH_ACTIONS \ EM( BTRFS_RESERVE_NO_FLUSH, "BTRFS_RESERVE_NO_FLUSH") \ EM( BTRFS_RESERVE_FLUSH_LIMIT, "BTRFS_RESERVE_FLUSH_LIMIT") \ EM( BTRFS_RESERVE_FLUSH_ALL, "BTRFS_RESERVE_FLUSH_ALL") \ EMe(BTRFS_RESERVE_FLUSH_ALL_STEAL, "BTRFS_RESERVE_FLUSH_ALL_STEAL") #define FI_TYPES \ EM( BTRFS_FILE_EXTENT_INLINE, "INLINE") \ EM( BTRFS_FILE_EXTENT_REG, "REG") \ EMe(BTRFS_FILE_EXTENT_PREALLOC, "PREALLOC") #define QGROUP_RSV_TYPES \ EM( BTRFS_QGROUP_RSV_DATA, "DATA") \ EM( BTRFS_QGROUP_RSV_META_PERTRANS, "META_PERTRANS") \ EMe(BTRFS_QGROUP_RSV_META_PREALLOC, "META_PREALLOC") #define IO_TREE_OWNER \ EM( IO_TREE_FS_PINNED_EXTENTS, "PINNED_EXTENTS") \ EM( IO_TREE_FS_EXCLUDED_EXTENTS, "EXCLUDED_EXTENTS") \ EM( IO_TREE_BTREE_INODE_IO, "BTREE_INODE_IO") \ EM( IO_TREE_INODE_IO, "INODE_IO") \ EM( IO_TREE_RELOC_BLOCKS, "RELOC_BLOCKS") \ EM( IO_TREE_TRANS_DIRTY_PAGES, "TRANS_DIRTY_PAGES") \ EM( IO_TREE_ROOT_DIRTY_LOG_PAGES, "ROOT_DIRTY_LOG_PAGES") \ EM( IO_TREE_INODE_FILE_EXTENT, "INODE_FILE_EXTENT") \ EM( IO_TREE_LOG_CSUM_RANGE, "LOG_CSUM_RANGE") \ EMe(IO_TREE_SELFTEST, "SELFTEST") #define FLUSH_STATES \ EM( FLUSH_DELAYED_ITEMS_NR, "FLUSH_DELAYED_ITEMS_NR") \ EM( FLUSH_DELAYED_ITEMS, "FLUSH_DELAYED_ITEMS") \ EM( FLUSH_DELALLOC, "FLUSH_DELALLOC") \ EM( FLUSH_DELALLOC_WAIT, "FLUSH_DELALLOC_WAIT") \ EM( FLUSH_DELALLOC_FULL, "FLUSH_DELALLOC_FULL") \ EM( FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR") \ EM( FLUSH_DELAYED_REFS, "FLUSH_DELAYED_REFS") \ EM( ALLOC_CHUNK, "ALLOC_CHUNK") \ EM( ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE") \ EM( RUN_DELAYED_IPUTS, "RUN_DELAYED_IPUTS") \ EM( COMMIT_TRANS, "COMMIT_TRANS") \ EMe(RESET_ZONES, "RESET_ZONES") /* * First define the enums in the above macros to be exported to userspace via * TRACE_DEFINE_ENUM(). */ #undef EM #undef EMe #define EM(a, b) TRACE_DEFINE_ENUM(a); #define EMe(a, b) TRACE_DEFINE_ENUM(a); FLUSH_ACTIONS FI_TYPES QGROUP_RSV_TYPES IO_TREE_OWNER FLUSH_STATES /* * Now redefine the EM and EMe macros to map the enums to the strings that will * be printed in the output */ #undef EM #undef EMe #define EM(a, b) {a, b}, #define EMe(a, b) {a, b} #define BTRFS_GROUP_FLAGS \ { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \ { BTRFS_BLOCK_GROUP_SYSTEM, "SYSTEM"}, \ { BTRFS_BLOCK_GROUP_METADATA, "METADATA"}, \ { BTRFS_BLOCK_GROUP_RAID0, "RAID0"}, \ { BTRFS_BLOCK_GROUP_RAID1, "RAID1"}, \ { BTRFS_BLOCK_GROUP_DUP, "DUP"}, \ { BTRFS_BLOCK_GROUP_RAID10, "RAID10"}, \ { BTRFS_BLOCK_GROUP_RAID5, "RAID5"}, \ { BTRFS_BLOCK_GROUP_RAID6, "RAID6"} #define EXTENT_FLAGS \ { EXTENT_DIRTY, "DIRTY"}, \ { EXTENT_LOCKED, "LOCKED"}, \ { EXTENT_DIRTY_LOG1, "DIRTY_LOG1"}, \ { EXTENT_DIRTY_LOG2, "DIRTY_LOG2"}, \ { EXTENT_DELALLOC, "DELALLOC"}, \ { EXTENT_DEFRAG, "DEFRAG"}, \ { EXTENT_BOUNDARY, "BOUNDARY"}, \ { EXTENT_NODATASUM, "NODATASUM"}, \ { EXTENT_CLEAR_META_RESV, "CLEAR_META_RESV"}, \ { EXTENT_NEED_WAIT, "NEED_WAIT"}, \ { EXTENT_NORESERVE, "NORESERVE"}, \ { EXTENT_QGROUP_RESERVED, "QGROUP_RESERVED"}, \ { EXTENT_CLEAR_DATA_RESV, "CLEAR_DATA_RESV"}, \ { EXTENT_DELALLOC_NEW, "DELALLOC_NEW"} #define BTRFS_FSID_SIZE 16 #define TP_STRUCT__entry_fsid __array(u8, fsid, BTRFS_FSID_SIZE) #define TP_fast_assign_fsid(fs_info) \ ({ \ if (fs_info) \ memcpy(__entry->fsid, fs_info->fs_devices->fsid, \ BTRFS_FSID_SIZE); \ else \ memset(__entry->fsid, 0, BTRFS_FSID_SIZE); \ }) #define TP_STRUCT__entry_btrfs(args...) \ TP_STRUCT__entry( \ TP_STRUCT__entry_fsid \ args) #define TP_fast_assign_btrfs(fs_info, args...) \ TP_fast_assign( \ TP_fast_assign_fsid(fs_info); \ args) #define TP_printk_btrfs(fmt, args...) \ TP_printk("%pU: " fmt, __entry->fsid, args) TRACE_EVENT(btrfs_transaction_commit, TP_PROTO(const struct btrfs_fs_info *fs_info), TP_ARGS(fs_info), TP_STRUCT__entry_btrfs( __field( u64, generation ) __field( u64, root_objectid ) ), TP_fast_assign_btrfs(fs_info, __entry->generation = fs_info->generation; __entry->root_objectid = BTRFS_ROOT_TREE_OBJECTID; ), TP_printk_btrfs("root=%llu(%s) gen=%llu", show_root_type(__entry->root_objectid), __entry->generation) ); DECLARE_EVENT_CLASS(btrfs__inode, TP_PROTO(const struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry_btrfs( __field( u64, ino ) __field( u64, blocks ) __field( u64, disk_i_size ) __field( u64, generation ) __field( u64, last_trans ) __field( u64, logged_trans ) __field( u64, root_objectid ) ), TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), __entry->ino = btrfs_ino(BTRFS_I(inode)); __entry->blocks = inode->i_blocks; __entry->disk_i_size = BTRFS_I(inode)->disk_i_size; __entry->generation = BTRFS_I(inode)->generation; __entry->last_trans = BTRFS_I(inode)->last_trans; __entry->logged_trans = BTRFS_I(inode)->logged_trans; __entry->root_objectid = btrfs_root_id(BTRFS_I(inode)->root); ), TP_printk_btrfs("root=%llu(%s) gen=%llu ino=%llu blocks=%llu " "disk_i_size=%llu last_trans=%llu logged_trans=%llu", show_root_type(__entry->root_objectid), __entry->generation, __entry->ino, __entry->blocks, __entry->disk_i_size, __entry->last_trans, __entry->logged_trans) ); DEFINE_EVENT(btrfs__inode, btrfs_inode_new, TP_PROTO(const struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(btrfs__inode, btrfs_inode_request, TP_PROTO(const struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(btrfs__inode, btrfs_inode_evict, TP_PROTO(const struct inode *inode), TP_ARGS(inode) ); #define __show_map_type(type) \ __print_symbolic_u64(type, \ { EXTENT_MAP_LAST_BYTE, "LAST_BYTE" }, \ { EXTENT_MAP_HOLE, "HOLE" }, \ { EXTENT_MAP_INLINE, "INLINE" }) #define show_map_type(type) \ type, (type >= EXTENT_MAP_LAST_BYTE) ? "-" : __show_map_type(type) #define show_map_flags(flag) \ __print_flags(flag, "|", \ { EXTENT_FLAG_PINNED, "PINNED" },\ { EXTENT_FLAG_COMPRESS_ZLIB, "COMPRESS_ZLIB" },\ { EXTENT_FLAG_COMPRESS_LZO, "COMPRESS_LZO" },\ { EXTENT_FLAG_COMPRESS_ZSTD, "COMPRESS_ZSTD" },\ { EXTENT_FLAG_PREALLOC, "PREALLOC" },\ { EXTENT_FLAG_LOGGING, "LOGGING" }) TRACE_EVENT_CONDITION(btrfs_get_extent, TP_PROTO(const struct btrfs_root *root, const struct btrfs_inode *inode, const struct extent_map *map), TP_ARGS(root, inode, map), TP_CONDITION(map), TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, ino ) __field( u64, start ) __field( u64, len ) __field( u32, flags ) __field( int, refs ) ), TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = btrfs_root_id(root); __entry->ino = btrfs_ino(inode); __entry->start = map->start; __entry->len = map->len; __entry->flags = map->flags; __entry->refs = refcount_read(&map->refs); ), TP_printk_btrfs("root=%llu(%s) ino=%llu start=%llu len=%llu flags=%s refs=%u", show_root_type(__entry->root_objectid), __entry->ino, __entry->start, __entry->len, show_map_flags(__entry->flags), __entry->refs) ); TRACE_EVENT(btrfs_handle_em_exist, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct extent_map *existing, const struct extent_map *map, u64 start, u64 len), TP_ARGS(fs_info, existing, map, start, len), TP_STRUCT__entry_btrfs( __field( u64, e_start ) __field( u64, e_len ) __field( u64, map_start ) __field( u64, map_len ) __field( u64, start ) __field( u64, len ) ), TP_fast_assign_btrfs(fs_info, __entry->e_start = existing->start; __entry->e_len = existing->len; __entry->map_start = map->start; __entry->map_len = map->len; __entry->start = start; __entry->len = len; ), TP_printk_btrfs("start=%llu len=%llu " "existing(start=%llu len=%llu) " "em(start=%llu len=%llu)", __entry->start, __entry->len, __entry->e_start, __entry->e_len, __entry->map_start, __entry->map_len) ); /* file extent item */ DECLARE_EVENT_CLASS(btrfs__file_extent_item_regular, TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l, const struct btrfs_file_extent_item *fi, u64 start), TP_ARGS(bi, l, fi, start), TP_STRUCT__entry_btrfs( __field( u64, root_obj ) __field( u64, ino ) __field( loff_t, isize ) __field( u64, disk_isize ) __field( u64, num_bytes ) __field( u64, ram_bytes ) __field( u64, disk_bytenr ) __field( u64, disk_num_bytes ) __field( u64, extent_offset ) __field( u8, extent_type ) __field( u8, compression ) __field( u64, extent_start ) __field( u64, extent_end ) ), TP_fast_assign_btrfs(bi->root->fs_info, __entry->root_obj = btrfs_root_id(bi->root); __entry->ino = btrfs_ino(bi); __entry->isize = bi->vfs_inode.i_size; __entry->disk_isize = bi->disk_i_size; __entry->num_bytes = btrfs_file_extent_num_bytes(l, fi); __entry->ram_bytes = btrfs_file_extent_ram_bytes(l, fi); __entry->disk_bytenr = btrfs_file_extent_disk_bytenr(l, fi); __entry->disk_num_bytes = btrfs_file_extent_disk_num_bytes(l, fi); __entry->extent_offset = btrfs_file_extent_offset(l, fi); __entry->extent_type = btrfs_file_extent_type(l, fi); __entry->compression = btrfs_file_extent_compression(l, fi); __entry->extent_start = start; __entry->extent_end = (start + __entry->num_bytes); ), TP_printk_btrfs( "root=%llu(%s) inode=%llu size=%llu disk_isize=%llu " "file extent range=[%llu %llu] " "(num_bytes=%llu ram_bytes=%llu disk_bytenr=%llu " "disk_num_bytes=%llu extent_offset=%llu type=%s " "compression=%u", show_root_type(__entry->root_obj), __entry->ino, __entry->isize, __entry->disk_isize, __entry->extent_start, __entry->extent_end, __entry->num_bytes, __entry->ram_bytes, __entry->disk_bytenr, __entry->disk_num_bytes, __entry->extent_offset, __print_symbolic(__entry->extent_type, FI_TYPES), __entry->compression) ); DECLARE_EVENT_CLASS( btrfs__file_extent_item_inline, TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l, const struct btrfs_file_extent_item *fi, int slot, u64 start), TP_ARGS(bi, l, fi, slot, start), TP_STRUCT__entry_btrfs( __field( u64, root_obj ) __field( u64, ino ) __field( loff_t, isize ) __field( u64, disk_isize ) __field( u8, extent_type ) __field( u8, compression ) __field( u64, extent_start ) __field( u64, extent_end ) ), TP_fast_assign_btrfs( bi->root->fs_info, __entry->root_obj = btrfs_root_id(bi->root); __entry->ino = btrfs_ino(bi); __entry->isize = bi->vfs_inode.i_size; __entry->disk_isize = bi->disk_i_size; __entry->extent_type = btrfs_file_extent_type(l, fi); __entry->compression = btrfs_file_extent_compression(l, fi); __entry->extent_start = start; __entry->extent_end = (start + btrfs_file_extent_ram_bytes(l, fi)); ), TP_printk_btrfs( "root=%llu(%s) inode=%llu size=%llu disk_isize=%llu " "file extent range=[%llu %llu] " "extent_type=%s compression=%u", show_root_type(__entry->root_obj), __entry->ino, __entry->isize, __entry->disk_isize, __entry->extent_start, __entry->extent_end, __print_symbolic(__entry->extent_type, FI_TYPES), __entry->compression) ); DEFINE_EVENT( btrfs__file_extent_item_regular, btrfs_get_extent_show_fi_regular, TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l, const struct btrfs_file_extent_item *fi, u64 start), TP_ARGS(bi, l, fi, start) ); DEFINE_EVENT( btrfs__file_extent_item_regular, btrfs_truncate_show_fi_regular, TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l, const struct btrfs_file_extent_item *fi, u64 start), TP_ARGS(bi, l, fi, start) ); DEFINE_EVENT( btrfs__file_extent_item_inline, btrfs_get_extent_show_fi_inline, TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l, const struct btrfs_file_extent_item *fi, int slot, u64 start), TP_ARGS(bi, l, fi, slot, start) ); DEFINE_EVENT( btrfs__file_extent_item_inline, btrfs_truncate_show_fi_inline, TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l, const struct btrfs_file_extent_item *fi, int slot, u64 start), TP_ARGS(bi, l, fi, slot, start) ); #define show_ordered_flags(flags) \ __print_flags(flags, "|", \ { (1 << BTRFS_ORDERED_REGULAR), "REGULAR" }, \ { (1 << BTRFS_ORDERED_NOCOW), "NOCOW" }, \ { (1 << BTRFS_ORDERED_PREALLOC), "PREALLOC" }, \ { (1 << BTRFS_ORDERED_COMPRESSED), "COMPRESSED" }, \ { (1 << BTRFS_ORDERED_DIRECT), "DIRECT" }, \ { (1 << BTRFS_ORDERED_IO_DONE), "IO_DONE" }, \ { (1 << BTRFS_ORDERED_COMPLETE), "COMPLETE" }, \ { (1 << BTRFS_ORDERED_IOERR), "IOERR" }, \ { (1 << BTRFS_ORDERED_TRUNCATED), "TRUNCATED" }) DECLARE_EVENT_CLASS(btrfs__ordered_extent, TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered), TP_STRUCT__entry_btrfs( __field( u64, ino ) __field( u64, file_offset ) __field( u64, start ) __field( u64, len ) __field( u64, disk_len ) __field( u64, bytes_left ) __field( unsigned long, flags ) __field( int, compress_type ) __field( int, refs ) __field( u64, root_objectid ) __field( u64, truncated_len ) ), TP_fast_assign_btrfs(inode->root->fs_info, __entry->ino = btrfs_ino(inode); __entry->file_offset = ordered->file_offset; __entry->start = ordered->disk_bytenr; __entry->len = ordered->num_bytes; __entry->disk_len = ordered->disk_num_bytes; __entry->bytes_left = ordered->bytes_left; __entry->flags = ordered->flags; __entry->compress_type = ordered->compress_type; __entry->refs = refcount_read(&ordered->refs); __entry->root_objectid = btrfs_root_id(inode->root); __entry->truncated_len = ordered->truncated_len; ), TP_printk_btrfs("root=%llu(%s) ino=%llu file_offset=%llu " "start=%llu len=%llu disk_len=%llu " "truncated_len=%llu " "bytes_left=%llu flags=%s compress_type=%d " "refs=%d", show_root_type(__entry->root_objectid), __entry->ino, __entry->file_offset, __entry->start, __entry->len, __entry->disk_len, __entry->truncated_len, __entry->bytes_left, show_ordered_flags(__entry->flags), __entry->compress_type, __entry->refs) ); DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_add, TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) ); DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_remove, TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) ); DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_start, TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) ); DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_put, TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) ); DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup, TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) ); DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_range, TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) ); DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_first_range, TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) ); DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_for_logging, TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) ); DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_first, TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) ); DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_split, TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) ); DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_dec_test_pending, TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) ); DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_mark_finished, TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) ); TRACE_EVENT(btrfs_finish_ordered_extent, TP_PROTO(const struct btrfs_inode *inode, u64 start, u64 len, bool uptodate), TP_ARGS(inode, start, len, uptodate), TP_STRUCT__entry_btrfs( __field( u64, ino ) __field( u64, start ) __field( u64, len ) __field( bool, uptodate ) __field( u64, root_objectid ) ), TP_fast_assign_btrfs(inode->root->fs_info, __entry->ino = btrfs_ino(inode); __entry->start = start; __entry->len = len; __entry->uptodate = uptodate; __entry->root_objectid = btrfs_root_id(inode->root); ), TP_printk_btrfs("root=%llu(%s) ino=%llu start=%llu len=%llu uptodate=%d", show_root_type(__entry->root_objectid), __entry->ino, __entry->start, __entry->len, !!__entry->uptodate) ); DECLARE_EVENT_CLASS(btrfs__writepage, TP_PROTO(const struct folio *folio, const struct inode *inode, const struct writeback_control *wbc), TP_ARGS(folio, inode, wbc), TP_STRUCT__entry_btrfs( __field( u64, ino ) __field( pgoff_t, index ) __field( long, nr_to_write ) __field( long, pages_skipped ) __field( loff_t, range_start ) __field( loff_t, range_end ) __field( char, for_kupdate ) __field( char, range_cyclic ) __field( unsigned long, writeback_index ) __field( u64, root_objectid ) ), TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), __entry->ino = btrfs_ino(BTRFS_I(inode)); __entry->index = folio->index; __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; __entry->range_end = wbc->range_end; __entry->for_kupdate = wbc->for_kupdate; __entry->range_cyclic = wbc->range_cyclic; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->root_objectid = btrfs_root_id(BTRFS_I(inode)->root); ), TP_printk_btrfs("root=%llu(%s) ino=%llu page_index=%lu " "nr_to_write=%ld pages_skipped=%ld range_start=%llu " "range_end=%llu for_kupdate=%d " "range_cyclic=%d writeback_index=%lu", show_root_type(__entry->root_objectid), __entry->ino, __entry->index, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, __entry->for_kupdate, __entry->range_cyclic, __entry->writeback_index) ); DEFINE_EVENT(btrfs__writepage, extent_writepage, TP_PROTO(const struct folio *folio, const struct inode *inode, const struct writeback_control *wbc), TP_ARGS(folio, inode, wbc) ); TRACE_EVENT(btrfs_writepage_end_io_hook, TP_PROTO(const struct btrfs_inode *inode, u64 start, u64 end, int uptodate), TP_ARGS(inode, start, end, uptodate), TP_STRUCT__entry_btrfs( __field( u64, ino ) __field( u64, start ) __field( u64, end ) __field( int, uptodate ) __field( u64, root_objectid ) ), TP_fast_assign_btrfs(inode->root->fs_info, __entry->ino = btrfs_ino(inode); __entry->start = start; __entry->end = end; __entry->uptodate = uptodate; __entry->root_objectid = btrfs_root_id(inode->root); ), TP_printk_btrfs("root=%llu(%s) ino=%llu start=%llu end=%llu uptodate=%d", show_root_type(__entry->root_objectid), __entry->ino, __entry->start, __entry->end, __entry->uptodate) ); TRACE_EVENT(btrfs_sync_file, TP_PROTO(const struct file *file, int datasync), TP_ARGS(file, datasync), TP_STRUCT__entry_btrfs( __field( u64, ino ) __field( u64, parent ) __field( int, datasync ) __field( u64, root_objectid ) ), TP_fast_assign( const struct dentry *dentry = file->f_path.dentry; const struct inode *inode = d_inode(dentry); TP_fast_assign_fsid(btrfs_sb(file->f_path.dentry->d_sb)); __entry->ino = btrfs_ino(BTRFS_I(inode)); __entry->parent = btrfs_ino(BTRFS_I(d_inode(dentry->d_parent))); __entry->datasync = datasync; __entry->root_objectid = btrfs_root_id(BTRFS_I(inode)->root); ), TP_printk_btrfs("root=%llu(%s) ino=%llu parent=%llu datasync=%d", show_root_type(__entry->root_objectid), __entry->ino, __entry->parent, __entry->datasync) ); TRACE_EVENT(btrfs_sync_fs, TP_PROTO(const struct btrfs_fs_info *fs_info, int wait), TP_ARGS(fs_info, wait), TP_STRUCT__entry_btrfs( __field( int, wait ) ), TP_fast_assign_btrfs(fs_info, __entry->wait = wait; ), TP_printk_btrfs("wait=%d", __entry->wait) ); TRACE_EVENT(btrfs_add_block_group, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_block_group *block_group, int create), TP_ARGS(fs_info, block_group, create), TP_STRUCT__entry_btrfs( __field( u64, offset ) __field( u64, size ) __field( u64, flags ) __field( u64, bytes_used ) __field( u64, bytes_super ) __field( int, create ) ), TP_fast_assign_btrfs(fs_info, __entry->offset = block_group->start; __entry->size = block_group->length; __entry->flags = block_group->flags; __entry->bytes_used = block_group->used; __entry->bytes_super = block_group->bytes_super; __entry->create = create; ), TP_printk_btrfs("block_group offset=%llu size=%llu " "flags=%llu(%s) bytes_used=%llu bytes_super=%llu " "create=%d", __entry->offset, __entry->size, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), __entry->bytes_used, __entry->bytes_super, __entry->create) ); #define show_ref_action(action) \ __print_symbolic(action, \ { BTRFS_ADD_DELAYED_REF, "ADD_DELAYED_REF" }, \ { BTRFS_DROP_DELAYED_REF, "DROP_DELAYED_REF" }, \ { BTRFS_ADD_DELAYED_EXTENT, "ADD_DELAYED_EXTENT" }, \ { BTRFS_UPDATE_DELAYED_HEAD, "UPDATE_DELAYED_HEAD" }) DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_delayed_ref_node *ref), TP_ARGS(fs_info, ref), TP_STRUCT__entry_btrfs( __field( u64, bytenr ) __field( u64, num_bytes ) __field( int, action ) __field( u64, parent ) __field( u64, ref_root ) __field( int, level ) __field( int, type ) __field( u64, seq ) ), TP_fast_assign_btrfs(fs_info, __entry->bytenr = ref->bytenr; __entry->num_bytes = ref->num_bytes; __entry->action = ref->action; __entry->parent = ref->parent; __entry->ref_root = ref->ref_root; __entry->level = ref->tree_ref.level; __entry->type = ref->type; __entry->seq = ref->seq; ), TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s " "parent=%llu(%s) ref_root=%llu(%s) level=%d " "type=%s seq=%llu", __entry->bytenr, __entry->num_bytes, show_ref_action(__entry->action), show_root_type(__entry->parent), show_root_type(__entry->ref_root), __entry->level, show_ref_type(__entry->type), __entry->seq) ); DEFINE_EVENT(btrfs_delayed_tree_ref, add_delayed_tree_ref, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_delayed_ref_node *ref), TP_ARGS(fs_info, ref) ); DEFINE_EVENT(btrfs_delayed_tree_ref, run_delayed_tree_ref, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_delayed_ref_node *ref), TP_ARGS(fs_info, ref) ); DECLARE_EVENT_CLASS(btrfs_delayed_data_ref, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_delayed_ref_node *ref), TP_ARGS(fs_info, ref), TP_STRUCT__entry_btrfs( __field( u64, bytenr ) __field( u64, num_bytes ) __field( int, action ) __field( u64, parent ) __field( u64, ref_root ) __field( u64, owner ) __field( u64, offset ) __field( int, type ) __field( u64, seq ) ), TP_fast_assign_btrfs(fs_info, __entry->bytenr = ref->bytenr; __entry->num_bytes = ref->num_bytes; __entry->action = ref->action; __entry->parent = ref->parent; __entry->ref_root = ref->ref_root; __entry->owner = ref->data_ref.objectid; __entry->offset = ref->data_ref.offset; __entry->type = ref->type; __entry->seq = ref->seq; ), TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s " "parent=%llu(%s) ref_root=%llu(%s) owner=%llu " "offset=%llu type=%s seq=%llu", __entry->bytenr, __entry->num_bytes, show_ref_action(__entry->action), show_root_type(__entry->parent), show_root_type(__entry->ref_root), __entry->owner, __entry->offset, show_ref_type(__entry->type), __entry->seq) ); DEFINE_EVENT(btrfs_delayed_data_ref, add_delayed_data_ref, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_delayed_ref_node *ref), TP_ARGS(fs_info, ref) ); DEFINE_EVENT(btrfs_delayed_data_ref, run_delayed_data_ref, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_delayed_ref_node *ref), TP_ARGS(fs_info, ref) ); DECLARE_EVENT_CLASS(btrfs_delayed_ref_head, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_delayed_ref_head *head_ref, int action), TP_ARGS(fs_info, head_ref, action), TP_STRUCT__entry_btrfs( __field( u64, bytenr ) __field( u64, num_bytes ) __field( int, action ) __field( int, is_data ) ), TP_fast_assign_btrfs(fs_info, __entry->bytenr = head_ref->bytenr; __entry->num_bytes = head_ref->num_bytes; __entry->action = action; __entry->is_data = head_ref->is_data; ), TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s is_data=%d", __entry->bytenr, __entry->num_bytes, show_ref_action(__entry->action), __entry->is_data) ); DEFINE_EVENT(btrfs_delayed_ref_head, add_delayed_ref_head, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_delayed_ref_head *head_ref, int action), TP_ARGS(fs_info, head_ref, action) ); DEFINE_EVENT(btrfs_delayed_ref_head, run_delayed_ref_head, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_delayed_ref_head *head_ref, int action), TP_ARGS(fs_info, head_ref, action) ); #define show_chunk_type(type) \ __print_flags(type, "|", \ { BTRFS_BLOCK_GROUP_DATA, "DATA" }, \ { BTRFS_BLOCK_GROUP_SYSTEM, "SYSTEM"}, \ { BTRFS_BLOCK_GROUP_METADATA, "METADATA"}, \ { BTRFS_BLOCK_GROUP_RAID0, "RAID0" }, \ { BTRFS_BLOCK_GROUP_RAID1, "RAID1" }, \ { BTRFS_BLOCK_GROUP_DUP, "DUP" }, \ { BTRFS_BLOCK_GROUP_RAID10, "RAID10"}, \ { BTRFS_BLOCK_GROUP_RAID5, "RAID5" }, \ { BTRFS_BLOCK_GROUP_RAID6, "RAID6" }) DECLARE_EVENT_CLASS(btrfs__chunk, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_chunk_map *map, u64 offset, u64 size), TP_ARGS(fs_info, map, offset, size), TP_STRUCT__entry_btrfs( __field( int, num_stripes ) __field( u64, type ) __field( int, sub_stripes ) __field( u64, offset ) __field( u64, size ) __field( u64, root_objectid ) ), TP_fast_assign_btrfs(fs_info, __entry->num_stripes = map->num_stripes; __entry->type = map->type; __entry->sub_stripes = map->sub_stripes; __entry->offset = offset; __entry->size = size; __entry->root_objectid = btrfs_root_id(fs_info->chunk_root); ), TP_printk_btrfs("root=%llu(%s) offset=%llu size=%llu " "num_stripes=%d sub_stripes=%d type=%s", show_root_type(__entry->root_objectid), __entry->offset, __entry->size, __entry->num_stripes, __entry->sub_stripes, show_chunk_type(__entry->type)) ); DEFINE_EVENT(btrfs__chunk, btrfs_chunk_alloc, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_chunk_map *map, u64 offset, u64 size), TP_ARGS(fs_info, map, offset, size) ); DEFINE_EVENT(btrfs__chunk, btrfs_chunk_free, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_chunk_map *map, u64 offset, u64 size), TP_ARGS(fs_info, map, offset, size) ); TRACE_EVENT(btrfs_cow_block, TP_PROTO(const struct btrfs_root *root, const struct extent_buffer *buf, const struct extent_buffer *cow), TP_ARGS(root, buf, cow), TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, buf_start ) __field( int, refs ) __field( u64, cow_start ) __field( int, buf_level ) __field( int, cow_level ) ), TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = btrfs_root_id(root); __entry->buf_start = buf->start; __entry->refs = refcount_read(&buf->refs); __entry->cow_start = cow->start; __entry->buf_level = btrfs_header_level(buf); __entry->cow_level = btrfs_header_level(cow); ), TP_printk_btrfs("root=%llu(%s) refs=%d orig_buf=%llu " "(orig_level=%d) cow_buf=%llu (cow_level=%d)", show_root_type(__entry->root_objectid), __entry->refs, __entry->buf_start, __entry->buf_level, __entry->cow_start, __entry->cow_level) ); TRACE_EVENT(btrfs_space_reservation, TP_PROTO(const struct btrfs_fs_info *fs_info, const char *type, u64 val, u64 bytes, int reserve), TP_ARGS(fs_info, type, val, bytes, reserve), TP_STRUCT__entry_btrfs( __string( type, type ) __field( u64, val ) __field( u64, bytes ) __field( int, reserve ) ), TP_fast_assign_btrfs(fs_info, __assign_str(type); __entry->val = val; __entry->bytes = bytes; __entry->reserve = reserve; ), TP_printk_btrfs("%s: %llu %s %llu", __get_str(type), __entry->val, __entry->reserve ? "reserve" : "release", __entry->bytes) ); TRACE_EVENT(btrfs_trigger_flush, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 bytes, int flush, const char *reason), TP_ARGS(fs_info, flags, bytes, flush, reason), TP_STRUCT__entry_btrfs( __field( u64, flags ) __field( u64, bytes ) __field( int, flush ) __string( reason, reason ) ), TP_fast_assign_btrfs(fs_info, __entry->flags = flags; __entry->bytes = bytes; __entry->flush = flush; __assign_str(reason); ), TP_printk_btrfs("%s: flush=%d(%s) flags=%llu(%s) bytes=%llu", __get_str(reason), __entry->flush, __print_symbolic(__entry->flush, FLUSH_ACTIONS), __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), __entry->bytes) ); TRACE_EVENT(btrfs_flush_space, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 num_bytes, int state, int ret, bool for_preempt), TP_ARGS(fs_info, flags, num_bytes, state, ret, for_preempt), TP_STRUCT__entry_btrfs( __field( u64, flags ) __field( u64, num_bytes ) __field( int, state ) __field( int, ret ) __field( bool, for_preempt ) ), TP_fast_assign_btrfs(fs_info, __entry->flags = flags; __entry->num_bytes = num_bytes; __entry->state = state; __entry->ret = ret; __entry->for_preempt = for_preempt; ), TP_printk_btrfs("state=%d(%s) flags=%llu(%s) num_bytes=%llu ret=%d for_preempt=%d", __entry->state, __print_symbolic(__entry->state, FLUSH_STATES), __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), __entry->num_bytes, __entry->ret, __entry->for_preempt) ); DECLARE_EVENT_CLASS(btrfs__reserved_extent, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 start, u64 len), TP_ARGS(fs_info, start, len), TP_STRUCT__entry_btrfs( __field( u64, start ) __field( u64, len ) ), TP_fast_assign_btrfs(fs_info, __entry->start = start; __entry->len = len; ), TP_printk_btrfs("root=%llu(%s) start=%llu len=%llu", show_root_type(BTRFS_EXTENT_TREE_OBJECTID), __entry->start, __entry->len) ); DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_alloc, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 start, u64 len), TP_ARGS(fs_info, start, len) ); DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 start, u64 len), TP_ARGS(fs_info, start, len) ); TRACE_EVENT(btrfs_find_free_extent, TP_PROTO(const struct btrfs_root *root, const struct find_free_extent_ctl *ffe_ctl), TP_ARGS(root, ffe_ctl), TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, num_bytes ) __field( u64, empty_size ) __field( u64, flags ) ), TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = btrfs_root_id(root); __entry->num_bytes = ffe_ctl->num_bytes; __entry->empty_size = ffe_ctl->empty_size; __entry->flags = ffe_ctl->flags; ), TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)", show_root_type(__entry->root_objectid), __entry->num_bytes, __entry->empty_size, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS)) ); TRACE_EVENT(btrfs_find_free_extent_search_loop, TP_PROTO(const struct btrfs_root *root, const struct find_free_extent_ctl *ffe_ctl), TP_ARGS(root, ffe_ctl), TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, num_bytes ) __field( u64, empty_size ) __field( u64, flags ) __field( u64, loop ) ), TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = btrfs_root_id(root); __entry->num_bytes = ffe_ctl->num_bytes; __entry->empty_size = ffe_ctl->empty_size; __entry->flags = ffe_ctl->flags; __entry->loop = ffe_ctl->loop; ), TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu", show_root_type(__entry->root_objectid), __entry->num_bytes, __entry->empty_size, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), __entry->loop) ); TRACE_EVENT(btrfs_find_free_extent_have_block_group, TP_PROTO(const struct btrfs_root *root, const struct find_free_extent_ctl *ffe_ctl, const struct btrfs_block_group *block_group), TP_ARGS(root, ffe_ctl, block_group), TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, num_bytes ) __field( u64, empty_size ) __field( u64, flags ) __field( u64, loop ) __field( bool, hinted ) __field( u64, bg_start ) __field( u64, bg_flags ) ), TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = btrfs_root_id(root); __entry->num_bytes = ffe_ctl->num_bytes; __entry->empty_size = ffe_ctl->empty_size; __entry->flags = ffe_ctl->flags; __entry->loop = ffe_ctl->loop; __entry->hinted = ffe_ctl->hinted; __entry->bg_start = block_group->start; __entry->bg_flags = block_group->flags; ), TP_printk_btrfs( "root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu hinted=%d block_group=%llu bg_flags=%llu(%s)", show_root_type(__entry->root_objectid), __entry->num_bytes, __entry->empty_size, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), __entry->loop, __entry->hinted, __entry->bg_start, __entry->bg_flags, __print_flags((unsigned long)__entry->bg_flags, "|", BTRFS_GROUP_FLAGS)) ); DECLARE_EVENT_CLASS(btrfs__reserve_extent, TP_PROTO(const struct btrfs_block_group *block_group, const struct find_free_extent_ctl *ffe_ctl), TP_ARGS(block_group, ffe_ctl), TP_STRUCT__entry_btrfs( __field( u64, bg_objectid ) __field( u64, flags ) __field( int, bg_size_class ) __field( u64, start ) __field( u64, len ) __field( u64, loop ) __field( bool, hinted ) __field( int, size_class ) ), TP_fast_assign_btrfs(block_group->fs_info, __entry->bg_objectid = block_group->start; __entry->flags = block_group->flags; __entry->bg_size_class = block_group->size_class; __entry->start = ffe_ctl->search_start; __entry->len = ffe_ctl->num_bytes; __entry->loop = ffe_ctl->loop; __entry->hinted = ffe_ctl->hinted; __entry->size_class = ffe_ctl->size_class; ), TP_printk_btrfs( "root=%llu(%s) block_group=%llu flags=%llu(%s) bg_size_class=%d start=%llu len=%llu loop=%llu hinted=%d size_class=%d", show_root_type(BTRFS_EXTENT_TREE_OBJECTID), __entry->bg_objectid, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), __entry->bg_size_class, __entry->start, __entry->len, __entry->loop, __entry->hinted, __entry->size_class) ); DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent, TP_PROTO(const struct btrfs_block_group *block_group, const struct find_free_extent_ctl *ffe_ctl), TP_ARGS(block_group, ffe_ctl) ); DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster, TP_PROTO(const struct btrfs_block_group *block_group, const struct find_free_extent_ctl *ffe_ctl), TP_ARGS(block_group, ffe_ctl) ); TRACE_EVENT(btrfs_find_cluster, TP_PROTO(const struct btrfs_block_group *block_group, u64 start, u64 bytes, u64 empty_size, u64 min_bytes), TP_ARGS(block_group, start, bytes, empty_size, min_bytes), TP_STRUCT__entry_btrfs( __field( u64, bg_objectid ) __field( u64, flags ) __field( u64, start ) __field( u64, bytes ) __field( u64, empty_size ) __field( u64, min_bytes ) ), TP_fast_assign_btrfs(block_group->fs_info, __entry->bg_objectid = block_group->start; __entry->flags = block_group->flags; __entry->start = start; __entry->bytes = bytes; __entry->empty_size = empty_size; __entry->min_bytes = min_bytes; ), TP_printk_btrfs("block_group=%llu flags=%llu(%s) start=%llu len=%llu " "empty_size=%llu min_bytes=%llu", __entry->bg_objectid, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), __entry->start, __entry->bytes, __entry->empty_size, __entry->min_bytes) ); TRACE_EVENT(btrfs_failed_cluster_setup, TP_PROTO(const struct btrfs_block_group *block_group), TP_ARGS(block_group), TP_STRUCT__entry_btrfs( __field( u64, bg_objectid ) ), TP_fast_assign_btrfs(block_group->fs_info, __entry->bg_objectid = block_group->start; ), TP_printk_btrfs("block_group=%llu", __entry->bg_objectid) ); TRACE_EVENT(btrfs_setup_cluster, TP_PROTO(const struct btrfs_block_group *block_group, const struct btrfs_free_cluster *cluster, u64 size, int bitmap), TP_ARGS(block_group, cluster, size, bitmap), TP_STRUCT__entry_btrfs( __field( u64, bg_objectid ) __field( u64, flags ) __field( u64, start ) __field( u64, max_size ) __field( u64, size ) __field( int, bitmap ) ), TP_fast_assign_btrfs(block_group->fs_info, __entry->bg_objectid = block_group->start; __entry->flags = block_group->flags; __entry->start = cluster->window_start; __entry->max_size = cluster->max_size; __entry->size = size; __entry->bitmap = bitmap; ), TP_printk_btrfs("block_group=%llu flags=%llu(%s) window_start=%llu " "size=%llu max_size=%llu bitmap=%d", __entry->bg_objectid, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), __entry->start, __entry->size, __entry->max_size, __entry->bitmap) ); struct extent_state; TRACE_EVENT(btrfs_alloc_extent_state, TP_PROTO(const struct extent_state *state, gfp_t mask, unsigned long IP), TP_ARGS(state, mask, IP), TP_STRUCT__entry( __field(const struct extent_state *, state) __field(unsigned long, mask) __field(const void*, ip) ), TP_fast_assign( __entry->state = state, __entry->mask = (__force unsigned long)mask, __entry->ip = (const void *)IP ), TP_printk("state=%p mask=%s caller=%pS", __entry->state, show_gfp_flags(__entry->mask), __entry->ip) ); TRACE_EVENT(btrfs_free_extent_state, TP_PROTO(const struct extent_state *state, unsigned long IP), TP_ARGS(state, IP), TP_STRUCT__entry( __field(const struct extent_state *, state) __field(const void*, ip) ), TP_fast_assign( __entry->state = state, __entry->ip = (const void *)IP ), TP_printk("state=%p caller=%pS", __entry->state, __entry->ip) ); DECLARE_EVENT_CLASS(btrfs__work, TP_PROTO(const struct btrfs_work *work), TP_ARGS(work), TP_STRUCT__entry_btrfs( __field( const void *, work ) __field( const void *, wq ) __field( const void *, func ) __field( const void *, ordered_func ) __field( const void *, normal_work ) ), TP_fast_assign_btrfs(btrfs_work_owner(work), __entry->work = work; __entry->wq = work->wq; __entry->func = work->func; __entry->ordered_func = work->ordered_func; __entry->normal_work = &work->normal_work; ), TP_printk_btrfs("work=%p (normal_work=%p) wq=%p func=%ps ordered_func=%p", __entry->work, __entry->normal_work, __entry->wq, __entry->func, __entry->ordered_func) ); /* * For situations when the work is freed, we pass fs_info and a tag that matches * the address of the work structure so it can be paired with the scheduling * event. DO NOT add anything here that dereferences wtag. */ DECLARE_EVENT_CLASS(btrfs__work__done, TP_PROTO(const struct btrfs_fs_info *fs_info, const void *wtag), TP_ARGS(fs_info, wtag), TP_STRUCT__entry_btrfs( __field( const void *, wtag ) ), TP_fast_assign_btrfs(fs_info, __entry->wtag = wtag; ), TP_printk_btrfs("work->%p", __entry->wtag) ); DEFINE_EVENT(btrfs__work, btrfs_work_queued, TP_PROTO(const struct btrfs_work *work), TP_ARGS(work) ); DEFINE_EVENT(btrfs__work, btrfs_work_sched, TP_PROTO(const struct btrfs_work *work), TP_ARGS(work) ); DEFINE_EVENT(btrfs__work__done, btrfs_all_work_done, TP_PROTO(const struct btrfs_fs_info *fs_info, const void *wtag), TP_ARGS(fs_info, wtag) ); DEFINE_EVENT(btrfs__work, btrfs_ordered_sched, TP_PROTO(const struct btrfs_work *work), TP_ARGS(work) ); DECLARE_EVENT_CLASS(btrfs_workqueue, TP_PROTO(const struct btrfs_workqueue *wq, const char *name), TP_ARGS(wq, name), TP_STRUCT__entry_btrfs( __field( const void *, wq ) __string( name, name ) ), TP_fast_assign_btrfs(btrfs_workqueue_owner(wq), __entry->wq = wq; __assign_str(name); ), TP_printk_btrfs("name=%s wq=%p", __get_str(name), __entry->wq) ); DEFINE_EVENT(btrfs_workqueue, btrfs_workqueue_alloc, TP_PROTO(const struct btrfs_workqueue *wq, const char *name), TP_ARGS(wq, name) ); DECLARE_EVENT_CLASS(btrfs_workqueue_done, TP_PROTO(const struct btrfs_workqueue *wq), TP_ARGS(wq), TP_STRUCT__entry_btrfs( __field( const void *, wq ) ), TP_fast_assign_btrfs(btrfs_workqueue_owner(wq), __entry->wq = wq; ), TP_printk_btrfs("wq=%p", __entry->wq) ); DEFINE_EVENT(btrfs_workqueue_done, btrfs_workqueue_destroy, TP_PROTO(const struct btrfs_workqueue *wq), TP_ARGS(wq) ); #define BTRFS_QGROUP_OPERATIONS \ { QGROUP_RESERVE, "reserve" }, \ { QGROUP_RELEASE, "release" }, \ { QGROUP_FREE, "free" } DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data, TP_PROTO(const struct inode *inode, u64 start, u64 len, u64 reserved, int op), TP_ARGS(inode, start, len, reserved, op), TP_STRUCT__entry_btrfs( __field( u64, rootid ) __field( u64, ino ) __field( u64, start ) __field( u64, len ) __field( u64, reserved ) __field( int, op ) ), TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), __entry->rootid = btrfs_root_id(BTRFS_I(inode)->root); __entry->ino = btrfs_ino(BTRFS_I(inode)); __entry->start = start; __entry->len = len; __entry->reserved = reserved; __entry->op = op; ), TP_printk_btrfs("root=%llu ino=%llu start=%llu len=%llu reserved=%llu op=%s", __entry->rootid, __entry->ino, __entry->start, __entry->len, __entry->reserved, __print_flags((unsigned long)__entry->op, "", BTRFS_QGROUP_OPERATIONS) ) ); DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_reserve_data, TP_PROTO(const struct inode *inode, u64 start, u64 len, u64 reserved, int op), TP_ARGS(inode, start, len, reserved, op) ); DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data, TP_PROTO(const struct inode *inode, u64 start, u64 len, u64 reserved, int op), TP_ARGS(inode, start, len, reserved, op) ); DECLARE_EVENT_CLASS(btrfs_qgroup_extent, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_qgroup_extent_record *rec, u64 bytenr), TP_ARGS(fs_info, rec, bytenr), TP_STRUCT__entry_btrfs( __field( u64, bytenr ) __field( u64, num_bytes ) ), TP_fast_assign_btrfs(fs_info, __entry->bytenr = bytenr; __entry->num_bytes = rec->num_bytes; ), TP_printk_btrfs("bytenr=%llu num_bytes=%llu", __entry->bytenr, __entry->num_bytes) ); DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_qgroup_extent_record *rec, u64 bytenr), TP_ARGS(fs_info, rec, bytenr) ); DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_qgroup_extent_record *rec, u64 bytenr), TP_ARGS(fs_info, rec, bytenr) ); TRACE_EVENT(btrfs_qgroup_num_dirty_extents, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 transid, u64 num_dirty_extents), TP_ARGS(fs_info, transid, num_dirty_extents), TP_STRUCT__entry_btrfs( __field( u64, transid ) __field( u64, num_dirty_extents ) ), TP_fast_assign_btrfs(fs_info, __entry->transid = transid; __entry->num_dirty_extents = num_dirty_extents; ), TP_printk_btrfs("transid=%llu num_dirty_extents=%llu", __entry->transid, __entry->num_dirty_extents) ); TRACE_EVENT(btrfs_qgroup_account_extent, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 transid, u64 bytenr, u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots), TP_ARGS(fs_info, transid, bytenr, num_bytes, nr_old_roots, nr_new_roots), TP_STRUCT__entry_btrfs( __field( u64, transid ) __field( u64, bytenr ) __field( u64, num_bytes ) __field( u64, nr_old_roots ) __field( u64, nr_new_roots ) ), TP_fast_assign_btrfs(fs_info, __entry->transid = transid; __entry->bytenr = bytenr; __entry->num_bytes = num_bytes; __entry->nr_old_roots = nr_old_roots; __entry->nr_new_roots = nr_new_roots; ), TP_printk_btrfs( "transid=%llu bytenr=%llu num_bytes=%llu nr_old_roots=%llu nr_new_roots=%llu", __entry->transid, __entry->bytenr, __entry->num_bytes, __entry->nr_old_roots, __entry->nr_new_roots) ); TRACE_EVENT(btrfs_qgroup_update_counters, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_qgroup *qgroup, u64 cur_old_count, u64 cur_new_count), TP_ARGS(fs_info, qgroup, cur_old_count, cur_new_count), TP_STRUCT__entry_btrfs( __field( u64, qgid ) __field( u64, old_rfer ) __field( u64, old_excl ) __field( u64, cur_old_count ) __field( u64, cur_new_count ) ), TP_fast_assign_btrfs(fs_info, __entry->qgid = qgroup->qgroupid; __entry->old_rfer = qgroup->rfer; __entry->old_excl = qgroup->excl; __entry->cur_old_count = cur_old_count; __entry->cur_new_count = cur_new_count; ), TP_printk_btrfs("qgid=%llu old_rfer=%llu old_excl=%llu cur_old_count=%llu cur_new_count=%llu", __entry->qgid, __entry->old_rfer, __entry->old_excl, __entry->cur_old_count, __entry->cur_new_count) ); TRACE_EVENT(btrfs_qgroup_update_reserve, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_qgroup *qgroup, s64 diff, int type), TP_ARGS(fs_info, qgroup, diff, type), TP_STRUCT__entry_btrfs( __field( u64, qgid ) __field( u64, cur_reserved ) __field( s64, diff ) __field( int, type ) ), TP_fast_assign_btrfs(fs_info, __entry->qgid = qgroup->qgroupid; __entry->cur_reserved = qgroup->rsv.values[type]; __entry->diff = diff; __entry->type = type; ), TP_printk_btrfs("qgid=%llu type=%s cur_reserved=%llu diff=%lld", __entry->qgid, __print_symbolic(__entry->type, QGROUP_RSV_TYPES), __entry->cur_reserved, __entry->diff) ); TRACE_EVENT(btrfs_qgroup_meta_reserve, TP_PROTO(const struct btrfs_root *root, s64 diff, int type), TP_ARGS(root, diff, type), TP_STRUCT__entry_btrfs( __field( u64, refroot ) __field( s64, diff ) __field( int, type ) ), TP_fast_assign_btrfs(root->fs_info, __entry->refroot = btrfs_root_id(root); __entry->diff = diff; __entry->type = type; ), TP_printk_btrfs("refroot=%llu(%s) type=%s diff=%lld", show_root_type(__entry->refroot), __print_symbolic(__entry->type, QGROUP_RSV_TYPES), __entry->diff) ); TRACE_EVENT(btrfs_qgroup_meta_convert, TP_PROTO(const struct btrfs_root *root, s64 diff), TP_ARGS(root, diff), TP_STRUCT__entry_btrfs( __field( u64, refroot ) __field( s64, diff ) ), TP_fast_assign_btrfs(root->fs_info, __entry->refroot = btrfs_root_id(root); __entry->diff = diff; ), TP_printk_btrfs("refroot=%llu(%s) type=%s->%s diff=%lld", show_root_type(__entry->refroot), __print_symbolic(BTRFS_QGROUP_RSV_META_PREALLOC, QGROUP_RSV_TYPES), __print_symbolic(BTRFS_QGROUP_RSV_META_PERTRANS, QGROUP_RSV_TYPES), __entry->diff) ); TRACE_EVENT(btrfs_qgroup_meta_free_all_pertrans, TP_PROTO(struct btrfs_root *root), TP_ARGS(root), TP_STRUCT__entry_btrfs( __field( u64, refroot ) __field( s64, diff ) __field( int, type ) ), TP_fast_assign_btrfs(root->fs_info, __entry->refroot = btrfs_root_id(root); spin_lock(&root->qgroup_meta_rsv_lock); __entry->diff = -(s64)root->qgroup_meta_rsv_pertrans; spin_unlock(&root->qgroup_meta_rsv_lock); __entry->type = BTRFS_QGROUP_RSV_META_PERTRANS; ), TP_printk_btrfs("refroot=%llu(%s) type=%s diff=%lld", show_root_type(__entry->refroot), __print_symbolic(__entry->type, QGROUP_RSV_TYPES), __entry->diff) ); DECLARE_EVENT_CLASS(btrfs__prelim_ref, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct prelim_ref *oldref, const struct prelim_ref *newref, u64 tree_size), TP_ARGS(fs_info, oldref, newref, tree_size), TP_STRUCT__entry_btrfs( __field( u64, root_id ) __field( u64, objectid ) __field( u8, type ) __field( u64, offset ) __field( int, level ) __field( int, old_count ) __field( u64, parent ) __field( u64, bytenr ) __field( int, mod_count ) __field( u64, tree_size ) ), TP_fast_assign_btrfs(fs_info, __entry->root_id = oldref->root_id; __entry->objectid = oldref->key_for_search.objectid; __entry->type = oldref->key_for_search.type; __entry->offset = oldref->key_for_search.offset; __entry->level = oldref->level; __entry->old_count = oldref->count; __entry->parent = oldref->parent; __entry->bytenr = oldref->wanted_disk_byte; __entry->mod_count = newref ? newref->count : 0; __entry->tree_size = tree_size; ), TP_printk_btrfs("root_id=%llu key=[%llu,%u,%llu] level=%d count=[%d+%d=%d] parent=%llu wanted_disk_byte=%llu nodes=%llu", __entry->root_id, __entry->objectid, __entry->type, __entry->offset, __entry->level, __entry->old_count, __entry->mod_count, __entry->old_count + __entry->mod_count, __entry->parent, __entry->bytenr, __entry->tree_size) ); DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_merge, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct prelim_ref *oldref, const struct prelim_ref *newref, u64 tree_size), TP_ARGS(fs_info, oldref, newref, tree_size) ); DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_insert, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct prelim_ref *oldref, const struct prelim_ref *newref, u64 tree_size), TP_ARGS(fs_info, oldref, newref, tree_size) ); TRACE_EVENT(btrfs_inode_mod_outstanding_extents, TP_PROTO(const struct btrfs_root *root, u64 ino, int mod, unsigned outstanding), TP_ARGS(root, ino, mod, outstanding), TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, ino ) __field( int, mod ) __field( unsigned, outstanding ) ), TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = btrfs_root_id(root); __entry->ino = ino; __entry->mod = mod; __entry->outstanding = outstanding; ), TP_printk_btrfs("root=%llu(%s) ino=%llu mod=%d outstanding=%u", show_root_type(__entry->root_objectid), __entry->ino, __entry->mod, __entry->outstanding) ); DECLARE_EVENT_CLASS(btrfs__block_group, TP_PROTO(const struct btrfs_block_group *bg_cache), TP_ARGS(bg_cache), TP_STRUCT__entry_btrfs( __field( u64, bytenr ) __field( u64, len ) __field( u64, used ) __field( u64, flags ) ), TP_fast_assign_btrfs(bg_cache->fs_info, __entry->bytenr = bg_cache->start, __entry->len = bg_cache->length, __entry->used = bg_cache->used; __entry->flags = bg_cache->flags; ), TP_printk_btrfs("bg bytenr=%llu len=%llu used=%llu flags=%llu(%s)", __entry->bytenr, __entry->len, __entry->used, __entry->flags, __print_flags(__entry->flags, "|", BTRFS_GROUP_FLAGS)) ); DEFINE_EVENT(btrfs__block_group, btrfs_remove_block_group, TP_PROTO(const struct btrfs_block_group *bg_cache), TP_ARGS(bg_cache) ); DEFINE_EVENT(btrfs__block_group, btrfs_add_unused_block_group, TP_PROTO(const struct btrfs_block_group *bg_cache), TP_ARGS(bg_cache) ); DEFINE_EVENT(btrfs__block_group, btrfs_add_reclaim_block_group, TP_PROTO(const struct btrfs_block_group *bg_cache), TP_ARGS(bg_cache) ); DEFINE_EVENT(btrfs__block_group, btrfs_reclaim_block_group, TP_PROTO(const struct btrfs_block_group *bg_cache), TP_ARGS(bg_cache) ); DEFINE_EVENT(btrfs__block_group, btrfs_skip_unused_block_group, TP_PROTO(const struct btrfs_block_group *bg_cache), TP_ARGS(bg_cache) ); TRACE_EVENT(btrfs_set_extent_bit, TP_PROTO(const struct extent_io_tree *tree, u64 start, u64 len, unsigned set_bits), TP_ARGS(tree, start, len, set_bits), TP_STRUCT__entry_btrfs( __field( unsigned, owner ) __field( u64, ino ) __field( u64, rootid ) __field( u64, start ) __field( u64, len ) __field( unsigned, set_bits) ), TP_fast_assign_btrfs(btrfs_extent_io_tree_to_fs_info(tree), const struct btrfs_inode *inode = btrfs_extent_io_tree_to_inode(tree); __entry->owner = tree->owner; __entry->ino = inode ? btrfs_ino(inode) : 0; __entry->rootid = inode ? btrfs_root_id(inode->root) : 0; __entry->start = start; __entry->len = len; __entry->set_bits = set_bits; ), TP_printk_btrfs( "io_tree=%s ino=%llu root=%llu start=%llu len=%llu set_bits=%s", __print_symbolic(__entry->owner, IO_TREE_OWNER), __entry->ino, __entry->rootid, __entry->start, __entry->len, __print_flags(__entry->set_bits, "|", EXTENT_FLAGS)) ); TRACE_EVENT(btrfs_clear_extent_bit, TP_PROTO(const struct extent_io_tree *tree, u64 start, u64 len, unsigned clear_bits), TP_ARGS(tree, start, len, clear_bits), TP_STRUCT__entry_btrfs( __field( unsigned, owner ) __field( u64, ino ) __field( u64, rootid ) __field( u64, start ) __field( u64, len ) __field( unsigned, clear_bits) ), TP_fast_assign_btrfs(btrfs_extent_io_tree_to_fs_info(tree), const struct btrfs_inode *inode = btrfs_extent_io_tree_to_inode(tree); __entry->owner = tree->owner; __entry->ino = inode ? btrfs_ino(inode) : 0; __entry->rootid = inode ? btrfs_root_id(inode->root) : 0; __entry->start = start; __entry->len = len; __entry->clear_bits = clear_bits; ), TP_printk_btrfs( "io_tree=%s ino=%llu root=%llu start=%llu len=%llu clear_bits=%s", __print_symbolic(__entry->owner, IO_TREE_OWNER), __entry->ino, __entry->rootid, __entry->start, __entry->len, __print_flags(__entry->clear_bits, "|", EXTENT_FLAGS)) ); TRACE_EVENT(btrfs_convert_extent_bit, TP_PROTO(const struct extent_io_tree *tree, u64 start, u64 len, unsigned set_bits, unsigned clear_bits), TP_ARGS(tree, start, len, set_bits, clear_bits), TP_STRUCT__entry_btrfs( __field( unsigned, owner ) __field( u64, ino ) __field( u64, rootid ) __field( u64, start ) __field( u64, len ) __field( unsigned, set_bits) __field( unsigned, clear_bits) ), TP_fast_assign_btrfs(btrfs_extent_io_tree_to_fs_info(tree), const struct btrfs_inode *inode = btrfs_extent_io_tree_to_inode(tree); __entry->owner = tree->owner; __entry->ino = inode ? btrfs_ino(inode) : 0; __entry->rootid = inode ? btrfs_root_id(inode->root) : 0; __entry->start = start; __entry->len = len; __entry->set_bits = set_bits; __entry->clear_bits = clear_bits; ), TP_printk_btrfs( "io_tree=%s ino=%llu root=%llu start=%llu len=%llu set_bits=%s clear_bits=%s", __print_symbolic(__entry->owner, IO_TREE_OWNER), __entry->ino, __entry->rootid, __entry->start, __entry->len, __print_flags(__entry->set_bits , "|", EXTENT_FLAGS), __print_flags(__entry->clear_bits, "|", EXTENT_FLAGS)) ); DECLARE_EVENT_CLASS(btrfs_dump_space_info, TP_PROTO(struct btrfs_fs_info *fs_info, const struct btrfs_space_info *sinfo), TP_ARGS(fs_info, sinfo), TP_STRUCT__entry_btrfs( __field( u64, flags ) __field( u64, total_bytes ) __field( u64, bytes_used ) __field( u64, bytes_pinned ) __field( u64, bytes_reserved ) __field( u64, bytes_may_use ) __field( u64, bytes_readonly ) __field( u64, reclaim_size ) __field( int, clamp ) __field( u64, global_reserved ) __field( u64, trans_reserved ) __field( u64, delayed_refs_reserved ) __field( u64, delayed_reserved ) __field( u64, free_chunk_space ) __field( u64, delalloc_bytes ) __field( u64, ordered_bytes ) ), TP_fast_assign_btrfs(fs_info, __entry->flags = sinfo->flags; __entry->total_bytes = sinfo->total_bytes; __entry->bytes_used = sinfo->bytes_used; __entry->bytes_pinned = sinfo->bytes_pinned; __entry->bytes_reserved = sinfo->bytes_reserved; __entry->bytes_may_use = sinfo->bytes_may_use; __entry->bytes_readonly = sinfo->bytes_readonly; __entry->reclaim_size = sinfo->reclaim_size; __entry->clamp = sinfo->clamp; __entry->global_reserved = fs_info->global_block_rsv.reserved; __entry->trans_reserved = fs_info->trans_block_rsv.reserved; __entry->delayed_refs_reserved = fs_info->delayed_refs_rsv.reserved; __entry->delayed_reserved = fs_info->delayed_block_rsv.reserved; __entry->free_chunk_space = atomic64_read(&fs_info->free_chunk_space); __entry->delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes); __entry->ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes); ), TP_printk_btrfs("flags=%s total_bytes=%llu bytes_used=%llu " "bytes_pinned=%llu bytes_reserved=%llu " "bytes_may_use=%llu bytes_readonly=%llu " "reclaim_size=%llu clamp=%d global_reserved=%llu " "trans_reserved=%llu delayed_refs_reserved=%llu " "delayed_reserved=%llu chunk_free_space=%llu " "delalloc_bytes=%llu ordered_bytes=%llu", __print_flags(__entry->flags, "|", BTRFS_GROUP_FLAGS), __entry->total_bytes, __entry->bytes_used, __entry->bytes_pinned, __entry->bytes_reserved, __entry->bytes_may_use, __entry->bytes_readonly, __entry->reclaim_size, __entry->clamp, __entry->global_reserved, __entry->trans_reserved, __entry->delayed_refs_reserved, __entry->delayed_reserved, __entry->free_chunk_space, __entry->delalloc_bytes, __entry->ordered_bytes) ); DEFINE_EVENT(btrfs_dump_space_info, btrfs_done_preemptive_reclaim, TP_PROTO(struct btrfs_fs_info *fs_info, const struct btrfs_space_info *sinfo), TP_ARGS(fs_info, sinfo) ); DEFINE_EVENT(btrfs_dump_space_info, btrfs_fail_all_tickets, TP_PROTO(struct btrfs_fs_info *fs_info, const struct btrfs_space_info *sinfo), TP_ARGS(fs_info, sinfo) ); TRACE_EVENT(btrfs_reserve_ticket, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 bytes, u64 start_ns, int flush, int error), TP_ARGS(fs_info, flags, bytes, start_ns, flush, error), TP_STRUCT__entry_btrfs( __field( u64, flags ) __field( u64, bytes ) __field( u64, start_ns ) __field( int, flush ) __field( int, error ) ), TP_fast_assign_btrfs(fs_info, __entry->flags = flags; __entry->bytes = bytes; __entry->start_ns = start_ns; __entry->flush = flush; __entry->error = error; ), TP_printk_btrfs("flags=%s bytes=%llu start_ns=%llu flush=%s error=%d", __print_flags(__entry->flags, "|", BTRFS_GROUP_FLAGS), __entry->bytes, __entry->start_ns, __print_symbolic(__entry->flush, FLUSH_ACTIONS), __entry->error) ); DECLARE_EVENT_CLASS(btrfs_sleep_tree_lock, TP_PROTO(const struct extent_buffer *eb, u64 start_ns), TP_ARGS(eb, start_ns), TP_STRUCT__entry_btrfs( __field( u64, block ) __field( u64, generation ) __field( u64, start_ns ) __field( u64, end_ns ) __field( u64, diff_ns ) __field( u64, owner ) __field( int, is_log_tree ) ), TP_fast_assign_btrfs(eb->fs_info, __entry->block = eb->start; __entry->generation = btrfs_header_generation(eb); __entry->start_ns = start_ns; __entry->end_ns = ktime_get_ns(); __entry->diff_ns = __entry->end_ns - start_ns; __entry->owner = btrfs_header_owner(eb); __entry->is_log_tree = (eb->log_index >= 0); ), TP_printk_btrfs( "block=%llu generation=%llu start_ns=%llu end_ns=%llu diff_ns=%llu owner=%llu is_log_tree=%d", __entry->block, __entry->generation, __entry->start_ns, __entry->end_ns, __entry->diff_ns, __entry->owner, __entry->is_log_tree) ); DEFINE_EVENT(btrfs_sleep_tree_lock, btrfs_tree_read_lock, TP_PROTO(const struct extent_buffer *eb, u64 start_ns), TP_ARGS(eb, start_ns) ); DEFINE_EVENT(btrfs_sleep_tree_lock, btrfs_tree_lock, TP_PROTO(const struct extent_buffer *eb, u64 start_ns), TP_ARGS(eb, start_ns) ); DECLARE_EVENT_CLASS(btrfs_locking_events, TP_PROTO(const struct extent_buffer *eb), TP_ARGS(eb), TP_STRUCT__entry_btrfs( __field( u64, block ) __field( u64, generation ) __field( u64, owner ) __field( int, is_log_tree ) ), TP_fast_assign_btrfs(eb->fs_info, __entry->block = eb->start; __entry->generation = btrfs_header_generation(eb); __entry->owner = btrfs_header_owner(eb); __entry->is_log_tree = (eb->log_index >= 0); ), TP_printk_btrfs("block=%llu generation=%llu owner=%llu is_log_tree=%d", __entry->block, __entry->generation, __entry->owner, __entry->is_log_tree) ); #define DEFINE_BTRFS_LOCK_EVENT(name) \ DEFINE_EVENT(btrfs_locking_events, name, \ TP_PROTO(const struct extent_buffer *eb), \ \ TP_ARGS(eb) \ ) DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_unlock); DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_unlock); DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_read_lock); DECLARE_EVENT_CLASS(btrfs__space_info_update, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_space_info *sinfo, u64 old, s64 diff), TP_ARGS(fs_info, sinfo, old, diff), TP_STRUCT__entry_btrfs( __field( u64, type ) __field( u64, old ) __field( s64, diff ) ), TP_fast_assign_btrfs(fs_info, __entry->type = sinfo->flags; __entry->old = old; __entry->diff = diff; ), TP_printk_btrfs("type=%s old=%llu diff=%lld", __print_flags(__entry->type, "|", BTRFS_GROUP_FLAGS), __entry->old, __entry->diff) ); DEFINE_EVENT(btrfs__space_info_update, update_bytes_may_use, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_space_info *sinfo, u64 old, s64 diff), TP_ARGS(fs_info, sinfo, old, diff) ); DEFINE_EVENT(btrfs__space_info_update, update_bytes_pinned, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_space_info *sinfo, u64 old, s64 diff), TP_ARGS(fs_info, sinfo, old, diff) ); DEFINE_EVENT(btrfs__space_info_update, update_bytes_zone_unusable, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_space_info *sinfo, u64 old, s64 diff), TP_ARGS(fs_info, sinfo, old, diff) ); DECLARE_EVENT_CLASS(btrfs_raid56_bio, TP_PROTO(const struct btrfs_raid_bio *rbio, const struct bio *bio, const struct raid56_bio_trace_info *trace_info), TP_ARGS(rbio, bio, trace_info), TP_STRUCT__entry_btrfs( __field( u64, full_stripe ) __field( u64, physical ) __field( u64, devid ) __field( u32, offset ) __field( u32, len ) __field( u8, opf ) __field( u8, total_stripes ) __field( u8, real_stripes ) __field( u8, nr_data ) __field( u8, stripe_nr ) ), TP_fast_assign_btrfs(rbio->bioc->fs_info, __entry->full_stripe = rbio->bioc->full_stripe_logical; __entry->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; __entry->len = bio->bi_iter.bi_size; __entry->opf = bio_op(bio); __entry->devid = trace_info->devid; __entry->offset = trace_info->offset; __entry->stripe_nr = trace_info->stripe_nr; __entry->total_stripes = rbio->bioc->num_stripes; __entry->real_stripes = rbio->real_stripes; __entry->nr_data = rbio->nr_data; ), /* * For type output, we need to output things like "DATA1" * (the first data stripe), "DATA2" (the second data stripe), * "PQ1" (P stripe),"PQ2" (Q stripe), "REPLACE0" (replace target device). */ TP_printk_btrfs( "full_stripe=%llu devid=%lld type=%s%d offset=%d opf=0x%x physical=%llu len=%u", __entry->full_stripe, __entry->devid, (__entry->stripe_nr < __entry->nr_data) ? "DATA" : ((__entry->stripe_nr < __entry->real_stripes) ? "PQ" : "REPLACE"), (__entry->stripe_nr < __entry->nr_data) ? (__entry->stripe_nr + 1) : ((__entry->stripe_nr < __entry->real_stripes) ? (__entry->stripe_nr - __entry->nr_data + 1) : 0), __entry->offset, __entry->opf, __entry->physical, __entry->len) ); DEFINE_EVENT(btrfs_raid56_bio, raid56_read, TP_PROTO(const struct btrfs_raid_bio *rbio, const struct bio *bio, const struct raid56_bio_trace_info *trace_info), TP_ARGS(rbio, bio, trace_info) ); DEFINE_EVENT(btrfs_raid56_bio, raid56_write, TP_PROTO(const struct btrfs_raid_bio *rbio, const struct bio *bio, const struct raid56_bio_trace_info *trace_info), TP_ARGS(rbio, bio, trace_info) ); TRACE_EVENT(btrfs_insert_one_raid_extent, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 logical, u64 length, int num_stripes), TP_ARGS(fs_info, logical, length, num_stripes), TP_STRUCT__entry_btrfs( __field( u64, logical ) __field( u64, length ) __field( int, num_stripes ) ), TP_fast_assign_btrfs(fs_info, __entry->logical = logical; __entry->length = length; __entry->num_stripes = num_stripes; ), TP_printk_btrfs("logical=%llu length=%llu num_stripes=%d", __entry->logical, __entry->length, __entry->num_stripes) ); TRACE_EVENT(btrfs_raid_extent_delete, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 start, u64 end, u64 found_start, u64 found_end), TP_ARGS(fs_info, start, end, found_start, found_end), TP_STRUCT__entry_btrfs( __field( u64, start ) __field( u64, end ) __field( u64, found_start ) __field( u64, found_end ) ), TP_fast_assign_btrfs(fs_info, __entry->start = start; __entry->end = end; __entry->found_start = found_start; __entry->found_end = found_end; ), TP_printk_btrfs("start=%llu end=%llu found_start=%llu found_end=%llu", __entry->start, __entry->end, __entry->found_start, __entry->found_end) ); TRACE_EVENT(btrfs_get_raid_extent_offset, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 logical, u64 length, u64 physical, u64 devid), TP_ARGS(fs_info, logical, length, physical, devid), TP_STRUCT__entry_btrfs( __field( u64, logical ) __field( u64, length ) __field( u64, physical ) __field( u64, devid ) ), TP_fast_assign_btrfs(fs_info, __entry->logical = logical; __entry->length = length; __entry->physical = physical; __entry->devid = devid; ), TP_printk_btrfs("logical=%llu length=%llu physical=%llu devid=%llu", __entry->logical, __entry->length, __entry->physical, __entry->devid) ); TRACE_EVENT(btrfs_extent_map_shrinker_count, TP_PROTO(const struct btrfs_fs_info *fs_info, long nr), TP_ARGS(fs_info, nr), TP_STRUCT__entry_btrfs( __field( long, nr ) ), TP_fast_assign_btrfs(fs_info, __entry->nr = nr; ), TP_printk_btrfs("nr=%ld", __entry->nr) ); TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, TP_PROTO(const struct btrfs_fs_info *fs_info, long nr), TP_ARGS(fs_info, nr), TP_STRUCT__entry_btrfs( __field( long, nr_to_scan ) __field( long, nr ) __field( u64, last_root_id ) __field( u64, last_ino ) ), TP_fast_assign_btrfs(fs_info, __entry->nr_to_scan = \ atomic64_read(&fs_info->em_shrinker_nr_to_scan); __entry->nr = nr; __entry->last_root_id = fs_info->em_shrinker_last_root; __entry->last_ino = fs_info->em_shrinker_last_ino; ), TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu", __entry->nr_to_scan, __entry->nr, show_root_type(__entry->last_root_id), __entry->last_ino) ); TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit, TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr), TP_ARGS(fs_info, nr_dropped, nr), TP_STRUCT__entry_btrfs( __field( long, nr_dropped ) __field( long, nr ) __field( u64, last_root_id ) __field( u64, last_ino ) ), TP_fast_assign_btrfs(fs_info, __entry->nr_dropped = nr_dropped; __entry->nr = nr; __entry->last_root_id = fs_info->em_shrinker_last_root; __entry->last_ino = fs_info->em_shrinker_last_ino; ), TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu", __entry->nr_dropped, __entry->nr, show_root_type(__entry->last_root_id), __entry->last_ino) ); TRACE_EVENT(btrfs_extent_map_shrinker_remove_em, TP_PROTO(const struct btrfs_inode *inode, const struct extent_map *em), TP_ARGS(inode, em), TP_STRUCT__entry_btrfs( __field( u64, ino ) __field( u64, root_id ) __field( u64, start ) __field( u64, len ) __field( u32, flags ) ), TP_fast_assign_btrfs(inode->root->fs_info, __entry->ino = btrfs_ino(inode); __entry->root_id = btrfs_root_id(inode->root); __entry->start = em->start; __entry->len = em->len; __entry->flags = em->flags; ), TP_printk_btrfs("ino=%llu root=%llu(%s) start=%llu len=%llu flags=%s", __entry->ino, show_root_type(__entry->root_id), __entry->start, __entry->len, show_map_flags(__entry->flags)) ); #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 2 1 7 3 4 4 14 14 9 43 9 39 1 1 3 34 14 1 3 4 9 45 5 22 19 1 7 47 197 35 9 54 102 217 21 116 6 77 8 18 10 98 1 1 205 98 104 2 103 5 1 73 6 1 7 1 12 1 7 195 188 12 1 6 202 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 | // SPDX-License-Identifier: GPL-2.0-or-later /* Filesystem access-by-fd. * * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/security.h> #include <linux/anon_inodes.h> #include <linux/namei.h> #include <linux/file.h> #include <uapi/linux/mount.h> #include "internal.h" #include "mount.h" static inline const char *fetch_message_locked(struct fc_log *log, size_t len, bool *need_free) { const char *p; int index; if (unlikely(log->head == log->tail)) return ERR_PTR(-ENODATA); index = log->tail & (ARRAY_SIZE(log->buffer) - 1); p = log->buffer[index]; if (unlikely(strlen(p) > len)) return ERR_PTR(-EMSGSIZE); log->buffer[index] = NULL; *need_free = log->need_free & (1 << index); log->need_free &= ~(1 << index); log->tail++; return p; } /* * Allow the user to read back any error, warning or informational messages. * Only one message is returned for each read(2) call. */ static ssize_t fscontext_read(struct file *file, char __user *_buf, size_t len, loff_t *pos) { struct fs_context *fc = file->private_data; ssize_t err; const char *p __free(kfree) = NULL, *message; bool need_free; int n; err = mutex_lock_interruptible(&fc->uapi_mutex); if (err < 0) return err; message = fetch_message_locked(fc->log.log, len, &need_free); mutex_unlock(&fc->uapi_mutex); if (IS_ERR(message)) return PTR_ERR(message); if (need_free) p = message; n = strlen(message); if (copy_to_user(_buf, message, n)) return -EFAULT; return n; } static int fscontext_release(struct inode *inode, struct file *file) { struct fs_context *fc = file->private_data; if (fc) { file->private_data = NULL; put_fs_context(fc); } return 0; } const struct file_operations fscontext_fops = { .read = fscontext_read, .release = fscontext_release, }; /* * Attach a filesystem context to a file and an fd. */ static int fscontext_create_fd(struct fs_context *fc, unsigned int o_flags) { int fd; fd = anon_inode_getfd("[fscontext]", &fscontext_fops, fc, O_RDWR | o_flags); if (fd < 0) put_fs_context(fc); return fd; } static int fscontext_alloc_log(struct fs_context *fc) { fc->log.log = kzalloc(sizeof(*fc->log.log), GFP_KERNEL); if (!fc->log.log) return -ENOMEM; refcount_set(&fc->log.log->usage, 1); fc->log.log->owner = fc->fs_type->owner; return 0; } /* * Open a filesystem by name so that it can be configured for mounting. * * We are allowed to specify a container in which the filesystem will be * opened, thereby indicating which namespaces will be used (notably, which * network namespace will be used for network filesystems). */ SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags) { struct file_system_type *fs_type; struct fs_context *fc; const char *fs_name; int ret; if (!may_mount()) return -EPERM; if (flags & ~FSOPEN_CLOEXEC) return -EINVAL; fs_name = strndup_user(_fs_name, PAGE_SIZE); if (IS_ERR(fs_name)) return PTR_ERR(fs_name); fs_type = get_fs_type(fs_name); kfree(fs_name); if (!fs_type) return -ENODEV; fc = fs_context_for_mount(fs_type, 0); put_filesystem(fs_type); if (IS_ERR(fc)) return PTR_ERR(fc); fc->phase = FS_CONTEXT_CREATE_PARAMS; ret = fscontext_alloc_log(fc); if (ret < 0) goto err_fc; return fscontext_create_fd(fc, flags & FSOPEN_CLOEXEC ? O_CLOEXEC : 0); err_fc: put_fs_context(fc); return ret; } /* * Pick a superblock into a context for reconfiguration. */ SYSCALL_DEFINE3(fspick, int, dfd, const char __user *, path, unsigned int, flags) { struct fs_context *fc; struct path target; unsigned int lookup_flags; int ret; if (!may_mount()) return -EPERM; if ((flags & ~(FSPICK_CLOEXEC | FSPICK_SYMLINK_NOFOLLOW | FSPICK_NO_AUTOMOUNT | FSPICK_EMPTY_PATH)) != 0) return -EINVAL; lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; if (flags & FSPICK_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & FSPICK_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; if (flags & FSPICK_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; ret = user_path_at(dfd, path, lookup_flags, &target); if (ret < 0) goto err; ret = -EINVAL; if (target.mnt->mnt_root != target.dentry) goto err_path; fc = fs_context_for_reconfigure(target.dentry, 0, 0); if (IS_ERR(fc)) { ret = PTR_ERR(fc); goto err_path; } fc->phase = FS_CONTEXT_RECONF_PARAMS; ret = fscontext_alloc_log(fc); if (ret < 0) goto err_fc; path_put(&target); return fscontext_create_fd(fc, flags & FSPICK_CLOEXEC ? O_CLOEXEC : 0); err_fc: put_fs_context(fc); err_path: path_put(&target); err: return ret; } static int vfs_cmd_create(struct fs_context *fc, bool exclusive) { struct super_block *sb; int ret; if (fc->phase != FS_CONTEXT_CREATE_PARAMS) return -EBUSY; if (!mount_capable(fc)) return -EPERM; fc->phase = FS_CONTEXT_CREATING; fc->exclusive = exclusive; ret = vfs_get_tree(fc); if (ret) { fc->phase = FS_CONTEXT_FAILED; return ret; } sb = fc->root->d_sb; ret = security_sb_kern_mount(sb); if (unlikely(ret)) { fc_drop_locked(fc); fc->phase = FS_CONTEXT_FAILED; return ret; } /* vfs_get_tree() callchains will have grabbed @s_umount */ up_write(&sb->s_umount); fc->phase = FS_CONTEXT_AWAITING_MOUNT; return 0; } static int vfs_cmd_reconfigure(struct fs_context *fc) { struct super_block *sb; int ret; if (fc->phase != FS_CONTEXT_RECONF_PARAMS) return -EBUSY; fc->phase = FS_CONTEXT_RECONFIGURING; sb = fc->root->d_sb; if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { fc->phase = FS_CONTEXT_FAILED; return -EPERM; } down_write(&sb->s_umount); ret = reconfigure_super(fc); up_write(&sb->s_umount); if (ret) { fc->phase = FS_CONTEXT_FAILED; return ret; } vfs_clean_context(fc); return 0; } /* * Check the state and apply the configuration. Note that this function is * allowed to 'steal' the value by setting param->xxx to NULL before returning. */ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, struct fs_parameter *param) { int ret; ret = finish_clean_context(fc); if (ret) return ret; switch (cmd) { case FSCONFIG_CMD_CREATE: return vfs_cmd_create(fc, false); case FSCONFIG_CMD_CREATE_EXCL: return vfs_cmd_create(fc, true); case FSCONFIG_CMD_RECONFIGURE: return vfs_cmd_reconfigure(fc); default: if (fc->phase != FS_CONTEXT_CREATE_PARAMS && fc->phase != FS_CONTEXT_RECONF_PARAMS) return -EBUSY; return vfs_parse_fs_param(fc, param); } } /** * sys_fsconfig - Set parameters and trigger actions on a context * @fd: The filesystem context to act upon * @cmd: The action to take * @_key: Where appropriate, the parameter key to set * @_value: Where appropriate, the parameter value to set * @aux: Additional information for the value * * This system call is used to set parameters on a context, including * superblock settings, data source and security labelling. * * Actions include triggering the creation of a superblock and the * reconfiguration of the superblock attached to the specified context. * * When setting a parameter, @cmd indicates the type of value being proposed * and @_key indicates the parameter to be altered. * * @_value and @aux are used to specify the value, should a value be required: * * (*) fsconfig_set_flag: No value is specified. The parameter must be boolean * in nature. The key may be prefixed with "no" to invert the * setting. @_value must be NULL and @aux must be 0. * * (*) fsconfig_set_string: A string value is specified. The parameter can be * expecting boolean, integer, string or take a path. A conversion to an * appropriate type will be attempted (which may include looking up as a * path). @_value points to a NUL-terminated string and @aux must be 0. * * (*) fsconfig_set_binary: A binary blob is specified. @_value points to the * blob and @aux indicates its size. The parameter must be expecting a * blob. * * (*) fsconfig_set_path: A non-empty path is specified. The parameter must be * expecting a path object. @_value points to a NUL-terminated string that * is the path and @aux is a file descriptor at which to start a relative * lookup or AT_FDCWD. * * (*) fsconfig_set_path_empty: As fsconfig_set_path, but with AT_EMPTY_PATH * implied. * * (*) fsconfig_set_fd: An open file descriptor is specified. @_value must be * NULL and @aux indicates the file descriptor. */ SYSCALL_DEFINE5(fsconfig, int, fd, unsigned int, cmd, const char __user *, _key, const void __user *, _value, int, aux) { struct fs_context *fc; int ret; int lookup_flags = 0; struct fs_parameter param = { .type = fs_value_is_undefined, }; if (fd < 0) return -EINVAL; switch (cmd) { case FSCONFIG_SET_FLAG: if (!_key || _value || aux) return -EINVAL; break; case FSCONFIG_SET_STRING: if (!_key || !_value || aux) return -EINVAL; break; case FSCONFIG_SET_BINARY: if (!_key || !_value || aux <= 0 || aux > 1024 * 1024) return -EINVAL; break; case FSCONFIG_SET_PATH: case FSCONFIG_SET_PATH_EMPTY: if (!_key || !_value || (aux != AT_FDCWD && aux < 0)) return -EINVAL; break; case FSCONFIG_SET_FD: if (!_key || _value || aux < 0) return -EINVAL; break; case FSCONFIG_CMD_CREATE: case FSCONFIG_CMD_CREATE_EXCL: case FSCONFIG_CMD_RECONFIGURE: if (_key || _value || aux) return -EINVAL; break; default: return -EOPNOTSUPP; } CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_op != &fscontext_fops) return -EINVAL; fc = fd_file(f)->private_data; if (fc->ops == &legacy_fs_context_ops) { switch (cmd) { case FSCONFIG_SET_BINARY: case FSCONFIG_SET_PATH: case FSCONFIG_SET_PATH_EMPTY: case FSCONFIG_SET_FD: case FSCONFIG_CMD_CREATE_EXCL: return -EOPNOTSUPP; } } if (_key) { param.key = strndup_user(_key, 256); if (IS_ERR(param.key)) return PTR_ERR(param.key); } switch (cmd) { case FSCONFIG_SET_FLAG: param.type = fs_value_is_flag; break; case FSCONFIG_SET_STRING: param.type = fs_value_is_string; param.string = strndup_user(_value, 256); if (IS_ERR(param.string)) { ret = PTR_ERR(param.string); goto out_key; } param.size = strlen(param.string); break; case FSCONFIG_SET_BINARY: param.type = fs_value_is_blob; param.size = aux; param.blob = memdup_user_nul(_value, aux); if (IS_ERR(param.blob)) { ret = PTR_ERR(param.blob); goto out_key; } break; case FSCONFIG_SET_PATH_EMPTY: lookup_flags = LOOKUP_EMPTY; fallthrough; case FSCONFIG_SET_PATH: param.type = fs_value_is_filename; param.name = getname_flags(_value, lookup_flags); if (IS_ERR(param.name)) { ret = PTR_ERR(param.name); goto out_key; } param.dirfd = aux; param.size = strlen(param.name->name); break; case FSCONFIG_SET_FD: param.type = fs_value_is_file; ret = -EBADF; param.file = fget_raw(aux); if (!param.file) goto out_key; param.dirfd = aux; break; default: break; } ret = mutex_lock_interruptible(&fc->uapi_mutex); if (ret == 0) { ret = vfs_fsconfig_locked(fc, cmd, ¶m); mutex_unlock(&fc->uapi_mutex); } /* Clean up the our record of any value that we obtained from * userspace. Note that the value may have been stolen by the LSM or * filesystem, in which case the value pointer will have been cleared. */ switch (cmd) { case FSCONFIG_SET_STRING: case FSCONFIG_SET_BINARY: kfree(param.string); break; case FSCONFIG_SET_PATH: case FSCONFIG_SET_PATH_EMPTY: if (param.name) putname(param.name); break; case FSCONFIG_SET_FD: if (param.file) fput(param.file); break; default: break; } out_key: kfree(param.key); return ret; } |
| 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 | // SPDX-License-Identifier: GPL-2.0-only /* * Ram backed block device driver. * * Copyright (C) 2007 Nick Piggin * Copyright (C) 2007 Novell Inc. * * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright * of their respective owners. */ #include <linux/init.h> #include <linux/initrd.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/major.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/highmem.h> #include <linux/mutex.h> #include <linux/pagemap.h> #include <linux/xarray.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/debugfs.h> #include <linux/uaccess.h> /* * Each block ramdisk device has a xarray brd_pages of pages that stores * the pages containing the block device's contents. */ struct brd_device { int brd_number; struct gendisk *brd_disk; struct list_head brd_list; /* * Backing store of pages. This is the contents of the block device. */ struct xarray brd_pages; u64 brd_nr_pages; }; /* * Look up and return a brd's page with reference grabbed for a given sector. */ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) { struct page *page; XA_STATE(xas, &brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); rcu_read_lock(); repeat: page = xas_load(&xas); if (xas_retry(&xas, page)) { xas_reset(&xas); goto repeat; } if (!page) goto out; if (!get_page_unless_zero(page)) { xas_reset(&xas); goto repeat; } if (unlikely(page != xas_reload(&xas))) { put_page(page); xas_reset(&xas); goto repeat; } out: rcu_read_unlock(); return page; } /* * Insert a new page for a given sector, if one does not already exist. * The returned page will grab reference. */ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector, blk_opf_t opf) { gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO; struct page *page, *ret; page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM); if (!page) return ERR_PTR(-ENOMEM); xa_lock(&brd->brd_pages); ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL, page, gfp); if (!ret) { brd->brd_nr_pages++; get_page(page); xa_unlock(&brd->brd_pages); return page; } if (!xa_is_err(ret)) { get_page(ret); xa_unlock(&brd->brd_pages); put_page(page); return ret; } xa_unlock(&brd->brd_pages); put_page(page); return ERR_PTR(xa_err(ret)); } /* * Free all backing store pages and xarray. This must only be called when * there are no other users of the device. */ static void brd_free_pages(struct brd_device *brd) { struct page *page; pgoff_t idx; xa_for_each(&brd->brd_pages, idx, page) { put_page(page); cond_resched(); } xa_destroy(&brd->brd_pages); } /* * Process a single segment. The segment is capped to not cross page boundaries * in both the bio and the brd backing memory. */ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) { struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); sector_t sector = bio->bi_iter.bi_sector; u32 offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT; blk_opf_t opf = bio->bi_opf; struct page *page; void *kaddr; bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset); page = brd_lookup_page(brd, sector); if (!page && op_is_write(opf)) { page = brd_insert_page(brd, sector, opf); if (IS_ERR(page)) goto out_error; } kaddr = bvec_kmap_local(&bv); if (op_is_write(opf)) { memcpy_to_page(page, offset, kaddr, bv.bv_len); } else { if (page) memcpy_from_page(kaddr, page, offset, bv.bv_len); else memset(kaddr, 0, bv.bv_len); } kunmap_local(kaddr); bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len); if (page) put_page(page); return true; out_error: if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT)) bio_wouldblock_error(bio); else bio_io_error(bio); return false; } static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) { sector_t aligned_sector = round_up(sector, PAGE_SECTORS); sector_t aligned_end = round_down( sector + (size >> SECTOR_SHIFT), PAGE_SECTORS); struct page *page; if (aligned_end <= aligned_sector) return; xa_lock(&brd->brd_pages); while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) { page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT); if (page) { put_page(page); brd->brd_nr_pages--; } aligned_sector += PAGE_SECTORS; } xa_unlock(&brd->brd_pages); } static void brd_submit_bio(struct bio *bio) { struct brd_device *brd = bio->bi_bdev->bd_disk->private_data; if (unlikely(op_is_discard(bio->bi_opf))) { brd_do_discard(brd, bio->bi_iter.bi_sector, bio->bi_iter.bi_size); bio_endio(bio); return; } do { if (!brd_rw_bvec(brd, bio)) return; } while (bio->bi_iter.bi_size); bio_endio(bio); } static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, .submit_bio = brd_submit_bio, }; /* * And now the modules code and kernel interface. */ static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT; module_param(rd_nr, int, 0444); MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices"); unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE; module_param(rd_size, ulong, 0444); MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); static int max_part = 1; module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); MODULE_DESCRIPTION("Ram backed block device driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); MODULE_ALIAS("rd"); #ifndef MODULE /* Legacy boot options - nonmodular */ static int __init ramdisk_size(char *str) { rd_size = simple_strtol(str, NULL, 0); return 1; } __setup("ramdisk_size=", ramdisk_size); #endif /* * The device scheme is derived from loop.c. Keep them in synch where possible * (should share code eventually). */ static LIST_HEAD(brd_devices); static DEFINE_MUTEX(brd_devices_mutex); static struct dentry *brd_debugfs_dir; static struct brd_device *brd_find_or_alloc_device(int i) { struct brd_device *brd; mutex_lock(&brd_devices_mutex); list_for_each_entry(brd, &brd_devices, brd_list) { if (brd->brd_number == i) { mutex_unlock(&brd_devices_mutex); return ERR_PTR(-EEXIST); } } brd = kzalloc(sizeof(*brd), GFP_KERNEL); if (!brd) { mutex_unlock(&brd_devices_mutex); return ERR_PTR(-ENOMEM); } brd->brd_number = i; list_add_tail(&brd->brd_list, &brd_devices); mutex_unlock(&brd_devices_mutex); return brd; } static void brd_free_device(struct brd_device *brd) { mutex_lock(&brd_devices_mutex); list_del(&brd->brd_list); mutex_unlock(&brd_devices_mutex); kfree(brd); } static int brd_alloc(int i) { struct brd_device *brd; struct gendisk *disk; char buf[DISK_NAME_LEN]; int err = -ENOMEM; struct queue_limits lim = { /* * This is so fdisk will align partitions on 4k, because of * direct_access API needing 4k alignment, returning a PFN * (This is only a problem on very small devices <= 4M, * otherwise fdisk will align on 1M. Regardless this call * is harmless) */ .physical_block_size = PAGE_SIZE, .max_hw_discard_sectors = UINT_MAX, .max_discard_segments = 1, .discard_granularity = PAGE_SIZE, .features = BLK_FEAT_SYNCHRONOUS | BLK_FEAT_NOWAIT, }; brd = brd_find_or_alloc_device(i); if (IS_ERR(brd)) return PTR_ERR(brd); xa_init(&brd->brd_pages); snprintf(buf, DISK_NAME_LEN, "ram%d", i); if (!IS_ERR_OR_NULL(brd_debugfs_dir)) debugfs_create_u64(buf, 0444, brd_debugfs_dir, &brd->brd_nr_pages); disk = brd->brd_disk = blk_alloc_disk(&lim, NUMA_NO_NODE); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_free_dev; } disk->major = RAMDISK_MAJOR; disk->first_minor = i * max_part; disk->minors = max_part; disk->fops = &brd_fops; disk->private_data = brd; strscpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); err = add_disk(disk); if (err) goto out_cleanup_disk; return 0; out_cleanup_disk: put_disk(disk); out_free_dev: brd_free_device(brd); return err; } static void brd_probe(dev_t dev) { brd_alloc(MINOR(dev) / max_part); } static void brd_cleanup(void) { struct brd_device *brd, *next; debugfs_remove_recursive(brd_debugfs_dir); list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { del_gendisk(brd->brd_disk); put_disk(brd->brd_disk); brd_free_pages(brd); brd_free_device(brd); } } static inline void brd_check_and_reset_par(void) { if (unlikely(!max_part)) max_part = 1; /* * make sure 'max_part' can be divided exactly by (1U << MINORBITS), * otherwise, it is possiable to get same dev_t when adding partitions. */ if ((1U << MINORBITS) % max_part != 0) max_part = 1UL << fls(max_part); if (max_part > DISK_MAX_PARTS) { pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n", DISK_MAX_PARTS, DISK_MAX_PARTS); max_part = DISK_MAX_PARTS; } } static int __init brd_init(void) { int err, i; /* * brd module now has a feature to instantiate underlying device * structure on-demand, provided that there is an access dev node. * * (1) if rd_nr is specified, create that many upfront. else * it defaults to CONFIG_BLK_DEV_RAM_COUNT * (2) User can further extend brd devices by create dev node themselves * and have kernel automatically instantiate actual device * on-demand. Example: * mknod /path/devnod_name b 1 X # 1 is the rd major * fdisk -l /path/devnod_name * If (X / max_part) was not already created it will be created * dynamically. */ brd_check_and_reset_par(); brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) { err = -EIO; goto out_free; } for (i = 0; i < rd_nr; i++) brd_alloc(i); pr_info("brd: module loaded\n"); return 0; out_free: brd_cleanup(); pr_info("brd: module NOT loaded !!!\n"); return err; } static void __exit brd_exit(void) { unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); brd_cleanup(); pr_info("brd: module unloaded\n"); } module_init(brd_init); module_exit(brd_exit); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 | /* SPDX-License-Identifier: GPL-2.0 */ /* * thermal_core.h * * Copyright (C) 2012 Intel Corp * Author: Durgadoss R <durgadoss.r@intel.com> */ #ifndef __THERMAL_CORE_H__ #define __THERMAL_CORE_H__ #include <linux/cleanup.h> #include <linux/device.h> #include <linux/thermal.h> #include "thermal_netlink.h" #include "thermal_thresholds.h" #include "thermal_debugfs.h" struct thermal_attr { struct device_attribute attr; char name[THERMAL_NAME_LENGTH]; }; struct thermal_trip_attrs { struct thermal_attr type; struct thermal_attr temp; struct thermal_attr hyst; }; struct thermal_trip_desc { struct thermal_trip trip; struct thermal_trip_attrs trip_attrs; struct list_head list_node; struct list_head thermal_instances; int threshold; }; /** * struct thermal_governor - structure that holds thermal governor information * @name: name of the governor * @bind_to_tz: callback called when binding to a thermal zone. If it * returns 0, the governor is bound to the thermal zone, * otherwise it fails. * @unbind_from_tz: callback called when a governor is unbound from a * thermal zone. * @trip_crossed: called for trip points that have just been crossed * @manage: called on thermal zone temperature updates * @update_tz: callback called when thermal zone internals have changed, e.g. * thermal cooling instance was added/removed * @governor_list: node in thermal_governor_list (in thermal_core.c) */ struct thermal_governor { const char *name; int (*bind_to_tz)(struct thermal_zone_device *tz); void (*unbind_from_tz)(struct thermal_zone_device *tz); void (*trip_crossed)(struct thermal_zone_device *tz, const struct thermal_trip *trip, bool upward); void (*manage)(struct thermal_zone_device *tz); void (*update_tz)(struct thermal_zone_device *tz, enum thermal_notify_event reason); struct list_head governor_list; }; #define TZ_STATE_FLAG_SUSPENDED BIT(0) #define TZ_STATE_FLAG_RESUMING BIT(1) #define TZ_STATE_FLAG_INIT BIT(2) #define TZ_STATE_FLAG_EXIT BIT(3) #define TZ_STATE_READY 0 /** * struct thermal_zone_device - structure for a thermal zone * @id: unique id number for each thermal zone * @type: the thermal zone device type * @device: &struct device for this thermal zone * @removal: removal completion * @resume: resume completion * @trips_high: trips above the current zone temperature * @trips_reached: trips below or at the current zone temperature * @trips_invalid: trips with invalid temperature * @mode: current mode of this thermal zone * @devdata: private pointer for device private data * @num_trips: number of trip points the thermal zone supports * @passive_delay_jiffies: number of jiffies to wait between polls when * performing passive cooling. * @polling_delay_jiffies: number of jiffies to wait between polls when * checking whether trip points have been crossed (0 for * interrupt driven systems) * @recheck_delay_jiffies: delay after a failed attempt to determine the zone * temperature before trying again * @temperature: current temperature. This is only for core code, * drivers should use thermal_zone_get_temp() to get the * current temperature * @last_temperature: previous temperature read * @emul_temperature: emulated temperature when using CONFIG_THERMAL_EMULATION * @passive: 1 if you've crossed a passive trip point, 0 otherwise. * @prev_low_trip: the low current temperature if you've crossed a passive trip point. * @prev_high_trip: the above current temperature if you've crossed a passive trip point. * @ops: operations this &thermal_zone_device supports * @tzp: thermal zone parameters * @governor: pointer to the governor for this thermal zone * @governor_data: private pointer for governor data * @ida: &struct ida to generate unique id for this zone's cooling * devices * @lock: lock to protect thermal_instances list * @node: node in thermal_tz_list (in thermal_core.c) * @poll_queue: delayed work for polling * @notify_event: Last notification event * @state: current state of the thermal zone * @trips: array of struct thermal_trip objects */ struct thermal_zone_device { int id; char type[THERMAL_NAME_LENGTH]; struct device device; struct completion removal; struct completion resume; struct attribute_group trips_attribute_group; struct list_head trips_high; struct list_head trips_reached; struct list_head trips_invalid; enum thermal_device_mode mode; void *devdata; int num_trips; unsigned long passive_delay_jiffies; unsigned long polling_delay_jiffies; unsigned long recheck_delay_jiffies; int temperature; int last_temperature; int emul_temperature; int passive; int prev_low_trip; int prev_high_trip; struct thermal_zone_device_ops ops; struct thermal_zone_params *tzp; struct thermal_governor *governor; void *governor_data; struct ida ida; struct mutex lock; struct list_head node; struct delayed_work poll_queue; enum thermal_notify_event notify_event; u8 state; #ifdef CONFIG_THERMAL_DEBUGFS struct thermal_debugfs *debugfs; #endif struct list_head user_thresholds; struct thermal_trip_desc trips[] __counted_by(num_trips); }; DEFINE_GUARD(thermal_zone, struct thermal_zone_device *, mutex_lock(&_T->lock), mutex_unlock(&_T->lock)) DEFINE_GUARD(thermal_zone_reverse, struct thermal_zone_device *, mutex_unlock(&_T->lock), mutex_lock(&_T->lock)) /* Initial thermal zone temperature. */ #define THERMAL_TEMP_INIT INT_MIN /* * Default and maximum delay after a failed thermal zone temperature check * before attempting to check it again (in jiffies). */ #define THERMAL_RECHECK_DELAY msecs_to_jiffies(250) #define THERMAL_MAX_RECHECK_DELAY (120 * HZ) /* Default Thermal Governor */ #if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE) #define DEFAULT_THERMAL_GOVERNOR "step_wise" #elif defined(CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE) #define DEFAULT_THERMAL_GOVERNOR "fair_share" #elif defined(CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE) #define DEFAULT_THERMAL_GOVERNOR "user_space" #elif defined(CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR) #define DEFAULT_THERMAL_GOVERNOR "power_allocator" #elif defined(CONFIG_THERMAL_DEFAULT_GOV_BANG_BANG) #define DEFAULT_THERMAL_GOVERNOR "bang_bang" #endif /* Initial state of a cooling device during binding */ #define THERMAL_NO_TARGET -1UL /* Init section thermal table */ extern struct thermal_governor *__governor_thermal_table[]; extern struct thermal_governor *__governor_thermal_table_end[]; #define THERMAL_TABLE_ENTRY(table, name) \ static typeof(name) *__thermal_table_entry_##name \ __used __section("__" #table "_thermal_table") = &name #define THERMAL_GOVERNOR_DECLARE(name) THERMAL_TABLE_ENTRY(governor, name) #define for_each_governor_table(__governor) \ for (__governor = __governor_thermal_table; \ __governor < __governor_thermal_table_end; \ __governor++) int for_each_thermal_zone(int (*cb)(struct thermal_zone_device *, void *), void *); int for_each_thermal_cooling_device(int (*cb)(struct thermal_cooling_device *, void *), void *); int for_each_thermal_governor(int (*cb)(struct thermal_governor *, void *), void *thermal_governor); struct thermal_zone_device *thermal_zone_get_by_id(int id); DEFINE_CLASS(thermal_zone_get_by_id, struct thermal_zone_device *, if (_T) put_device(&_T->device), thermal_zone_get_by_id(id), int id) static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev) { return cdev->ops->get_requested_power && cdev->ops->state2power && cdev->ops->power2state; } void thermal_cdev_update(struct thermal_cooling_device *); void thermal_cdev_update_nocheck(struct thermal_cooling_device *cdev); void __thermal_cdev_update(struct thermal_cooling_device *cdev); int get_tz_trend(struct thermal_zone_device *tz, const struct thermal_trip *trip); /* * This structure is used to describe the behavior of * a certain cooling device on a certain trip point * in a certain thermal zone */ struct thermal_instance { int id; char name[THERMAL_NAME_LENGTH]; struct thermal_cooling_device *cdev; const struct thermal_trip *trip; bool initialized; unsigned long upper; /* Highest cooling state for this trip point */ unsigned long lower; /* Lowest cooling state for this trip point */ unsigned long target; /* expected cooling state */ char attr_name[THERMAL_NAME_LENGTH]; struct device_attribute attr; char weight_attr_name[THERMAL_NAME_LENGTH]; struct device_attribute weight_attr; struct list_head trip_node; /* node in trip->thermal_instances */ struct list_head cdev_node; /* node in cdev->thermal_instances */ unsigned int weight; /* The weight of the cooling device */ bool upper_no_limit; }; #define to_thermal_zone(_dev) \ container_of(_dev, struct thermal_zone_device, device) #define to_cooling_device(_dev) \ container_of(_dev, struct thermal_cooling_device, device) int thermal_register_governor(struct thermal_governor *); void thermal_unregister_governor(struct thermal_governor *); int thermal_zone_device_set_policy(struct thermal_zone_device *, char *); int thermal_build_list_of_policies(char *buf); void __thermal_zone_device_update(struct thermal_zone_device *tz, enum thermal_notify_event event); void thermal_zone_device_critical_reboot(struct thermal_zone_device *tz); void thermal_zone_device_critical_shutdown(struct thermal_zone_device *tz); void thermal_governor_update_tz(struct thermal_zone_device *tz, enum thermal_notify_event reason); /* Helpers */ #define for_each_trip_desc(__tz, __td) \ for (__td = __tz->trips; __td - __tz->trips < __tz->num_trips; __td++) #define trip_to_trip_desc(__trip) \ container_of(__trip, struct thermal_trip_desc, trip) const char *thermal_trip_type_name(enum thermal_trip_type trip_type); void thermal_zone_set_trips(struct thermal_zone_device *tz, int low, int high); int thermal_zone_trip_id(const struct thermal_zone_device *tz, const struct thermal_trip *trip); int __thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp); void thermal_zone_set_trip_hyst(struct thermal_zone_device *tz, struct thermal_trip *trip, int hyst); /* sysfs I/F */ int thermal_zone_create_device_groups(struct thermal_zone_device *tz); void thermal_zone_destroy_device_groups(struct thermal_zone_device *); void thermal_cooling_device_setup_sysfs(struct thermal_cooling_device *); void thermal_cooling_device_destroy_sysfs(struct thermal_cooling_device *cdev); void thermal_cooling_device_stats_reinit(struct thermal_cooling_device *cdev); /* used only at binding time */ ssize_t trip_point_show(struct device *, struct device_attribute *, char *); ssize_t weight_show(struct device *, struct device_attribute *, char *); ssize_t weight_store(struct device *, struct device_attribute *, const char *, size_t); #ifdef CONFIG_THERMAL_STATISTICS void thermal_cooling_device_stats_update(struct thermal_cooling_device *cdev, unsigned long new_state); #else static inline void thermal_cooling_device_stats_update(struct thermal_cooling_device *cdev, unsigned long new_state) {} #endif /* CONFIG_THERMAL_STATISTICS */ #endif /* __THERMAL_CORE_H__ */ |
| 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2020 Google Corporation */ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/mgmt.h> #include "mgmt_util.h" #include "msft.h" #define MSFT_RSSI_THRESHOLD_VALUE_MIN -127 #define MSFT_RSSI_THRESHOLD_VALUE_MAX 20 #define MSFT_RSSI_LOW_TIMEOUT_MAX 0x3C #define MSFT_OP_READ_SUPPORTED_FEATURES 0x00 struct msft_cp_read_supported_features { __u8 sub_opcode; } __packed; struct msft_rp_read_supported_features { __u8 status; __u8 sub_opcode; __le64 features; __u8 evt_prefix_len; __u8 evt_prefix[]; } __packed; #define MSFT_OP_LE_MONITOR_ADVERTISEMENT 0x03 #define MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN 0x01 struct msft_le_monitor_advertisement_pattern { __u8 length; __u8 data_type; __u8 start_byte; __u8 pattern[]; }; struct msft_le_monitor_advertisement_pattern_data { __u8 count; __u8 data[]; }; struct msft_cp_le_monitor_advertisement { __u8 sub_opcode; __s8 rssi_high; __s8 rssi_low; __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 cond_type; __u8 data[]; } __packed; struct msft_rp_le_monitor_advertisement { __u8 status; __u8 sub_opcode; __u8 handle; } __packed; #define MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT 0x04 struct msft_cp_le_cancel_monitor_advertisement { __u8 sub_opcode; __u8 handle; } __packed; struct msft_rp_le_cancel_monitor_advertisement { __u8 status; __u8 sub_opcode; } __packed; #define MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE 0x05 struct msft_cp_le_set_advertisement_filter_enable { __u8 sub_opcode; __u8 enable; } __packed; struct msft_rp_le_set_advertisement_filter_enable { __u8 status; __u8 sub_opcode; } __packed; #define MSFT_EV_LE_MONITOR_DEVICE 0x02 struct msft_ev_le_monitor_device { __u8 addr_type; bdaddr_t bdaddr; __u8 monitor_handle; __u8 monitor_state; } __packed; struct msft_monitor_advertisement_handle_data { __u8 msft_handle; __u16 mgmt_handle; __s8 rssi_high; __s8 rssi_low; __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 cond_type; struct list_head list; }; enum monitor_addr_filter_state { AF_STATE_IDLE, AF_STATE_ADDING, AF_STATE_ADDED, AF_STATE_REMOVING, }; #define MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR 0x04 struct msft_monitor_addr_filter_data { __u8 msft_handle; __u8 pattern_handle; /* address filters pertain to */ __u16 mgmt_handle; int state; __s8 rssi_high; __s8 rssi_low; __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 addr_type; bdaddr_t bdaddr; struct list_head list; }; struct msft_data { __u64 features; __u8 evt_prefix_len; __u8 *evt_prefix; struct list_head handle_map; struct list_head address_filters; __u8 resuming; __u8 suspending; __u8 filter_enabled; /* To synchronize add/remove address filter and monitor device event.*/ struct mutex filter_lock; }; bool msft_monitor_supported(struct hci_dev *hdev) { return !!(msft_get_features(hdev) & MSFT_FEATURE_MASK_LE_ADV_MONITOR); } static bool read_supported_features(struct hci_dev *hdev, struct msft_data *msft) { struct msft_cp_read_supported_features cp; struct msft_rp_read_supported_features *rp; struct sk_buff *skb; cp.sub_opcode = MSFT_OP_READ_SUPPORTED_FEATURES; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read MSFT supported features (%ld)", PTR_ERR(skb)); return false; } if (skb->len < sizeof(*rp)) { bt_dev_err(hdev, "MSFT supported features length mismatch"); goto failed; } rp = (struct msft_rp_read_supported_features *)skb->data; if (rp->sub_opcode != MSFT_OP_READ_SUPPORTED_FEATURES) goto failed; if (rp->evt_prefix_len > 0) { msft->evt_prefix = kmemdup(rp->evt_prefix, rp->evt_prefix_len, GFP_KERNEL); if (!msft->evt_prefix) goto failed; } msft->evt_prefix_len = rp->evt_prefix_len; msft->features = __le64_to_cpu(rp->features); if (msft->features & MSFT_FEATURE_MASK_CURVE_VALIDITY) hdev->msft_curve_validity = true; kfree_skb(skb); return true; failed: kfree_skb(skb); return false; } /* is_mgmt = true matches the handle exposed to userspace via mgmt. * is_mgmt = false matches the handle used by the msft controller. * This function requires the caller holds hdev->lock */ static struct msft_monitor_advertisement_handle_data *msft_find_handle_data (struct hci_dev *hdev, u16 handle, bool is_mgmt) { struct msft_monitor_advertisement_handle_data *entry; struct msft_data *msft = hdev->msft_data; list_for_each_entry(entry, &msft->handle_map, list) { if (is_mgmt && entry->mgmt_handle == handle) return entry; if (!is_mgmt && entry->msft_handle == handle) return entry; } return NULL; } /* This function requires the caller holds msft->filter_lock */ static struct msft_monitor_addr_filter_data *msft_find_address_data (struct hci_dev *hdev, u8 addr_type, bdaddr_t *addr, u8 pattern_handle) { struct msft_monitor_addr_filter_data *entry; struct msft_data *msft = hdev->msft_data; list_for_each_entry(entry, &msft->address_filters, list) { if (entry->pattern_handle == pattern_handle && addr_type == entry->addr_type && !bacmp(addr, &entry->bdaddr)) return entry; } return NULL; } /* This function requires the caller holds hdev->lock */ static int msft_monitor_device_del(struct hci_dev *hdev, __u16 mgmt_handle, bdaddr_t *bdaddr, __u8 addr_type, bool notify) { struct monitored_device *dev, *tmp; int count = 0; list_for_each_entry_safe(dev, tmp, &hdev->monitored_devices, list) { /* mgmt_handle == 0 indicates remove all devices, whereas, * bdaddr == NULL indicates remove all devices matching the * mgmt_handle. */ if ((!mgmt_handle || dev->handle == mgmt_handle) && (!bdaddr || (!bacmp(bdaddr, &dev->bdaddr) && addr_type == dev->addr_type))) { if (notify && dev->notified) { mgmt_adv_monitor_device_lost(hdev, dev->handle, &dev->bdaddr, dev->addr_type); } list_del(&dev->list); kfree(dev); count++; } } return count; } static int msft_le_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode, struct adv_monitor *monitor, struct sk_buff *skb) { struct msft_rp_le_monitor_advertisement *rp; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; int status = 0; hci_dev_lock(hdev); rp = (struct msft_rp_le_monitor_advertisement *)skb->data; if (skb->len < sizeof(*rp)) { status = HCI_ERROR_UNSPECIFIED; goto unlock; } status = rp->status; if (status) goto unlock; handle_data = kmalloc(sizeof(*handle_data), GFP_KERNEL); if (!handle_data) { status = HCI_ERROR_UNSPECIFIED; goto unlock; } handle_data->mgmt_handle = monitor->handle; handle_data->msft_handle = rp->handle; handle_data->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN; INIT_LIST_HEAD(&handle_data->list); list_add(&handle_data->list, &msft->handle_map); monitor->state = ADV_MONITOR_STATE_OFFLOADED; unlock: if (status) hci_free_adv_monitor(hdev, monitor); hci_dev_unlock(hdev); return status; } /* This function requires the caller holds hci_req_sync_lock */ static void msft_remove_addr_filters_sync(struct hci_dev *hdev, u8 handle) { struct msft_monitor_addr_filter_data *address_filter, *n; struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_data *msft = hdev->msft_data; struct list_head head; struct sk_buff *skb; INIT_LIST_HEAD(&head); /* Cancel all corresponding address monitors */ mutex_lock(&msft->filter_lock); list_for_each_entry_safe(address_filter, n, &msft->address_filters, list) { if (address_filter->pattern_handle != handle) continue; list_del(&address_filter->list); /* Keep the address filter and let * msft_add_address_filter_sync() remove and free the address * filter. */ if (address_filter->state == AF_STATE_ADDING) { address_filter->state = AF_STATE_REMOVING; continue; } /* Keep the address filter and let * msft_cancel_address_filter_sync() remove and free the address * filter */ if (address_filter->state == AF_STATE_REMOVING) continue; list_add_tail(&address_filter->list, &head); } mutex_unlock(&msft->filter_lock); list_for_each_entry_safe(address_filter, n, &head, list) { list_del(&address_filter->list); cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; cp.handle = address_filter->msft_handle; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { kfree(address_filter); continue; } kfree_skb(skb); bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter", &address_filter->bdaddr); kfree(address_filter); } } static int msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode, struct adv_monitor *monitor, struct sk_buff *skb) { struct msft_rp_le_cancel_monitor_advertisement *rp; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; int status = 0; u8 msft_handle; rp = (struct msft_rp_le_cancel_monitor_advertisement *)skb->data; if (skb->len < sizeof(*rp)) { status = HCI_ERROR_UNSPECIFIED; goto done; } status = rp->status; if (status) goto done; hci_dev_lock(hdev); handle_data = msft_find_handle_data(hdev, monitor->handle, true); if (handle_data) { if (monitor->state == ADV_MONITOR_STATE_OFFLOADED) monitor->state = ADV_MONITOR_STATE_REGISTERED; /* Do not free the monitor if it is being removed due to * suspend. It will be re-monitored on resume. */ if (!msft->suspending) { hci_free_adv_monitor(hdev, monitor); /* Clear any monitored devices by this Adv Monitor */ msft_monitor_device_del(hdev, handle_data->mgmt_handle, NULL, 0, false); } msft_handle = handle_data->msft_handle; list_del(&handle_data->list); kfree(handle_data); hci_dev_unlock(hdev); msft_remove_addr_filters_sync(hdev, msft_handle); } else { hci_dev_unlock(hdev); } done: return status; } /* This function requires the caller holds hci_req_sync_lock */ static int msft_remove_monitor_sync(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_monitor_advertisement_handle_data *handle_data; struct sk_buff *skb; handle_data = msft_find_handle_data(hdev, monitor->handle, true); /* If no matched handle, just remove without telling controller */ if (!handle_data) return -ENOENT; cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; cp.handle = handle_data->msft_handle; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) return PTR_ERR(skb); return msft_le_cancel_monitor_advertisement_cb(hdev, hdev->msft_opcode, monitor, skb); } /* This function requires the caller holds hci_req_sync_lock */ int msft_suspend_sync(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; struct adv_monitor *monitor; int handle = 0; if (!msft || !msft_monitor_supported(hdev)) return 0; msft->suspending = true; while (1) { monitor = idr_get_next(&hdev->adv_monitors_idr, &handle); if (!monitor) break; msft_remove_monitor_sync(hdev, monitor); handle++; } /* All monitors have been removed */ msft->suspending = false; return 0; } static bool msft_monitor_rssi_valid(struct adv_monitor *monitor) { struct adv_rssi_thresholds *r = &monitor->rssi; if (r->high_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN || r->high_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX || r->low_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN || r->low_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX) return false; /* High_threshold_timeout is not supported, * once high_threshold is reached, events are immediately reported. */ if (r->high_threshold_timeout != 0) return false; if (r->low_threshold_timeout > MSFT_RSSI_LOW_TIMEOUT_MAX) return false; /* Sampling period from 0x00 to 0xFF are all allowed */ return true; } static bool msft_monitor_pattern_valid(struct adv_monitor *monitor) { return msft_monitor_rssi_valid(monitor); /* No additional check needed for pattern-based monitor */ } static int msft_add_monitor_sync(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_cp_le_monitor_advertisement *cp; struct msft_le_monitor_advertisement_pattern_data *pattern_data; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_le_monitor_advertisement_pattern *pattern; struct adv_pattern *entry; size_t total_size = sizeof(*cp) + sizeof(*pattern_data); ptrdiff_t offset = 0; u8 pattern_count = 0; struct sk_buff *skb; int err; if (!msft_monitor_pattern_valid(monitor)) return -EINVAL; list_for_each_entry(entry, &monitor->patterns, list) { pattern_count++; total_size += sizeof(*pattern) + entry->length; } cp = kmalloc(total_size, GFP_KERNEL); if (!cp) return -ENOMEM; cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT; cp->rssi_high = monitor->rssi.high_threshold; cp->rssi_low = monitor->rssi.low_threshold; cp->rssi_low_interval = (u8)monitor->rssi.low_threshold_timeout; cp->rssi_sampling_period = monitor->rssi.sampling_period; cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN; pattern_data = (void *)cp->data; pattern_data->count = pattern_count; list_for_each_entry(entry, &monitor->patterns, list) { pattern = (void *)(pattern_data->data + offset); /* the length also includes data_type and offset */ pattern->length = entry->length + 2; pattern->data_type = entry->ad_type; pattern->start_byte = entry->offset; memcpy(pattern->pattern, entry->value, entry->length); offset += sizeof(*pattern) + entry->length; } skb = __hci_cmd_sync(hdev, hdev->msft_opcode, total_size, cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { err = PTR_ERR(skb); goto out_free; } err = msft_le_monitor_advertisement_cb(hdev, hdev->msft_opcode, monitor, skb); if (err) goto out_free; handle_data = msft_find_handle_data(hdev, monitor->handle, true); if (!handle_data) { err = -ENODATA; goto out_free; } handle_data->rssi_high = cp->rssi_high; handle_data->rssi_low = cp->rssi_low; handle_data->rssi_low_interval = cp->rssi_low_interval; handle_data->rssi_sampling_period = cp->rssi_sampling_period; out_free: kfree(cp); return err; } /* This function requires the caller holds hci_req_sync_lock */ static void reregister_monitor(struct hci_dev *hdev) { struct adv_monitor *monitor; struct msft_data *msft = hdev->msft_data; int handle = 0; if (!msft) return; msft->resuming = true; while (1) { monitor = idr_get_next(&hdev->adv_monitors_idr, &handle); if (!monitor) break; msft_add_monitor_sync(hdev, monitor); handle++; } /* All monitors have been reregistered */ msft->resuming = false; } /* This function requires the caller holds hci_req_sync_lock */ int msft_resume_sync(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; if (!msft || !msft_monitor_supported(hdev)) return 0; hci_dev_lock(hdev); /* Clear already tracked devices on resume. Once the monitors are * reregistered, devices in range will be found again after resume. */ hdev->advmon_pend_notify = false; msft_monitor_device_del(hdev, 0, NULL, 0, true); hci_dev_unlock(hdev); reregister_monitor(hdev); return 0; } /* This function requires the caller holds hci_req_sync_lock */ void msft_do_open(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; if (hdev->msft_opcode == HCI_OP_NOP) return; if (!msft) { bt_dev_err(hdev, "MSFT extension not registered"); return; } bt_dev_dbg(hdev, "Initialize MSFT extension"); /* Reset existing MSFT data before re-reading */ kfree(msft->evt_prefix); msft->evt_prefix = NULL; msft->evt_prefix_len = 0; msft->features = 0; if (!read_supported_features(hdev, msft)) { hdev->msft_data = NULL; kfree(msft); return; } if (msft_monitor_supported(hdev)) { msft->resuming = true; msft_set_filter_enable(hdev, true); /* Monitors get removed on power off, so we need to explicitly * tell the controller to re-monitor. */ reregister_monitor(hdev); } } void msft_do_close(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; struct msft_monitor_advertisement_handle_data *handle_data, *tmp; struct msft_monitor_addr_filter_data *address_filter, *n; struct adv_monitor *monitor; if (!msft) return; bt_dev_dbg(hdev, "Cleanup of MSFT extension"); /* The controller will silently remove all monitors on power off. * Therefore, remove handle_data mapping and reset monitor state. */ list_for_each_entry_safe(handle_data, tmp, &msft->handle_map, list) { monitor = idr_find(&hdev->adv_monitors_idr, handle_data->mgmt_handle); if (monitor && monitor->state == ADV_MONITOR_STATE_OFFLOADED) monitor->state = ADV_MONITOR_STATE_REGISTERED; list_del(&handle_data->list); kfree(handle_data); } mutex_lock(&msft->filter_lock); list_for_each_entry_safe(address_filter, n, &msft->address_filters, list) { list_del(&address_filter->list); kfree(address_filter); } mutex_unlock(&msft->filter_lock); hci_dev_lock(hdev); /* Clear any devices that are being monitored and notify device lost */ hdev->advmon_pend_notify = false; msft_monitor_device_del(hdev, 0, NULL, 0, true); hci_dev_unlock(hdev); } static int msft_cancel_address_filter_sync(struct hci_dev *hdev, void *data) { struct msft_monitor_addr_filter_data *address_filter = data; struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_data *msft = hdev->msft_data; struct sk_buff *skb; int err = 0; if (!msft) { bt_dev_err(hdev, "MSFT: msft data is freed"); return -EINVAL; } /* The address filter has been removed by hci dev close */ if (!test_bit(HCI_UP, &hdev->flags)) return 0; mutex_lock(&msft->filter_lock); list_del(&address_filter->list); mutex_unlock(&msft->filter_lock); cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; cp.handle = address_filter->msft_handle; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { bt_dev_err(hdev, "MSFT: Failed to cancel address (%pMR) filter", &address_filter->bdaddr); err = PTR_ERR(skb); goto done; } kfree_skb(skb); bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter", &address_filter->bdaddr); done: kfree(address_filter); return err; } void msft_register(struct hci_dev *hdev) { struct msft_data *msft = NULL; bt_dev_dbg(hdev, "Register MSFT extension"); msft = kzalloc(sizeof(*msft), GFP_KERNEL); if (!msft) { bt_dev_err(hdev, "Failed to register MSFT extension"); return; } INIT_LIST_HEAD(&msft->handle_map); INIT_LIST_HEAD(&msft->address_filters); hdev->msft_data = msft; mutex_init(&msft->filter_lock); } void msft_release(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; if (!msft) return; bt_dev_dbg(hdev, "Unregister MSFT extension"); hdev->msft_data = NULL; kfree(msft->evt_prefix); mutex_destroy(&msft->filter_lock); kfree(msft); } /* This function requires the caller holds hdev->lock */ static void msft_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 addr_type, __u16 mgmt_handle) { struct monitored_device *dev; dev = kmalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { bt_dev_err(hdev, "MSFT vendor event %u: no memory", MSFT_EV_LE_MONITOR_DEVICE); return; } bacpy(&dev->bdaddr, bdaddr); dev->addr_type = addr_type; dev->handle = mgmt_handle; dev->notified = false; INIT_LIST_HEAD(&dev->list); list_add(&dev->list, &hdev->monitored_devices); hdev->advmon_pend_notify = true; } /* This function requires the caller holds hdev->lock */ static void msft_device_lost(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 addr_type, __u16 mgmt_handle) { if (!msft_monitor_device_del(hdev, mgmt_handle, bdaddr, addr_type, true)) { bt_dev_err(hdev, "MSFT vendor event %u: dev %pMR not in list", MSFT_EV_LE_MONITOR_DEVICE, bdaddr); } } static void *msft_skb_pull(struct hci_dev *hdev, struct sk_buff *skb, u8 ev, size_t len) { void *data; data = skb_pull_data(skb, len); if (!data) bt_dev_err(hdev, "Malformed MSFT vendor event: 0x%02x", ev); return data; } static int msft_add_address_filter_sync(struct hci_dev *hdev, void *data) { struct msft_monitor_addr_filter_data *address_filter = data; struct msft_rp_le_monitor_advertisement *rp; struct msft_cp_le_monitor_advertisement *cp; struct msft_data *msft = hdev->msft_data; struct sk_buff *skb = NULL; bool remove = false; size_t size; if (!msft) { bt_dev_err(hdev, "MSFT: msft data is freed"); return -EINVAL; } /* The address filter has been removed by hci dev close */ if (!test_bit(HCI_UP, &hdev->flags)) return -ENODEV; /* We are safe to use the address filter from now on. * msft_monitor_device_evt() wouldn't delete this filter because it's * not been added by now. * And all other functions that requiring hci_req_sync_lock wouldn't * touch this filter before this func completes because it's protected * by hci_req_sync_lock. */ if (address_filter->state == AF_STATE_REMOVING) { mutex_lock(&msft->filter_lock); list_del(&address_filter->list); mutex_unlock(&msft->filter_lock); kfree(address_filter); return 0; } size = sizeof(*cp) + sizeof(address_filter->addr_type) + sizeof(address_filter->bdaddr); cp = kzalloc(size, GFP_KERNEL); if (!cp) { bt_dev_err(hdev, "MSFT: Alloc cmd param err"); remove = true; goto done; } cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT; cp->rssi_high = address_filter->rssi_high; cp->rssi_low = address_filter->rssi_low; cp->rssi_low_interval = address_filter->rssi_low_interval; cp->rssi_sampling_period = address_filter->rssi_sampling_period; cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR; cp->data[0] = address_filter->addr_type; memcpy(&cp->data[1], &address_filter->bdaddr, sizeof(address_filter->bdaddr)); skb = __hci_cmd_sync(hdev, hdev->msft_opcode, size, cp, HCI_CMD_TIMEOUT); kfree(cp); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to enable address %pMR filter", &address_filter->bdaddr); skb = NULL; remove = true; goto done; } rp = skb_pull_data(skb, sizeof(*rp)); if (!rp || rp->sub_opcode != MSFT_OP_LE_MONITOR_ADVERTISEMENT || rp->status) remove = true; done: mutex_lock(&msft->filter_lock); if (remove) { bt_dev_warn(hdev, "MSFT: Remove address (%pMR) filter", &address_filter->bdaddr); list_del(&address_filter->list); kfree(address_filter); } else { address_filter->state = AF_STATE_ADDED; address_filter->msft_handle = rp->handle; bt_dev_dbg(hdev, "MSFT: Address %pMR filter enabled", &address_filter->bdaddr); } mutex_unlock(&msft->filter_lock); kfree_skb(skb); return 0; } /* This function requires the caller holds msft->filter_lock */ static struct msft_monitor_addr_filter_data *msft_add_address_filter (struct hci_dev *hdev, u8 addr_type, bdaddr_t *bdaddr, struct msft_monitor_advertisement_handle_data *handle_data) { struct msft_monitor_addr_filter_data *address_filter = NULL; struct msft_data *msft = hdev->msft_data; int err; address_filter = kzalloc(sizeof(*address_filter), GFP_KERNEL); if (!address_filter) return NULL; address_filter->state = AF_STATE_ADDING; address_filter->msft_handle = 0xff; address_filter->pattern_handle = handle_data->msft_handle; address_filter->mgmt_handle = handle_data->mgmt_handle; address_filter->rssi_high = handle_data->rssi_high; address_filter->rssi_low = handle_data->rssi_low; address_filter->rssi_low_interval = handle_data->rssi_low_interval; address_filter->rssi_sampling_period = handle_data->rssi_sampling_period; address_filter->addr_type = addr_type; bacpy(&address_filter->bdaddr, bdaddr); /* With the above AF_STATE_ADDING, duplicated address filter can be * avoided when receiving monitor device event (found/lost) frequently * for the same device. */ list_add_tail(&address_filter->list, &msft->address_filters); err = hci_cmd_sync_queue(hdev, msft_add_address_filter_sync, address_filter, NULL); if (err < 0) { bt_dev_err(hdev, "MSFT: Add address %pMR filter err", bdaddr); list_del(&address_filter->list); kfree(address_filter); return NULL; } bt_dev_dbg(hdev, "MSFT: Add device %pMR address filter", &address_filter->bdaddr); return address_filter; } /* This function requires the caller holds hdev->lock */ static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct msft_monitor_addr_filter_data *n, *address_filter = NULL; struct msft_ev_le_monitor_device *ev; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; u16 mgmt_handle = 0xffff; u8 addr_type; ev = msft_skb_pull(hdev, skb, MSFT_EV_LE_MONITOR_DEVICE, sizeof(*ev)); if (!ev) return; bt_dev_dbg(hdev, "MSFT vendor event 0x%02x: handle 0x%04x state %d addr %pMR", MSFT_EV_LE_MONITOR_DEVICE, ev->monitor_handle, ev->monitor_state, &ev->bdaddr); handle_data = msft_find_handle_data(hdev, ev->monitor_handle, false); if (!hci_test_quirk(hdev, HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER)) { if (!handle_data) return; mgmt_handle = handle_data->mgmt_handle; goto report_state; } if (handle_data) { /* Don't report any device found/lost event from pattern * monitors. Pattern monitor always has its address filters for * tracking devices. */ address_filter = msft_find_address_data(hdev, ev->addr_type, &ev->bdaddr, handle_data->msft_handle); if (address_filter) return; if (ev->monitor_state && handle_data->cond_type == MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN) msft_add_address_filter(hdev, ev->addr_type, &ev->bdaddr, handle_data); return; } /* This device event is not from pattern monitor. * Report it if there is a corresponding address_filter for it. */ list_for_each_entry(n, &msft->address_filters, list) { if (n->state == AF_STATE_ADDED && n->msft_handle == ev->monitor_handle) { mgmt_handle = n->mgmt_handle; address_filter = n; break; } } if (!address_filter) { bt_dev_warn(hdev, "MSFT: Unexpected device event %pMR, %u, %u", &ev->bdaddr, ev->monitor_handle, ev->monitor_state); return; } report_state: switch (ev->addr_type) { case ADDR_LE_DEV_PUBLIC: addr_type = BDADDR_LE_PUBLIC; break; case ADDR_LE_DEV_RANDOM: addr_type = BDADDR_LE_RANDOM; break; default: bt_dev_err(hdev, "MSFT vendor event 0x%02x: unknown addr type 0x%02x", MSFT_EV_LE_MONITOR_DEVICE, ev->addr_type); return; } if (ev->monitor_state) { msft_device_found(hdev, &ev->bdaddr, addr_type, mgmt_handle); } else { if (address_filter && address_filter->state == AF_STATE_ADDED) { address_filter->state = AF_STATE_REMOVING; hci_cmd_sync_queue(hdev, msft_cancel_address_filter_sync, address_filter, NULL); } msft_device_lost(hdev, &ev->bdaddr, addr_type, mgmt_handle); } } void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct msft_data *msft = hdev->msft_data; u8 *evt_prefix; u8 *evt; if (!msft) return; /* When the extension has defined an event prefix, check that it * matches, and otherwise just return. */ if (msft->evt_prefix_len > 0) { evt_prefix = msft_skb_pull(hdev, skb, 0, msft->evt_prefix_len); if (!evt_prefix) return; if (memcmp(evt_prefix, msft->evt_prefix, msft->evt_prefix_len)) return; } /* Every event starts at least with an event code and the rest of * the data is variable and depends on the event code. */ if (skb->len < 1) return; evt = msft_skb_pull(hdev, skb, 0, sizeof(*evt)); if (!evt) return; hci_dev_lock(hdev); switch (*evt) { case MSFT_EV_LE_MONITOR_DEVICE: mutex_lock(&msft->filter_lock); msft_monitor_device_evt(hdev, skb); mutex_unlock(&msft->filter_lock); break; default: bt_dev_dbg(hdev, "MSFT vendor event 0x%02x", *evt); break; } hci_dev_unlock(hdev); } __u64 msft_get_features(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; return msft ? msft->features : 0; } static void msft_le_set_advertisement_filter_enable_cb(struct hci_dev *hdev, void *user_data, u8 status) { struct msft_cp_le_set_advertisement_filter_enable *cp = user_data; struct msft_data *msft = hdev->msft_data; /* Error 0x0C would be returned if the filter enabled status is * already set to whatever we were trying to set. * Although the default state should be disabled, some controller set * the initial value to enabled. Because there is no way to know the * actual initial value before sending this command, here we also treat * error 0x0C as success. */ if (status != 0x00 && status != 0x0C) return; hci_dev_lock(hdev); msft->filter_enabled = cp->enable; if (status == 0x0C) bt_dev_warn(hdev, "MSFT filter_enable is already %s", cp->enable ? "on" : "off"); hci_dev_unlock(hdev); } /* This function requires the caller holds hci_req_sync_lock */ int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_data *msft = hdev->msft_data; if (!msft) return -EOPNOTSUPP; if (msft->resuming || msft->suspending) return -EBUSY; return msft_add_monitor_sync(hdev, monitor); } /* This function requires the caller holds hci_req_sync_lock */ int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_data *msft = hdev->msft_data; if (!msft) return -EOPNOTSUPP; if (msft->resuming || msft->suspending) return -EBUSY; return msft_remove_monitor_sync(hdev, monitor); } int msft_set_filter_enable(struct hci_dev *hdev, bool enable) { struct msft_cp_le_set_advertisement_filter_enable cp; struct msft_data *msft = hdev->msft_data; int err; if (!msft) return -EOPNOTSUPP; cp.sub_opcode = MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE; cp.enable = enable; err = __hci_cmd_sync_status(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); msft_le_set_advertisement_filter_enable_cb(hdev, &cp, err); return 0; } bool msft_curve_validity(struct hci_dev *hdev) { return hdev->msft_curve_validity; } |
| 21 14 13 3 77 234 184 152 80 5 161 282 204 207 73 165 182 164 47 101 162 189 183 73 1 30 87 2 5 164 44 51 45 218 140 78 180 181 181 180 178 9 4 178 9 179 12 36 252 23 3 9 57 76 8 50 50 51 51 10 99 91 169 169 121 121 8 11 86 190 190 179 164 6 11 69 163 124 130 130 163 162 2 161 122 129 1 12 27 27 27 21 11 123 411 392 334 162 34 193 411 249 175 406 416 97 167 252 237 140 297 297 316 22 22 22 22 22 123 143 341 357 244 3 244 32 102 28 13 5 233 5 273 8 152 8 1 23 23 190 3 52 111 120 23 7 10 88 97 27 5 160 9 159 5 143 21 146 18 163 162 1 160 3 146 59 99 65 114 137 30 140 21 146 146 146 1 255 112 83 38 146 269 150 114 150 78 89 160 11 39 81 8 3 30 11 1 73 274 45 267 200 200 340 150 140 269 14 14 13 199 199 139 89 112 15 15 235 235 8 3 197 209 209 15 3 82 23 6 1 1 15 45 45 7 45 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 | /* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/f2fs.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #ifndef _LINUX_F2FS_H #define _LINUX_F2FS_H #include <linux/uio.h> #include <linux/types.h> #include <linux/page-flags.h> #include <linux/slab.h> #include <linux/crc32.h> #include <linux/magic.h> #include <linux/kobject.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/sched/mm.h> #include <linux/vmalloc.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/quotaops.h> #include <linux/part_stat.h> #include <linux/rw_hint.h> #include <linux/fscrypt.h> #include <linux/fsverity.h> struct pagevec; #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) #else #define f2fs_bug_on(sbi, condition) \ do { \ if (WARN_ON(condition)) \ set_sbi_flag(sbi, SBI_NEED_FSCK); \ } while (0) #endif enum { FAULT_KMALLOC, FAULT_KVMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, FAULT_ALLOC_BIO, /* it's obsolete due to bio_alloc() will never fail */ FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, FAULT_DIR_DEPTH, FAULT_EVICT_INODE, FAULT_TRUNCATE, FAULT_READ_IO, FAULT_CHECKPOINT, FAULT_DISCARD, FAULT_WRITE_IO, FAULT_SLAB_ALLOC, FAULT_DQUOT_INIT, FAULT_LOCK_OP, FAULT_BLKADDR_VALIDITY, FAULT_BLKADDR_CONSISTENCE, FAULT_NO_SEGMENT, FAULT_INCONSISTENT_FOOTER, FAULT_TIMEOUT, FAULT_VMALLOC, FAULT_MAX, }; /* indicate which option to update */ enum fault_option { FAULT_RATE = 1, /* only update fault rate */ FAULT_TYPE = 2, /* only update fault type */ FAULT_ALL = 4, /* reset all fault injection options/stats */ }; #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info { atomic_t inject_ops; int inject_rate; unsigned int inject_type; /* Used to account total count of injection for each type */ unsigned int inject_count[FAULT_MAX]; }; extern const char *f2fs_fault_name[FAULT_MAX]; #define IS_FAULT_SET(fi, type) ((fi)->inject_type & BIT(type)) /* maximum retry count for injected failure */ #define DEFAULT_FAILURE_RETRY_COUNT 8 #else #define DEFAULT_FAILURE_RETRY_COUNT 1 #endif /* * For mount options */ enum f2fs_mount_opt { F2FS_MOUNT_DISABLE_ROLL_FORWARD, F2FS_MOUNT_DISCARD, F2FS_MOUNT_NOHEAP, F2FS_MOUNT_XATTR_USER, F2FS_MOUNT_POSIX_ACL, F2FS_MOUNT_DISABLE_EXT_IDENTIFY, F2FS_MOUNT_INLINE_XATTR, F2FS_MOUNT_INLINE_DATA, F2FS_MOUNT_INLINE_DENTRY, F2FS_MOUNT_FLUSH_MERGE, F2FS_MOUNT_NOBARRIER, F2FS_MOUNT_FASTBOOT, F2FS_MOUNT_READ_EXTENT_CACHE, F2FS_MOUNT_DATA_FLUSH, F2FS_MOUNT_FAULT_INJECTION, F2FS_MOUNT_USRQUOTA, F2FS_MOUNT_GRPQUOTA, F2FS_MOUNT_PRJQUOTA, F2FS_MOUNT_QUOTA, F2FS_MOUNT_INLINE_XATTR_SIZE, F2FS_MOUNT_RESERVE_ROOT, F2FS_MOUNT_DISABLE_CHECKPOINT, F2FS_MOUNT_NORECOVERY, F2FS_MOUNT_ATGC, F2FS_MOUNT_MERGE_CHECKPOINT, F2FS_MOUNT_GC_MERGE, F2FS_MOUNT_COMPRESS_CACHE, F2FS_MOUNT_AGE_EXTENT_CACHE, F2FS_MOUNT_NAT_BITS, F2FS_MOUNT_INLINECRYPT, /* * Some f2fs environments expect to be able to pass the "lazytime" option * string rather than using the MS_LAZYTIME flag, so this must remain. */ F2FS_MOUNT_LAZYTIME, F2FS_MOUNT_RESERVE_NODE, }; #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) \ (F2FS_OPTION(sbi).opt &= ~BIT(F2FS_MOUNT_##option)) #define set_opt(sbi, option) \ (F2FS_OPTION(sbi).opt |= BIT(F2FS_MOUNT_##option)) #define test_opt(sbi, option) \ (F2FS_OPTION(sbi).opt & BIT(F2FS_MOUNT_##option)) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ ((long long)((a) - (b)) > 0)) typedef u32 block_t; /* * should not change u32, since it is the on-disk block * address format, __le32. */ typedef u32 nid_t; #define COMPRESS_EXT_NUM 16 enum blkzone_allocation_policy { BLKZONE_ALLOC_PRIOR_SEQ, /* Prioritize writing to sequential zones */ BLKZONE_ALLOC_ONLY_SEQ, /* Only allow writing to sequential zones */ BLKZONE_ALLOC_PRIOR_CONV, /* Prioritize writing to conventional zones */ }; enum bggc_io_aware_policy { AWARE_ALL_IO, /* skip background GC if there is any kind of pending IO */ AWARE_READ_IO, /* skip background GC if there is pending read IO */ AWARE_NONE, /* don't aware IO for background GC */ }; enum device_allocation_policy { ALLOCATE_FORWARD_NOHINT, ALLOCATE_FORWARD_WITHIN_HINT, ALLOCATE_FORWARD_FROM_HINT, }; /* * An implementation of an rwsem that is explicitly unfair to readers. This * prevents priority inversion when a low-priority reader acquires the read lock * while sleeping on the write lock but the write lock is needed by * higher-priority clients. */ struct f2fs_rwsem { struct rw_semaphore internal_rwsem; #ifdef CONFIG_F2FS_UNFAIR_RWSEM wait_queue_head_t read_waiters; #endif }; struct f2fs_mount_info { unsigned long long opt; block_t root_reserved_blocks; /* root reserved blocks */ block_t root_reserved_nodes; /* root reserved nodes */ kuid_t s_resuid; /* reserved blocks for uid */ kgid_t s_resgid; /* reserved blocks for gid */ int active_logs; /* # of active logs */ int inline_xattr_size; /* inline xattr size */ #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info fault_info; /* For fault injection */ #endif #ifdef CONFIG_QUOTA /* Names of quota files with journalled quota */ char *s_qf_names[MAXQUOTAS]; int s_jquota_fmt; /* Format of quota to use */ #endif /* For which write hints are passed down to block layer */ int alloc_mode; /* segment allocation policy */ int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ int memory_mode; /* memory mode */ int errors; /* errors parameter */ int discard_unit; /* * discard command's offset/size should * be aligned to this unit: block, * segment or section */ struct fscrypt_dummy_policy dummy_enc_policy; /* test dummy encryption */ block_t unusable_cap_perc; /* percentage for cap */ block_t unusable_cap; /* Amount of space allowed to be * unusable when disabling checkpoint */ /* For compression */ unsigned char compress_algorithm; /* algorithm type */ unsigned char compress_log_size; /* cluster log size */ unsigned char compress_level; /* compress level */ bool compress_chksum; /* compressed data chksum */ unsigned char compress_ext_cnt; /* extension count */ unsigned char nocompress_ext_cnt; /* nocompress extension count */ int compress_mode; /* compression mode */ unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ unsigned int lookup_mode; }; #define F2FS_FEATURE_ENCRYPT 0x00000001 #define F2FS_FEATURE_BLKZONED 0x00000002 #define F2FS_FEATURE_ATOMIC_WRITE 0x00000004 #define F2FS_FEATURE_EXTRA_ATTR 0x00000008 #define F2FS_FEATURE_PRJQUOTA 0x00000010 #define F2FS_FEATURE_INODE_CHKSUM 0x00000020 #define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x00000040 #define F2FS_FEATURE_QUOTA_INO 0x00000080 #define F2FS_FEATURE_INODE_CRTIME 0x00000100 #define F2FS_FEATURE_LOST_FOUND 0x00000200 #define F2FS_FEATURE_VERITY 0x00000400 #define F2FS_FEATURE_SB_CHKSUM 0x00000800 #define F2FS_FEATURE_CASEFOLD 0x00001000 #define F2FS_FEATURE_COMPRESSION 0x00002000 #define F2FS_FEATURE_RO 0x00004000 #define F2FS_FEATURE_DEVICE_ALIAS 0x00008000 #define F2FS_FEATURE_PACKED_SSA 0x00010000 #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) #define F2FS_HAS_FEATURE(sbi, mask) __F2FS_HAS_FEATURE(sbi->raw_super, mask) /* * Default values for user and/or group using reserved blocks */ #define F2FS_DEF_RESUID 0 #define F2FS_DEF_RESGID 0 /* * For checkpoint manager */ enum { NAT_BITMAP, SIT_BITMAP }; #define CP_UMOUNT 0x00000001 #define CP_FASTBOOT 0x00000002 #define CP_SYNC 0x00000004 #define CP_RECOVERY 0x00000008 #define CP_DISCARD 0x00000010 #define CP_TRIMMED 0x00000020 #define CP_PAUSE 0x00000040 #define CP_RESIZE 0x00000080 #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ #define DEF_ENABLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ #define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ enum cp_time { CP_TIME_START, /* begin */ CP_TIME_LOCK, /* after cp_global_sem */ CP_TIME_OP_LOCK, /* after block_operation */ CP_TIME_FLUSH_META, /* after flush sit/nat */ CP_TIME_SYNC_META, /* after sync_meta_pages */ CP_TIME_SYNC_CP_META, /* after sync cp meta pages */ CP_TIME_WAIT_DIRTY_META,/* after wait on dirty meta */ CP_TIME_WAIT_CP_DATA, /* after wait on cp data */ CP_TIME_FLUSH_DEVICE, /* after flush device cache */ CP_TIME_WAIT_LAST_CP, /* after wait on last cp pack */ CP_TIME_END, /* after unblock_operation */ CP_TIME_MAX, }; /* time cost stats of checkpoint */ struct cp_stats { ktime_t times[CP_TIME_MAX]; }; struct cp_control { int reason; __u64 trim_start; __u64 trim_end; __u64 trim_minlen; struct cp_stats stats; }; enum f2fs_cp_phase { CP_PHASE_START_BLOCK_OPS, CP_PHASE_FINISH_BLOCK_OPS, CP_PHASE_FINISH_CHECKPOINT, }; /* * indicate meta/data type */ enum { META_CP, META_NAT, META_SIT, META_SSA, META_MAX, META_POR, DATA_GENERIC, /* check range only */ DATA_GENERIC_ENHANCE, /* strong check on range and segment bitmap */ DATA_GENERIC_ENHANCE_READ, /* * strong check on range and segment * bitmap but no warning due to race * condition of read on truncated area * by extent_cache */ DATA_GENERIC_ENHANCE_UPDATE, /* * strong check on range and segment * bitmap for update case */ META_GENERIC, }; /* for the list of ino */ enum { ORPHAN_INO, /* for orphan ino list */ APPEND_INO, /* for append ino list */ UPDATE_INO, /* for update ino list */ TRANS_DIR_INO, /* for transactions dir ino list */ XATTR_DIR_INO, /* for xattr updated dir ino list */ FLUSH_INO, /* for multiple device flushing */ MAX_INO_ENTRY, /* max. list */ }; struct ino_entry { struct list_head list; /* list head */ nid_t ino; /* inode number */ unsigned int dirty_device; /* dirty device bitmap */ }; /* for the list of inodes to be GCed */ struct inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ }; struct fsync_node_entry { struct list_head list; /* list head */ struct folio *folio; /* warm node folio pointer */ unsigned int seq_id; /* sequence id */ }; struct ckpt_req { struct completion wait; /* completion for checkpoint done */ struct llist_node llnode; /* llist_node to be linked in wait queue */ int ret; /* return code of checkpoint */ union { ktime_t queue_time; /* request queued time */ ktime_t delta_time; /* time in queue */ }; }; struct ckpt_req_control { struct task_struct *f2fs_issue_ckpt; /* checkpoint task */ int ckpt_thread_ioprio; /* checkpoint merge thread ioprio */ wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */ atomic_t issued_ckpt; /* # of actually issued ckpts */ atomic_t total_ckpt; /* # of total ckpts */ atomic_t queued_ckpt; /* # of queued ckpts */ struct llist_head issue_list; /* list for command issue */ spinlock_t stat_lock; /* lock for below checkpoint time stats */ unsigned int cur_time; /* cur wait time in msec for currently issued checkpoint */ unsigned int peak_time; /* peak wait time in msec until now */ }; /* a time threshold that checkpoint was blocked for, unit: ms */ #define CP_LONG_LATENCY_THRESHOLD 5000 /* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ block_t start_blkaddr; /* start blockaddr of current segment */ unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; /* minimum discard granularity, unit: block count */ #define MIN_DISCARD_GRANULARITY 1 /* default discard granularity of inner discard thread, unit: block count */ #define DEFAULT_DISCARD_GRANULARITY 16 /* default maximum discard granularity of ordered discard, unit: block count */ #define DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY 16 /* default interval of periodical discard submission */ #define DEFAULT_DISCARD_INTERVAL (msecs_to_jiffies(20)) /* max discard pend list number */ #define MAX_PLIST_NUM 512 #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ (MAX_PLIST_NUM - 1) : ((blk_num) - 1)) enum { D_PREP, /* initial */ D_PARTIAL, /* partially submitted */ D_SUBMIT, /* all submitted */ D_DONE, /* finished */ }; struct discard_info { block_t lstart; /* logical start address */ block_t len; /* length */ block_t start; /* actual start address in dev */ }; struct discard_cmd { struct rb_node rb_node; /* rb node located in rb-tree */ struct discard_info di; /* discard info */ struct list_head list; /* command list */ struct completion wait; /* completion */ struct block_device *bdev; /* bdev */ unsigned short ref; /* reference count */ unsigned char state; /* state */ unsigned char queued; /* queued discard */ int error; /* bio error */ spinlock_t lock; /* for state/bio_ref updating */ unsigned short bio_ref; /* bio reference count */ }; enum { DPOLICY_BG, DPOLICY_FORCE, DPOLICY_FSTRIM, DPOLICY_UMOUNT, MAX_DPOLICY, }; enum { DPOLICY_IO_AWARE_DISABLE, /* force to not be aware of IO */ DPOLICY_IO_AWARE_ENABLE, /* force to be aware of IO */ DPOLICY_IO_AWARE_MAX, }; struct discard_policy { int type; /* type of discard */ unsigned int min_interval; /* used for candidates exist */ unsigned int mid_interval; /* used for device busy */ unsigned int max_interval; /* used for candidates not exist */ unsigned int max_requests; /* # of discards issued per round */ unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ bool io_aware; /* issue discard in idle time */ bool sync; /* submit discard with REQ_SYNC flag */ bool ordered; /* issue discard by lba order */ bool timeout; /* discard timeout for put_super */ unsigned int granularity; /* discard granularity */ }; struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head entry_list; /* 4KB discard entry list */ struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ struct list_head wait_list; /* store on-flushing entries */ struct list_head fstrim_list; /* in-flight discard from fstrim */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; unsigned int nr_discards; /* # of discards in the list */ unsigned int max_discards; /* max. discards to be issued */ unsigned int max_discard_request; /* max. discard request per round */ unsigned int min_discard_issue_time; /* min. interval between discard issue */ unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ unsigned int max_discard_issue_time; /* max. interval between discard issue */ unsigned int discard_io_aware_gran; /* minimum discard granularity not be aware of I/O */ unsigned int discard_urgent_util; /* utilization which issue discard proactively */ unsigned int discard_granularity; /* discard granularity */ unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */ unsigned int discard_io_aware; /* io_aware policy */ unsigned int undiscard_blks; /* # of undiscard blocks */ unsigned int next_pos; /* next discard position */ atomic_t issued_discard; /* # of issued discard */ atomic_t queued_discard; /* # of queued discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ struct rb_root_cached root; /* root of discard rb-tree */ bool rbtree_check; /* config for consistence check */ bool discard_wake; /* to wake up discard thread */ }; /* for the list of fsync inodes, used only during recovery */ struct fsync_inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ block_t blkaddr; /* block address locating the last fsync */ block_t last_dentry; /* block address locating the last dentry */ }; #define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats)) #define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits)) #define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne) #define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid) #define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se) #define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno) #define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) #define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) { int before = nats_in_cursum(journal); journal->n_nats = cpu_to_le16(before + i); return before; } static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) { int before = sits_in_cursum(journal); journal->n_sits = cpu_to_le16(before + i); return before; } static inline bool __has_cursum_space(struct f2fs_journal *journal, int size, int type) { if (type == NAT_JOURNAL) return size <= MAX_NAT_JENTRIES(journal); return size <= MAX_SIT_JENTRIES(journal); } /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ (CUR_ADDRS_PER_INODE(inode) - \ get_inline_xattr_addrs(inode) - \ DEF_INLINE_RESERVED_SIZE)) /* for inline dir */ #define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ BITS_PER_BYTE + 1)) #define INLINE_DENTRY_BITMAP_SIZE(inode) \ DIV_ROUND_UP(NR_INLINE_DENTRY(inode), BITS_PER_BYTE) #define INLINE_RESERVED_SIZE(inode) (MAX_INLINE_DATA(inode) - \ ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ NR_INLINE_DENTRY(inode) + \ INLINE_DENTRY_BITMAP_SIZE(inode))) /* * For INODE and NODE manager */ /* for directory operations */ struct f2fs_filename { /* * The filename the user specified. This is NULL for some * filesystem-internal operations, e.g. converting an inline directory * to a non-inline one, or roll-forward recovering an encrypted dentry. */ const struct qstr *usr_fname; /* * The on-disk filename. For encrypted directories, this is encrypted. * This may be NULL for lookups in an encrypted dir without the key. */ struct fscrypt_str disk_name; /* The dirhash of this filename */ f2fs_hash_t hash; #ifdef CONFIG_FS_ENCRYPTION /* * For lookups in encrypted directories: either the buffer backing * disk_name, or a buffer that holds the decoded no-key name. */ struct fscrypt_str crypto_buf; #endif #if IS_ENABLED(CONFIG_UNICODE) /* * For casefolded directories: the casefolded name, but it's left NULL * if the original name is not valid Unicode, if the original name is * "." or "..", if the directory is both casefolded and encrypted and * its encryption key is unavailable, or if the filesystem is doing an * internal operation where usr_fname is also NULL. In all these cases * we fall back to treating the name as an opaque byte sequence. */ struct qstr cf_name; #endif }; struct f2fs_dentry_ptr { struct inode *inode; void *bitmap; struct f2fs_dir_entry *dentry; __u8 (*filename)[F2FS_SLOT_LEN]; int max; int nr_bitmap; }; static inline void make_dentry_ptr_block(struct inode *inode, struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t) { d->inode = inode; d->max = NR_DENTRY_IN_BLOCK; d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; d->bitmap = t->dentry_bitmap; d->dentry = t->dentry; d->filename = t->filename; } static inline void make_dentry_ptr_inline(struct inode *inode, struct f2fs_dentry_ptr *d, void *t) { int entry_cnt = NR_INLINE_DENTRY(inode); int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode); int reserved_size = INLINE_RESERVED_SIZE(inode); d->inode = inode; d->max = entry_cnt; d->nr_bitmap = bitmap_size; d->bitmap = t; d->dentry = t + bitmap_size + reserved_size; d->filename = t + bitmap_size + reserved_size + SIZE_OF_DIR_ENTRY * entry_cnt; } /* * XATTR_NODE_OFFSET stores xattrs to one node block per file keeping -1 * as its node offset to distinguish from index node blocks. * But some bits are used to mark the node block. */ #define XATTR_NODE_OFFSET ((((unsigned int)-1) << OFFSET_BIT_SHIFT) \ >> OFFSET_BIT_SHIFT) enum { ALLOC_NODE, /* allocate a new node page if needed */ LOOKUP_NODE, /* look up a node without readahead */ LOOKUP_NODE_RA, /* * look up a node with readahead called * by get_data_block. */ }; #define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO or flush count */ /* IO/non-IO congestion wait timeout value, default: 1ms */ #define DEFAULT_SCHEDULE_TIMEOUT (msecs_to_jiffies(1)) /* timeout value injected, default: 1000ms */ #define DEFAULT_FAULT_TIMEOUT (msecs_to_jiffies(1000)) /* maximum retry quota flush count */ #define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 /* maximum retry of EIO'ed page */ #define MAX_RETRY_PAGE_EIO 100 #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ /* dirty segments threshold for triggering CP */ #define DEFAULT_DIRTY_THRESHOLD 4 #define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS #define RECOVERY_MIN_RA_BLOCKS 1 #define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ /* for in-memory extent cache entry */ #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ /* number of extent info in extent cache we try to shrink */ #define READ_EXTENT_CACHE_SHRINK_NUMBER 128 /* number of age extent info in extent cache we try to shrink */ #define AGE_EXTENT_CACHE_SHRINK_NUMBER 128 #define LAST_AGE_WEIGHT 30 #define SAME_AGE_REGION 1024 /* * Define data block with age less than 1GB as hot data * define data block with age less than 10GB but more than 1GB as warm data */ #define DEF_HOT_DATA_AGE_THRESHOLD 262144 #define DEF_WARM_DATA_AGE_THRESHOLD 2621440 /* default max read extent count per inode */ #define DEF_MAX_READ_EXTENT_COUNT 10240 /* extent cache type */ enum extent_type { EX_READ, EX_BLOCK_AGE, NR_EXTENT_CACHES, }; /* * Reserved value to mark invalid age extents, hence valid block range * from 0 to ULLONG_MAX-1 */ #define F2FS_EXTENT_AGE_INVALID ULLONG_MAX struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ union { /* read extent_cache */ struct { /* start block address of the extent */ block_t blk; #ifdef CONFIG_F2FS_FS_COMPRESSION /* physical extent length of compressed blocks */ unsigned int c_len; #endif }; /* block age extent_cache */ struct { /* block age of the extent */ unsigned long long age; /* last total blocks allocated */ unsigned long long last_blocks; }; }; }; struct extent_node { struct rb_node rb_node; /* rb node located in rb-tree */ struct extent_info ei; /* extent info */ struct list_head list; /* node in global extent list of sbi */ struct extent_tree *et; /* extent tree pointer */ }; struct extent_tree { nid_t ino; /* inode number */ enum extent_type type; /* keep the extent tree type */ struct rb_root_cached root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ atomic_t node_cnt; /* # of extent node in rb-tree*/ bool largest_updated; /* largest extent updated */ struct extent_info largest; /* largest cached extent for EX_READ */ }; struct extent_tree_info { struct radix_tree_root extent_tree_root;/* cache extent cache entries */ struct mutex extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ atomic_t total_ext_tree; /* extent tree count */ struct list_head zombie_list; /* extent zombie tree list */ atomic_t total_zombie_tree; /* extent zombie tree count */ atomic_t total_ext_node; /* extent info count */ }; /* * State of block returned by f2fs_map_blocks. */ #define F2FS_MAP_NEW (1U << 0) #define F2FS_MAP_MAPPED (1U << 1) #define F2FS_MAP_DELALLOC (1U << 2) #define F2FS_MAP_FLAGS (F2FS_MAP_NEW | F2FS_MAP_MAPPED |\ F2FS_MAP_DELALLOC) struct f2fs_map_blocks { struct block_device *m_bdev; /* for multi-device dio */ block_t m_pblk; block_t m_lblk; unsigned int m_len; unsigned int m_flags; unsigned long m_last_pblk; /* last allocated block, only used for DIO in LFS mode */ pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ pgoff_t *m_next_extent; /* point to next possible extent */ int m_seg_type; bool m_may_create; /* indicate it is from write path */ bool m_multidev_dio; /* indicate it allows multi-device dio */ }; /* for flag in get_data_block */ enum { F2FS_GET_BLOCK_DEFAULT, F2FS_GET_BLOCK_FIEMAP, F2FS_GET_BLOCK_BMAP, F2FS_GET_BLOCK_DIO, F2FS_GET_BLOCK_PRE_DIO, F2FS_GET_BLOCK_PRE_AIO, F2FS_GET_BLOCK_PRECACHE, }; /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. */ #define FADVISE_COLD_BIT 0x01 #define FADVISE_LOST_PINO_BIT 0x02 #define FADVISE_ENCRYPT_BIT 0x04 #define FADVISE_ENC_NAME_BIT 0x08 #define FADVISE_KEEP_SIZE_BIT 0x10 #define FADVISE_HOT_BIT 0x20 #define FADVISE_VERITY_BIT 0x40 #define FADVISE_TRUNC_BIT 0x80 #define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) #define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) #define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) #define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) #define file_is_encrypt(inode) is_file(inode, FADVISE_ENCRYPT_BIT) #define file_set_encrypt(inode) set_file(inode, FADVISE_ENCRYPT_BIT) #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) #define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) #define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT) #define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT) #define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT) #define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT) #define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT) #define file_should_truncate(inode) is_file(inode, FADVISE_TRUNC_BIT) #define file_need_truncate(inode) set_file(inode, FADVISE_TRUNC_BIT) #define file_dont_truncate(inode) clear_file(inode, FADVISE_TRUNC_BIT) #define DEF_DIR_LEVEL 0 /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ FI_DIRTY_INODE, /* indicate inode is dirty or not */ FI_AUTO_RECOVER, /* indicate inode is recoverable */ FI_DIRTY_DIR, /* indicate directory has dirty pages */ FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ FI_FREE_NID, /* free allocated nide */ FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ FI_INLINE_DATA, /* used for inline data*/ FI_INLINE_DENTRY, /* used for inline dentry */ FI_APPEND_WRITE, /* inode has appended data */ FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ FI_DATA_EXIST, /* indicate data exists */ FI_SKIP_WRITES, /* should skip data page writeback */ FI_OPU_WRITE, /* used for opu per file */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ FI_PREALLOCATED_ALL, /* all blocks for write were preallocated */ FI_HOT_DATA, /* indicate file is hot */ FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ FI_PIN_FILE, /* indicate file should not be gced */ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ FI_MMAP_FILE, /* indicate file was mmapped */ FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */ FI_COMPRESS_RELEASED, /* compressed blocks were released */ FI_ALIGNED_WRITE, /* enable aligned write */ FI_COW_FILE, /* indicate COW file */ FI_ATOMIC_COMMITTED, /* indicate atomic commit completed except disk sync */ FI_ATOMIC_DIRTIED, /* indicate atomic file is dirtied */ FI_ATOMIC_REPLACE, /* indicate atomic replace */ FI_OPENED_FILE, /* indicate file has been opened */ FI_DONATE_FINISHED, /* indicate page donation of file has been finished */ FI_MAX, /* max flag, never be used */ }; struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ unsigned char i_dir_level; /* use for dentry level for large dir */ union { unsigned int i_current_depth; /* only for directory depth */ unsigned short i_gc_failures; /* for gc failure statistic */ }; unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ /* Use below internally in f2fs*/ unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */ unsigned int ioprio_hint; /* hint for IO priority */ struct f2fs_rwsem i_sem; /* protect fi info */ atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ struct task_struct *task; /* lookup and create consistency */ struct task_struct *cp_task; /* separate cp/wb IO stats*/ struct task_struct *wb_task; /* indicate inode is in context of writeback */ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ spinlock_t i_size_lock; /* protect last_disk_size */ #ifdef CONFIG_QUOTA struct dquot __rcu *i_dquot[MAXQUOTAS]; /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ /* linked in global inode list for cache donation */ struct list_head gdonate_list; pgoff_t donate_start, donate_end; /* inclusive */ atomic_t open_count; /* # of open files */ struct task_struct *atomic_write_task; /* store atomic write task */ struct extent_tree *extent_tree[NR_EXTENT_CACHES]; /* cached extent_tree entry */ union { struct inode *cow_inode; /* copy-on-write inode for atomic write */ struct inode *atomic_inode; /* point to atomic_inode, available only for cow_inode */ }; /* avoid racing between foreground op and gc */ struct f2fs_rwsem i_gc_rwsem[2]; struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */ int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ int i_inline_xattr_size; /* inline xattr size */ struct timespec64 i_crtime; /* inode creation time */ struct timespec64 i_disk_time[3];/* inode disk times */ /* for file compress */ atomic_t i_compr_blocks; /* # of compressed blocks */ unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ unsigned char i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ atomic_t writeback; /* count # of writeback thread */ unsigned int atomic_write_cnt; loff_t original_i_size; /* original i_size before atomic write */ #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_inode_info *i_crypt_info; /* filesystem encryption info */ #endif #ifdef CONFIG_FS_VERITY struct fsverity_info *i_verity_info; /* filesystem verity info */ #endif }; static inline void get_read_extent_info(struct extent_info *ext, struct f2fs_extent *i_ext) { ext->fofs = le32_to_cpu(i_ext->fofs); ext->blk = le32_to_cpu(i_ext->blk); ext->len = le32_to_cpu(i_ext->len); } static inline void set_raw_read_extent(struct extent_info *ext, struct f2fs_extent *i_ext) { i_ext->fofs = cpu_to_le32(ext->fofs); i_ext->blk = cpu_to_le32(ext->blk); i_ext->len = cpu_to_le32(ext->len); } static inline bool __is_discard_mergeable(struct discard_info *back, struct discard_info *front, unsigned int max_len) { return (back->lstart + back->len == front->lstart) && (back->len + front->len <= max_len); } static inline bool __is_discard_back_mergeable(struct discard_info *cur, struct discard_info *back, unsigned int max_len) { return __is_discard_mergeable(back, cur, max_len); } static inline bool __is_discard_front_mergeable(struct discard_info *cur, struct discard_info *front, unsigned int max_len) { return __is_discard_mergeable(cur, front, max_len); } /* * For free nid management */ enum nid_state { FREE_NID, /* newly added to free nid list */ PREALLOC_NID, /* it is preallocated */ MAX_NID_STATE, }; enum nat_state { TOTAL_NAT, DIRTY_NAT, RECLAIMABLE_NAT, MAX_NAT_STATE, }; struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ nid_t available_nids; /* # of available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ nid_t max_rf_node_blocks; /* max # of nodes for recovery */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_set_root;/* root of the nat set cache */ struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */ struct list_head nat_entries; /* cached nat entry list (clean) */ spinlock_t nat_list_lock; /* protect clean nat entry list */ unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ unsigned int nat_blocks; /* # of nat blocks */ /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ struct list_head free_nid_list; /* list for free nids excluding preallocated nids */ unsigned int nid_cnt[MAX_NID_STATE]; /* the number of free node id */ spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ unsigned char **free_nid_bitmap; unsigned char *nat_block_bitmap; unsigned short *free_nid_count; /* free nid count of NAT block */ /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ unsigned int nat_bits_blocks; /* # of nat bits blocks */ unsigned char *nat_bits; /* NAT bits blocks */ unsigned char *full_nat_bits; /* full NAT pages */ unsigned char *empty_nat_bits; /* empty NAT pages */ #ifdef CONFIG_F2FS_CHECK_FS char *nat_bitmap_mir; /* NAT bitmap mirror */ #endif int bitmap_size; /* bitmap size */ }; /* * this structure is used as one of function parameters. * all the information are dedicated to a given direct node block determined * by the data offset in a file. */ struct dnode_of_data { struct inode *inode; /* vfs inode pointer */ struct folio *inode_folio; /* its inode folio, NULL is possible */ struct folio *node_folio; /* cached direct node folio */ nid_t nid; /* node id of the direct node block */ unsigned int ofs_in_node; /* data offset in the node page */ bool inode_folio_locked; /* inode folio is locked or not */ bool node_changed; /* is node block changed */ char cur_level; /* level of hole node page */ char max_level; /* level of current page located */ block_t data_blkaddr; /* block address of the node block */ }; static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, struct folio *ifolio, struct folio *nfolio, nid_t nid) { memset(dn, 0, sizeof(*dn)); dn->inode = inode; dn->inode_folio = ifolio; dn->node_folio = nfolio; dn->nid = nid; } /* * For SIT manager * * By default, there are 6 active log areas across the whole main area. * When considering hot and cold data separation to reduce cleaning overhead, * we split 3 for data logs and 3 for node logs as hot, warm, and cold types, * respectively. * In the current design, you should not change the numbers intentionally. * Instead, as a mount option such as active_logs=x, you can use 2, 4, and 6 * logs individually according to the underlying devices. (default: 6) * Just in case, on-disk layout covers maximum 16 logs that consist of 8 for * data and 8 for node logs. */ #define NR_CURSEG_DATA_TYPE (3) #define NR_CURSEG_NODE_TYPE (3) #define NR_CURSEG_INMEM_TYPE (2) #define NR_CURSEG_RO_TYPE (2) #define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) #define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE) enum log_type { CURSEG_HOT_DATA = 0, /* directory entry blocks */ CURSEG_WARM_DATA, /* data blocks */ CURSEG_COLD_DATA, /* multimedia or GCed data blocks */ CURSEG_HOT_NODE, /* direct node blocks of directory files */ CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ NR_PERSISTENT_LOG, /* number of persistent log */ CURSEG_COLD_DATA_PINNED = NR_PERSISTENT_LOG, /* pinned file that needs consecutive block address */ CURSEG_ALL_DATA_ATGC, /* SSR alloctor in hot/warm/cold data area */ NO_CHECK_TYPE, /* number of persistent & inmem log */ }; struct flush_cmd { struct completion wait; struct llist_node llnode; nid_t ino; int ret; }; struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ atomic_t issued_flush; /* # of issued flushes */ atomic_t queued_flush; /* # of queued flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; struct f2fs_sm_info { struct sit_info *sit_info; /* whole segment information */ struct free_segmap_info *free_info; /* free segment information */ struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ struct f2fs_rwsem curseg_lock; /* for preventing curseg change */ block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ block_t ssa_blkaddr; /* start block address of SSA area */ unsigned int segment_count; /* total # of segments */ unsigned int main_segments; /* # of segments in main area */ unsigned int reserved_segments; /* # of reserved segments */ unsigned int ovp_segments; /* # of overprovision segments */ /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; struct list_head sit_entry_set; /* sit entry set list */ unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ unsigned int min_seq_blocks; /* threshold for sequential blocks */ unsigned int min_hot_blocks; /* threshold for hot block allocation */ unsigned int min_ssr_sections; /* threshold to trigger SSR allocation */ /* for flush command control */ struct flush_cmd_control *fcc_info; /* for discard command control */ struct discard_cmd_control *dcc_info; }; /* * For superblock */ /* * COUNT_TYPE for monitoring * * f2fs monitors the number of several block types such as on-writeback, * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ #define WB_DATA_TYPE(folio, f) \ (f || f2fs_is_cp_guaranteed(folio) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, F2FS_DIRTY_QDATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, F2FS_DIRTY_IMETA, F2FS_WB_CP_DATA, F2FS_WB_DATA, F2FS_RD_DATA, F2FS_RD_NODE, F2FS_RD_META, F2FS_DIO_WRITE, F2FS_DIO_READ, NR_COUNT_TYPE, }; /* * The below are the page types of bios used in submit_bio(). * The available types are: * DATA User data pages. It operates as async mode. * NODE Node pages. It operates as async mode. * META FS metadata pages such as SIT, NAT, CP. * NR_PAGE_TYPE The number of page types. * META_FLUSH Make sure the previous pages are written * with waiting the bio's completion * ... Only can be used with META. */ #define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) #define PAGE_TYPE_ON_MAIN(type) ((type) == DATA || (type) == NODE) enum page_type { DATA = 0, NODE = 1, /* should not change this */ META, NR_PAGE_TYPE, META_FLUSH, IPU, /* the below types are used by tracepoints only. */ OPU, }; enum temp_type { HOT = 0, /* must be zero for meta bio */ WARM, COLD, NR_TEMP_TYPE, }; enum need_lock_type { LOCK_REQ = 0, LOCK_DONE, LOCK_RETRY, }; enum cp_reason_type { CP_NO_NEEDED, CP_NON_REGULAR, CP_COMPRESSED, CP_HARDLINK, CP_SB_NEED_CP, CP_WRONG_PINO, CP_NO_SPC_ROLL, CP_NODE_NEED_CP, CP_FASTBOOT_MODE, CP_SPEC_LOG_NUM, CP_RECOVER_DIR, CP_XATTR_DIR, }; enum iostat_type { /* WRITE IO */ APP_DIRECT_IO, /* app direct write IOs */ APP_BUFFERED_IO, /* app buffered write IOs */ APP_WRITE_IO, /* app write IOs */ APP_MAPPED_IO, /* app mapped IOs */ APP_BUFFERED_CDATA_IO, /* app buffered write IOs on compressed file */ APP_MAPPED_CDATA_IO, /* app mapped write IOs on compressed file */ FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ FS_CDATA_IO, /* data IOs from kworker/fsync/reclaimer on compressed file */ FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ FS_META_IO, /* meta IOs from kworker/reclaimer */ FS_GC_DATA_IO, /* data IOs from forground gc */ FS_GC_NODE_IO, /* node IOs from forground gc */ FS_CP_DATA_IO, /* data IOs from checkpoint */ FS_CP_NODE_IO, /* node IOs from checkpoint */ FS_CP_META_IO, /* meta IOs from checkpoint */ /* READ IO */ APP_DIRECT_READ_IO, /* app direct read IOs */ APP_BUFFERED_READ_IO, /* app buffered read IOs */ APP_READ_IO, /* app read IOs */ APP_MAPPED_READ_IO, /* app mapped read IOs */ APP_BUFFERED_CDATA_READ_IO, /* app buffered read IOs on compressed file */ APP_MAPPED_CDATA_READ_IO, /* app mapped read IOs on compressed file */ FS_DATA_READ_IO, /* data read IOs */ FS_GDATA_READ_IO, /* data read IOs from background gc */ FS_CDATA_READ_IO, /* compressed data read IOs */ FS_NODE_READ_IO, /* node read IOs */ FS_META_READ_IO, /* meta read IOs */ /* other */ FS_DISCARD_IO, /* discard */ FS_FLUSH_IO, /* flush */ FS_ZONE_RESET_IO, /* zone reset */ NR_IO_TYPE, }; struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ nid_t ino; /* inode number */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ enum temp_type temp; /* contains HOT/WARM/COLD */ enum req_op op; /* contains REQ_OP_ */ blk_opf_t op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ union { struct page *page; /* page to be written */ struct folio *folio; }; struct page *encrypted_page; /* encrypted page */ struct page *compressed_page; /* compressed page */ struct list_head list; /* serialize IOs */ unsigned int compr_blocks; /* # of compressed block addresses */ unsigned int need_lock:8; /* indicate we need to lock cp_rwsem */ unsigned int version:8; /* version of the node */ unsigned int submitted:1; /* indicate IO submission */ unsigned int in_list:1; /* indicate fio is in io_list */ unsigned int is_por:1; /* indicate IO is from recovery or not */ unsigned int encrypted:1; /* indicate file is encrypted */ unsigned int meta_gc:1; /* require meta inode GC */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ struct bio **bio; /* bio for ipu */ sector_t *last_block; /* last block number in bio */ }; struct bio_entry { struct bio *bio; struct list_head list; }; #define is_read_io(rw) ((rw) == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ #ifdef CONFIG_BLK_DEV_ZONED struct completion zone_wait; /* condition value for the previous open zone to close */ struct bio *zone_pending_bio; /* pending bio for the previous zone */ void *bi_private; /* previous bi_private for pending bio */ #endif struct f2fs_rwsem io_rwsem; /* blocking op for bio */ spinlock_t io_lock; /* serialize DATA/NODE IOs */ struct list_head io_list; /* track fios */ struct list_head bio_list; /* bio entry list head */ struct f2fs_rwsem bio_list_lock; /* lock to protect bio entry list */ }; #define FDEV(i) (sbi->devs[i]) #define RDEV(i) (raw_super->devs[i]) struct f2fs_dev_info { struct file *bdev_file; struct block_device *bdev; char path[MAX_PATH_LEN + 1]; unsigned int total_segments; block_t start_blk; block_t end_blk; #ifdef CONFIG_BLK_DEV_ZONED unsigned int nr_blkz; /* Total number of zones */ unsigned long *blkz_seq; /* Bitmap indicating sequential zones */ #endif }; enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ DONATE_INODE, /* for all inode to donate pages */ NR_INODE_TYPE, }; /* for inner inode cache management */ struct inode_management { struct radix_tree_root ino_root; /* ino entry array */ spinlock_t ino_lock; /* for ino entry lock */ struct list_head ino_list; /* inode list head */ unsigned long ino_num; /* number of entries */ }; /* for GC_AT */ struct atgc_management { bool atgc_enabled; /* ATGC is enabled or not */ struct rb_root_cached root; /* root of victim rb-tree */ struct list_head victim_list; /* linked with all victim entries */ unsigned int victim_count; /* victim count in rb-tree */ unsigned int candidate_ratio; /* candidate ratio */ unsigned int max_candidate_count; /* max candidate count */ unsigned int age_weight; /* age weight, vblock_weight = 100 - age_weight */ unsigned long long age_threshold; /* age threshold */ }; struct f2fs_gc_control { unsigned int victim_segno; /* target victim segment number */ int init_gc_type; /* FG_GC or BG_GC */ bool no_bg_gc; /* check the space and stop bg_gc */ bool should_migrate_blocks; /* should migrate blocks */ bool err_gc_skipped; /* return EAGAIN if GC skipped */ bool one_time; /* require one time GC in one migration unit */ unsigned int nr_free_secs; /* # of free sections to do GC */ }; /* * For s_flag in struct f2fs_sb_info * Modification on enum should be synchronized with s_flag array */ enum { SBI_IS_DIRTY, /* dirty flag for checkpoint */ SBI_IS_CLOSE, /* specify unmounting */ SBI_NEED_FSCK, /* need fsck.f2fs to fix */ SBI_POR_DOING, /* recovery is doing or not */ SBI_NEED_SB_WRITE, /* need to recover superblock */ SBI_NEED_CP, /* need to checkpoint */ SBI_IS_SHUTDOWN, /* shutdown by ioctl */ SBI_IS_RECOVERED, /* recovered orphan/data */ SBI_CP_DISABLED, /* CP was disabled last mount */ SBI_CP_DISABLED_QUICK, /* CP was disabled quickly */ SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */ SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ SBI_IS_RESIZEFS, /* resizefs is in process */ SBI_IS_FREEZING, /* freezefs is in process */ SBI_IS_WRITABLE, /* remove ro mountoption transiently */ MAX_SBI_FLAG, }; enum { CP_TIME, REQ_TIME, DISCARD_TIME, GC_TIME, DISABLE_TIME, ENABLE_TIME, UMOUNT_DISCARD_TIMEOUT, MAX_TIME, }; /* Note that you need to keep synchronization with this gc_mode_names array */ enum { GC_NORMAL, GC_IDLE_CB, GC_IDLE_GREEDY, GC_IDLE_AT, GC_URGENT_HIGH, GC_URGENT_LOW, GC_URGENT_MID, MAX_GC_MODE, }; enum { BGGC_MODE_ON, /* background gc is on */ BGGC_MODE_OFF, /* background gc is off */ BGGC_MODE_SYNC, /* * background gc is on, migrating blocks * like foreground gc */ }; enum { FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */ FS_MODE_LFS, /* use lfs allocation only */ FS_MODE_FRAGMENT_SEG, /* segment fragmentation mode */ FS_MODE_FRAGMENT_BLK, /* block fragmentation mode */ }; enum { ALLOC_MODE_DEFAULT, /* stay default */ ALLOC_MODE_REUSE, /* reuse segments as much as possible */ }; enum fsync_mode { FSYNC_MODE_POSIX, /* fsync follows posix semantics */ FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */ FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ }; enum { COMPR_MODE_FS, /* * automatically compress compression * enabled files */ COMPR_MODE_USER, /* * automatical compression is disabled. * user can control the file compression * using ioctls */ }; enum { DISCARD_UNIT_BLOCK, /* basic discard unit is block */ DISCARD_UNIT_SEGMENT, /* basic discard unit is segment */ DISCARD_UNIT_SECTION, /* basic discard unit is section */ }; enum { MEMORY_MODE_NORMAL, /* memory mode for normal devices */ MEMORY_MODE_LOW, /* memory mode for low memory devices */ }; enum errors_option { MOUNT_ERRORS_READONLY, /* remount fs ro on errors */ MOUNT_ERRORS_CONTINUE, /* continue on errors */ MOUNT_ERRORS_PANIC, /* panic on errors */ }; enum { BACKGROUND, FOREGROUND, MAX_CALL_TYPE, TOTAL_CALL = FOREGROUND, }; enum f2fs_lookup_mode { LOOKUP_PERF, LOOKUP_COMPAT, LOOKUP_AUTO, }; static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); /* * Layout of f2fs page.private: * * Layout A: lowest bit should be 1 * | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... | * bit 0 PAGE_PRIVATE_NOT_POINTER * bit 1 PAGE_PRIVATE_ONGOING_MIGRATION * bit 2 PAGE_PRIVATE_INLINE_INODE * bit 3 PAGE_PRIVATE_REF_RESOURCE * bit 4 PAGE_PRIVATE_ATOMIC_WRITE * bit 5- f2fs private data * * Layout B: lowest bit should be 0 * page.private is a wrapped pointer. */ enum { PAGE_PRIVATE_NOT_POINTER, /* private contains non-pointer data */ PAGE_PRIVATE_ONGOING_MIGRATION, /* data page which is on-going migrating */ PAGE_PRIVATE_INLINE_INODE, /* inode page contains inline data */ PAGE_PRIVATE_REF_RESOURCE, /* dirty page has referenced resources */ PAGE_PRIVATE_ATOMIC_WRITE, /* data page from atomic write path */ PAGE_PRIVATE_MAX }; /* For compression */ enum compress_algorithm_type { COMPRESS_LZO, COMPRESS_LZ4, COMPRESS_ZSTD, COMPRESS_LZORLE, COMPRESS_MAX, }; enum compress_flag { COMPRESS_CHKSUM, COMPRESS_MAX_FLAG, }; #define COMPRESS_WATERMARK 20 #define COMPRESS_PERCENT 20 #define COMPRESS_DATA_RESERVED_SIZE 4 struct compress_data { __le32 clen; /* compressed data size */ __le32 chksum; /* compressed data checksum */ __le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */ u8 cdata[]; /* compressed data */ }; #define COMPRESS_HEADER_SIZE (sizeof(struct compress_data)) #define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 #define F2FS_ZSTD_DEFAULT_CLEVEL 1 #define COMPRESS_LEVEL_OFFSET 8 /* compress context */ struct compress_ctx { struct inode *inode; /* inode the context belong to */ pgoff_t cluster_idx; /* cluster index number */ unsigned int cluster_size; /* page count in cluster */ unsigned int log_cluster_size; /* log of cluster size */ struct page **rpages; /* pages store raw data in cluster */ unsigned int nr_rpages; /* total page number in rpages */ struct page **cpages; /* pages store compressed data in cluster */ unsigned int nr_cpages; /* total page number in cpages */ unsigned int valid_nr_cpages; /* valid page number in cpages */ void *rbuf; /* virtual mapped address on rpages */ struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ void *private; /* payload buffer for specified compression algorithm */ void *private2; /* extra payload buffer */ }; /* compress context for write IO path */ struct compress_io_ctx { u32 magic; /* magic number to indicate page is compressed */ struct inode *inode; /* inode the context belong to */ struct page **rpages; /* pages store raw data in cluster */ unsigned int nr_rpages; /* total page number in rpages */ atomic_t pending_pages; /* in-flight compressed page count */ }; /* Context for decompressing one cluster on the read IO path */ struct decompress_io_ctx { u32 magic; /* magic number to indicate page is compressed */ struct inode *inode; /* inode the context belong to */ struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ pgoff_t cluster_idx; /* cluster index number */ unsigned int cluster_size; /* page count in cluster */ unsigned int log_cluster_size; /* log of cluster size */ struct page **rpages; /* pages store raw data in cluster */ unsigned int nr_rpages; /* total page number in rpages */ struct page **cpages; /* pages store compressed data in cluster */ unsigned int nr_cpages; /* total page number in cpages */ struct page **tpages; /* temp pages to pad holes in cluster */ void *rbuf; /* virtual mapped address on rpages */ struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ /* * The number of compressed pages remaining to be read in this cluster. * This is initially nr_cpages. It is decremented by 1 each time a page * has been read (or failed to be read). When it reaches 0, the cluster * is decompressed (or an error is reported). * * If an error occurs before all the pages have been submitted for I/O, * then this will never reach 0. In this case the I/O submitter is * responsible for calling f2fs_decompress_end_io() instead. */ atomic_t remaining_pages; /* * Number of references to this decompress_io_ctx. * * One reference is held for I/O completion. This reference is dropped * after the pagecache pages are updated and unlocked -- either after * decompression (and verity if enabled), or after an error. * * In addition, each compressed page holds a reference while it is in a * bio. These references are necessary prevent compressed pages from * being freed while they are still in a bio. */ refcount_t refcnt; bool failed; /* IO error occurred before decompression? */ bool need_verity; /* need fs-verity verification after decompression? */ unsigned char compress_algorithm; /* backup algorithm type */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ struct work_struct verity_work; /* work to verify the decompressed pages */ struct work_struct free_work; /* work for late free this structure itself */ }; #define NULL_CLUSTER ((unsigned int)(~0)) #define MIN_COMPRESS_LOG_SIZE 2 #define MAX_COMPRESS_LOG_SIZE 8 #define MAX_COMPRESS_WINDOW_SIZE(log_size) ((PAGE_SIZE) << (log_size)) struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ struct f2fs_super_block *raw_super; /* raw super block pointer */ struct f2fs_rwsem sb_lock; /* lock for raw super block */ int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ struct mutex writepages; /* mutex for writepages() */ #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ unsigned int unusable_blocks_per_sec; /* unusable blocks per section */ unsigned int max_open_zones; /* max open zone resources of the zoned device */ /* For adjust the priority writing position of data in zone UFS */ unsigned int blkzone_alloc_policy; #endif /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ /* for segment-related operations */ struct f2fs_sm_info *sm_info; /* segment manager */ /* for bio operations */ struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ /* keep migration IO order for LFS mode */ struct f2fs_rwsem io_order_lock; pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */ int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ int cur_cp_pack; /* remain current cp pack */ spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ struct f2fs_rwsem cp_global_sem; /* checkpoint procedure lock */ struct f2fs_rwsem cp_rwsem; /* blocking FS operations */ struct f2fs_rwsem node_write; /* locking node writes */ struct f2fs_rwsem node_change; /* locking node change */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ struct ckpt_req_control cprc_info; /* for checkpoint request control */ struct cp_stats cp_stats; /* for time stat of checkpoint */ struct f2fs_rwsem cp_enable_rwsem; /* block cache/dio write */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ spinlock_t fsync_node_lock; /* for node entry lock */ struct list_head fsync_node_list; /* node list head */ unsigned int fsync_seg_id; /* sequence id */ unsigned int fsync_node_num; /* number of node entries */ /* for orphan inode, use 0'th array */ unsigned int max_orphans; /* max orphan inodes */ /* for inode management */ struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */ spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */ struct mutex flush_lock; /* for flush exclusion */ /* for extent tree cache */ struct extent_tree_info extent_tree[NR_EXTENT_CACHES]; atomic64_t allocated_data_blocks; /* for block age extent_cache */ unsigned int max_read_extent_count; /* max read extent count per inode */ /* The threshold used for hot and warm data seperation*/ unsigned int hot_data_age_threshold; unsigned int warm_data_age_threshold; unsigned int last_age_weight; /* control donate caches */ unsigned int donate_files; /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ unsigned int log_blocksize; /* log2 block size */ unsigned int blocksize; /* block size */ unsigned int root_ino_num; /* root inode number*/ unsigned int node_ino_num; /* node inode number*/ unsigned int meta_ino_num; /* meta inode number*/ unsigned int log_blocks_per_seg; /* log2 blocks per segment */ unsigned int blocks_per_seg; /* blocks per segment */ unsigned int segs_per_sec; /* segments per section */ unsigned int secs_per_zone; /* sections per zone */ unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ int dir_level; /* directory level */ bool readdir_ra; /* readahead inode in readdir */ u64 max_io_bytes; /* max io bytes to merge IOs */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ block_t reserved_blocks; /* configurable reserved blocks */ block_t current_reserved_blocks; /* current reserved blocks */ /* Additional tracking for no checkpoint mode */ block_t unusable_block_count; /* # of blocks saved by last cp */ unsigned int nquota_files; /* # of quota sysfile */ struct f2fs_rwsem quota_sem; /* blocking cp for flags */ struct task_struct *umount_lock_holder; /* s_umount lock holder */ /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; /* # of node block writes as roll forward recovery */ struct percpu_counter rf_node_block_count; /* writeback control */ atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */ /* valid inode count */ struct percpu_counter total_valid_inode_count; struct f2fs_mount_info mount_opt; /* mount options */ /* for cleaning operations */ struct f2fs_rwsem gc_lock; /* * semaphore for GC, avoid * race between GC and GC or CP */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ struct atgc_management am; /* atgc management */ unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ spinlock_t gc_remaining_trials_lock; /* remaining trial count for GC_URGENT_* and GC_IDLE_* */ unsigned int gc_remaining_trials; /* for skip statistic */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* free sections reserved for pinned file */ unsigned int reserved_pin_section; /* threshold for gc trials on pinned files */ unsigned short gc_pin_file_threshold; struct f2fs_rwsem pin_sem; /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; /* migration granularity of garbage collection, unit: segment */ unsigned int migration_granularity; /* migration window granularity of garbage collection, unit: segment */ unsigned int migration_window_granularity; /* * for stat information. * one is for the LFS mode, and the other is for the SSR mode. */ #ifdef CONFIG_F2FS_STAT_FS struct f2fs_stat_info *stat_info; /* FS status information */ atomic_t meta_count[META_MAX]; /* # of meta blocks */ unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ /* # of lookup extent cache */ atomic64_t total_hit_ext[NR_EXTENT_CACHES]; /* # of hit rbtree extent node */ atomic64_t read_hit_rbtree[NR_EXTENT_CACHES]; /* # of hit cached extent node */ atomic64_t read_hit_cached[NR_EXTENT_CACHES]; /* # of hit largest extent node in read extent cache */ atomic64_t read_hit_largest; atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ atomic64_t compr_blocks; /* # of compressed blocks */ atomic_t swapfile_inode; /* # of swapfile inodes */ atomic_t atomic_files; /* # of opened atomic file */ atomic_t max_aw_cnt; /* max # of atomic writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ atomic_t cp_call_count[MAX_CALL_TYPE]; /* # of cp call */ #endif spinlock_t stat_lock; /* lock for stat operations */ /* to attach REQ_META|REQ_FUA flags */ unsigned int data_io_flag; unsigned int node_io_flag; /* For sysfs support */ struct kobject s_kobj; /* /sys/fs/f2fs/<devname> */ struct completion s_kobj_unregister; struct kobject s_stat_kobj; /* /sys/fs/f2fs/<devname>/stat */ struct completion s_stat_kobj_unregister; struct kobject s_feature_list_kobj; /* /sys/fs/f2fs/<devname>/feature_list */ struct completion s_feature_list_kobj_unregister; /* For shrinker support */ struct list_head s_list; struct mutex umount_mutex; unsigned int shrinker_run_no; /* For multi devices */ int s_ndevs; /* number of devices */ struct f2fs_dev_info *devs; /* for device list */ unsigned int dirty_device; /* for checkpoint data flush */ spinlock_t dev_lock; /* protect dirty_device */ bool aligned_blksize; /* all devices has the same logical blksize */ unsigned int first_seq_zone_segno; /* first segno in sequential zone */ unsigned int bggc_io_aware; /* For adjust the BG_GC priority when pending IO */ unsigned int allocate_section_hint; /* the boundary position between devices */ unsigned int allocate_section_policy; /* determine the section writing priority */ /* For write statistics */ u64 sectors_written_start; u64 kbytes_written; /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_chksum_seed; struct workqueue_struct *post_read_wq; /* post read workqueue */ /* * If we are in irq context, let's update error information into * on-disk superblock in the work. */ struct work_struct s_error_work; unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */ unsigned char stop_reason[MAX_STOP_REASON]; /* stop reason */ spinlock_t error_lock; /* protect errors/stop_reason array */ bool error_dirty; /* errors of sb is dirty */ /* For reclaimed segs statistics per each GC mode */ unsigned int gc_segment_mode; /* GC state for reclaimed segments */ unsigned int gc_reclaimed_segs[MAX_GC_MODE]; /* Reclaimed segs for each mode */ unsigned long seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */ int max_fragment_chunk; /* max chunk size for block fragmentation mode */ int max_fragment_hole; /* max hole size for block fragmentation mode */ /* For atomic write statistics */ atomic64_t current_atomic_write; s64 peak_atomic_write; u64 committed_atomic_block; u64 revoked_atomic_block; /* carve out reserved_blocks from total blocks */ bool carve_out; #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ /* For runtime compression statistics */ u64 compr_written_block; u64 compr_saved_block; u32 compr_new_inode; /* For compressed block cache */ struct inode *compress_inode; /* cache compressed blocks */ unsigned int compress_percent; /* cache page percentage */ unsigned int compress_watermark; /* cache page watermark */ atomic_t compress_page_hit; /* cache hit count */ #endif #ifdef CONFIG_F2FS_IOSTAT /* For app/fs IO statistics */ spinlock_t iostat_lock; unsigned long long iostat_count[NR_IO_TYPE]; unsigned long long iostat_bytes[NR_IO_TYPE]; unsigned long long prev_iostat_bytes[NR_IO_TYPE]; bool iostat_enable; unsigned long iostat_next_period; unsigned int iostat_period_ms; /* For io latency related statistics info in one iostat period */ spinlock_t iostat_lat_lock; struct iostat_lat_info *iostat_io_lat; #endif }; /* Definitions to access f2fs_sb_info */ #define SEGS_TO_BLKS(sbi, segs) \ ((segs) << (sbi)->log_blocks_per_seg) #define BLKS_TO_SEGS(sbi, blks) \ ((blks) >> (sbi)->log_blocks_per_seg) #define BLKS_PER_SEG(sbi) ((sbi)->blocks_per_seg) #define BLKS_PER_SEC(sbi) (SEGS_TO_BLKS(sbi, (sbi)->segs_per_sec)) #define SEGS_PER_SEC(sbi) ((sbi)->segs_per_sec) __printf(3, 4) void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, const char *fmt, ...); #define f2fs_err(sbi, fmt, ...) \ f2fs_printk(sbi, false, KERN_ERR fmt, ##__VA_ARGS__) #define f2fs_warn(sbi, fmt, ...) \ f2fs_printk(sbi, false, KERN_WARNING fmt, ##__VA_ARGS__) #define f2fs_notice(sbi, fmt, ...) \ f2fs_printk(sbi, false, KERN_NOTICE fmt, ##__VA_ARGS__) #define f2fs_info(sbi, fmt, ...) \ f2fs_printk(sbi, false, KERN_INFO fmt, ##__VA_ARGS__) #define f2fs_debug(sbi, fmt, ...) \ f2fs_printk(sbi, false, KERN_DEBUG fmt, ##__VA_ARGS__) #define f2fs_err_ratelimited(sbi, fmt, ...) \ f2fs_printk(sbi, true, KERN_ERR fmt, ##__VA_ARGS__) #define f2fs_warn_ratelimited(sbi, fmt, ...) \ f2fs_printk(sbi, true, KERN_WARNING fmt, ##__VA_ARGS__) #define f2fs_info_ratelimited(sbi, fmt, ...) \ f2fs_printk(sbi, true, KERN_INFO fmt, ##__VA_ARGS__) #ifdef CONFIG_F2FS_FAULT_INJECTION #define time_to_inject(sbi, type) __time_to_inject(sbi, type, __func__, \ __builtin_return_address(0)) static inline bool __time_to_inject(struct f2fs_sb_info *sbi, int type, const char *func, const char *parent_func) { struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; if (!ffi->inject_rate) return false; if (!IS_FAULT_SET(ffi, type)) return false; atomic_inc(&ffi->inject_ops); if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { atomic_set(&ffi->inject_ops, 0); ffi->inject_count[type]++; f2fs_info_ratelimited(sbi, "inject %s in %s of %pS", f2fs_fault_name[type], func, parent_func); return true; } return false; } #else static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { return false; } #endif /* * Test if the mounted volume is a multi-device volume. * - For a single regular disk volume, sbi->s_ndevs is 0. * - For a single zoned disk volume, sbi->s_ndevs is 1. * - For a multi-device volume, sbi->s_ndevs is always 2 or more. */ static inline bool f2fs_is_multi_device(struct f2fs_sb_info *sbi) { return sbi->s_ndevs > 1; } static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { unsigned long now = jiffies; sbi->last_time[type] = now; /* DISCARD_TIME and GC_TIME are based on REQ_TIME */ if (type == REQ_TIME) { sbi->last_time[DISCARD_TIME] = now; sbi->last_time[GC_TIME] = now; } } static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) { unsigned long interval = sbi->interval_time[type] * HZ; return time_after(jiffies, sbi->last_time[type] + interval); } static inline unsigned int f2fs_time_to_wait(struct f2fs_sb_info *sbi, int type) { unsigned long interval = sbi->interval_time[type] * HZ; unsigned int wait_ms = 0; long delta; delta = (sbi->last_time[type] + interval) - jiffies; if (delta > 0) wait_ms = jiffies_to_msecs(delta); return wait_ms; } /* * Inline functions */ static inline u32 __f2fs_crc32(u32 crc, const void *address, unsigned int length) { return crc32(crc, address, length); } static inline u32 f2fs_crc32(const void *address, unsigned int length) { return __f2fs_crc32(F2FS_SUPER_MAGIC, address, length); } static inline u32 f2fs_chksum(u32 crc, const void *address, unsigned int length) { return __f2fs_crc32(crc, address, length); } static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); } static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb) { return sb->s_fs_info; } static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode) { return F2FS_SB(inode->i_sb); } static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping) { return F2FS_I_SB(mapping->host); } static inline struct f2fs_sb_info *F2FS_F_SB(const struct folio *folio) { return F2FS_M_SB(folio->mapping); } static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) { return (struct f2fs_super_block *)(sbi->raw_super); } static inline struct f2fs_super_block *F2FS_SUPER_BLOCK(struct folio *folio, pgoff_t index) { pgoff_t idx_in_folio = index % folio_nr_pages(folio); return (struct f2fs_super_block *) (page_address(folio_page(folio, idx_in_folio)) + F2FS_SUPER_OFFSET); } static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) { return (struct f2fs_checkpoint *)(sbi->ckpt); } static inline struct f2fs_node *F2FS_NODE(const struct folio *folio) { return (struct f2fs_node *)folio_address(folio); } static inline struct f2fs_inode *F2FS_INODE(const struct folio *folio) { return &((struct f2fs_node *)folio_address(folio))->i; } static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) { return (struct f2fs_nm_info *)(sbi->nm_info); } static inline struct f2fs_sm_info *SM_I(struct f2fs_sb_info *sbi) { return (struct f2fs_sm_info *)(sbi->sm_info); } static inline struct sit_info *SIT_I(struct f2fs_sb_info *sbi) { return (struct sit_info *)(SM_I(sbi)->sit_info); } static inline struct free_segmap_info *FREE_I(struct f2fs_sb_info *sbi) { return (struct free_segmap_info *)(SM_I(sbi)->free_info); } static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi) { return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info); } static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi) { return sbi->meta_inode->i_mapping; } static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) { return sbi->node_inode->i_mapping; } static inline bool is_meta_folio(struct folio *folio) { return folio->mapping == META_MAPPING(F2FS_F_SB(folio)); } static inline bool is_node_folio(struct folio *folio) { return folio->mapping == NODE_MAPPING(F2FS_F_SB(folio)); } static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type) { return test_bit(type, &sbi->s_flag); } static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { set_bit(type, &sbi->s_flag); } static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { clear_bit(type, &sbi->s_flag); } static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) { return le64_to_cpu(cp->checkpoint_ver); } static inline unsigned long f2fs_qf_ino(struct super_block *sb, int type) { if (type < F2FS_MAX_QUOTAS) return le32_to_cpu(F2FS_SB(sb)->raw_super->qf_ino[type]); return 0; } static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp) { size_t crc_offset = le32_to_cpu(cp->checksum_offset); return le32_to_cpu(*((__le32 *)((unsigned char *)cp + crc_offset))); } static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); return ckpt_flags & f; } static inline bool is_set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { return __is_set_ckpt_flags(F2FS_CKPT(sbi), f); } static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags; ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags |= f; cp->ckpt_flags = cpu_to_le32(ckpt_flags); } static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { unsigned long flags; spin_lock_irqsave(&sbi->cp_lock, flags); __set_ckpt_flags(F2FS_CKPT(sbi), f); spin_unlock_irqrestore(&sbi->cp_lock, flags); } static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags; ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags &= (~f); cp->ckpt_flags = cpu_to_le32(ckpt_flags); } static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { unsigned long flags; spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), f); spin_unlock_irqrestore(&sbi->cp_lock, flags); } #define init_f2fs_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ __init_f2fs_rwsem((sem), #sem, &__key); \ } while (0) static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem, const char *sem_name, struct lock_class_key *key) { __init_rwsem(&sem->internal_rwsem, sem_name, key); #ifdef CONFIG_F2FS_UNFAIR_RWSEM init_waitqueue_head(&sem->read_waiters); #endif } static inline int f2fs_rwsem_is_locked(struct f2fs_rwsem *sem) { return rwsem_is_locked(&sem->internal_rwsem); } static inline int f2fs_rwsem_is_contended(struct f2fs_rwsem *sem) { return rwsem_is_contended(&sem->internal_rwsem); } static inline void f2fs_down_read(struct f2fs_rwsem *sem) { #ifdef CONFIG_F2FS_UNFAIR_RWSEM wait_event(sem->read_waiters, down_read_trylock(&sem->internal_rwsem)); #else down_read(&sem->internal_rwsem); #endif } static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem) { return down_read_trylock(&sem->internal_rwsem); } static inline void f2fs_up_read(struct f2fs_rwsem *sem) { up_read(&sem->internal_rwsem); } static inline void f2fs_down_write(struct f2fs_rwsem *sem) { down_write(&sem->internal_rwsem); } #ifdef CONFIG_DEBUG_LOCK_ALLOC static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass) { down_read_nested(&sem->internal_rwsem, subclass); } static inline void f2fs_down_write_nested(struct f2fs_rwsem *sem, int subclass) { down_write_nested(&sem->internal_rwsem, subclass); } #else #define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem) #define f2fs_down_write_nested(sem, subclass) f2fs_down_write(sem) #endif static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem) { return down_write_trylock(&sem->internal_rwsem); } static inline void f2fs_up_write(struct f2fs_rwsem *sem) { up_write(&sem->internal_rwsem); #ifdef CONFIG_F2FS_UNFAIR_RWSEM wake_up_all(&sem->read_waiters); #endif } static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) { unsigned long flags; unsigned char *nat_bits; /* * In order to re-enable nat_bits we need to call fsck.f2fs by * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost, * so let's rely on regular fsck or unclean shutdown. */ if (lock) spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); nat_bits = NM_I(sbi)->nat_bits; NM_I(sbi)->nat_bits = NULL; if (lock) spin_unlock_irqrestore(&sbi->cp_lock, flags); kvfree(nat_bits); } static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, struct cp_control *cpc) { bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; } static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { f2fs_down_read(&sbi->cp_rwsem); } static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) { if (time_to_inject(sbi, FAULT_LOCK_OP)) return 0; return f2fs_down_read_trylock(&sbi->cp_rwsem); } static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { f2fs_up_read(&sbi->cp_rwsem); } static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { f2fs_down_write(&sbi->cp_rwsem); } static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) { f2fs_up_write(&sbi->cp_rwsem); } static inline int __get_cp_reason(struct f2fs_sb_info *sbi) { int reason = CP_SYNC; if (test_opt(sbi, FASTBOOT)) reason = CP_FASTBOOT; if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) reason = CP_UMOUNT; return reason; } static inline bool __remain_node_summaries(int reason) { return (reason & (CP_UMOUNT | CP_FASTBOOT)); } static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) { return (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG) || is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG)); } /* * Check whether the inode has blocks or not */ static inline int F2FS_HAS_BLOCKS(struct inode *inode) { block_t xattr_block = F2FS_I(inode)->i_xattr_nid ? 1 : 0; return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > xattr_block; } static inline bool f2fs_has_xattr_block(unsigned int ofs) { return ofs == XATTR_NODE_OFFSET; } static inline bool __allow_reserved_root(struct f2fs_sb_info *sbi, struct inode *inode, bool cap) { if (!inode) return true; if (IS_NOQUOTA(inode)) return true; if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid())) return true; if (!gid_eq(F2FS_OPTION(sbi).s_resgid, GLOBAL_ROOT_GID) && in_group_p(F2FS_OPTION(sbi).s_resgid)) return true; if (cap && capable(CAP_SYS_RESOURCE)) return true; return false; } static inline unsigned int get_available_block_count(struct f2fs_sb_info *sbi, struct inode *inode, bool cap) { block_t avail_user_block_count; avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; if (test_opt(sbi, RESERVE_ROOT) && !__allow_reserved_root(sbi, inode, cap)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { if (avail_user_block_count > sbi->unusable_block_count) avail_user_block_count -= sbi->unusable_block_count; else avail_user_block_count = 0; } return avail_user_block_count; } static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count, bool partial) { long long diff = 0, release = 0; block_t avail_user_block_count; int ret; ret = dquot_reserve_block(inode, *count); if (ret) return ret; if (time_to_inject(sbi, FAULT_BLOCK)) { release = *count; goto release_quota; } /* * let's increase this in prior to actual block count change in order * for f2fs_sync_file to avoid data races when deciding checkpoint. */ percpu_counter_add(&sbi->alloc_valid_block_count, (*count)); spin_lock(&sbi->stat_lock); avail_user_block_count = get_available_block_count(sbi, inode, true); diff = (long long)sbi->total_valid_block_count + *count - avail_user_block_count; if (unlikely(diff > 0)) { if (!partial) { spin_unlock(&sbi->stat_lock); release = *count; goto enospc; } if (diff > *count) diff = *count; *count -= diff; release = diff; if (!*count) { spin_unlock(&sbi->stat_lock); goto enospc; } } sbi->total_valid_block_count += (block_t)(*count); spin_unlock(&sbi->stat_lock); if (unlikely(release)) { percpu_counter_sub(&sbi->alloc_valid_block_count, release); dquot_release_reservation_block(inode, release); } f2fs_i_blocks_write(inode, *count, true, true); return 0; enospc: percpu_counter_sub(&sbi->alloc_valid_block_count, release); release_quota: dquot_release_reservation_block(inode, release); return -ENOSPC; } #define PAGE_PRIVATE_GET_FUNC(name, flagname) \ static inline bool folio_test_f2fs_##name(const struct folio *folio) \ { \ unsigned long priv = (unsigned long)folio->private; \ unsigned long v = (1UL << PAGE_PRIVATE_NOT_POINTER) | \ (1UL << PAGE_PRIVATE_##flagname); \ return (priv & v) == v; \ } \ static inline bool page_private_##name(struct page *page) \ { \ return PagePrivate(page) && \ test_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)) && \ test_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ } #define PAGE_PRIVATE_SET_FUNC(name, flagname) \ static inline void folio_set_f2fs_##name(struct folio *folio) \ { \ unsigned long v = (1UL << PAGE_PRIVATE_NOT_POINTER) | \ (1UL << PAGE_PRIVATE_##flagname); \ if (!folio->private) \ folio_attach_private(folio, (void *)v); \ else { \ v |= (unsigned long)folio->private; \ folio->private = (void *)v; \ } \ } \ static inline void set_page_private_##name(struct page *page) \ { \ if (!PagePrivate(page)) \ attach_page_private(page, (void *)0); \ set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); \ set_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ } #define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \ static inline void folio_clear_f2fs_##name(struct folio *folio) \ { \ unsigned long v = (unsigned long)folio->private; \ \ v &= ~(1UL << PAGE_PRIVATE_##flagname); \ if (v == (1UL << PAGE_PRIVATE_NOT_POINTER)) \ folio_detach_private(folio); \ else \ folio->private = (void *)v; \ } \ static inline void clear_page_private_##name(struct page *page) \ { \ clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) \ detach_page_private(page); \ } PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER); PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION); PAGE_PRIVATE_GET_FUNC(atomic, ATOMIC_WRITE); PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE); PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION); PAGE_PRIVATE_SET_FUNC(atomic, ATOMIC_WRITE); PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE); PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE); static inline unsigned long folio_get_f2fs_data(struct folio *folio) { unsigned long data = (unsigned long)folio->private; if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data)) return 0; return data >> PAGE_PRIVATE_MAX; } static inline void folio_set_f2fs_data(struct folio *folio, unsigned long data) { data = (1UL << PAGE_PRIVATE_NOT_POINTER) | (data << PAGE_PRIVATE_MAX); if (!folio_test_private(folio)) folio_attach_private(folio, (void *)data); else folio->private = (void *)((unsigned long)folio->private | data); } static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, block_t count) { blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK; spin_lock(&sbi->stat_lock); if (unlikely(sbi->total_valid_block_count < count)) { f2fs_warn(sbi, "Inconsistent total_valid_block_count:%u, ino:%lu, count:%u", sbi->total_valid_block_count, inode->i_ino, count); sbi->total_valid_block_count = 0; set_sbi_flag(sbi, SBI_NEED_FSCK); } else { sbi->total_valid_block_count -= count; } if (sbi->reserved_blocks && sbi->current_reserved_blocks < sbi->reserved_blocks) sbi->current_reserved_blocks = min(sbi->reserved_blocks, sbi->current_reserved_blocks + count); spin_unlock(&sbi->stat_lock); if (unlikely(inode->i_blocks < sectors)) { f2fs_warn(sbi, "Inconsistent i_blocks, ino:%lu, iblocks:%llu, sectors:%llu", inode->i_ino, (unsigned long long)inode->i_blocks, (unsigned long long)sectors); set_sbi_flag(sbi, SBI_NEED_FSCK); return; } f2fs_i_blocks_write(inode, count, false, true); } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_inc(&sbi->nr_pages[count_type]); if (count_type == F2FS_DIRTY_DENTS || count_type == F2FS_DIRTY_NODES || count_type == F2FS_DIRTY_META || count_type == F2FS_DIRTY_QDATA || count_type == F2FS_DIRTY_IMETA) set_sbi_flag(sbi, SBI_IS_DIRTY); } static inline void inode_inc_dirty_pages(struct inode *inode) { atomic_inc(&F2FS_I(inode)->dirty_pages); inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); if (IS_NOQUOTA(inode)) inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_dec(&sbi->nr_pages[count_type]); } static inline void inode_dec_dirty_pages(struct inode *inode) { if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return; atomic_dec(&F2FS_I(inode)->dirty_pages); dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); if (IS_NOQUOTA(inode)) dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); } static inline void inc_atomic_write_cnt(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); u64 current_write; fi->atomic_write_cnt++; atomic64_inc(&sbi->current_atomic_write); current_write = atomic64_read(&sbi->current_atomic_write); if (current_write > sbi->peak_atomic_write) sbi->peak_atomic_write = current_write; } static inline void release_atomic_write_cnt(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); atomic64_sub(fi->atomic_write_cnt, &sbi->current_atomic_write); fi->atomic_write_cnt = 0; } static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { return atomic_read(&sbi->nr_pages[count_type]); } static inline int get_dirty_pages(struct inode *inode) { return atomic_read(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { return div_u64(get_pages(sbi, block_type) + BLKS_PER_SEC(sbi) - 1, BLKS_PER_SEC(sbi)); } static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) { return sbi->total_valid_block_count; } static inline block_t discard_blocks(struct f2fs_sb_info *sbi) { return sbi->discard_blks; } static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); /* return NAT or SIT bitmap */ if (flag == NAT_BITMAP) return le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); else if (flag == SIT_BITMAP) return le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); return 0; } static inline block_t __cp_payload(struct f2fs_sb_info *sbi) { return le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); } static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); void *tmp_ptr = &ckpt->sit_nat_version_bitmap; int offset; if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) { offset = (flag == SIT_BITMAP) ? le32_to_cpu(ckpt->nat_ver_bitmap_bytesize) : 0; /* * if large_nat_bitmap feature is enabled, leave checksum * protection for all nat/sit bitmaps. */ return tmp_ptr + offset + sizeof(__le32); } if (__cp_payload(sbi) > 0) { if (flag == NAT_BITMAP) return tmp_ptr; else return (unsigned char *)ckpt + F2FS_BLKSIZE; } else { offset = (flag == NAT_BITMAP) ? le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; return tmp_ptr + offset; } } static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) { block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); if (sbi->cur_cp_pack == 2) start_addr += BLKS_PER_SEG(sbi); return start_addr; } static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi) { block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); if (sbi->cur_cp_pack == 1) start_addr += BLKS_PER_SEG(sbi); return start_addr; } static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi) { sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1; } static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) { return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { block_t valid_block_count; unsigned int valid_node_count, avail_user_node_count; unsigned int avail_user_block_count; int err; if (is_inode) { if (inode) { err = dquot_alloc_inode(inode); if (err) return err; } } else { err = dquot_reserve_block(inode, 1); if (err) return err; } if (time_to_inject(sbi, FAULT_BLOCK)) goto enospc; spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; avail_user_block_count = get_available_block_count(sbi, inode, test_opt(sbi, RESERVE_NODE)); if (unlikely(valid_block_count > avail_user_block_count)) { spin_unlock(&sbi->stat_lock); goto enospc; } avail_user_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; if (test_opt(sbi, RESERVE_NODE) && !__allow_reserved_root(sbi, inode, true)) avail_user_node_count -= F2FS_OPTION(sbi).root_reserved_nodes; valid_node_count = sbi->total_valid_node_count + 1; if (unlikely(valid_node_count > avail_user_node_count)) { spin_unlock(&sbi->stat_lock); goto enospc; } sbi->total_valid_node_count++; sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); if (inode) { if (is_inode) f2fs_mark_inode_dirty_sync(inode, true); else f2fs_i_blocks_write(inode, 1, true, true); } percpu_counter_inc(&sbi->alloc_valid_block_count); return 0; enospc: if (is_inode) { if (inode) dquot_free_inode(inode); } else { dquot_release_reservation_block(inode, 1); } return -ENOSPC; } static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { spin_lock(&sbi->stat_lock); if (unlikely(!sbi->total_valid_block_count || !sbi->total_valid_node_count)) { f2fs_warn(sbi, "dec_valid_node_count: inconsistent block counts, total_valid_block:%u, total_valid_node:%u", sbi->total_valid_block_count, sbi->total_valid_node_count); set_sbi_flag(sbi, SBI_NEED_FSCK); } else { sbi->total_valid_block_count--; sbi->total_valid_node_count--; } if (sbi->reserved_blocks && sbi->current_reserved_blocks < sbi->reserved_blocks) sbi->current_reserved_blocks++; spin_unlock(&sbi->stat_lock); if (is_inode) { dquot_free_inode(inode); } else { if (unlikely(inode->i_blocks == 0)) { f2fs_warn(sbi, "dec_valid_node_count: inconsistent i_blocks, ino:%lu, iblocks:%llu", inode->i_ino, (unsigned long long)inode->i_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); return; } f2fs_i_blocks_write(inode, 1, false, true); } } static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) { return sbi->total_valid_node_count; } static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) { percpu_counter_inc(&sbi->total_valid_inode_count); } static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) { percpu_counter_dec(&sbi->total_valid_inode_count); } static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) { return percpu_counter_sum_positive(&sbi->total_valid_inode_count); } static inline struct folio *f2fs_grab_cache_folio(struct address_space *mapping, pgoff_t index, bool for_write) { struct folio *folio; unsigned int flags; if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) { fgf_t fgf_flags; if (!for_write) fgf_flags = FGP_LOCK | FGP_ACCESSED; else fgf_flags = FGP_LOCK; folio = __filemap_get_folio(mapping, index, fgf_flags, 0); if (!IS_ERR(folio)) return folio; if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) return ERR_PTR(-ENOMEM); } if (!for_write) return filemap_grab_folio(mapping, index); flags = memalloc_nofs_save(); folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); memalloc_nofs_restore(flags); return folio; } static inline struct folio *f2fs_filemap_get_folio( struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp_mask) { if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) return ERR_PTR(-ENOMEM); return __filemap_get_folio(mapping, index, fgp_flags, gfp_mask); } static inline void f2fs_folio_put(struct folio *folio, bool unlock) { if (IS_ERR_OR_NULL(folio)) return; if (unlock) { f2fs_bug_on(F2FS_F_SB(folio), !folio_test_locked(folio)); folio_unlock(folio); } folio_put(folio); } static inline void f2fs_put_page(struct page *page, bool unlock) { if (!page) return; f2fs_folio_put(page_folio(page), unlock); } static inline void f2fs_put_dnode(struct dnode_of_data *dn) { if (dn->node_folio) f2fs_folio_put(dn->node_folio, true); if (dn->inode_folio && dn->node_folio != dn->inode_folio) f2fs_folio_put(dn->inode_folio, false); dn->node_folio = NULL; dn->inode_folio = NULL; } static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, size_t size) { return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); } static inline void *f2fs_kmem_cache_alloc_nofail(struct kmem_cache *cachep, gfp_t flags) { void *entry; entry = kmem_cache_alloc(cachep, flags); if (!entry) entry = kmem_cache_alloc(cachep, flags | __GFP_NOFAIL); return entry; } static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags, bool nofail, struct f2fs_sb_info *sbi) { if (nofail) return f2fs_kmem_cache_alloc_nofail(cachep, flags); if (time_to_inject(sbi, FAULT_SLAB_ALLOC)) return NULL; return kmem_cache_alloc(cachep, flags); } static inline bool is_inflight_io(struct f2fs_sb_info *sbi, int type) { if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) || get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || get_pages(sbi, F2FS_WB_CP_DATA) || get_pages(sbi, F2FS_DIO_READ) || get_pages(sbi, F2FS_DIO_WRITE)) return true; if (type != DISCARD_TIME && SM_I(sbi) && SM_I(sbi)->dcc_info && atomic_read(&SM_I(sbi)->dcc_info->queued_discard)) return true; if (SM_I(sbi) && SM_I(sbi)->fcc_info && atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) return true; return false; } static inline bool is_inflight_read_io(struct f2fs_sb_info *sbi) { return get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_DIO_READ); } static inline bool is_idle(struct f2fs_sb_info *sbi, int type) { bool zoned_gc = (type == GC_TIME && F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_BLKZONED)); if (sbi->gc_mode == GC_URGENT_HIGH) return true; if (sbi->bggc_io_aware == AWARE_READ_IO && is_inflight_read_io(sbi)) return false; if (sbi->bggc_io_aware == AWARE_ALL_IO && is_inflight_io(sbi, type)) return false; if (sbi->gc_mode == GC_URGENT_MID) return true; if (sbi->gc_mode == GC_URGENT_LOW && (type == DISCARD_TIME || type == GC_TIME)) return true; if (zoned_gc) return true; return f2fs_time_over(sbi, type); } static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) { while (radix_tree_insert(root, index, item)) cond_resched(); } #define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino) static inline bool IS_INODE(const struct folio *folio) { struct f2fs_node *p = F2FS_NODE(folio); return RAW_IS_INODE(p); } static inline int offset_in_addr(struct f2fs_inode *i) { return (i->i_inline & F2FS_EXTRA_ATTR) ? (le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0; } static inline __le32 *blkaddr_in_node(struct f2fs_node *node) { return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; } static inline int f2fs_has_extra_attr(struct inode *inode); static inline unsigned int get_dnode_base(struct inode *inode, struct folio *node_folio) { if (!IS_INODE(node_folio)) return 0; return inode ? get_extra_isize(inode) : offset_in_addr(&F2FS_NODE(node_folio)->i); } static inline __le32 *get_dnode_addr(struct inode *inode, struct folio *node_folio) { return blkaddr_in_node(F2FS_NODE(node_folio)) + get_dnode_base(inode, node_folio); } static inline block_t data_blkaddr(struct inode *inode, struct folio *node_folio, unsigned int offset) { return le32_to_cpu(*(get_dnode_addr(inode, node_folio) + offset)); } static inline block_t f2fs_data_blkaddr(struct dnode_of_data *dn) { return data_blkaddr(dn->inode, dn->node_folio, dn->ofs_in_node); } static inline int f2fs_test_bit(unsigned int nr, char *addr) { int mask; addr += (nr >> 3); mask = BIT(7 - (nr & 0x07)); return mask & *addr; } static inline void f2fs_set_bit(unsigned int nr, char *addr) { int mask; addr += (nr >> 3); mask = BIT(7 - (nr & 0x07)); *addr |= mask; } static inline void f2fs_clear_bit(unsigned int nr, char *addr) { int mask; addr += (nr >> 3); mask = BIT(7 - (nr & 0x07)); *addr &= ~mask; } static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr) { int mask; int ret; addr += (nr >> 3); mask = BIT(7 - (nr & 0x07)); ret = mask & *addr; *addr |= mask; return ret; } static inline int f2fs_test_and_clear_bit(unsigned int nr, char *addr) { int mask; int ret; addr += (nr >> 3); mask = BIT(7 - (nr & 0x07)); ret = mask & *addr; *addr &= ~mask; return ret; } static inline void f2fs_change_bit(unsigned int nr, char *addr) { int mask; addr += (nr >> 3); mask = BIT(7 - (nr & 0x07)); *addr ^= mask; } /* * On-disk inode flags (f2fs_inode::i_flags) */ #define F2FS_COMPR_FL 0x00000004 /* Compress file */ #define F2FS_SYNC_FL 0x00000008 /* Synchronous updates */ #define F2FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ #define F2FS_APPEND_FL 0x00000020 /* writes to file may only append */ #define F2FS_NODUMP_FL 0x00000040 /* do not dump file */ #define F2FS_NOATIME_FL 0x00000080 /* do not update atime */ #define F2FS_NOCOMP_FL 0x00000400 /* Don't compress */ #define F2FS_INDEX_FL 0x00001000 /* hash-indexed directory */ #define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define F2FS_CASEFOLD_FL 0x40000000 /* Casefolded file */ #define F2FS_DEVICE_ALIAS_FL 0x80000000 /* File for aliasing a device */ #define F2FS_QUOTA_DEFAULT_FL (F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL) /* Flags that should be inherited by new inodes from their parent. */ #define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \ F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ F2FS_CASEFOLD_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ F2FS_CASEFOLD_FL)) /* Flags that are appropriate for non-directories/regular files. */ #define F2FS_OTHER_FLMASK (F2FS_NODUMP_FL | F2FS_NOATIME_FL) #define IS_DEVICE_ALIASING(inode) (F2FS_I(inode)->i_flags & F2FS_DEVICE_ALIAS_FL) static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) { if (S_ISDIR(mode)) return flags; else if (S_ISREG(mode)) return flags & F2FS_REG_FLMASK; else return flags & F2FS_OTHER_FLMASK; } static inline void __mark_inode_dirty_flag(struct inode *inode, int flag, bool set) { switch (flag) { case FI_INLINE_XATTR: case FI_INLINE_DATA: case FI_INLINE_DENTRY: case FI_NEW_INODE: if (set) return; fallthrough; case FI_DATA_EXIST: case FI_PIN_FILE: case FI_COMPRESS_RELEASED: f2fs_mark_inode_dirty_sync(inode, true); } } static inline void set_inode_flag(struct inode *inode, int flag) { set_bit(flag, F2FS_I(inode)->flags); __mark_inode_dirty_flag(inode, flag, true); } static inline int is_inode_flag_set(struct inode *inode, int flag) { return test_bit(flag, F2FS_I(inode)->flags); } static inline void clear_inode_flag(struct inode *inode, int flag) { clear_bit(flag, F2FS_I(inode)->flags); __mark_inode_dirty_flag(inode, flag, false); } static inline bool f2fs_verity_in_progress(struct inode *inode) { return IS_ENABLED(CONFIG_FS_VERITY) && is_inode_flag_set(inode, FI_VERITY_IN_PROGRESS); } static inline void set_acl_inode(struct inode *inode, umode_t mode) { F2FS_I(inode)->i_acl_mode = mode; set_inode_flag(inode, FI_ACL_MODE); f2fs_mark_inode_dirty_sync(inode, false); } static inline void f2fs_i_links_write(struct inode *inode, bool inc) { if (inc) inc_nlink(inode); else drop_nlink(inode); f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_blocks_write(struct inode *inode, block_t diff, bool add, bool claim) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); /* add = 1, claim = 1 should be dquot_reserve_block in pair */ if (add) { if (claim) dquot_claim_block(inode, diff); else dquot_alloc_block_nofail(inode, diff); } else { dquot_free_block(inode, diff); } f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } static inline bool f2fs_is_atomic_file(struct inode *inode); static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); if (i_size_read(inode) == i_size) return; i_size_write(inode, i_size); if (f2fs_is_atomic_file(inode)) return; f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { F2FS_I(inode)->i_current_depth = depth; f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_gc_failures_write(struct inode *inode, unsigned int count) { F2FS_I(inode)->i_gc_failures = count; f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) { F2FS_I(inode)->i_xattr_nid = xnid; f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) { F2FS_I(inode)->i_pino = pino; f2fs_mark_inode_dirty_sync(inode, true); } static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) { struct f2fs_inode_info *fi = F2FS_I(inode); if (ri->i_inline & F2FS_INLINE_XATTR) set_bit(FI_INLINE_XATTR, fi->flags); if (ri->i_inline & F2FS_INLINE_DATA) set_bit(FI_INLINE_DATA, fi->flags); if (ri->i_inline & F2FS_INLINE_DENTRY) set_bit(FI_INLINE_DENTRY, fi->flags); if (ri->i_inline & F2FS_DATA_EXIST) set_bit(FI_DATA_EXIST, fi->flags); if (ri->i_inline & F2FS_EXTRA_ATTR) set_bit(FI_EXTRA_ATTR, fi->flags); if (ri->i_inline & F2FS_PIN_FILE) set_bit(FI_PIN_FILE, fi->flags); if (ri->i_inline & F2FS_COMPRESS_RELEASED) set_bit(FI_COMPRESS_RELEASED, fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) { ri->i_inline = 0; if (is_inode_flag_set(inode, FI_INLINE_XATTR)) ri->i_inline |= F2FS_INLINE_XATTR; if (is_inode_flag_set(inode, FI_INLINE_DATA)) ri->i_inline |= F2FS_INLINE_DATA; if (is_inode_flag_set(inode, FI_INLINE_DENTRY)) ri->i_inline |= F2FS_INLINE_DENTRY; if (is_inode_flag_set(inode, FI_DATA_EXIST)) ri->i_inline |= F2FS_DATA_EXIST; if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) ri->i_inline |= F2FS_EXTRA_ATTR; if (is_inode_flag_set(inode, FI_PIN_FILE)) ri->i_inline |= F2FS_PIN_FILE; if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) ri->i_inline |= F2FS_COMPRESS_RELEASED; } static inline int f2fs_has_extra_attr(struct inode *inode) { return is_inode_flag_set(inode, FI_EXTRA_ATTR); } static inline int f2fs_has_inline_xattr(struct inode *inode) { return is_inode_flag_set(inode, FI_INLINE_XATTR); } static inline int f2fs_compressed_file(struct inode *inode) { return S_ISREG(inode->i_mode) && is_inode_flag_set(inode, FI_COMPRESSED_FILE); } static inline bool f2fs_need_compress_data(struct inode *inode) { int compress_mode = F2FS_OPTION(F2FS_I_SB(inode)).compress_mode; if (!f2fs_compressed_file(inode)) return false; if (compress_mode == COMPR_MODE_FS) return true; else if (compress_mode == COMPR_MODE_USER && is_inode_flag_set(inode, FI_ENABLE_COMPRESS)) return true; return false; } static inline unsigned int addrs_per_page(struct inode *inode, bool is_inode) { unsigned int addrs = is_inode ? (CUR_ADDRS_PER_INODE(inode) - get_inline_xattr_addrs(inode)) : DEF_ADDRS_PER_BLOCK; if (f2fs_compressed_file(inode)) return ALIGN_DOWN(addrs, F2FS_I(inode)->i_cluster_size); return addrs; } static inline void *inline_xattr_addr(struct inode *inode, const struct folio *folio) { struct f2fs_inode *ri = F2FS_INODE(folio); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - get_inline_xattr_addrs(inode)]); } static inline int inline_xattr_size(struct inode *inode) { if (f2fs_has_inline_xattr(inode)) return get_inline_xattr_addrs(inode) * sizeof(__le32); return 0; } /* * Notice: check inline_data flag without inode page lock is unsafe. * It could change at any time by f2fs_convert_inline_folio(). */ static inline int f2fs_has_inline_data(struct inode *inode) { return is_inode_flag_set(inode, FI_INLINE_DATA); } static inline int f2fs_exist_data(struct inode *inode) { return is_inode_flag_set(inode, FI_DATA_EXIST); } static inline int f2fs_is_mmap_file(struct inode *inode) { return is_inode_flag_set(inode, FI_MMAP_FILE); } static inline bool f2fs_is_pinned_file(struct inode *inode) { return is_inode_flag_set(inode, FI_PIN_FILE); } static inline bool f2fs_is_atomic_file(struct inode *inode) { return is_inode_flag_set(inode, FI_ATOMIC_FILE); } static inline bool f2fs_is_cow_file(struct inode *inode) { return is_inode_flag_set(inode, FI_COW_FILE); } static inline void *inline_data_addr(struct inode *inode, struct folio *folio) { __le32 *addr = get_dnode_addr(inode, folio); return (void *)(addr + DEF_INLINE_RESERVED_SIZE); } static inline int f2fs_has_inline_dentry(struct inode *inode) { return is_inode_flag_set(inode, FI_INLINE_DENTRY); } static inline int is_file(struct inode *inode, int type) { return F2FS_I(inode)->i_advise & type; } static inline void set_file(struct inode *inode, int type) { if (is_file(inode, type)) return; F2FS_I(inode)->i_advise |= type; f2fs_mark_inode_dirty_sync(inode, true); } static inline void clear_file(struct inode *inode, int type) { if (!is_file(inode, type)) return; F2FS_I(inode)->i_advise &= ~type; f2fs_mark_inode_dirty_sync(inode, true); } static inline bool f2fs_is_time_consistent(struct inode *inode) { struct timespec64 ts = inode_get_atime(inode); if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &ts)) return false; ts = inode_get_ctime(inode); if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ts)) return false; ts = inode_get_mtime(inode); if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &ts)) return false; return true; } static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { bool ret; if (dsync) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); spin_lock(&sbi->inode_lock[DIRTY_META]); ret = list_empty(&F2FS_I(inode)->gdirty_list); spin_unlock(&sbi->inode_lock[DIRTY_META]); return ret; } if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || file_keep_isize(inode) || i_size_read(inode) & ~PAGE_MASK) return false; if (!f2fs_is_time_consistent(inode)) return false; spin_lock(&F2FS_I(inode)->i_size_lock); ret = F2FS_I(inode)->last_disk_size == i_size_read(inode); spin_unlock(&F2FS_I(inode)->i_size_lock); return ret; } static inline bool f2fs_readonly(struct super_block *sb) { return sb_rdonly(sb); } static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) { return is_set_ckpt_flags(sbi, CP_ERROR_FLAG); } static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { if (time_to_inject(sbi, FAULT_KMALLOC)) return NULL; return kmalloc(size, flags); } static inline void *f2fs_getname(struct f2fs_sb_info *sbi) { if (time_to_inject(sbi, FAULT_KMALLOC)) return NULL; return __getname(); } static inline void f2fs_putname(char *buf) { __putname(buf); } static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { return f2fs_kmalloc(sbi, size, flags | __GFP_ZERO); } static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { if (time_to_inject(sbi, FAULT_KVMALLOC)) return NULL; return kvmalloc(size, flags); } static inline void *f2fs_kvzalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { return f2fs_kvmalloc(sbi, size, flags | __GFP_ZERO); } static inline void *f2fs_vmalloc(struct f2fs_sb_info *sbi, size_t size) { if (time_to_inject(sbi, FAULT_VMALLOC)) return NULL; return vmalloc(size); } static inline int get_extra_isize(struct inode *inode) { return F2FS_I(inode)->i_extra_isize / sizeof(__le32); } static inline int get_inline_xattr_addrs(struct inode *inode) { return F2FS_I(inode)->i_inline_xattr_size; } #define f2fs_get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) #define F2FS_MIN_EXTRA_ATTR_SIZE (sizeof(__le32)) #define F2FS_TOTAL_EXTRA_ATTR_SIZE \ (offsetof(struct f2fs_inode, i_extra_end) - \ offsetof(struct f2fs_inode, i_extra_isize)) \ #define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) #define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ ((offsetof(typeof(*(f2fs_inode)), field) + \ sizeof((f2fs_inode)->field)) \ <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \ #define __is_large_section(sbi) (SEGS_PER_SEC(sbi) > 1) #define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META) bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); static inline void verify_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) f2fs_err(sbi, "invalid blkaddr: %u, type: %d, run fsck to fix.", blkaddr, type); } static inline bool __is_valid_data_blkaddr(block_t blkaddr) { if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR || blkaddr == COMPRESS_ADDR) return false; return true; } /* * file.c */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); int f2fs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag, bool readonly, bool need_lock); int f2fs_precache_extents(struct inode *inode); int f2fs_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int f2fs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid); int f2fs_pin_file_control(struct inode *inode, bool inc); /* * inode.c */ void f2fs_set_inode_flags(struct inode *inode); bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct folio *folio); void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct folio *folio); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); void f2fs_update_inode(struct inode *inode, struct folio *node_folio); void f2fs_update_inode_page(struct inode *inode); int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); void f2fs_remove_donate_inode(struct inode *inode); void f2fs_evict_inode(struct inode *inode); void f2fs_handle_failed_inode(struct inode *inode); /* * namei.c */ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct inode **new_inode); /* * dir.c */ #if IS_ENABLED(CONFIG_UNICODE) int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname); void f2fs_free_casefolded_name(struct f2fs_filename *fname); #else static inline int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname) { return 0; } static inline void f2fs_free_casefolded_name(struct f2fs_filename *fname) { } #endif /* CONFIG_UNICODE */ int f2fs_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct f2fs_filename *fname); int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry, struct f2fs_filename *fname); void f2fs_free_filename(struct f2fs_filename *fname); struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, const struct f2fs_filename *fname, int *max_slots, bool use_hash); int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr); void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d); struct folio *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, const struct f2fs_filename *fname, struct folio *dfolio); void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth); int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots); void f2fs_drop_nlink(struct inode *dir, struct inode *inode); struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, const struct f2fs_filename *fname, struct folio **res_folio); struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, const struct qstr *child, struct folio **res_folio); struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct folio **f); ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, struct folio **folio); void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct folio *folio, struct inode *inode); bool f2fs_has_enough_room(struct inode *dir, struct folio *ifolio, const struct f2fs_filename *fname); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct fscrypt_str *name, f2fs_hash_t name_hash, unsigned int bit_pos); int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode); int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode); int f2fs_do_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode); void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct folio *folio, struct inode *dir, struct inode *inode); int f2fs_do_tmpfile(struct inode *inode, struct inode *dir, struct f2fs_filename *fname); bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; return f2fs_do_add_link(d_inode(dentry->d_parent), &dentry->d_name, inode, inode->i_ino, inode->i_mode); } /* * super.c */ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); int f2fs_dquot_initialize(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_do_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag); void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason); void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname); /* * node.c */ struct node_info; enum node_type; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct folio *folio); void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi); void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct folio *folio); void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi); int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni, bool checkpoint_context); pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); int f2fs_truncate_xattr_node(struct inode *inode); int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, unsigned int seq_id); int f2fs_remove_inode_page(struct inode *inode); struct folio *f2fs_new_inode_folio(struct inode *inode); struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs); void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, enum node_type node_type); struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino); struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid); int f2fs_move_node_folio(struct folio *node_folio, int gc_type); void f2fs_flush_inline_data(struct f2fs_sb_info *sbi); int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic, unsigned int *seq_id); int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type); int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); int f2fs_recover_inline_xattr(struct inode *inode, struct folio *folio); int f2fs_recover_xattr_data(struct inode *inode, struct folio *folio); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct folio *folio); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_node_manager_caches(void); void f2fs_destroy_node_manager_caches(void); /* * segment.c */ bool f2fs_need_SSR(struct f2fs_sb_info *sbi); int f2fs_commit_atomic_write(struct inode *inode); void f2fs_abort_atomic_write(struct inode *inode, bool clean); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi); int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr, unsigned int len); bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); int f2fs_start_discard_thread(struct f2fs_sb_info *sbi); void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi); void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi); block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi); int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable); void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno); int f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi); int f2fs_reinit_atgc_curseg(struct f2fs_sb_info *sbi); void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi); void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi); int f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force); int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi); int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); struct folio *f2fs_get_sum_folio(struct f2fs_sb_info *sbi, unsigned int segno); void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio, enum iostat_type io_type); void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio); void f2fs_outplace_write_data(struct dnode_of_data *dn, struct f2fs_io_info *fio); int f2fs_inplace_write_data(struct f2fs_io_info *fio); void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr, bool from_gc); void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, bool recover_newaddr); enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi, enum log_type seg_type); int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio); void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, block_t blkaddr, unsigned int blkcnt); void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, bool ordered, bool locked); #define f2fs_wait_on_page_writeback(page, type, ordered, locked) \ f2fs_folio_wait_writeback(page_folio(page), type, ordered, locked) void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len); void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc); void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi); int f2fs_build_segment_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_segment_manager_caches(void); void f2fs_destroy_segment_manager_caches(void); int f2fs_rw_hint_to_seg_type(struct f2fs_sb_info *sbi, enum rw_hint hint); enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp); unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi); unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno); unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, unsigned int segno); static inline struct inode *fio_inode(struct f2fs_io_info *fio) { return fio->folio->mapping->host; } #define DEF_FRAGMENT_SIZE 4 #define MIN_FRAGMENT_SIZE 1 #define MAX_FRAGMENT_SIZE 512 static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) { return F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG || F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK; } /* * checkpoint.c */ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, unsigned char reason); void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi); struct folio *f2fs_grab_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index); struct folio *f2fs_get_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index); struct folio *f2fs_get_meta_folio_retry(struct f2fs_sb_info *sbi, pgoff_t index); struct folio *f2fs_get_tmp_folio(struct f2fs_sb_info *sbi, pgoff_t index); bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi, block_t blkaddr, int type); int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, unsigned int ra_blocks); long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type); void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all); bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type); bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type); int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi); void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi); void f2fs_add_orphan_inode(struct inode *inode); void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi); int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio); void f2fs_remove_dirty_inode(struct inode *inode); int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, bool from_cp); void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type); u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi); int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); int __init f2fs_create_checkpoint_caches(void); void f2fs_destroy_checkpoint_caches(void); int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi); int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi); void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi); void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); /* * data.c */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); bool f2fs_is_cp_guaranteed(const struct folio *folio); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type); int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct folio *folio, nid_t ino, enum page_type type); void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, struct bio **bio, struct folio *folio); void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); int f2fs_submit_page_bio(struct f2fs_io_info *fio); int f2fs_merge_page_bio(struct f2fs_io_info *fio); void f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, sector_t *sector); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index, pgoff_t *next_pgofs); struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index, bool for_write); struct folio *f2fs_get_new_data_folio(struct inode *inode, struct folio *ifolio, pgoff_t index, bool new_i_size); int f2fs_do_write_data_page(struct f2fs_io_info *fio); int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); int f2fs_encrypt_one_page(struct f2fs_io_info *fio); bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); int f2fs_write_single_data_page(struct folio *folio, int *submitted, struct bio **bio, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, int compr_blocks, bool allow_balance); void f2fs_write_failed(struct inode *inode, loff_t to); void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length); bool f2fs_release_folio(struct folio *folio, gfp_t wait); bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); void f2fs_clear_page_cache_dirty_tag(struct folio *folio); int f2fs_init_post_read_processing(void); void f2fs_destroy_post_read_processing(void); int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); extern const struct iomap_ops f2fs_iomap_ops; /* * gc.c */ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_gc_range(struct f2fs_sb_info *sbi, unsigned int start_seg, unsigned int end_seg, bool dry_run, unsigned int dry_run_sections); int f2fs_resize_fs(struct file *filp, __u64 block_count); int __init f2fs_create_garbage_collection_cache(void); void f2fs_destroy_garbage_collection_cache(void); /* victim selection function for cleaning and SSR */ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, int gc_type, int type, char alloc_mode, unsigned long long age, bool one_time); /* * recovery.c */ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi); int __init f2fs_create_recovery_cache(void); void f2fs_destroy_recovery_cache(void); /* * debug.c */ #ifdef CONFIG_F2FS_STAT_FS enum { DEVSTAT_INUSE, DEVSTAT_DIRTY, DEVSTAT_FULL, DEVSTAT_FREE, DEVSTAT_PREFREE, DEVSTAT_MAX, }; struct f2fs_dev_stats { unsigned int devstats[2][DEVSTAT_MAX]; /* 0: segs, 1: secs */ }; struct f2fs_stat_info { struct list_head stat_list; struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; unsigned long long hit_cached[NR_EXTENT_CACHES]; unsigned long long hit_rbtree[NR_EXTENT_CACHES]; unsigned long long total_ext[NR_EXTENT_CACHES]; unsigned long long hit_total[NR_EXTENT_CACHES]; int ext_tree[NR_EXTENT_CACHES]; int zombie_tree[NR_EXTENT_CACHES]; int ext_node[NR_EXTENT_CACHES]; /* to count memory footprint */ unsigned long long ext_mem[NR_EXTENT_CACHES]; /* for read extent cache */ unsigned long long hit_largest; /* for block age extent cache */ unsigned long long allocated_data_blocks; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; unsigned int ndirty_dirs, ndirty_files, ndirty_all; unsigned int nquota_files, ndonate_files; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; int total_count, utilization; int nr_wb_cp_data, nr_wb_data; int nr_rd_data, nr_rd_node, nr_rd_meta; int nr_dio_read, nr_dio_write; unsigned int io_skip_bggc, other_skip_bggc; int nr_flushing, nr_flushed, flush_list_empty; int nr_discarding, nr_discarded; int nr_discard_cmd; unsigned int undiscard_blks; int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; unsigned int cur_ckpt_time, peak_ckpt_time; int inline_xattr, inline_inode, inline_dir, append, update, orphans; int compr_inode, swapfile_inode; unsigned long long compr_blocks; int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages, compress_pages; int compress_page_hit; int prefree_count, free_segs, free_secs; int cp_call_count[MAX_CALL_TYPE], cp_count; int gc_call_count[MAX_CALL_TYPE]; int gc_segs[2][2]; int gc_secs[2][2]; int tot_blks, data_blks, node_blks; int bg_data_blks, bg_node_blks; int blkoff[NR_CURSEG_TYPE]; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; unsigned int dirty_seg[NR_CURSEG_TYPE]; unsigned int full_seg[NR_CURSEG_TYPE]; unsigned int valid_blks[NR_CURSEG_TYPE]; unsigned int meta_count[META_MAX]; unsigned int segment_count[2]; unsigned int block_count[2]; unsigned int inplace_count; unsigned long long base_mem, cache_mem, page_mem; struct f2fs_dev_stats *dev_stats; }; static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) { return (struct f2fs_stat_info *)sbi->stat_info; } #define stat_inc_cp_call_count(sbi, foreground) \ atomic_inc(&sbi->cp_call_count[(foreground)]) #define stat_inc_cp_count(sbi) (F2FS_STAT(sbi)->cp_count++) #define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++) #define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) #define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) #define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) #define stat_inc_total_hit(sbi, type) (atomic64_inc(&(sbi)->total_hit_ext[type])) #define stat_inc_rbtree_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_rbtree[type])) #define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) #define stat_inc_cached_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_cached[type])) #define stat_inc_inline_xattr(inode) \ do { \ if (f2fs_has_inline_xattr(inode)) \ (atomic_inc(&F2FS_I_SB(inode)->inline_xattr)); \ } while (0) #define stat_dec_inline_xattr(inode) \ do { \ if (f2fs_has_inline_xattr(inode)) \ (atomic_dec(&F2FS_I_SB(inode)->inline_xattr)); \ } while (0) #define stat_inc_inline_inode(inode) \ do { \ if (f2fs_has_inline_data(inode)) \ (atomic_inc(&F2FS_I_SB(inode)->inline_inode)); \ } while (0) #define stat_dec_inline_inode(inode) \ do { \ if (f2fs_has_inline_data(inode)) \ (atomic_dec(&F2FS_I_SB(inode)->inline_inode)); \ } while (0) #define stat_inc_inline_dir(inode) \ do { \ if (f2fs_has_inline_dentry(inode)) \ (atomic_inc(&F2FS_I_SB(inode)->inline_dir)); \ } while (0) #define stat_dec_inline_dir(inode) \ do { \ if (f2fs_has_inline_dentry(inode)) \ (atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \ } while (0) #define stat_inc_compr_inode(inode) \ do { \ if (f2fs_compressed_file(inode)) \ (atomic_inc(&F2FS_I_SB(inode)->compr_inode)); \ } while (0) #define stat_dec_compr_inode(inode) \ do { \ if (f2fs_compressed_file(inode)) \ (atomic_dec(&F2FS_I_SB(inode)->compr_inode)); \ } while (0) #define stat_add_compr_blocks(inode, blocks) \ (atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_sub_compr_blocks(inode, blocks) \ (atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_inc_swapfile_inode(inode) \ (atomic_inc(&F2FS_I_SB(inode)->swapfile_inode)) #define stat_dec_swapfile_inode(inode) \ (atomic_dec(&F2FS_I_SB(inode)->swapfile_inode)) #define stat_inc_atomic_inode(inode) \ (atomic_inc(&F2FS_I_SB(inode)->atomic_files)) #define stat_dec_atomic_inode(inode) \ (atomic_dec(&F2FS_I_SB(inode)->atomic_files)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ atomic_inc(&(sbi)->meta_count[META_CP]); \ else if (blkaddr < NM_I(sbi)->nat_blkaddr) \ atomic_inc(&(sbi)->meta_count[META_SIT]); \ else if (blkaddr < SM_I(sbi)->ssa_blkaddr) \ atomic_inc(&(sbi)->meta_count[META_NAT]); \ else if (blkaddr < SM_I(sbi)->main_blkaddr) \ atomic_inc(&(sbi)->meta_count[META_SSA]); \ } while (0) #define stat_inc_seg_type(sbi, curseg) \ ((sbi)->segment_count[(curseg)->alloc_type]++) #define stat_inc_block_count(sbi, curseg) \ ((sbi)->block_count[(curseg)->alloc_type]++) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) #define stat_update_max_atomic_write(inode) \ do { \ int cur = atomic_read(&F2FS_I_SB(inode)->atomic_files); \ int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) #define stat_inc_gc_call_count(sbi, foreground) \ (F2FS_STAT(sbi)->gc_call_count[(foreground)]++) #define stat_inc_gc_sec_count(sbi, type, gc_type) \ (F2FS_STAT(sbi)->gc_secs[(type)][(gc_type)]++) #define stat_inc_gc_seg_count(sbi, type, gc_type) \ (F2FS_STAT(sbi)->gc_segs[(type)][(gc_type)]++) #define stat_inc_tot_blk_count(si, blks) \ ((si)->tot_blks += (blks)) #define stat_inc_data_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) #define stat_inc_node_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) int f2fs_build_stats(struct f2fs_sb_info *sbi); void f2fs_destroy_stats(struct f2fs_sb_info *sbi); void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #else #define stat_inc_cp_call_count(sbi, foreground) do { } while (0) #define stat_inc_cp_count(sbi) do { } while (0) #define stat_io_skip_bggc_count(sbi) do { } while (0) #define stat_other_skip_bggc_count(sbi) do { } while (0) #define stat_inc_dirty_inode(sbi, type) do { } while (0) #define stat_dec_dirty_inode(sbi, type) do { } while (0) #define stat_inc_total_hit(sbi, type) do { } while (0) #define stat_inc_rbtree_node_hit(sbi, type) do { } while (0) #define stat_inc_largest_node_hit(sbi) do { } while (0) #define stat_inc_cached_node_hit(sbi, type) do { } while (0) #define stat_inc_inline_xattr(inode) do { } while (0) #define stat_dec_inline_xattr(inode) do { } while (0) #define stat_inc_inline_inode(inode) do { } while (0) #define stat_dec_inline_inode(inode) do { } while (0) #define stat_inc_inline_dir(inode) do { } while (0) #define stat_dec_inline_dir(inode) do { } while (0) #define stat_inc_compr_inode(inode) do { } while (0) #define stat_dec_compr_inode(inode) do { } while (0) #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) #define stat_inc_swapfile_inode(inode) do { } while (0) #define stat_dec_swapfile_inode(inode) do { } while (0) #define stat_inc_atomic_inode(inode) do { } while (0) #define stat_dec_atomic_inode(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) #define stat_inc_block_count(sbi, curseg) do { } while (0) #define stat_inc_inplace_blocks(sbi) do { } while (0) #define stat_inc_gc_call_count(sbi, foreground) do { } while (0) #define stat_inc_gc_sec_count(sbi, type, gc_type) do { } while (0) #define stat_inc_gc_seg_count(sbi, type, gc_type) do { } while (0) #define stat_inc_tot_blk_count(si, blks) do { } while (0) #define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0) #define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0) static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } static inline void __init f2fs_create_root_stats(void) { } static inline void f2fs_destroy_root_stats(void) { } static inline void f2fs_update_sit_info(struct f2fs_sb_info *sbi) {} #endif extern const struct file_operations f2fs_dir_operations; extern const struct file_operations f2fs_file_operations; extern const struct inode_operations f2fs_file_inode_operations; extern const struct address_space_operations f2fs_dblock_aops; extern const struct address_space_operations f2fs_node_aops; extern const struct address_space_operations f2fs_meta_aops; extern const struct inode_operations f2fs_dir_inode_operations; extern const struct inode_operations f2fs_symlink_inode_operations; extern const struct inode_operations f2fs_encrypted_symlink_inode_operations; extern const struct inode_operations f2fs_special_inode_operations; extern struct kmem_cache *f2fs_inode_entry_slab; /* * inline.c */ bool f2fs_may_inline_data(struct inode *inode); bool f2fs_sanity_check_inline_data(struct inode *inode, struct folio *ifolio); bool f2fs_may_inline_dentry(struct inode *inode); void f2fs_do_read_inline_data(struct folio *folio, struct folio *ifolio); void f2fs_truncate_inline_inode(struct inode *inode, struct folio *ifolio, u64 from); int f2fs_read_inline_data(struct inode *inode, struct folio *folio); int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio); int f2fs_convert_inline_inode(struct inode *inode); int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry); int f2fs_write_inline_data(struct inode *inode, struct folio *folio); int f2fs_recover_inline_data(struct inode *inode, struct folio *nfolio); struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, const struct f2fs_filename *fname, struct folio **res_folio, bool use_hash); int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, struct folio *ifolio); int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode); void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct folio *folio, struct inode *dir, struct inode *inode); bool f2fs_empty_inline_dir(struct inode *dir); int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct fscrypt_str *fstr); int f2fs_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); /* * shrinker.c */ unsigned long f2fs_shrink_count(struct shrinker *shrink, struct shrink_control *sc); unsigned long f2fs_shrink_scan(struct shrinker *shrink, struct shrink_control *sc); unsigned int f2fs_donate_files(void); void f2fs_reclaim_caches(unsigned int reclaim_caches_kb); void f2fs_join_shrinker(struct f2fs_sb_info *sbi); void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio); void f2fs_init_extent_tree(struct inode *inode); void f2fs_drop_extent_tree(struct inode *inode); void f2fs_destroy_extent_node(struct inode *inode); void f2fs_destroy_extent_tree(struct inode *inode); void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi); int __init f2fs_create_extent_cache(void); void f2fs_destroy_extent_cache(void); /* read extent cache ops */ void f2fs_init_read_extent_tree(struct inode *inode, struct folio *ifolio); bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, struct extent_info *ei); bool f2fs_lookup_read_extent_cache_block(struct inode *inode, pgoff_t index, block_t *blkaddr); void f2fs_update_read_extent_cache(struct dnode_of_data *dn); void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, pgoff_t fofs, block_t blkaddr, unsigned int len); unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); /* block age extent cache ops */ void f2fs_init_age_extent_tree(struct inode *inode); bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs, struct extent_info *ei); void f2fs_update_age_extent_cache(struct dnode_of_data *dn); void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn, pgoff_t fofs, unsigned int len); unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); /* * sysfs.c */ #define MIN_RA_MUL 2 #define MAX_RA_MUL 256 int __init f2fs_init_sysfs(void); void f2fs_exit_sysfs(void); int f2fs_register_sysfs(struct f2fs_sb_info *sbi); void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); /* verity.c */ extern const struct fsverity_operations f2fs_verityops; /* * crypto support */ static inline bool f2fs_encrypted_file(struct inode *inode) { return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); } static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_FS_ENCRYPTION file_set_encrypt(inode); f2fs_set_inode_flags(inode); #endif } /* * Returns true if the reads of the inode's data need to undergo some * postprocessing step, like decryption or authenticity verification. */ static inline bool f2fs_post_read_required(struct inode *inode) { return f2fs_encrypted_file(inode) || fsverity_active(inode) || f2fs_compressed_file(inode); } static inline bool f2fs_used_in_atomic_write(struct inode *inode) { return f2fs_is_atomic_file(inode) || f2fs_is_cow_file(inode); } static inline bool f2fs_meta_inode_gc_required(struct inode *inode) { return f2fs_post_read_required(inode) || f2fs_used_in_atomic_write(inode); } /* * compress.c */ #ifdef CONFIG_F2FS_FS_COMPRESSION enum cluster_check_type { CLUSTER_IS_COMPR, /* check only if compressed cluster */ CLUSTER_COMPR_BLKS, /* return # of compressed blocks in a cluster */ CLUSTER_RAW_BLKS /* return # of raw blocks in a cluster */ }; bool f2fs_is_compressed_page(struct folio *folio); struct folio *f2fs_compress_control_folio(struct folio *folio); int f2fs_prepare_compress_overwrite(struct inode *inode, struct page **pagep, pgoff_t index, void **fsdata); bool f2fs_compress_write_end(struct inode *inode, void *fsdata, pgoff_t index, unsigned copied); int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock); void f2fs_compress_write_end_io(struct bio *bio, struct folio *folio); bool f2fs_is_compress_backend_ready(struct inode *inode); bool f2fs_is_compress_level_valid(int alg, int lvl); int __init f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); void f2fs_end_read_compressed_page(struct folio *folio, bool failed, block_t blkaddr, bool in_task); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages, int index, int nr_pages, bool uptodate); bool f2fs_sanity_check_cluster(struct dnode_of_data *dn); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct folio *folio); int f2fs_write_multi_pages(struct compress_ctx *cc, int *submitted, struct writeback_control *wbc, enum iostat_type io_type); int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); bool f2fs_is_sparse_cluster(struct inode *inode, pgoff_t index); void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int llen, unsigned int c_len); int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, struct readahead_control *rac, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, bool in_task); void f2fs_put_folio_dic(struct folio *folio, bool in_task); unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn, unsigned int ofs_in_node); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); int f2fs_init_compress_inode(struct f2fs_sb_info *sbi); void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi); int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); int __init f2fs_init_compress_cache(void); void f2fs_destroy_compress_cache(void); struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi); void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int len); bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio, block_t blkaddr); void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino); #define inc_compr_inode_stat(inode) \ do { \ struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ sbi->compr_new_inode++; \ } while (0) #define add_compr_block_stat(inode, blocks) \ do { \ struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ int diff = F2FS_I(inode)->i_cluster_size - blocks; \ sbi->compr_written_block += blocks; \ sbi->compr_saved_block += diff; \ } while (0) #else static inline bool f2fs_is_compressed_page(struct folio *folio) { return false; } static inline bool f2fs_is_compress_backend_ready(struct inode *inode) { if (!f2fs_compressed_file(inode)) return true; /* not support compression */ return false; } static inline bool f2fs_is_compress_level_valid(int alg, int lvl) { return false; } static inline struct folio *f2fs_compress_control_folio(struct folio *folio) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } static inline int __init f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) { } static inline void f2fs_end_read_compressed_page(struct folio *folio, bool failed, block_t blkaddr, bool in_task) { WARN_ON_ONCE(1); } static inline void f2fs_put_folio_dic(struct folio *folio, bool in_task) { WARN_ON_ONCE(1); } static inline unsigned int f2fs_cluster_blocks_are_contiguous( struct dnode_of_data *dn, unsigned int ofs_in_node) { return 0; } static inline bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { return false; } static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { } static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } static inline int __init f2fs_init_compress_cache(void) { return 0; } static inline void f2fs_destroy_compress_cache(void) { } static inline void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int len) { } static inline bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio, block_t blkaddr) { return false; } static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { } #define inc_compr_inode_stat(inode) do { } while (0) static inline int f2fs_is_compressed_cluster( struct inode *inode, pgoff_t index) { return 0; } static inline bool f2fs_is_sparse_cluster( struct inode *inode, pgoff_t index) { return true; } static inline void f2fs_update_read_extent_tree_range_compressed( struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int llen, unsigned int c_len) { } #endif static inline int set_compress_context(struct inode *inode) { #ifdef CONFIG_F2FS_FS_COMPRESSION struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); fi->i_compress_algorithm = F2FS_OPTION(sbi).compress_algorithm; fi->i_log_cluster_size = F2FS_OPTION(sbi).compress_log_size; fi->i_compress_flag = F2FS_OPTION(sbi).compress_chksum ? BIT(COMPRESS_CHKSUM) : 0; fi->i_cluster_size = BIT(fi->i_log_cluster_size); if ((fi->i_compress_algorithm == COMPRESS_LZ4 || fi->i_compress_algorithm == COMPRESS_ZSTD) && F2FS_OPTION(sbi).compress_level) fi->i_compress_level = F2FS_OPTION(sbi).compress_level; fi->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); inc_compr_inode_stat(inode); f2fs_mark_inode_dirty_sync(inode, true); return 0; #else return -EOPNOTSUPP; #endif } static inline bool f2fs_disable_compressed_file(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); f2fs_down_write(&fi->i_sem); if (!f2fs_compressed_file(inode)) { f2fs_up_write(&fi->i_sem); return true; } if (f2fs_is_mmap_file(inode) || atomic_read(&fi->writeback) || (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))) { f2fs_up_write(&fi->i_sem); return false; } fi->i_flags &= ~F2FS_COMPR_FL; stat_dec_compr_inode(inode); clear_inode_flag(inode, FI_COMPRESSED_FILE); f2fs_mark_inode_dirty_sync(inode, true); f2fs_up_write(&fi->i_sem); return true; } #define F2FS_FEATURE_FUNCS(name, flagname) \ static inline bool f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \ { \ return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \ } F2FS_FEATURE_FUNCS(encrypt, ENCRYPT); F2FS_FEATURE_FUNCS(blkzoned, BLKZONED); F2FS_FEATURE_FUNCS(extra_attr, EXTRA_ATTR); F2FS_FEATURE_FUNCS(project_quota, PRJQUOTA); F2FS_FEATURE_FUNCS(inode_chksum, INODE_CHKSUM); F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); F2FS_FEATURE_FUNCS(verity, VERITY); F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); F2FS_FEATURE_FUNCS(readonly, RO); F2FS_FEATURE_FUNCS(device_alias, DEVICE_ALIAS); F2FS_FEATURE_FUNCS(packed_ssa, PACKED_SSA); #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_zone_is_seq(struct f2fs_sb_info *sbi, int devi, unsigned int zone) { return test_bit(zone, FDEV(devi).blkz_seq); } static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, block_t blkaddr) { return f2fs_zone_is_seq(sbi, devi, blkaddr / sbi->blocks_per_blkz); } #endif static inline int f2fs_bdev_index(struct f2fs_sb_info *sbi, struct block_device *bdev) { int i; if (!f2fs_is_multi_device(sbi)) return 0; for (i = 0; i < sbi->s_ndevs; i++) if (FDEV(i).bdev == bdev) return i; WARN_ON(1); return -1; } static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) { return f2fs_sb_has_blkzoned(sbi); } static inline bool f2fs_bdev_support_discard(struct block_device *bdev) { return bdev_max_discard_sectors(bdev) || bdev_is_zoned(bdev); } static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) { int i; if (!f2fs_is_multi_device(sbi)) return f2fs_bdev_support_discard(sbi->sb->s_bdev); for (i = 0; i < sbi->s_ndevs; i++) if (f2fs_bdev_support_discard(FDEV(i).bdev)) return true; return false; } static inline unsigned int f2fs_hw_discard_granularity(struct f2fs_sb_info *sbi) { int i = 1; unsigned int discard_granularity = bdev_discard_granularity(sbi->sb->s_bdev); if (f2fs_is_multi_device(sbi)) for (; i < sbi->s_ndevs && !bdev_is_zoned(FDEV(i).bdev); i++) discard_granularity = max_t(unsigned int, discard_granularity, bdev_discard_granularity(FDEV(i).bdev)); return discard_granularity; } static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi) { return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) || f2fs_hw_should_discard(sbi); } static inline bool f2fs_hw_is_readonly(struct f2fs_sb_info *sbi) { int i; if (!f2fs_is_multi_device(sbi)) return bdev_read_only(sbi->sb->s_bdev); for (i = 0; i < sbi->s_ndevs; i++) if (bdev_read_only(FDEV(i).bdev)) return true; return false; } static inline bool f2fs_dev_is_readonly(struct f2fs_sb_info *sbi) { return f2fs_sb_has_readonly(sbi) || f2fs_hw_is_readonly(sbi); } static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) { return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS; } static inline bool f2fs_is_sequential_zone_area(struct f2fs_sb_info *sbi, block_t blkaddr) { if (f2fs_sb_has_blkzoned(sbi)) { #ifdef CONFIG_BLK_DEV_ZONED int devi = f2fs_target_device_index(sbi, blkaddr); if (!bdev_is_zoned(FDEV(devi).bdev)) return false; if (f2fs_is_multi_device(sbi)) { if (blkaddr < FDEV(devi).start_blk || blkaddr > FDEV(devi).end_blk) { f2fs_err(sbi, "Invalid block %x", blkaddr); return false; } blkaddr -= FDEV(devi).start_blk; } return f2fs_blkz_is_seq(sbi, devi, blkaddr); #else return false; #endif } return false; } static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi) { return F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW; } static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || f2fs_is_atomic_file(inode) || f2fs_has_inline_data(inode) || f2fs_is_mmap_file(inode)) return false; return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); } static inline void f2fs_i_compr_blocks_update(struct inode *inode, u64 blocks, bool add) { struct f2fs_inode_info *fi = F2FS_I(inode); int diff = fi->i_cluster_size - blocks; /* don't update i_compr_blocks if saved blocks were released */ if (!add && !atomic_read(&fi->i_compr_blocks)) return; if (add) { atomic_add(diff, &fi->i_compr_blocks); stat_add_compr_blocks(inode, diff); } else { atomic_sub(diff, &fi->i_compr_blocks); stat_sub_compr_blocks(inode, diff); } f2fs_mark_inode_dirty_sync(inode, true); } static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, int flag) { if (!f2fs_is_multi_device(sbi)) return false; if (flag != F2FS_GET_BLOCK_DIO) return false; return sbi->aligned_blksize; } static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) { return fsverity_active(inode) && idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); } #ifdef CONFIG_F2FS_FAULT_INJECTION extern int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, unsigned long type, enum fault_option fo); #else static inline int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, unsigned long type, enum fault_option fo) { return 0; } #endif static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) { #ifdef CONFIG_QUOTA if (f2fs_sb_has_quota_ino(sbi)) return true; if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) return true; #endif return false; } static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi) { return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK; } static inline void __f2fs_schedule_timeout(long timeout, bool io) { set_current_state(TASK_UNINTERRUPTIBLE); if (io) io_schedule_timeout(timeout); else schedule_timeout(timeout); } #define f2fs_io_schedule_timeout(timeout) \ __f2fs_schedule_timeout(timeout, true) #define f2fs_schedule_timeout(timeout) \ __f2fs_schedule_timeout(timeout, false) static inline void f2fs_io_schedule_timeout_killable(long timeout) { while (timeout) { if (fatal_signal_pending(current)) return; set_current_state(TASK_UNINTERRUPTIBLE); io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); if (timeout <= DEFAULT_SCHEDULE_TIMEOUT) return; timeout -= DEFAULT_SCHEDULE_TIMEOUT; } } static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, struct folio *folio, enum page_type type) { pgoff_t ofs = folio->index; if (unlikely(f2fs_cp_error(sbi))) return; if (ofs == sbi->page_eio_ofs[type]) { if (sbi->page_eio_cnt[type]++ == MAX_RETRY_PAGE_EIO) set_ckpt_flags(sbi, CP_ERROR_FLAG); } else { sbi->page_eio_ofs[type] = ofs; sbi->page_eio_cnt[type] = 0; } } static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi) { return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb); } static inline void f2fs_truncate_meta_inode_pages(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int cnt) { bool need_submit = false; int i = 0; do { struct folio *folio; folio = filemap_get_folio(META_MAPPING(sbi), blkaddr + i); if (!IS_ERR(folio)) { if (folio_test_writeback(folio)) need_submit = true; f2fs_folio_put(folio, false); } } while (++i < cnt && !need_submit); if (need_submit) f2fs_submit_merged_write_cond(sbi, sbi->meta_inode, NULL, 0, DATA); truncate_inode_pages_range(META_MAPPING(sbi), F2FS_BLK_TO_BYTES((loff_t)blkaddr), F2FS_BLK_END_BYTES((loff_t)(blkaddr + cnt - 1))); } static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int len) { f2fs_truncate_meta_inode_pages(sbi, blkaddr, len); f2fs_invalidate_compress_pages_range(sbi, blkaddr, len); } #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* _LINUX_F2FS_H */ |
| 146 161 8 233 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_POLL_H #define _LINUX_POLL_H #include <linux/compiler.h> #include <linux/ktime.h> #include <linux/wait.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <uapi/linux/poll.h> #include <uapi/linux/eventpoll.h> /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating additional memory. */ #define MAX_STACK_ALLOC 832 #define FRONTEND_STACK_ALLOC 256 #define SELECT_STACK_ALLOC FRONTEND_STACK_ALLOC #define POLL_STACK_ALLOC FRONTEND_STACK_ALLOC #define WQUEUES_STACK_ALLOC (MAX_STACK_ALLOC - FRONTEND_STACK_ALLOC) #define N_INLINE_POLL_ENTRIES (WQUEUES_STACK_ALLOC / sizeof(struct poll_table_entry)) #define DEFAULT_POLLMASK (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM) struct poll_table_struct; /* * structures and helpers for f_op->poll implementations */ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *); /* * Do not touch the structure directly, use the access function * poll_requested_events() instead. */ typedef struct poll_table_struct { poll_queue_proc _qproc; __poll_t _key; } poll_table; static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { if (p && p->_qproc) { p->_qproc(filp, wait_address, p); /* * This memory barrier is paired in the wq_has_sleeper(). * See the comment above prepare_to_wait(), we need to * ensure that subsequent tests in this thread can't be * reordered with __add_wait_queue() in _qproc() paths. */ smp_mb(); } } /* * Return the set of events that the application wants to poll for. * This is useful for drivers that need to know whether a DMA transfer has * to be started implicitly on poll(). You typically only want to do that * if the application is actually polling for POLLIN and/or POLLOUT. */ static inline __poll_t poll_requested_events(const poll_table *p) { return p ? p->_key : ~(__poll_t)0; } static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) { pt->_qproc = qproc; pt->_key = ~(__poll_t)0; /* all events enabled */ } static inline bool file_can_poll(struct file *file) { return file->f_op->poll; } static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt) { if (unlikely(!file->f_op->poll)) return DEFAULT_POLLMASK; return file->f_op->poll(file, pt); } struct poll_table_entry { struct file *filp; __poll_t key; wait_queue_entry_t wait; wait_queue_head_t *wait_address; }; /* * Structures and helpers for select/poll syscall */ struct poll_wqueues { poll_table pt; struct poll_table_page *table; struct task_struct *polling_task; int triggered; int error; int inline_index; struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES]; }; extern void poll_initwait(struct poll_wqueues *pwq); extern void poll_freewait(struct poll_wqueues *pwq); extern u64 select_estimate_accuracy(struct timespec64 *tv); #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1) extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec64 *end_time); extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec); #define __MAP(v, from, to) \ (from < to ? (v & from) * (to/from) : (v & from) / (from/to)) static inline __u16 mangle_poll(__poll_t val) { __u16 v = (__force __u16)val; #define M(X) __MAP(v, (__force __u16)EPOLL##X, POLL##X) return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) | M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) | M(HUP) | M(RDHUP) | M(MSG); #undef M } static inline __poll_t demangle_poll(u16 val) { #define M(X) (__force __poll_t)__MAP(val, POLL##X, (__force __u16)EPOLL##X) return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) | M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) | M(HUP) | M(RDHUP) | M(MSG); #undef M } #undef __MAP #endif /* _LINUX_POLL_H */ |
| 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_generic.c Generic packet scheduler routines. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Jamal Hadi Salim, <hadi@cyberus.ca> 990601 * - Ingress support */ #include <linux/bitops.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/if_vlan.h> #include <linux/skb_array.h> #include <linux/if_macvlan.h> #include <linux/bpf.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> #include <net/hotdata.h> #include <trace/events/qdisc.h> #include <trace/events/net.h> #include <net/xfrm.h> /* Qdisc to use by default */ const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; EXPORT_SYMBOL(default_qdisc_ops); static void qdisc_maybe_clear_missed(struct Qdisc *q, const struct netdev_queue *txq) { clear_bit(__QDISC_STATE_MISSED, &q->state); /* Make sure the below netif_xmit_frozen_or_stopped() * checking happens after clearing STATE_MISSED. */ smp_mb__after_atomic(); /* Checking netif_xmit_frozen_or_stopped() again to * make sure STATE_MISSED is set if the STATE_MISSED * set by netif_tx_wake_queue()'s rescheduling of * net_tx_action() is cleared by the above clear_bit(). */ if (!netif_xmit_frozen_or_stopped(txq)) set_bit(__QDISC_STATE_MISSED, &q->state); else set_bit(__QDISC_STATE_DRAINING, &q->state); } /* Main transmission queue. */ /* Modifications to data participating in scheduling must be protected with * qdisc_lock(qdisc) spinlock. * * The idea is the following: * - enqueue, dequeue are serialized via qdisc root lock * - ingress filtering is also serialized via qdisc root lock * - updates to tree and tree walking are only done under the rtnl mutex. */ #define SKB_XOFF_MAGIC ((struct sk_buff *)1UL) static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) { const struct netdev_queue *txq = q->dev_queue; spinlock_t *lock = NULL; struct sk_buff *skb; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } skb = skb_peek(&q->skb_bad_txq); if (skb) { /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { skb = __skb_dequeue(&q->skb_bad_txq); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; } } else { skb = SKB_XOFF_MAGIC; qdisc_maybe_clear_missed(q, txq); } } if (lock) spin_unlock(lock); return skb; } static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q) { struct sk_buff *skb = skb_peek(&q->skb_bad_txq); if (unlikely(skb)) skb = __skb_dequeue_bad_txq(q); return skb; } static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, struct sk_buff *skb) { spinlock_t *lock = NULL; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } __skb_queue_tail(&q->skb_bad_txq, skb); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_inc(q, skb); qdisc_qstats_cpu_qlen_inc(q); } else { qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; } if (lock) spin_unlock(lock); } static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { spinlock_t *lock = NULL; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } while (skb) { struct sk_buff *next = skb->next; __skb_queue_tail(&q->gso_skb, skb); /* it's still part of the queue */ if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_requeues_inc(q); qdisc_qstats_cpu_backlog_inc(q, skb); qdisc_qstats_cpu_qlen_inc(q); } else { q->qstats.requeues++; qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; } skb = next; } if (lock) { spin_unlock(lock); set_bit(__QDISC_STATE_MISSED, &q->state); } else { __netif_schedule(q); } } static void try_bulk_dequeue_skb(struct Qdisc *q, struct sk_buff *skb, const struct netdev_queue *txq, int *packets, int budget) { int bytelimit = qdisc_avail_bulklimit(txq) - skb->len; int cnt = 0; while (bytelimit > 0) { struct sk_buff *nskb = q->dequeue(q); if (!nskb) break; bytelimit -= nskb->len; /* covers GSO len */ skb->next = nskb; skb = nskb; if (++cnt >= budget) break; } (*packets) += cnt; skb_mark_not_on_list(skb); } /* This variant of try_bulk_dequeue_skb() makes sure * all skbs in the chain are for the same txq */ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, struct sk_buff *skb, int *packets) { int mapping = skb_get_queue_mapping(skb); struct sk_buff *nskb; int cnt = 0; do { nskb = q->dequeue(q); if (!nskb) break; if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { qdisc_enqueue_skb_bad_txq(q, nskb); break; } skb->next = nskb; skb = nskb; } while (++cnt < 8); (*packets) += cnt; skb_mark_not_on_list(skb); } /* Note that dequeue_skb can possibly return a SKB list (via skb->next). * A requeued skb (via q->gso_skb) can also be a SKB list. */ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, int *packets, int budget) { const struct netdev_queue *txq = q->dev_queue; struct sk_buff *skb = NULL; *packets = 1; if (unlikely(!skb_queue_empty(&q->gso_skb))) { spinlock_t *lock = NULL; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } skb = skb_peek(&q->gso_skb); /* skb may be null if another cpu pulls gso_skb off in between * empty check and lock. */ if (!skb) { if (lock) spin_unlock(lock); goto validate; } /* skb in gso_skb were already validated */ *validate = false; if (xfrm_offload(skb)) *validate = true; /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { skb = __skb_dequeue(&q->gso_skb); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; } } else { skb = NULL; qdisc_maybe_clear_missed(q, txq); } if (lock) spin_unlock(lock); goto trace; } validate: *validate = true; if ((q->flags & TCQ_F_ONETXQUEUE) && netif_xmit_frozen_or_stopped(txq)) { qdisc_maybe_clear_missed(q, txq); return skb; } skb = qdisc_dequeue_skb_bad_txq(q); if (unlikely(skb)) { if (skb == SKB_XOFF_MAGIC) return NULL; goto bulk; } skb = q->dequeue(q); if (skb) { bulk: if (qdisc_may_bulk(q)) try_bulk_dequeue_skb(q, skb, txq, packets, budget); else try_bulk_dequeue_skb_slow(q, skb, packets); } trace: trace_qdisc_dequeue(q, txq, *packets, skb); return skb; } /* * Transmit possibly several skbs, and handle the return status as * required. Owning qdisc running bit guarantees that only one CPU * can execute this function. * * Returns to the caller: * false - hardware queue frozen backoff * true - feel free to send more pkts */ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate) { int ret = NETDEV_TX_BUSY; bool again = false; /* And release qdisc */ if (root_lock) spin_unlock(root_lock); /* Note that we validate skb (GSO, checksum, ...) outside of locks */ if (validate) skb = validate_xmit_skb_list(skb, dev, &again); #ifdef CONFIG_XFRM_OFFLOAD if (unlikely(again)) { if (root_lock) spin_lock(root_lock); dev_requeue_skb(skb, q); return false; } #endif if (likely(skb)) { HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) skb = dev_hard_start_xmit(skb, dev, txq, &ret); else qdisc_maybe_clear_missed(q, txq); HARD_TX_UNLOCK(dev, txq); } else { if (root_lock) spin_lock(root_lock); return true; } if (root_lock) spin_lock(root_lock); if (!dev_xmit_complete(ret)) { /* Driver returned NETDEV_TX_BUSY - requeue skb */ if (unlikely(ret != NETDEV_TX_BUSY)) net_warn_ratelimited("BUG %s code %d qlen %d\n", dev->name, ret, q->q.qlen); dev_requeue_skb(skb, q); return false; } return true; } /* * NOTE: Called under qdisc_lock(q) with locally disabled BH. * * running seqcount guarantees only one CPU can process * this qdisc at a time. qdisc_lock(q) serializes queue accesses for * this queue. * * netif_tx_lock serializes accesses to device driver. * * qdisc_lock(q) and netif_tx_lock are mutually exclusive, * if one is grabbed, another must be free. * * Note, that this procedure can be called by a watchdog timer * * Returns to the caller: * 0 - queue is empty or throttled. * >0 - queue is not empty. * */ static inline bool qdisc_restart(struct Qdisc *q, int *packets, int budget) { spinlock_t *root_lock = NULL; struct netdev_queue *txq; struct net_device *dev; struct sk_buff *skb; bool validate; /* Dequeue packet */ skb = dequeue_skb(q, &validate, packets, budget); if (unlikely(!skb)) return false; if (!(q->flags & TCQ_F_NOLOCK)) root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); } void __qdisc_run(struct Qdisc *q) { int quota = READ_ONCE(net_hotdata.dev_tx_weight); int packets; while (qdisc_restart(q, &packets, quota)) { quota -= packets; if (quota <= 0) { if (q->flags & TCQ_F_NOLOCK) set_bit(__QDISC_STATE_MISSED, &q->state); else __netif_schedule(q); break; } } } unsigned long dev_trans_start(struct net_device *dev) { unsigned long res = READ_ONCE(netdev_get_tx_queue(dev, 0)->trans_start); unsigned long val; unsigned int i; for (i = 1; i < dev->num_tx_queues; i++) { val = READ_ONCE(netdev_get_tx_queue(dev, i)->trans_start); if (val && time_after(val, res)) res = val; } return res; } EXPORT_SYMBOL(dev_trans_start); static void netif_freeze_queues(struct net_device *dev) { unsigned int i; int cpu; cpu = smp_processor_id(); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); /* We are the only thread of execution doing a * freeze, but we have to grab the _xmit_lock in * order to synchronize with threads which are in * the ->hard_start_xmit() handler and already * checked the frozen bit. */ __netif_tx_lock(txq, cpu); set_bit(__QUEUE_STATE_FROZEN, &txq->state); __netif_tx_unlock(txq); } } void netif_tx_lock(struct net_device *dev) { spin_lock(&dev->tx_global_lock); netif_freeze_queues(dev); } EXPORT_SYMBOL(netif_tx_lock); static void netif_unfreeze_queues(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); /* No need to grab the _xmit_lock here. If the * queue is not stopped for another reason, we * force a schedule. */ clear_bit(__QUEUE_STATE_FROZEN, &txq->state); netif_schedule_queue(txq); } } void netif_tx_unlock(struct net_device *dev) { netif_unfreeze_queues(dev); spin_unlock(&dev->tx_global_lock); } EXPORT_SYMBOL(netif_tx_unlock); static void dev_watchdog(struct timer_list *t) { struct net_device *dev = timer_container_of(dev, t, watchdog_timer); bool release = true; spin_lock(&dev->tx_global_lock); if (!qdisc_tx_is_noop(dev)) { if (netif_device_present(dev) && netif_running(dev) && netif_carrier_ok(dev)) { unsigned int timedout_ms = 0; unsigned int i; unsigned long trans_start; unsigned long oldest_start = jiffies; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq; txq = netdev_get_tx_queue(dev, i); if (!netif_xmit_stopped(txq)) continue; /* Paired with WRITE_ONCE() + smp_mb...() in * netdev_tx_sent_queue() and netif_tx_stop_queue(). */ smp_mb(); trans_start = READ_ONCE(txq->trans_start); if (time_after(jiffies, trans_start + dev->watchdog_timeo)) { timedout_ms = jiffies_to_msecs(jiffies - trans_start); atomic_long_inc(&txq->trans_timeout); break; } if (time_after(oldest_start, trans_start)) oldest_start = trans_start; } if (unlikely(timedout_ms)) { trace_net_dev_xmit_timeout(dev, i); netdev_crit(dev, "NETDEV WATCHDOG: CPU: %d: transmit queue %u timed out %u ms\n", raw_smp_processor_id(), i, timedout_ms); netif_freeze_queues(dev); dev->netdev_ops->ndo_tx_timeout(dev, i); netif_unfreeze_queues(dev); } if (!mod_timer(&dev->watchdog_timer, round_jiffies(oldest_start + dev->watchdog_timeo))) release = false; } } spin_unlock(&dev->tx_global_lock); if (release) netdev_put(dev, &dev->watchdog_dev_tracker); } void netdev_watchdog_up(struct net_device *dev) { if (!dev->netdev_ops->ndo_tx_timeout) return; if (dev->watchdog_timeo <= 0) dev->watchdog_timeo = 5*HZ; if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo))) netdev_hold(dev, &dev->watchdog_dev_tracker, GFP_ATOMIC); } EXPORT_SYMBOL_GPL(netdev_watchdog_up); static void netdev_watchdog_down(struct net_device *dev) { netif_tx_lock_bh(dev); if (timer_delete(&dev->watchdog_timer)) netdev_put(dev, &dev->watchdog_dev_tracker); netif_tx_unlock_bh(dev); } /** * netif_carrier_on - set carrier * @dev: network device * * Device has detected acquisition of carrier. */ void netif_carrier_on(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; atomic_inc(&dev->carrier_up_count); linkwatch_fire_event(dev); if (netif_running(dev)) netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_carrier_on); /** * netif_carrier_off - clear carrier * @dev: network device * * Device has detected loss of carrier. */ void netif_carrier_off(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; atomic_inc(&dev->carrier_down_count); linkwatch_fire_event(dev); } } EXPORT_SYMBOL(netif_carrier_off); /** * netif_carrier_event - report carrier state event * @dev: network device * * Device has detected a carrier event but the carrier state wasn't changed. * Use in drivers when querying carrier state asynchronously, to avoid missing * events (link flaps) if link recovers before it's queried. */ void netif_carrier_event(struct net_device *dev) { if (dev->reg_state == NETREG_UNINITIALIZED) return; atomic_inc(&dev->carrier_up_count); atomic_inc(&dev->carrier_down_count); linkwatch_fire_event(dev); } EXPORT_SYMBOL_GPL(netif_carrier_event); /* "NOOP" scheduler: the best scheduler, recommended for all interfaces under all circumstances. It is difficult to invent anything faster or cheaper. */ static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { dev_core_stats_tx_dropped_inc(skb->dev); __qdisc_drop(skb, to_free); return NET_XMIT_CN; } static struct sk_buff *noop_dequeue(struct Qdisc *qdisc) { return NULL; } struct Qdisc_ops noop_qdisc_ops __read_mostly = { .id = "noop", .priv_size = 0, .enqueue = noop_enqueue, .dequeue = noop_dequeue, .peek = noop_dequeue, .owner = THIS_MODULE, }; static struct netdev_queue noop_netdev_queue = { RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc), RCU_POINTER_INITIALIZER(qdisc_sleeping, &noop_qdisc), }; struct Qdisc noop_qdisc = { .enqueue = noop_enqueue, .dequeue = noop_dequeue, .flags = TCQ_F_BUILTIN, .ops = &noop_qdisc_ops, .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, .gso_skb = { .next = (struct sk_buff *)&noop_qdisc.gso_skb, .prev = (struct sk_buff *)&noop_qdisc.gso_skb, .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock), }, .skb_bad_txq = { .next = (struct sk_buff *)&noop_qdisc.skb_bad_txq, .prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq, .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock), }, }; EXPORT_SYMBOL(noop_qdisc); static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt, struct netlink_ext_ack *extack) { /* register_qdisc() assigns a default of noop_enqueue if unset, * but __dev_queue_xmit() treats noqueue only as such * if this is NULL - so clear it here. */ qdisc->enqueue = NULL; return 0; } struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { .id = "noqueue", .priv_size = 0, .init = noqueue_init, .enqueue = noop_enqueue, .dequeue = noop_dequeue, .peek = noop_dequeue, .owner = THIS_MODULE, }; const u8 sch_default_prio2band[TC_PRIO_MAX + 1] = { 1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }; EXPORT_SYMBOL(sch_default_prio2band); /* 3-band FIFO queue: old style, but should be a bit faster than generic prio+fifo combination. */ #define PFIFO_FAST_BANDS 3 /* * Private data for a pfifo_fast scheduler containing: * - rings for priority bands */ struct pfifo_fast_priv { struct skb_array q[PFIFO_FAST_BANDS]; }; static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, int band) { return &priv->q[band]; } static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { int band = sch_default_prio2band[skb->priority & TC_PRIO_MAX]; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct skb_array *q = band2list(priv, band); unsigned int pkt_len = qdisc_pkt_len(skb); int err; err = skb_array_produce(q, skb); if (unlikely(err)) { tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_OVERLIMIT); if (qdisc_is_percpu_stats(qdisc)) return qdisc_drop_cpu(skb, qdisc, to_free); else return qdisc_drop(skb, qdisc, to_free); } qdisc_update_stats_at_enqueue(qdisc, pkt_len); return NET_XMIT_SUCCESS; } static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct sk_buff *skb = NULL; bool need_retry = true; int band; retry: for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { struct skb_array *q = band2list(priv, band); if (__skb_array_empty(q)) continue; skb = __skb_array_consume(q); } if (likely(skb)) { qdisc_update_stats_at_dequeue(qdisc, skb); } else if (need_retry && READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY) { /* Delay clearing the STATE_MISSED here to reduce * the overhead of the second spin_trylock() in * qdisc_run_begin() and __netif_schedule() calling * in qdisc_run_end(). */ clear_bit(__QDISC_STATE_MISSED, &qdisc->state); clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); /* Make sure dequeuing happens after clearing * STATE_MISSED. */ smp_mb__after_atomic(); need_retry = false; goto retry; } return skb; } static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct sk_buff *skb = NULL; int band; for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { struct skb_array *q = band2list(priv, band); skb = __skb_array_peek(q); } return skb; } static void pfifo_fast_reset(struct Qdisc *qdisc) { int i, band; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); for (band = 0; band < PFIFO_FAST_BANDS; band++) { struct skb_array *q = band2list(priv, band); struct sk_buff *skb; /* NULL ring is possible if destroy path is due to a failed * skb_array_init() in pfifo_fast_init() case. */ if (!q->ring.queue) continue; while ((skb = __skb_array_consume(q)) != NULL) kfree_skb(skb); } if (qdisc_is_percpu_stats(qdisc)) { for_each_possible_cpu(i) { struct gnet_stats_queue *q; q = per_cpu_ptr(qdisc->cpu_qstats, i); q->backlog = 0; q->qlen = 0; } } } static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) { struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; memcpy(&opt.priomap, sch_default_prio2band, TC_PRIO_MAX + 1); if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: return -1; } static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt, struct netlink_ext_ack *extack) { unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); int prio; /* guard against zero length rings */ if (!qlen) return -EINVAL; for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { struct skb_array *q = band2list(priv, prio); int err; err = skb_array_init(q, qlen, GFP_KERNEL); if (err) return -ENOMEM; } /* Can by-pass the queue discipline */ qdisc->flags |= TCQ_F_CAN_BYPASS; return 0; } static void pfifo_fast_destroy(struct Qdisc *sch) { struct pfifo_fast_priv *priv = qdisc_priv(sch); int prio; for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { struct skb_array *q = band2list(priv, prio); /* NULL ring is possible if destroy path is due to a failed * skb_array_init() in pfifo_fast_init() case. */ if (!q->ring.queue) continue; /* Destroy ring but no need to kfree_skb because a call to * pfifo_fast_reset() has already done that work. */ ptr_ring_cleanup(&q->ring, NULL); } } static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch, unsigned int new_len) { struct pfifo_fast_priv *priv = qdisc_priv(sch); struct skb_array *bands[PFIFO_FAST_BANDS]; int prio; for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { struct skb_array *q = band2list(priv, prio); bands[prio] = q; } return skb_array_resize_multiple_bh(bands, PFIFO_FAST_BANDS, new_len, GFP_KERNEL); } struct Qdisc_ops pfifo_fast_ops __read_mostly = { .id = "pfifo_fast", .priv_size = sizeof(struct pfifo_fast_priv), .enqueue = pfifo_fast_enqueue, .dequeue = pfifo_fast_dequeue, .peek = pfifo_fast_peek, .init = pfifo_fast_init, .destroy = pfifo_fast_destroy, .reset = pfifo_fast_reset, .dump = pfifo_fast_dump, .change_tx_queue_len = pfifo_fast_change_tx_queue_len, .owner = THIS_MODULE, .static_flags = TCQ_F_NOLOCK | TCQ_F_CPUSTATS, }; EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack) { struct Qdisc *sch; unsigned int size = sizeof(*sch) + ops->priv_size; int err = -ENOBUFS; struct net_device *dev; if (!dev_queue) { NL_SET_ERR_MSG(extack, "No device queue given"); err = -EINVAL; goto errout; } dev = dev_queue->dev; sch = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue)); if (!sch) goto errout; __skb_queue_head_init(&sch->gso_skb); __skb_queue_head_init(&sch->skb_bad_txq); gnet_stats_basic_sync_init(&sch->bstats); lockdep_register_key(&sch->root_lock_key); spin_lock_init(&sch->q.lock); lockdep_set_class(&sch->q.lock, &sch->root_lock_key); if (ops->static_flags & TCQ_F_CPUSTATS) { sch->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!sch->cpu_bstats) goto errout1; sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); if (!sch->cpu_qstats) { free_percpu(sch->cpu_bstats); goto errout1; } } /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); lockdep_set_class(&sch->seqlock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); sch->ops = ops; sch->flags = ops->static_flags; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL); refcount_set(&sch->refcnt, 1); return sch; errout1: lockdep_unregister_key(&sch->root_lock_key); kfree(sch); errout: return ERR_PTR(err); } struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, unsigned int parentid, struct netlink_ext_ack *extack) { struct Qdisc *sch; if (!bpf_try_module_get(ops, ops->owner)) { NL_SET_ERR_MSG(extack, "Failed to increase module reference counter"); return NULL; } sch = qdisc_alloc(dev_queue, ops, extack); if (IS_ERR(sch)) { bpf_module_put(ops, ops->owner); return NULL; } sch->parent = parentid; if (!ops->init || ops->init(sch, NULL, extack) == 0) { trace_qdisc_create(ops, dev_queue->dev, parentid); return sch; } qdisc_put(sch); return NULL; } EXPORT_SYMBOL(qdisc_create_dflt); /* Under qdisc_lock(qdisc) and BH! */ void qdisc_reset(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; trace_qdisc_reset(qdisc); if (ops->reset) ops->reset(qdisc); __skb_queue_purge(&qdisc->gso_skb); __skb_queue_purge(&qdisc->skb_bad_txq); qdisc->q.qlen = 0; qdisc->qstats.backlog = 0; } EXPORT_SYMBOL(qdisc_reset); void qdisc_free(struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) { free_percpu(qdisc->cpu_bstats); free_percpu(qdisc->cpu_qstats); } kfree(qdisc); } static void qdisc_free_cb(struct rcu_head *head) { struct Qdisc *q = container_of(head, struct Qdisc, rcu); qdisc_free(q); } static void __qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; struct net_device *dev = qdisc_dev(qdisc); #ifdef CONFIG_NET_SCHED qdisc_hash_del(qdisc); qdisc_put_stab(rtnl_dereference(qdisc->stab)); #endif gen_kill_estimator(&qdisc->rate_est); qdisc_reset(qdisc); if (ops->destroy) ops->destroy(qdisc); lockdep_unregister_key(&qdisc->root_lock_key); bpf_module_put(ops, ops->owner); netdev_put(dev, &qdisc->dev_tracker); trace_qdisc_destroy(qdisc); call_rcu(&qdisc->rcu, qdisc_free_cb); } void qdisc_destroy(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return; __qdisc_destroy(qdisc); } void qdisc_put(struct Qdisc *qdisc) { if (!qdisc) return; if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_test(&qdisc->refcnt)) return; __qdisc_destroy(qdisc); } EXPORT_SYMBOL(qdisc_put); /* Version of qdisc_put() that is called with rtnl mutex unlocked. * Intended to be used as optimization, this function only takes rtnl lock if * qdisc reference counter reached zero. */ void qdisc_put_unlocked(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_rtnl_lock(&qdisc->refcnt)) return; __qdisc_destroy(qdisc); rtnl_unlock(); } EXPORT_SYMBOL(qdisc_put_unlocked); /* Attach toplevel qdisc to device queue. */ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, struct Qdisc *qdisc) { struct Qdisc *oqdisc = rtnl_dereference(dev_queue->qdisc_sleeping); spinlock_t *root_lock; root_lock = qdisc_lock(oqdisc); spin_lock_bh(root_lock); /* ... and graft new one */ if (qdisc == NULL) qdisc = &noop_qdisc; rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); spin_unlock_bh(root_lock); return oqdisc; } EXPORT_SYMBOL(dev_graft_qdisc); static void shutdown_scheduler_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_qdisc_default) { struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); struct Qdisc *qdisc_default = _qdisc_default; if (qdisc) { rcu_assign_pointer(dev_queue->qdisc, qdisc_default); rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc_default); qdisc_put(qdisc); } } static void attach_one_default_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_unused) { struct Qdisc *qdisc; const struct Qdisc_ops *ops = default_qdisc_ops; if (dev->priv_flags & IFF_NO_QUEUE) ops = &noqueue_qdisc_ops; else if(dev->type == ARPHRD_CAN) ops = &pfifo_fast_ops; qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL); if (!qdisc) return; if (!netif_is_multiqueue(dev)) qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); } static void attach_default_qdiscs(struct net_device *dev) { struct netdev_queue *txq; struct Qdisc *qdisc; txq = netdev_get_tx_queue(dev, 0); if (!netif_is_multiqueue(dev) || dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); qdisc = rtnl_dereference(txq->qdisc_sleeping); rcu_assign_pointer(dev->qdisc, qdisc); qdisc_refcount_inc(qdisc); } else { qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL); if (qdisc) { rcu_assign_pointer(dev->qdisc, qdisc); qdisc->ops->attach(qdisc); } } qdisc = rtnl_dereference(dev->qdisc); /* Detect default qdisc setup/init failed and fallback to "noqueue" */ if (qdisc == &noop_qdisc) { netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n", default_qdisc_ops->id, noqueue_qdisc_ops.id); netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); dev->priv_flags |= IFF_NO_QUEUE; netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); qdisc = rtnl_dereference(txq->qdisc_sleeping); rcu_assign_pointer(dev->qdisc, qdisc); qdisc_refcount_inc(qdisc); dev->priv_flags ^= IFF_NO_QUEUE; } #ifdef CONFIG_NET_SCHED if (qdisc != &noop_qdisc) qdisc_hash_add(qdisc, false); #endif } static void transition_one_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_need_watchdog) { struct Qdisc *new_qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); int *need_watchdog_p = _need_watchdog; if (!(new_qdisc->flags & TCQ_F_BUILTIN)) clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state); rcu_assign_pointer(dev_queue->qdisc, new_qdisc); if (need_watchdog_p) { WRITE_ONCE(dev_queue->trans_start, 0); *need_watchdog_p = 1; } } void dev_activate(struct net_device *dev) { int need_watchdog; /* No queueing discipline is attached to device; * create default one for devices, which need queueing * and noqueue_qdisc for virtual interfaces */ if (rtnl_dereference(dev->qdisc) == &noop_qdisc) attach_default_qdiscs(dev); if (!netif_carrier_ok(dev)) /* Delay activation until next carrier-on event */ return; need_watchdog = 0; netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog); if (dev_ingress_queue(dev)) transition_one_qdisc(dev, dev_ingress_queue(dev), NULL); if (need_watchdog) { netif_trans_update(dev); netdev_watchdog_up(dev); } } EXPORT_SYMBOL(dev_activate); static void qdisc_deactivate(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return; set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); } static void dev_deactivate_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_sync_needed) { bool *sync_needed = _sync_needed; struct Qdisc *qdisc; qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { if (qdisc->enqueue) *sync_needed = true; qdisc_deactivate(qdisc); rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); } } static void dev_reset_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_unused) { struct Qdisc *qdisc; bool nolock; qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); if (!qdisc) return; nolock = qdisc->flags & TCQ_F_NOLOCK; if (nolock) spin_lock_bh(&qdisc->seqlock); spin_lock_bh(qdisc_lock(qdisc)); qdisc_reset(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); if (nolock) { clear_bit(__QDISC_STATE_MISSED, &qdisc->state); clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); spin_unlock_bh(&qdisc->seqlock); } } static bool some_qdisc_is_busy(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *dev_queue; spinlock_t *root_lock; struct Qdisc *q; int val; dev_queue = netdev_get_tx_queue(dev, i); q = rtnl_dereference(dev_queue->qdisc_sleeping); root_lock = qdisc_lock(q); spin_lock_bh(root_lock); val = (qdisc_is_running(q) || test_bit(__QDISC_STATE_SCHED, &q->state)); spin_unlock_bh(root_lock); if (val) return true; } return false; } /** * dev_deactivate_many - deactivate transmissions on several devices * @head: list of devices to deactivate * * This function returns only when all outstanding transmissions * have completed, unless all devices are in dismantle phase. */ void dev_deactivate_many(struct list_head *head) { bool sync_needed = false; struct net_device *dev; list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, &sync_needed); if (dev_ingress_queue(dev)) dev_deactivate_queue(dev, dev_ingress_queue(dev), &sync_needed); netdev_watchdog_down(dev); } /* Wait for outstanding qdisc enqueuing calls. */ if (sync_needed) synchronize_net(); list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_reset_queue, NULL); if (dev_ingress_queue(dev)) dev_reset_queue(dev, dev_ingress_queue(dev), NULL); } /* Wait for outstanding qdisc_run calls. */ list_for_each_entry(dev, head, close_list) { while (some_qdisc_is_busy(dev)) { /* wait_event() would avoid this sleep-loop but would * require expensive checks in the fast paths of packet * processing which isn't worth it. */ schedule_timeout_uninterruptible(1); } } } void dev_deactivate(struct net_device *dev) { LIST_HEAD(single); list_add(&dev->close_list, &single); dev_deactivate_many(&single); list_del(&single); } EXPORT_SYMBOL(dev_deactivate); static int qdisc_change_tx_queue_len(struct net_device *dev, struct netdev_queue *dev_queue) { struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); const struct Qdisc_ops *ops = qdisc->ops; if (ops->change_tx_queue_len) return ops->change_tx_queue_len(qdisc, dev->tx_queue_len); return 0; } void dev_qdisc_change_real_num_tx(struct net_device *dev, unsigned int new_real_tx) { struct Qdisc *qdisc = rtnl_dereference(dev->qdisc); if (qdisc->ops->change_real_num_tx) qdisc->ops->change_real_num_tx(qdisc, new_real_tx); } void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx) { #ifdef CONFIG_NET_SCHED struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; unsigned int i; for (i = new_real_tx; i < dev->real_num_tx_queues; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping); /* Only update the default qdiscs we created, * qdiscs with handles are always hashed. */ if (qdisc != &noop_qdisc && !qdisc->handle) qdisc_hash_del(qdisc); } for (i = dev->real_num_tx_queues; i < new_real_tx; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping); if (qdisc != &noop_qdisc && !qdisc->handle) qdisc_hash_add(qdisc, false); } #endif } EXPORT_SYMBOL(mq_change_real_num_tx); int dev_qdisc_change_tx_queue_len(struct net_device *dev) { bool up = dev->flags & IFF_UP; unsigned int i; int ret = 0; if (up) dev_deactivate(dev); for (i = 0; i < dev->num_tx_queues; i++) { ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]); /* TODO: revert changes on a partial failure */ if (ret) break; } if (up) dev_activate(dev); return ret; } static void dev_init_scheduler_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_qdisc) { struct Qdisc *qdisc = _qdisc; rcu_assign_pointer(dev_queue->qdisc, qdisc); rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); } void dev_init_scheduler(struct net_device *dev) { rcu_assign_pointer(dev->qdisc, &noop_qdisc); netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); timer_setup(&dev->watchdog_timer, dev_watchdog, 0); } void dev_shutdown(struct net_device *dev) { netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); qdisc_put(rtnl_dereference(dev->qdisc)); rcu_assign_pointer(dev->qdisc, &noop_qdisc); WARN_ON(timer_pending(&dev->watchdog_timer)); } /** * psched_ratecfg_precompute__() - Pre-compute values for reciprocal division * @rate: Rate to compute reciprocal division values of * @mult: Multiplier for reciprocal division * @shift: Shift for reciprocal division * * The multiplier and shift for reciprocal division by rate are stored * in mult and shift. * * The deal here is to replace a divide by a reciprocal one * in fast path (a reciprocal divide is a multiply and a shift) * * Normal formula would be : * time_in_ns = (NSEC_PER_SEC * len) / rate_bps * * We compute mult/shift to use instead : * time_in_ns = (len * mult) >> shift; * * We try to get the highest possible mult value for accuracy, * but have to make sure no overflows will ever happen. * * reciprocal_value() is not used here it doesn't handle 64-bit values. */ static void psched_ratecfg_precompute__(u64 rate, u32 *mult, u8 *shift) { u64 factor = NSEC_PER_SEC; *mult = 1; *shift = 0; if (rate <= 0) return; for (;;) { *mult = div64_u64(factor, rate); if (*mult & (1U << 31) || factor & (1ULL << 63)) break; factor <<= 1; (*shift)++; } } void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf, u64 rate64) { memset(r, 0, sizeof(*r)); r->overhead = conf->overhead; r->mpu = conf->mpu; r->rate_bytes_ps = max_t(u64, conf->rate, rate64); r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); psched_ratecfg_precompute__(r->rate_bytes_ps, &r->mult, &r->shift); } EXPORT_SYMBOL(psched_ratecfg_precompute); void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64) { r->rate_pkts_ps = pktrate64; psched_ratecfg_precompute__(r->rate_pkts_ps, &r->mult, &r->shift); } EXPORT_SYMBOL(psched_ppscfg_precompute); void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, struct tcf_proto *tp_head) { /* Protected with chain0->filter_chain_lock. * Can't access chain directly because tp_head can be NULL. */ struct mini_Qdisc *miniq_old = rcu_dereference_protected(*miniqp->p_miniq, 1); struct mini_Qdisc *miniq; if (!tp_head) { RCU_INIT_POINTER(*miniqp->p_miniq, NULL); } else { miniq = miniq_old != &miniqp->miniq1 ? &miniqp->miniq1 : &miniqp->miniq2; /* We need to make sure that readers won't see the miniq * we are about to modify. So ensure that at least one RCU * grace period has elapsed since the miniq was made * inactive. */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) cond_synchronize_rcu(miniq->rcu_state); else if (!poll_state_synchronize_rcu(miniq->rcu_state)) synchronize_rcu_expedited(); miniq->filter_list = tp_head; rcu_assign_pointer(*miniqp->p_miniq, miniq); } if (miniq_old) /* This is counterpart of the rcu sync above. We need to * block potential new user of miniq_old until all readers * are not seeing it. */ miniq_old->rcu_state = start_poll_synchronize_rcu(); } EXPORT_SYMBOL(mini_qdisc_pair_swap); void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp, struct tcf_block *block) { miniqp->miniq1.block = block; miniqp->miniq2.block = block; } EXPORT_SYMBOL(mini_qdisc_pair_block_init); void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, struct mini_Qdisc __rcu **p_miniq) { miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats; miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats; miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats; miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats; miniqp->miniq1.rcu_state = get_state_synchronize_rcu(); miniqp->miniq2.rcu_state = miniqp->miniq1.rcu_state; miniqp->p_miniq = p_miniq; } EXPORT_SYMBOL(mini_qdisc_pair_init); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2020 Google LLC. */ #include <linux/filter.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/binfmts.h> #include <linux/lsm_hooks.h> #include <linux/bpf_lsm.h> #include <linux/kallsyms.h> #include <net/bpf_sk_storage.h> #include <linux/bpf_local_storage.h> #include <linux/btf_ids.h> #include <linux/ima.h> #include <linux/bpf-cgroup.h> /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. */ #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ noinline RET bpf_lsm_##NAME(__VA_ARGS__) \ { \ return DEFAULT; \ } #include <linux/lsm_hook_defs.h> #undef LSM_HOOK #define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME) BTF_SET_START(bpf_lsm_hooks) #include <linux/lsm_hook_defs.h> #undef LSM_HOOK BTF_SET_END(bpf_lsm_hooks) BTF_SET_START(bpf_lsm_disabled_hooks) BTF_ID(func, bpf_lsm_vm_enough_memory) BTF_ID(func, bpf_lsm_inode_need_killpriv) BTF_ID(func, bpf_lsm_inode_getsecurity) BTF_ID(func, bpf_lsm_inode_listsecurity) BTF_ID(func, bpf_lsm_inode_copy_up_xattr) BTF_ID(func, bpf_lsm_getselfattr) BTF_ID(func, bpf_lsm_getprocattr) BTF_ID(func, bpf_lsm_setprocattr) #ifdef CONFIG_KEYS BTF_ID(func, bpf_lsm_key_getsecurity) #endif #ifdef CONFIG_AUDIT BTF_ID(func, bpf_lsm_audit_rule_match) #endif BTF_ID(func, bpf_lsm_ismaclabel) BTF_ID(func, bpf_lsm_file_alloc_security) BTF_SET_END(bpf_lsm_disabled_hooks) /* List of LSM hooks that should operate on 'current' cgroup regardless * of function signature. */ BTF_SET_START(bpf_lsm_current_hooks) /* operate on freshly allocated sk without any cgroup association */ #ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_sk_alloc_security) BTF_ID(func, bpf_lsm_sk_free_security) #endif BTF_SET_END(bpf_lsm_current_hooks) /* List of LSM hooks that trigger while the socket is properly locked. */ BTF_SET_START(bpf_lsm_locked_sockopt_hooks) #ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_sock_graft) BTF_ID(func, bpf_lsm_inet_csk_clone) BTF_ID(func, bpf_lsm_inet_conn_established) #endif BTF_SET_END(bpf_lsm_locked_sockopt_hooks) /* List of LSM hooks that trigger while the socket is _not_ locked, * but it's ok to call bpf_{g,s}etsockopt because the socket is still * in the early init phase. */ BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks) #ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_socket_post_create) BTF_ID(func, bpf_lsm_socket_socketpair) #endif BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks) #ifdef CONFIG_CGROUP_BPF void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func) { const struct btf_param *args __maybe_unused; if (btf_type_vlen(prog->aux->attach_func_proto) < 1 || btf_id_set_contains(&bpf_lsm_current_hooks, prog->aux->attach_btf_id)) { *bpf_func = __cgroup_bpf_run_lsm_current; return; } #ifdef CONFIG_NET args = btf_params(prog->aux->attach_func_proto); if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET]) *bpf_func = __cgroup_bpf_run_lsm_socket; else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK]) *bpf_func = __cgroup_bpf_run_lsm_sock; else #endif *bpf_func = __cgroup_bpf_run_lsm_current; } #endif int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { u32 btf_id = prog->aux->attach_btf_id; const char *func_name = prog->aux->attach_func_name; if (!prog->gpl_compatible) { bpf_log(vlog, "LSM programs must have a GPL compatible license\n"); return -EINVAL; } if (btf_id_set_contains(&bpf_lsm_disabled_hooks, btf_id)) { bpf_log(vlog, "attach_btf_id %u points to disabled hook %s\n", btf_id, func_name); return -EINVAL; } if (!btf_id_set_contains(&bpf_lsm_hooks, btf_id)) { bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", btf_id, func_name); return -EINVAL; } return 0; } /* Mask for all the currently supported BPRM option flags */ #define BPF_F_BRPM_OPTS_MASK BPF_F_BPRM_SECUREEXEC BPF_CALL_2(bpf_bprm_opts_set, struct linux_binprm *, bprm, u64, flags) { if (flags & ~BPF_F_BRPM_OPTS_MASK) return -EINVAL; bprm->secureexec = (flags & BPF_F_BPRM_SECUREEXEC); return 0; } BTF_ID_LIST_SINGLE(bpf_bprm_opts_set_btf_ids, struct, linux_binprm) static const struct bpf_func_proto bpf_bprm_opts_set_proto = { .func = bpf_bprm_opts_set, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_bprm_opts_set_btf_ids[0], .arg2_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_ima_inode_hash, struct inode *, inode, void *, dst, u32, size) { return ima_inode_hash(inode, dst, size); } static bool bpf_ima_inode_hash_allowed(const struct bpf_prog *prog) { return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id); } BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode) static const struct bpf_func_proto bpf_ima_inode_hash_proto = { .func = bpf_ima_inode_hash, .gpl_only = false, .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, .allowed = bpf_ima_inode_hash_allowed, }; BPF_CALL_3(bpf_ima_file_hash, struct file *, file, void *, dst, u32, size) { return ima_file_hash(file, dst, size); } BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file) static const struct bpf_func_proto bpf_ima_file_hash_proto = { .func = bpf_ima_file_hash, .gpl_only = false, .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_ima_file_hash_btf_ids[0], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, .allowed = bpf_ima_inode_hash_allowed, }; BPF_CALL_1(bpf_get_attach_cookie, void *, ctx) { struct bpf_trace_run_ctx *run_ctx; run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); return run_ctx->bpf_cookie; } static const struct bpf_func_proto bpf_get_attach_cookie_proto = { .func = bpf_get_attach_cookie, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; if (prog->expected_attach_type == BPF_LSM_CGROUP) { func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; } switch (func_id) { case BPF_FUNC_inode_storage_get: return &bpf_inode_storage_get_proto; case BPF_FUNC_inode_storage_delete: return &bpf_inode_storage_delete_proto; #ifdef CONFIG_NET case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; #endif /* CONFIG_NET */ case BPF_FUNC_spin_lock: return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; case BPF_FUNC_bprm_opts_set: return &bpf_bprm_opts_set_proto; case BPF_FUNC_ima_inode_hash: return &bpf_ima_inode_hash_proto; case BPF_FUNC_ima_file_hash: return &bpf_ima_file_hash_proto; case BPF_FUNC_get_attach_cookie: return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL; #ifdef CONFIG_NET case BPF_FUNC_setsockopt: if (prog->expected_attach_type != BPF_LSM_CGROUP) return NULL; if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks, prog->aux->attach_btf_id)) return &bpf_sk_setsockopt_proto; if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks, prog->aux->attach_btf_id)) return &bpf_unlocked_sk_setsockopt_proto; return NULL; case BPF_FUNC_getsockopt: if (prog->expected_attach_type != BPF_LSM_CGROUP) return NULL; if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks, prog->aux->attach_btf_id)) return &bpf_sk_getsockopt_proto; if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks, prog->aux->attach_btf_id)) return &bpf_unlocked_sk_getsockopt_proto; return NULL; #endif default: return tracing_prog_func_proto(func_id, prog); } } /* The set of hooks which are called without pagefaults disabled and are allowed * to "sleep" and thus can be used for sleepable BPF programs. */ BTF_SET_START(sleepable_lsm_hooks) BTF_ID(func, bpf_lsm_bpf) BTF_ID(func, bpf_lsm_bpf_map) BTF_ID(func, bpf_lsm_bpf_map_create) BTF_ID(func, bpf_lsm_bpf_map_free) BTF_ID(func, bpf_lsm_bpf_prog) BTF_ID(func, bpf_lsm_bpf_prog_load) BTF_ID(func, bpf_lsm_bpf_prog_free) BTF_ID(func, bpf_lsm_bpf_token_create) BTF_ID(func, bpf_lsm_bpf_token_free) BTF_ID(func, bpf_lsm_bpf_token_cmd) BTF_ID(func, bpf_lsm_bpf_token_capable) BTF_ID(func, bpf_lsm_bprm_check_security) BTF_ID(func, bpf_lsm_bprm_committed_creds) BTF_ID(func, bpf_lsm_bprm_committing_creds) BTF_ID(func, bpf_lsm_bprm_creds_for_exec) BTF_ID(func, bpf_lsm_bprm_creds_from_file) BTF_ID(func, bpf_lsm_capget) BTF_ID(func, bpf_lsm_capset) BTF_ID(func, bpf_lsm_cred_prepare) BTF_ID(func, bpf_lsm_file_ioctl) BTF_ID(func, bpf_lsm_file_lock) BTF_ID(func, bpf_lsm_file_open) BTF_ID(func, bpf_lsm_file_post_open) BTF_ID(func, bpf_lsm_file_receive) BTF_ID(func, bpf_lsm_inode_create) BTF_ID(func, bpf_lsm_inode_free_security) BTF_ID(func, bpf_lsm_inode_getattr) BTF_ID(func, bpf_lsm_inode_getxattr) BTF_ID(func, bpf_lsm_inode_mknod) BTF_ID(func, bpf_lsm_inode_need_killpriv) BTF_ID(func, bpf_lsm_inode_post_setxattr) BTF_ID(func, bpf_lsm_inode_post_removexattr) BTF_ID(func, bpf_lsm_inode_readlink) BTF_ID(func, bpf_lsm_inode_removexattr) BTF_ID(func, bpf_lsm_inode_rename) BTF_ID(func, bpf_lsm_inode_rmdir) BTF_ID(func, bpf_lsm_inode_setattr) BTF_ID(func, bpf_lsm_inode_setxattr) BTF_ID(func, bpf_lsm_inode_symlink) BTF_ID(func, bpf_lsm_inode_unlink) BTF_ID(func, bpf_lsm_kernel_module_request) BTF_ID(func, bpf_lsm_kernel_read_file) BTF_ID(func, bpf_lsm_kernfs_init_security) #ifdef CONFIG_SECURITY_PATH BTF_ID(func, bpf_lsm_path_unlink) BTF_ID(func, bpf_lsm_path_mkdir) BTF_ID(func, bpf_lsm_path_rmdir) BTF_ID(func, bpf_lsm_path_truncate) BTF_ID(func, bpf_lsm_path_symlink) BTF_ID(func, bpf_lsm_path_link) BTF_ID(func, bpf_lsm_path_rename) BTF_ID(func, bpf_lsm_path_chmod) BTF_ID(func, bpf_lsm_path_chown) #endif /* CONFIG_SECURITY_PATH */ BTF_ID(func, bpf_lsm_mmap_file) BTF_ID(func, bpf_lsm_netlink_send) BTF_ID(func, bpf_lsm_path_notify) BTF_ID(func, bpf_lsm_release_secctx) BTF_ID(func, bpf_lsm_sb_alloc_security) BTF_ID(func, bpf_lsm_sb_eat_lsm_opts) BTF_ID(func, bpf_lsm_sb_kern_mount) BTF_ID(func, bpf_lsm_sb_mount) BTF_ID(func, bpf_lsm_sb_remount) BTF_ID(func, bpf_lsm_sb_set_mnt_opts) BTF_ID(func, bpf_lsm_sb_show_options) BTF_ID(func, bpf_lsm_sb_statfs) BTF_ID(func, bpf_lsm_sb_umount) BTF_ID(func, bpf_lsm_settime) #ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_inet_conn_established) BTF_ID(func, bpf_lsm_socket_accept) BTF_ID(func, bpf_lsm_socket_bind) BTF_ID(func, bpf_lsm_socket_connect) BTF_ID(func, bpf_lsm_socket_create) BTF_ID(func, bpf_lsm_socket_getpeername) BTF_ID(func, bpf_lsm_socket_getpeersec_dgram) BTF_ID(func, bpf_lsm_socket_getsockname) BTF_ID(func, bpf_lsm_socket_getsockopt) BTF_ID(func, bpf_lsm_socket_listen) BTF_ID(func, bpf_lsm_socket_post_create) BTF_ID(func, bpf_lsm_socket_recvmsg) BTF_ID(func, bpf_lsm_socket_sendmsg) BTF_ID(func, bpf_lsm_socket_shutdown) BTF_ID(func, bpf_lsm_socket_socketpair) #endif /* CONFIG_SECURITY_NETWORK */ BTF_ID(func, bpf_lsm_syslog) BTF_ID(func, bpf_lsm_task_alloc) BTF_ID(func, bpf_lsm_task_prctl) BTF_ID(func, bpf_lsm_task_setscheduler) BTF_ID(func, bpf_lsm_task_to_inode) BTF_ID(func, bpf_lsm_userns_create) BTF_SET_END(sleepable_lsm_hooks) BTF_SET_START(untrusted_lsm_hooks) BTF_ID(func, bpf_lsm_bpf_map_free) BTF_ID(func, bpf_lsm_bpf_prog_free) BTF_ID(func, bpf_lsm_file_alloc_security) BTF_ID(func, bpf_lsm_file_free_security) #ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_sk_alloc_security) BTF_ID(func, bpf_lsm_sk_free_security) #endif /* CONFIG_SECURITY_NETWORK */ BTF_ID(func, bpf_lsm_task_free) BTF_SET_END(untrusted_lsm_hooks) bool bpf_lsm_is_sleepable_hook(u32 btf_id) { return btf_id_set_contains(&sleepable_lsm_hooks, btf_id); } bool bpf_lsm_is_trusted(const struct bpf_prog *prog) { return !btf_id_set_contains(&untrusted_lsm_hooks, prog->aux->attach_btf_id); } const struct bpf_prog_ops lsm_prog_ops = { }; const struct bpf_verifier_ops lsm_verifier_ops = { .get_func_proto = bpf_lsm_func_proto, .is_valid_access = btf_ctx_access, }; /* hooks return 0 or 1 */ BTF_SET_START(bool_lsm_hooks) #ifdef CONFIG_SECURITY_NETWORK_XFRM BTF_ID(func, bpf_lsm_xfrm_state_pol_flow_match) #endif #ifdef CONFIG_AUDIT BTF_ID(func, bpf_lsm_audit_rule_known) #endif BTF_ID(func, bpf_lsm_inode_xattr_skipcap) BTF_SET_END(bool_lsm_hooks) int bpf_lsm_get_retval_range(const struct bpf_prog *prog, struct bpf_retval_range *retval_range) { /* no return value range for void hooks */ if (!prog->aux->attach_func_proto->type) return -EINVAL; if (btf_id_set_contains(&bool_lsm_hooks, prog->aux->attach_btf_id)) { retval_range->minval = 0; retval_range->maxval = 1; } else { /* All other available LSM hooks, except task_prctl, return 0 * on success and negative error code on failure. * To keep things simple, we only allow bpf progs to return 0 * or negative errno for task_prctl too. */ retval_range->minval = -MAX_ERRNO; retval_range->maxval = 0; } return 0; } |
| 14 14 19 19 19 3 21 25 14 14 14 20 14 7 9 5 14 14 13 32 31 31 58 14 20 8 21 16 20 10 22 15 15 2 1 4 6 4 2 4 1 2 1 1 3 1 4 3 1 12 1 12 12 1 10 1 2 12 23 23 1 8 15 5 1 1 1 2 27 1 1 3 20 23 1 1 2 1 10 9 18 14 4 4 14 17 2 1 9 18 1 1 1 1 7 15 4 2 2 2 6 1 7 8 8 1 1 13 5 4 4 4 3 2 3 3 13 5 15 5 1 5 4 1 18 5 4 5 4 3 5 5 70 70 69 68 65 66 58 13 12 13 13 12 66 29 20 9 9 5 20 17 5 17 5 20 20 13 4 4 24 1 1 20 2 14 2 1 20 18 14 6 3 13 2 4 11 4 12 12 1 4 4 4 3 3 2 1 1 1 1 1 1 1 1 12 1 1 11 1 5 1 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/hugetlb.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/io_uring/cmd.h> #include <uapi/linux/io_uring.h> #include "filetable.h" #include "io_uring.h" #include "openclose.h" #include "rsrc.h" #include "memmap.h" #include "register.h" struct io_rsrc_update { struct file *file; u64 arg; u32 nr_args; u32 offset; }; static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct page **last_hpage); /* only define max */ #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) #define IO_CACHED_BVECS_SEGS 32 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; if (!nr_pages) return 0; /* Don't allow more pages than we can safely lock */ page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; cur_pages = atomic_long_read(&user->locked_vm); do { new_pages = cur_pages + nr_pages; if (new_pages > page_limit) return -ENOMEM; } while (!atomic_long_try_cmpxchg(&user->locked_vm, &cur_pages, new_pages)); return 0; } void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, unsigned long nr_pages) { if (user) __io_unaccount_mem(user, nr_pages); if (mm_account) atomic64_sub(nr_pages, &mm_account->pinned_vm); } int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, unsigned long nr_pages) { int ret; if (user) { ret = __io_account_mem(user, nr_pages); if (ret) return ret; } if (mm_account) atomic64_add(nr_pages, &mm_account->pinned_vm); return 0; } int io_validate_user_buf_range(u64 uaddr, u64 ulen) { unsigned long tmp, base = (unsigned long)uaddr; unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); /* arbitrary limit, but we need something */ if (ulen > SZ_1G || !ulen) return -EFAULT; if (check_add_overflow(base, acct_len, &tmp)) return -EOVERFLOW; return 0; } static int io_buffer_validate(struct iovec *iov) { /* * Don't impose further limits on the size and buffer * constraints here, we'll -EINVAL later when IO is * submitted if they are wrong. */ if (!iov->iov_base) return iov->iov_len ? -EFAULT : 0; return io_validate_user_buf_range((unsigned long)iov->iov_base, iov->iov_len); } static void io_release_ubuf(void *priv) { struct io_mapped_ubuf *imu = priv; unsigned int i; for (i = 0; i < imu->nr_bvecs; i++) { struct folio *folio = page_folio(imu->bvec[i].bv_page); unpin_user_folio(folio, 1); } } static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, int nr_bvecs) { if (nr_bvecs <= IO_CACHED_BVECS_SEGS) return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), GFP_KERNEL); } static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) { if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) io_cache_free(&ctx->imu_cache, imu); else kvfree(imu); } static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) { if (unlikely(refcount_read(&imu->refs) > 1)) { if (!refcount_dec_and_test(&imu->refs)) return; } if (imu->acct_pages) io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages); imu->release(imu->priv); io_free_imu(ctx, imu); } struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) { struct io_rsrc_node *node; node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); if (node) { node->type = type; node->refs = 1; node->tag = 0; node->file_ptr = 0; } return node; } bool io_rsrc_cache_init(struct io_ring_ctx *ctx) { const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, IO_CACHED_BVECS_SEGS); const int node_size = sizeof(struct io_rsrc_node); bool ret; ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, node_size, 0); ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, imu_cache_size, 0); return ret; } void io_rsrc_cache_free(struct io_ring_ctx *ctx) { io_alloc_cache_free(&ctx->node_cache, kfree); io_alloc_cache_free(&ctx->imu_cache, kfree); } static void io_clear_table_tags(struct io_rsrc_data *data) { int i; for (i = 0; i < data->nr; i++) { struct io_rsrc_node *node = data->nodes[i]; if (node) node->tag = 0; } } __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) { if (!data->nr) return; while (data->nr--) { if (data->nodes[data->nr]) io_put_rsrc_node(ctx, data->nodes[data->nr]); } kvfree(data->nodes); data->nodes = NULL; data->nr = 0; } __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) { data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (data->nodes) { data->nr = nr; return 0; } return -ENOMEM; } static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned nr_args) { u64 __user *tags = u64_to_user_ptr(up->tags); __s32 __user *fds = u64_to_user_ptr(up->data); int fd, i, err = 0; unsigned int done; if (!ctx->file_table.data.nr) return -ENXIO; if (up->offset + nr_args > ctx->file_table.data.nr) return -EINVAL; for (done = 0; done < nr_args; done++) { u64 tag = 0; if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || copy_from_user(&fd, &fds[done], sizeof(fd))) { err = -EFAULT; break; } if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { err = -EINVAL; break; } if (fd == IORING_REGISTER_FILES_SKIP) continue; i = up->offset + done; if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) io_file_bitmap_clear(&ctx->file_table, i); if (fd != -1) { struct file *file = fget(fd); struct io_rsrc_node *node; if (!file) { err = -EBADF; break; } /* * Don't allow io_uring instances to be registered. */ if (io_is_uring_fops(file)) { fput(file); err = -EBADF; break; } node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) { err = -ENOMEM; fput(file); break; } ctx->file_table.data.nodes[i] = node; if (tag) node->tag = tag; io_fixed_file_set(node, file); io_file_bitmap_set(&ctx->file_table, i); } } return done ? done : err; } static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned int nr_args) { u64 __user *tags = u64_to_user_ptr(up->tags); struct iovec fast_iov, *iov; struct page *last_hpage = NULL; struct iovec __user *uvec; u64 user_data = up->data; __u32 done; int i, err; if (!ctx->buf_table.nr) return -ENXIO; if (up->offset + nr_args > ctx->buf_table.nr) return -EINVAL; for (done = 0; done < nr_args; done++) { struct io_rsrc_node *node; u64 tag = 0; uvec = u64_to_user_ptr(user_data); iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); if (IS_ERR(iov)) { err = PTR_ERR(iov); break; } if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { err = -EFAULT; break; } err = io_buffer_validate(iov); if (err) break; node = io_sqe_buffer_register(ctx, iov, &last_hpage); if (IS_ERR(node)) { err = PTR_ERR(node); break; } if (tag) { if (!node) { err = -EINVAL; break; } node->tag = tag; } i = array_index_nospec(up->offset + done, ctx->buf_table.nr); io_reset_rsrc_node(ctx, &ctx->buf_table, i); ctx->buf_table.nodes[i] = node; if (ctx->compat) user_data += sizeof(struct compat_iovec); else user_data += sizeof(struct iovec); } return done ? done : err; } static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, struct io_uring_rsrc_update2 *up, unsigned nr_args) { __u32 tmp; lockdep_assert_held(&ctx->uring_lock); if (check_add_overflow(up->offset, nr_args, &tmp)) return -EOVERFLOW; switch (type) { case IORING_RSRC_FILE: return __io_sqe_files_update(ctx, up, nr_args); case IORING_RSRC_BUFFER: return __io_sqe_buffers_update(ctx, up, nr_args); } return -EINVAL; } int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct io_uring_rsrc_update2 up; if (!nr_args) return -EINVAL; memset(&up, 0, sizeof(up)); if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) return -EFAULT; if (up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); } int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type) { struct io_uring_rsrc_update2 up; if (size != sizeof(up)) return -EINVAL; if (copy_from_user(&up, arg, sizeof(up))) return -EFAULT; if (!up.nr || up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, type, &up, up.nr); } __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type) { struct io_uring_rsrc_register rr; /* keep it extendible */ if (size != sizeof(rr)) return -EINVAL; memset(&rr, 0, sizeof(rr)); if (copy_from_user(&rr, arg, size)) return -EFAULT; if (!rr.nr || rr.resv2) return -EINVAL; if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) return -EINVAL; switch (type) { case IORING_RSRC_FILE: if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) break; return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); case IORING_RSRC_BUFFER: if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) break; return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); } return -EINVAL; } int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; if (sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; up->offset = READ_ONCE(sqe->off); up->nr_args = READ_ONCE(sqe->len); if (!up->nr_args) return -EINVAL; up->arg = READ_ONCE(sqe->addr); return 0; } static int io_files_update_with_index_alloc(struct io_kiocb *req, unsigned int issue_flags) { struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); __s32 __user *fds = u64_to_user_ptr(up->arg); unsigned int done; struct file *file; int ret, fd; if (!req->ctx->file_table.data.nr) return -ENXIO; for (done = 0; done < up->nr_args; done++) { if (get_user(fd, &fds[done])) { ret = -EFAULT; break; } file = fget(fd); if (!file) { ret = -EBADF; break; } ret = io_fixed_fd_install(req, issue_flags, file, IORING_FILE_INDEX_ALLOC); if (ret < 0) break; if (put_user(ret, &fds[done])) { __io_close_fixed(req->ctx, issue_flags, ret); ret = -EFAULT; break; } } if (done) return done; return ret; } int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); struct io_ring_ctx *ctx = req->ctx; struct io_uring_rsrc_update2 up2; int ret; up2.offset = up->offset; up2.data = up->arg; up2.nr = 0; up2.tags = 0; up2.resv = 0; up2.resv2 = 0; if (up->offset == IORING_FILE_INDEX_ALLOC) { ret = io_files_update_with_index_alloc(req, issue_flags); } else { io_ring_submit_lock(ctx, issue_flags); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up2, up->nr_args); io_ring_submit_unlock(ctx, issue_flags); } if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_COMPLETE; } void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { if (node->tag) io_post_aux_cqe(ctx, node->tag, 0, 0); switch (node->type) { case IORING_RSRC_FILE: fput(io_slot_file(node)); break; case IORING_RSRC_BUFFER: io_buffer_unmap(ctx, node->buf); break; default: WARN_ON_ONCE(1); break; } io_cache_free(&ctx->node_cache, node); } int io_sqe_files_unregister(struct io_ring_ctx *ctx) { if (!ctx->file_table.data.nr) return -ENXIO; io_free_file_tables(ctx, &ctx->file_table); io_file_table_set_alloc_range(ctx, 0, 0); return 0; } int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags) { __s32 __user *fds = (__s32 __user *) arg; struct file *file; int fd, ret; unsigned i; if (ctx->file_table.data.nr) return -EBUSY; if (!nr_args) return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; if (nr_args > rlimit(RLIMIT_NOFILE)) return -EMFILE; if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) return -ENOMEM; for (i = 0; i < nr_args; i++) { struct io_rsrc_node *node; u64 tag = 0; ret = -EFAULT; if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) goto fail; if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) goto fail; /* allow sparse sets */ if (!fds || fd == -1) { ret = -EINVAL; if (tag) goto fail; continue; } file = fget(fd); ret = -EBADF; if (unlikely(!file)) goto fail; /* * Don't allow io_uring instances to be registered. */ if (io_is_uring_fops(file)) { fput(file); goto fail; } ret = -ENOMEM; node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) { fput(file); goto fail; } if (tag) node->tag = tag; ctx->file_table.data.nodes[i] = node; io_fixed_file_set(node, file); io_file_bitmap_set(&ctx->file_table, i); } /* default it to the whole table */ io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); return 0; fail: io_clear_table_tags(&ctx->file_table.data); io_sqe_files_unregister(ctx); return ret; } int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) { if (!ctx->buf_table.nr) return -ENXIO; io_rsrc_data_free(ctx, &ctx->buf_table); return 0; } /* * Not super efficient, but this is just a registration time. And we do cache * the last compound head, so generally we'll only do a full search if we don't * match that one. * * We check if the given compound head page has already been accounted, to * avoid double accounting it. This allows us to account the full size of the * page, not just the constituent pages of a huge page. */ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, int nr_pages, struct page *hpage) { int i, j; /* check current page array */ for (i = 0; i < nr_pages; i++) { if (!PageCompound(pages[i])) continue; if (compound_head(pages[i]) == hpage) return true; } /* check previously registered pages */ for (i = 0; i < ctx->buf_table.nr; i++) { struct io_rsrc_node *node = ctx->buf_table.nodes[i]; struct io_mapped_ubuf *imu; if (!node) continue; imu = node->buf; for (j = 0; j < imu->nr_bvecs; j++) { if (!PageCompound(imu->bvec[j].bv_page)) continue; if (compound_head(imu->bvec[j].bv_page) == hpage) return true; } } return false; } static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, int nr_pages, struct io_mapped_ubuf *imu, struct page **last_hpage) { int i, ret; imu->acct_pages = 0; for (i = 0; i < nr_pages; i++) { if (!PageCompound(pages[i])) { imu->acct_pages++; } else { struct page *hpage; hpage = compound_head(pages[i]); if (hpage == *last_hpage) continue; *last_hpage = hpage; if (headpage_already_acct(ctx, pages, i, hpage)) continue; imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; } } if (!imu->acct_pages) return 0; ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages); if (ret) imu->acct_pages = 0; return ret; } static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, struct io_imu_folio_data *data) { struct page **page_array = *pages, **new_array = NULL; unsigned nr_pages_left = *nr_pages; unsigned nr_folios = data->nr_folios; unsigned i, j; /* Store head pages only*/ new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); if (!new_array) return false; for (i = 0, j = 0; i < nr_folios; i++) { struct page *p = compound_head(page_array[j]); struct folio *folio = page_folio(p); unsigned int nr; WARN_ON_ONCE(i > 0 && p != page_array[j]); nr = i ? data->nr_pages_mid : data->nr_pages_head; nr = min(nr, nr_pages_left); /* Drop all but one ref, the entire folio will remain pinned. */ if (nr > 1) unpin_user_folio(folio, nr - 1); j += nr; nr_pages_left -= nr; new_array[i] = p; } WARN_ON_ONCE(j != *nr_pages); kvfree(page_array); *pages = new_array; *nr_pages = nr_folios; return true; } bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, struct io_imu_folio_data *data) { struct folio *folio = page_folio(page_array[0]); unsigned int count = 1, nr_folios = 1; int i; data->nr_pages_mid = folio_nr_pages(folio); data->folio_shift = folio_shift(folio); data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); /* * Check if pages are contiguous inside a folio, and all folios have * the same page count except for the head and tail. */ for (i = 1; i < nr_pages; i++) { if (page_folio(page_array[i]) == folio && page_array[i] == page_array[i-1] + 1) { count++; continue; } if (nr_folios == 1) { if (folio_page_idx(folio, page_array[i-1]) != data->nr_pages_mid - 1) return false; data->nr_pages_head = count; } else if (count != data->nr_pages_mid) { return false; } folio = page_folio(page_array[i]); if (folio_size(folio) != (1UL << data->folio_shift) || folio_page_idx(folio, page_array[i]) != 0) return false; count = 1; nr_folios++; } if (nr_folios == 1) data->nr_pages_head = count; data->nr_folios = nr_folios; return true; } static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct page **last_hpage) { struct io_mapped_ubuf *imu = NULL; struct page **pages = NULL; struct io_rsrc_node *node; unsigned long off; size_t size; int ret, nr_pages, i; struct io_imu_folio_data data; bool coalesced = false; if (!iov->iov_base) return NULL; node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!node) return ERR_PTR(-ENOMEM); ret = -ENOMEM; pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, &nr_pages); if (IS_ERR(pages)) { ret = PTR_ERR(pages); pages = NULL; goto done; } /* If it's huge page(s), try to coalesce them into fewer bvec entries */ if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { if (data.nr_pages_mid != 1) coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); } imu = io_alloc_imu(ctx, nr_pages); if (!imu) goto done; imu->nr_bvecs = nr_pages; ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); if (ret) goto done; size = iov->iov_len; /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; imu->len = iov->iov_len; imu->folio_shift = PAGE_SHIFT; imu->release = io_release_ubuf; imu->priv = imu; imu->is_kbuf = false; imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); off = (unsigned long)iov->iov_base & ~PAGE_MASK; if (coalesced) off += data.first_folio_page_idx << PAGE_SHIFT; node->buf = imu; ret = 0; for (i = 0; i < nr_pages; i++) { size_t vec_len; vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); off = 0; size -= vec_len; } done: if (ret) { if (imu) io_free_imu(ctx, imu); if (pages) { for (i = 0; i < nr_pages; i++) unpin_user_folio(page_folio(pages[i]), 1); } io_cache_free(&ctx->node_cache, node); node = ERR_PTR(ret); } kvfree(pages); return node; } int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args, u64 __user *tags) { struct page *last_hpage = NULL; struct io_rsrc_data data; struct iovec fast_iov, *iov = &fast_iov; const struct iovec __user *uvec; int i, ret; BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); if (ctx->buf_table.nr) return -EBUSY; if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; ret = io_rsrc_data_alloc(&data, nr_args); if (ret) return ret; if (!arg) memset(iov, 0, sizeof(*iov)); for (i = 0; i < nr_args; i++) { struct io_rsrc_node *node; u64 tag = 0; if (arg) { uvec = (struct iovec __user *) arg; iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); if (IS_ERR(iov)) { ret = PTR_ERR(iov); break; } ret = io_buffer_validate(iov); if (ret) break; if (ctx->compat) arg += sizeof(struct compat_iovec); else arg += sizeof(struct iovec); } if (tags) { if (copy_from_user(&tag, &tags[i], sizeof(tag))) { ret = -EFAULT; break; } } node = io_sqe_buffer_register(ctx, iov, &last_hpage); if (IS_ERR(node)) { ret = PTR_ERR(node); break; } if (tag) { if (!node) { ret = -EINVAL; break; } node->tag = tag; } data.nodes[i] = node; } ctx->buf_table = data; if (ret) { io_clear_table_tags(&ctx->buf_table); io_sqe_buffers_unregister(ctx); } return ret; } int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, void (*release)(void *), unsigned int index, unsigned int issue_flags) { struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; struct io_rsrc_data *data = &ctx->buf_table; struct req_iterator rq_iter; struct io_mapped_ubuf *imu; struct io_rsrc_node *node; struct bio_vec bv; unsigned int nr_bvecs = 0; int ret = 0; io_ring_submit_lock(ctx, issue_flags); if (index >= data->nr) { ret = -EINVAL; goto unlock; } index = array_index_nospec(index, data->nr); if (data->nodes[index]) { ret = -EBUSY; goto unlock; } node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!node) { ret = -ENOMEM; goto unlock; } /* * blk_rq_nr_phys_segments() may overestimate the number of bvecs * but avoids needing to iterate over the bvecs */ imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); if (!imu) { kfree(node); ret = -ENOMEM; goto unlock; } imu->ubuf = 0; imu->len = blk_rq_bytes(rq); imu->acct_pages = 0; imu->folio_shift = PAGE_SHIFT; refcount_set(&imu->refs, 1); imu->release = release; imu->priv = rq; imu->is_kbuf = true; imu->dir = 1 << rq_data_dir(rq); rq_for_each_bvec(bv, rq, rq_iter) imu->bvec[nr_bvecs++] = bv; imu->nr_bvecs = nr_bvecs; node->buf = imu; data->nodes[index] = node; unlock: io_ring_submit_unlock(ctx, issue_flags); return ret; } EXPORT_SYMBOL_GPL(io_buffer_register_bvec); int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, unsigned int issue_flags) { struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; struct io_rsrc_data *data = &ctx->buf_table; struct io_rsrc_node *node; int ret = 0; io_ring_submit_lock(ctx, issue_flags); if (index >= data->nr) { ret = -EINVAL; goto unlock; } index = array_index_nospec(index, data->nr); node = data->nodes[index]; if (!node) { ret = -EINVAL; goto unlock; } if (!node->buf->is_kbuf) { ret = -EBUSY; goto unlock; } io_put_rsrc_node(ctx, node); data->nodes[index] = NULL; unlock: io_ring_submit_unlock(ctx, issue_flags); return ret; } EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); static int validate_fixed_range(u64 buf_addr, size_t len, const struct io_mapped_ubuf *imu) { u64 buf_end; if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) return -EFAULT; if (unlikely(len > MAX_RW_COUNT)) return -EFAULT; return 0; } static int io_import_kbuf(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, size_t len, size_t offset) { size_t count = len + offset; iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); iov_iter_advance(iter, offset); if (count < imu->len) { const struct bio_vec *bvec = iter->bvec; while (len > bvec->bv_len) { len -= bvec->bv_len; bvec++; } iter->nr_segs = 1 + bvec - iter->bvec; } return 0; } static int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len) { const struct bio_vec *bvec; size_t folio_mask; unsigned nr_segs; size_t offset; int ret; ret = validate_fixed_range(buf_addr, len, imu); if (unlikely(ret)) return ret; if (!(imu->dir & (1 << ddir))) return -EFAULT; offset = buf_addr - imu->ubuf; if (imu->is_kbuf) return io_import_kbuf(ddir, iter, imu, len, offset); /* * Don't use iov_iter_advance() here, as it's really slow for * using the latter parts of a big fixed buffer - it iterates * over each segment manually. We can cheat a bit here for user * registered nodes, because we know that: * * 1) it's a BVEC iter, we set it up * 2) all bvecs are the same in size, except potentially the * first and last bvec */ folio_mask = (1UL << imu->folio_shift) - 1; bvec = imu->bvec; if (offset >= bvec->bv_len) { unsigned long seg_skip; /* skip first vec */ offset -= bvec->bv_len; seg_skip = 1 + (offset >> imu->folio_shift); bvec += seg_skip; offset &= folio_mask; } nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; iov_iter_bvec(iter, ddir, bvec, nr_segs, len); iter->iov_offset = offset; return 0; } inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_rsrc_node *node; if (req->flags & REQ_F_BUF_NODE) return req->buf_node; req->flags |= REQ_F_BUF_NODE; io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); if (node) { node->refs++; req->buf_node = node; io_ring_submit_unlock(ctx, issue_flags); return node; } req->flags &= ~REQ_F_BUF_NODE; io_ring_submit_unlock(ctx, issue_flags); return NULL; } int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, u64 buf_addr, size_t len, int ddir, unsigned issue_flags) { struct io_rsrc_node *node; node = io_find_buf_node(req, issue_flags); if (!node) return -EFAULT; return io_import_fixed(ddir, iter, node->buf, buf_addr, len); } /* Lock two rings at once. The rings must be different! */ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) { if (ctx1 > ctx2) swap(ctx1, ctx2); mutex_lock(&ctx1->uring_lock); mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); } /* Both rings are locked by the caller. */ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, struct io_uring_clone_buffers *arg) { struct io_rsrc_data data; int i, ret, off, nr; unsigned int nbufs; lockdep_assert_held(&ctx->uring_lock); lockdep_assert_held(&src_ctx->uring_lock); /* * Accounting state is shared between the two rings; that only works if * both rings are accounted towards the same counters. */ if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) return -EINVAL; /* if offsets are given, must have nr specified too */ if (!arg->nr && (arg->dst_off || arg->src_off)) return -EINVAL; /* not allowed unless REPLACE is set */ if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) return -EBUSY; nbufs = src_ctx->buf_table.nr; if (!nbufs) return -ENXIO; if (!arg->nr) arg->nr = nbufs; else if (arg->nr > nbufs) return -EINVAL; else if (arg->nr > IORING_MAX_REG_BUFFERS) return -EINVAL; if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs) return -EOVERFLOW; if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) return -EOVERFLOW; if (nbufs > IORING_MAX_REG_BUFFERS) return -EINVAL; ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); if (ret) return ret; /* Copy original dst nodes from before the cloned range */ for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { struct io_rsrc_node *node = ctx->buf_table.nodes[i]; if (node) { data.nodes[i] = node; node->refs++; } } off = arg->dst_off; i = arg->src_off; nr = arg->nr; while (nr--) { struct io_rsrc_node *dst_node, *src_node; src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); if (!src_node) { dst_node = NULL; } else { dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!dst_node) { io_rsrc_data_free(ctx, &data); return -ENOMEM; } refcount_inc(&src_node->buf->refs); dst_node->buf = src_node->buf; } data.nodes[off++] = dst_node; i++; } /* Copy original dst nodes from after the cloned range */ for (i = nbufs; i < ctx->buf_table.nr; i++) { struct io_rsrc_node *node = ctx->buf_table.nodes[i]; if (node) { data.nodes[i] = node; node->refs++; } } /* * If asked for replace, put the old table. data->nodes[] holds both * old and new nodes at this point. */ if (arg->flags & IORING_REGISTER_DST_REPLACE) io_rsrc_data_free(ctx, &ctx->buf_table); /* * ctx->buf_table must be empty now - either the contents are being * replaced and we just freed the table, or the contents are being * copied to a ring that does not have buffers yet (checked at function * entry). */ WARN_ON_ONCE(ctx->buf_table.nr); ctx->buf_table = data; return 0; } /* * Copy the registered buffers from the source ring whose file descriptor * is given in the src_fd to the current ring. This is identical to registering * the buffers with ctx, except faster as mappings already exist. * * Since the memory is already accounted once, don't account it again. */ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_clone_buffers buf; struct io_ring_ctx *src_ctx; bool registered_src; struct file *file; int ret; if (copy_from_user(&buf, arg, sizeof(buf))) return -EFAULT; if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) return -EINVAL; if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) return -EBUSY; if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) return -EINVAL; registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; file = io_uring_register_get_file(buf.src_fd, registered_src); if (IS_ERR(file)) return PTR_ERR(file); src_ctx = file->private_data; if (src_ctx != ctx) { mutex_unlock(&ctx->uring_lock); lock_two_rings(ctx, src_ctx); if (src_ctx->submitter_task && src_ctx->submitter_task != current) { ret = -EEXIST; goto out; } } ret = io_clone_buffers(ctx, src_ctx, &buf); out: if (src_ctx != ctx) mutex_unlock(&src_ctx->uring_lock); fput(file); return ret; } void io_vec_free(struct iou_vec *iv) { if (!iv->iovec) return; kfree(iv->iovec); iv->iovec = NULL; iv->nr = 0; } int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct iovec *iov; iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); if (!iov) return -ENOMEM; io_vec_free(iv); iv->iovec = iov; iv->nr = nr_entries; return 0; } static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, struct iovec *iovec, unsigned nr_iovs, struct iou_vec *vec) { unsigned long folio_size = 1 << imu->folio_shift; unsigned long folio_mask = folio_size - 1; struct bio_vec *res_bvec = vec->bvec; size_t total_len = 0; unsigned bvec_idx = 0; unsigned iov_idx; for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { size_t iov_len = iovec[iov_idx].iov_len; u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; struct bio_vec *src_bvec; size_t offset; int ret; ret = validate_fixed_range(buf_addr, iov_len, imu); if (unlikely(ret)) return ret; if (unlikely(!iov_len)) return -EFAULT; if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) return -EOVERFLOW; offset = buf_addr - imu->ubuf; /* * Only the first bvec can have non zero bv_offset, account it * here and work with full folios below. */ offset += imu->bvec[0].bv_offset; src_bvec = imu->bvec + (offset >> imu->folio_shift); offset &= folio_mask; for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { size_t seg_size = min_t(size_t, iov_len, folio_size - offset); bvec_set_page(&res_bvec[bvec_idx], src_bvec->bv_page, seg_size, offset); iov_len -= seg_size; } } if (total_len > MAX_RW_COUNT) return -EINVAL; iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); return 0; } static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, struct io_mapped_ubuf *imu) { unsigned shift = imu->folio_shift; size_t max_segs = 0; unsigned i; for (i = 0; i < nr_iovs; i++) { max_segs += (iov[i].iov_len >> shift) + 2; if (max_segs > INT_MAX) return -EOVERFLOW; } return max_segs; } static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, struct iovec *iovec, unsigned nr_iovs, struct iou_vec *vec) { const struct bio_vec *src_bvec = imu->bvec; struct bio_vec *res_bvec = vec->bvec; unsigned res_idx = 0; size_t total_len = 0; unsigned iov_idx; for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; size_t iov_len = iovec[iov_idx].iov_len; struct bvec_iter bi = { .bi_size = offset + iov_len, }; struct bio_vec bv; bvec_iter_advance(src_bvec, &bi, offset); for_each_mp_bvec(bv, src_bvec, bi, bi) res_bvec[res_idx++] = bv; total_len += iov_len; } iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); return 0; } static int iov_kern_bvec_size(const struct iovec *iov, const struct io_mapped_ubuf *imu, unsigned int *nr_seg) { size_t offset = (size_t)(uintptr_t)iov->iov_base; const struct bio_vec *bvec = imu->bvec; int start = 0, i = 0; size_t off = 0; int ret; ret = validate_fixed_range(offset, iov->iov_len, imu); if (unlikely(ret)) return ret; for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; off += bvec[i].bv_len, i++) { if (offset >= off && offset < off + bvec[i].bv_len) start = i; } *nr_seg = i - start; return 0; } static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, struct io_mapped_ubuf *imu, unsigned *nr_segs) { unsigned max_segs = 0; size_t total_len = 0; unsigned i; int ret; *nr_segs = 0; for (i = 0; i < nr_iovs; i++) { if (unlikely(!iov[i].iov_len)) return -EFAULT; if (unlikely(check_add_overflow(total_len, iov[i].iov_len, &total_len))) return -EOVERFLOW; ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); if (unlikely(ret)) return ret; *nr_segs += max_segs; } if (total_len > MAX_RW_COUNT) return -EINVAL; return 0; } int io_import_reg_vec(int ddir, struct iov_iter *iter, struct io_kiocb *req, struct iou_vec *vec, unsigned nr_iovs, unsigned issue_flags) { struct io_rsrc_node *node; struct io_mapped_ubuf *imu; unsigned iovec_off; struct iovec *iov; unsigned nr_segs; node = io_find_buf_node(req, issue_flags); if (!node) return -EFAULT; imu = node->buf; if (!(imu->dir & (1 << ddir))) return -EFAULT; iovec_off = vec->nr - nr_iovs; iov = vec->iovec + iovec_off; if (imu->is_kbuf) { int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); if (unlikely(ret)) return ret; } else { int ret = io_estimate_bvec_size(iov, nr_iovs, imu); if (ret < 0) return ret; nr_segs = ret; } if (sizeof(struct bio_vec) > sizeof(struct iovec)) { size_t bvec_bytes; bvec_bytes = nr_segs * sizeof(struct bio_vec); nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); nr_segs += nr_iovs; } if (nr_segs > vec->nr) { struct iou_vec tmp_vec = {}; int ret; ret = io_vec_realloc(&tmp_vec, nr_segs); if (ret) return ret; iovec_off = tmp_vec.nr - nr_iovs; memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); io_vec_free(vec); *vec = tmp_vec; iov = vec->iovec + iovec_off; req->flags |= REQ_F_NEED_CLEANUP; } if (imu->is_kbuf) return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); } int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, const struct iovec __user *uvec, size_t uvec_segs) { struct iovec *iov; int iovec_off, ret; void *res; if (uvec_segs > iv->nr) { ret = io_vec_realloc(iv, uvec_segs); if (ret) return ret; req->flags |= REQ_F_NEED_CLEANUP; } /* pad iovec to the right */ iovec_off = iv->nr - uvec_segs; iov = iv->iovec + iovec_off; res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, io_is_compat(req->ctx)); if (IS_ERR(res)) return PTR_ERR(res); req->flags |= REQ_F_IMPORT_BUFFER; return 0; } |
| 3 2 19 1 1 1 1 1 1 1 17 19 19 19 19 19 15 18 15 15 9 15 15 10 10 3 4 3 1 3 4 4 3 2 2 3 3 1 1 5 2 3 2 2 2 1 1 3 3 3 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 | /* * * Copyright IBM Corporation, 2012 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> * * Cgroup v2 * Copyright (C) 2019 Red Hat, Inc. * Author: Giuseppe Scrivano <gscrivan@redhat.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License * as published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * */ #include <linux/cgroup.h> #include <linux/page_counter.h> #include <linux/slab.h> #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) /* Use t->m[0] to encode the offset */ #define MEMFILE_OFFSET(t, m0) (((offsetof(t, m0) << 16) | sizeof_field(t, m0))) #define MEMFILE_OFFSET0(val) (((val) >> 16) & 0xffff) #define MEMFILE_FIELD_SIZE(val) ((val) & 0xffff) #define DFL_TMPL_SIZE ARRAY_SIZE(hugetlb_dfl_tmpl) #define LEGACY_TMPL_SIZE ARRAY_SIZE(hugetlb_legacy_tmpl) static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static struct cftype *dfl_files; static struct cftype *legacy_files; static inline struct page_counter * __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx, bool rsvd) { if (rsvd) return &h_cg->rsvd_hugepage[idx]; return &h_cg->hugepage[idx]; } static inline struct page_counter * hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx) { return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false); } static inline struct page_counter * hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx) { return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true); } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) { return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) { return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); } static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) { return (h_cg == root_h_cgroup); } static inline struct hugetlb_cgroup * parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) { return hugetlb_cgroup_from_css(h_cg->css.parent); } static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) { struct hstate *h; for_each_hstate(h) { if (page_counter_read( hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h)))) return true; } return false; } static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, struct hugetlb_cgroup *parent_h_cgroup) { int idx; for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { struct page_counter *fault, *fault_parent = NULL; struct page_counter *rsvd, *rsvd_parent = NULL; unsigned long limit; if (parent_h_cgroup) { fault_parent = hugetlb_cgroup_counter_from_cgroup( parent_h_cgroup, idx); rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd( parent_h_cgroup, idx); } fault = hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx); rsvd = hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx); page_counter_init(fault, fault_parent, false); page_counter_init(rsvd, rsvd_parent, false); if (!cgroup_subsys_on_dfl(hugetlb_cgrp_subsys)) { fault->track_failcnt = true; rsvd->track_failcnt = true; } limit = round_down(PAGE_COUNTER_MAX, pages_per_huge_page(&hstates[idx])); VM_BUG_ON(page_counter_set_max(fault, limit)); VM_BUG_ON(page_counter_set_max(rsvd, limit)); } } static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) { int node; for_each_node(node) kfree(h_cgroup->nodeinfo[node]); kfree(h_cgroup); } static struct cgroup_subsys_state * hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); struct hugetlb_cgroup *h_cgroup; int node; h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), GFP_KERNEL); if (!h_cgroup) return ERR_PTR(-ENOMEM); if (!parent_h_cgroup) root_h_cgroup = h_cgroup; /* * TODO: this routine can waste much memory for nodes which will * never be onlined. It's better to use memory hotplug callback * function. */ for_each_node(node) { /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */ int node_to_alloc = node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE; h_cgroup->nodeinfo[node] = kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), GFP_KERNEL, node_to_alloc); if (!h_cgroup->nodeinfo[node]) goto fail_alloc_nodeinfo; } hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); return &h_cgroup->css; fail_alloc_nodeinfo: hugetlb_cgroup_free(h_cgroup); return ERR_PTR(-ENOMEM); } static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) { hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); } /* * Should be called with hugetlb_lock held. * Since we are holding hugetlb_lock, pages cannot get moved from * active list or uncharged from the cgroup, So no need to get * page reference and test for page active here. This function * cannot fail. */ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, struct folio *folio) { unsigned int nr_pages; struct page_counter *counter; struct hugetlb_cgroup *hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); hcg = hugetlb_cgroup_from_folio(folio); /* * We can have pages in active list without any cgroup * ie, hugepage with less than 3 pages. We can safely * ignore those pages. */ if (!hcg || hcg != h_cg) goto out; nr_pages = folio_nr_pages(folio); if (!parent) { parent = root_h_cgroup; /* root has no limit */ page_counter_charge(&parent->hugepage[idx], nr_pages); } counter = &h_cg->hugepage[idx]; /* Take the pages off the local counter */ page_counter_cancel(counter, nr_pages); set_hugetlb_cgroup(folio, parent); out: return; } /* * Force the hugetlb cgroup to empty the hugetlb resources by moving them to * the parent cgroup. */ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) { struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); struct hstate *h; struct folio *folio; do { for_each_hstate(h) { spin_lock_irq(&hugetlb_lock); list_for_each_entry(folio, &h->hugepage_activelist, lru) hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio); spin_unlock_irq(&hugetlb_lock); } cond_resched(); } while (hugetlb_cgroup_have_usage(h_cg)); } static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, enum hugetlb_memory_event event) { atomic_long_inc(&hugetlb->events_local[idx][event]); cgroup_file_notify(&hugetlb->events_local_file[idx]); do { atomic_long_inc(&hugetlb->events[idx][event]); cgroup_file_notify(&hugetlb->events_file[idx]); } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) && !hugetlb_cgroup_is_root(hugetlb)); } static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr, bool rsvd) { int ret = 0; struct page_counter *counter; struct hugetlb_cgroup *h_cg = NULL; if (hugetlb_cgroup_disabled()) goto done; again: rcu_read_lock(); h_cg = hugetlb_cgroup_from_task(current); if (!css_tryget(&h_cg->css)) { rcu_read_unlock(); goto again; } rcu_read_unlock(); if (!page_counter_try_charge( __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages, &counter)) { ret = -ENOMEM; hugetlb_event(h_cg, idx, HUGETLB_MAX); css_put(&h_cg->css); goto done; } /* Reservations take a reference to the css because they do not get * reparented. */ if (!rsvd) css_put(&h_cg->css); done: *ptr = h_cg; return ret; } int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false); } int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true); } /* Should be called with hugetlb_lock held */ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio, bool rsvd) { if (hugetlb_cgroup_disabled() || !h_cg) return; lockdep_assert_held(&hugetlb_lock); __set_hugetlb_cgroup(folio, h_cg, rsvd); if (!rsvd) { unsigned long usage = h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; /* * This write is not atomic due to fetching usage and writing * to it, but that's fine because we call this with * hugetlb_lock held anyway. */ WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], usage + nr_pages); } } void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio) { __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); } void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio) { __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); } /* * Should be called with hugetlb_lock held */ static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio, bool rsvd) { struct hugetlb_cgroup *h_cg; if (hugetlb_cgroup_disabled()) return; lockdep_assert_held(&hugetlb_lock); h_cg = __hugetlb_cgroup_from_folio(folio, rsvd); if (unlikely(!h_cg)) return; __set_hugetlb_cgroup(folio, NULL, rsvd); page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); if (rsvd) css_put(&h_cg->css); else { unsigned long usage = h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; /* * This write is not atomic due to fetching usage and writing * to it, but that's fine because we call this with * hugetlb_lock held anyway. */ WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], usage - nr_pages); } } void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio) { __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false); } void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, struct folio *folio) { __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true); } static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, bool rsvd) { if (hugetlb_cgroup_disabled() || !h_cg) return; page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); if (rsvd) css_put(&h_cg->css); } void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false); } void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true); } void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, unsigned long end) { if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter || !resv->css) return; page_counter_uncharge(resv->reservation_counter, (end - start) * resv->pages_per_hpage); css_put(resv->css); } void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, struct file_region *rg, unsigned long nr_pages, bool region_del) { if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) return; if (rg->reservation_counter && resv->pages_per_hpage && !resv->reservation_counter) { page_counter_uncharge(rg->reservation_counter, nr_pages * resv->pages_per_hpage); /* * Only do css_put(rg->css) when we delete the entire region * because one file_region must hold exactly one css reference. */ if (region_del) css_put(rg->css); } } enum { RES_USAGE, RES_RSVD_USAGE, RES_LIMIT, RES_RSVD_LIMIT, RES_MAX_USAGE, RES_RSVD_MAX_USAGE, RES_FAILCNT, RES_RSVD_FAILCNT, }; static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) { int nid; struct cftype *cft = seq_cft(seq); int idx = MEMFILE_IDX(cft->private); bool legacy = !cgroup_subsys_on_dfl(hugetlb_cgrp_subsys); struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); struct cgroup_subsys_state *css; unsigned long usage; if (legacy) { /* Add up usage across all nodes for the non-hierarchical total. */ usage = 0; for_each_node_state(nid, N_MEMORY) usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); seq_printf(seq, "total=%lu", usage * PAGE_SIZE); /* Simply print the per-node usage for the non-hierarchical total. */ for_each_node_state(nid, N_MEMORY) seq_printf(seq, " N%d=%lu", nid, READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * PAGE_SIZE); seq_putc(seq, '\n'); } /* * The hierarchical total is pretty much the value recorded by the * counter, so use that. */ seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); /* * For each node, transverse the css tree to obtain the hierarchical * node usage. */ for_each_node_state(nid, N_MEMORY) { usage = 0; rcu_read_lock(); css_for_each_descendant_pre(css, &h_cg->css) { usage += READ_ONCE(hugetlb_cgroup_from_css(css) ->nodeinfo[nid] ->usage[idx]); } rcu_read_unlock(); seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); } seq_putc(seq, '\n'); return 0; } static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct page_counter *counter; struct page_counter *rsvd_counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)]; switch (MEMFILE_ATTR(cft->private)) { case RES_USAGE: return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_RSVD_USAGE: return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE; case RES_LIMIT: return (u64)counter->max * PAGE_SIZE; case RES_RSVD_LIMIT: return (u64)rsvd_counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_RSVD_MAX_USAGE: return (u64)rsvd_counter->watermark * PAGE_SIZE; case RES_FAILCNT: return counter->failcnt; case RES_RSVD_FAILCNT: return rsvd_counter->failcnt; default: BUG(); } } static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) { int idx; u64 val; struct cftype *cft = seq_cft(seq); unsigned long limit; struct page_counter *counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); idx = MEMFILE_IDX(cft->private); counter = &h_cg->hugepage[idx]; limit = round_down(PAGE_COUNTER_MAX, pages_per_huge_page(&hstates[idx])); switch (MEMFILE_ATTR(cft->private)) { case RES_RSVD_USAGE: counter = &h_cg->rsvd_hugepage[idx]; fallthrough; case RES_USAGE: val = (u64)page_counter_read(counter); seq_printf(seq, "%llu\n", val * PAGE_SIZE); break; case RES_RSVD_LIMIT: counter = &h_cg->rsvd_hugepage[idx]; fallthrough; case RES_LIMIT: val = (u64)counter->max; if (val == limit) seq_puts(seq, "max\n"); else seq_printf(seq, "%llu\n", val * PAGE_SIZE); break; default: BUG(); } return 0; } static DEFINE_MUTEX(hugetlb_limit_mutex); static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, const char *max) { int ret, idx; unsigned long nr_pages; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); bool rsvd = false; if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ return -EINVAL; buf = strstrip(buf); ret = page_counter_memparse(buf, max, &nr_pages); if (ret) return ret; idx = MEMFILE_IDX(of_cft(of)->private); nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx])); switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_RSVD_LIMIT: rsvd = true; fallthrough; case RES_LIMIT: mutex_lock(&hugetlb_limit_mutex); ret = page_counter_set_max( __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); mutex_unlock(&hugetlb_limit_mutex); break; default: ret = -EINVAL; break; } return ret ?: nbytes; } static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); } static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); } static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { int ret = 0; struct page_counter *counter, *rsvd_counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)]; switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_MAX_USAGE: page_counter_reset_watermark(counter); break; case RES_RSVD_MAX_USAGE: page_counter_reset_watermark(rsvd_counter); break; case RES_FAILCNT: counter->failcnt = 0; break; case RES_RSVD_FAILCNT: rsvd_counter->failcnt = 0; break; default: ret = -EINVAL; break; } return ret ?: nbytes; } static char *mem_fmt(char *buf, int size, unsigned long hsize) { if (hsize >= SZ_1G) snprintf(buf, size, "%luGB", hsize / SZ_1G); else if (hsize >= SZ_1M) snprintf(buf, size, "%luMB", hsize / SZ_1M); else snprintf(buf, size, "%luKB", hsize / SZ_1K); return buf; } static int __hugetlb_events_show(struct seq_file *seq, bool local) { int idx; long max; struct cftype *cft = seq_cft(seq); struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); idx = MEMFILE_IDX(cft->private); if (local) max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]); else max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]); seq_printf(seq, "max %lu\n", max); return 0; } static int hugetlb_events_show(struct seq_file *seq, void *v) { return __hugetlb_events_show(seq, false); } static int hugetlb_events_local_show(struct seq_file *seq, void *v) { return __hugetlb_events_show(seq, true); } static struct cftype hugetlb_dfl_tmpl[] = { { .name = "max", .private = RES_LIMIT, .seq_show = hugetlb_cgroup_read_u64_max, .write = hugetlb_cgroup_write_dfl, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "rsvd.max", .private = RES_RSVD_LIMIT, .seq_show = hugetlb_cgroup_read_u64_max, .write = hugetlb_cgroup_write_dfl, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "current", .private = RES_USAGE, .seq_show = hugetlb_cgroup_read_u64_max, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "rsvd.current", .private = RES_RSVD_USAGE, .seq_show = hugetlb_cgroup_read_u64_max, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "events", .seq_show = hugetlb_events_show, .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_file[0]), .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "events.local", .seq_show = hugetlb_events_local_show, .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_local_file[0]), .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "numa_stat", .seq_show = hugetlb_cgroup_read_numa_stat, .flags = CFTYPE_NOT_ON_ROOT, }, /* don't need terminator here */ }; static struct cftype hugetlb_legacy_tmpl[] = { { .name = "limit_in_bytes", .private = RES_LIMIT, .read_u64 = hugetlb_cgroup_read_u64, .write = hugetlb_cgroup_write_legacy, }, { .name = "rsvd.limit_in_bytes", .private = RES_RSVD_LIMIT, .read_u64 = hugetlb_cgroup_read_u64, .write = hugetlb_cgroup_write_legacy, }, { .name = "usage_in_bytes", .private = RES_USAGE, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "rsvd.usage_in_bytes", .private = RES_RSVD_USAGE, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "max_usage_in_bytes", .private = RES_MAX_USAGE, .write = hugetlb_cgroup_reset, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "rsvd.max_usage_in_bytes", .private = RES_RSVD_MAX_USAGE, .write = hugetlb_cgroup_reset, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "failcnt", .private = RES_FAILCNT, .write = hugetlb_cgroup_reset, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "rsvd.failcnt", .private = RES_RSVD_FAILCNT, .write = hugetlb_cgroup_reset, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "numa_stat", .seq_show = hugetlb_cgroup_read_numa_stat, }, /* don't need terminator here */ }; static void __init hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft, struct cftype *tmpl, int tmpl_size) { char buf[32]; int i, idx = hstate_index(h); /* format the size */ mem_fmt(buf, sizeof(buf), huge_page_size(h)); for (i = 0; i < tmpl_size; cft++, tmpl++, i++) { *cft = *tmpl; /* rebuild the name */ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name); /* rebuild the private */ cft->private = MEMFILE_PRIVATE(idx, tmpl->private); /* rebuild the file_offset */ if (tmpl->file_offset) { unsigned int offset = tmpl->file_offset; cft->file_offset = MEMFILE_OFFSET0(offset) + MEMFILE_FIELD_SIZE(offset) * idx; } lockdep_register_key(&cft->lockdep_key); } } static void __init __hugetlb_cgroup_file_dfl_init(struct hstate *h) { int idx = hstate_index(h); hugetlb_cgroup_cfttypes_init(h, dfl_files + idx * DFL_TMPL_SIZE, hugetlb_dfl_tmpl, DFL_TMPL_SIZE); } static void __init __hugetlb_cgroup_file_legacy_init(struct hstate *h) { int idx = hstate_index(h); hugetlb_cgroup_cfttypes_init(h, legacy_files + idx * LEGACY_TMPL_SIZE, hugetlb_legacy_tmpl, LEGACY_TMPL_SIZE); } static void __init __hugetlb_cgroup_file_init(struct hstate *h) { __hugetlb_cgroup_file_dfl_init(h); __hugetlb_cgroup_file_legacy_init(h); } static void __init __hugetlb_cgroup_file_pre_init(void) { int cft_count; cft_count = hugetlb_max_hstate * DFL_TMPL_SIZE + 1; /* add terminator */ dfl_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL); BUG_ON(!dfl_files); cft_count = hugetlb_max_hstate * LEGACY_TMPL_SIZE + 1; /* add terminator */ legacy_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL); BUG_ON(!legacy_files); } static void __init __hugetlb_cgroup_file_post_init(void) { WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, dfl_files)); WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, legacy_files)); } void __init hugetlb_cgroup_file_init(void) { struct hstate *h; __hugetlb_cgroup_file_pre_init(); for_each_hstate(h) __hugetlb_cgroup_file_init(h); __hugetlb_cgroup_file_post_init(); } /* * hugetlb_lock will make sure a parallel cgroup rmdir won't happen * when we migrate hugepages */ void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) { struct hugetlb_cgroup *h_cg; struct hugetlb_cgroup *h_cg_rsvd; struct hstate *h = folio_hstate(old_folio); if (hugetlb_cgroup_disabled()) return; spin_lock_irq(&hugetlb_lock); h_cg = hugetlb_cgroup_from_folio(old_folio); h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio); set_hugetlb_cgroup(old_folio, NULL); set_hugetlb_cgroup_rsvd(old_folio, NULL); /* move the h_cg details to new cgroup */ set_hugetlb_cgroup(new_folio, h_cg); set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); list_move(&new_folio->lru, &h->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); } static struct cftype hugetlb_files[] = { {} /* terminate */ }; struct cgroup_subsys hugetlb_cgrp_subsys = { .css_alloc = hugetlb_cgroup_css_alloc, .css_offline = hugetlb_cgroup_css_offline, .css_free = hugetlb_cgroup_css_free, .dfl_cftypes = hugetlb_files, .legacy_cftypes = hugetlb_files, }; |
| 5 3 1 1 1 1 1 1 1 1 3 3 3 1 3 3 3 3 3 3 2 3 3 2 3 3 3 3 3 4 2 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds */ /* * Hopefully this will be a rather complete VT102 implementation. * * Beeping thanks to John T Kohl. * * Virtual Consoles, Screen Blanking, Screen Dumping, Color, Graphics * Chars, and VT100 enhancements by Peter MacDonald. * * Copy and paste function by Andrew Haylett, * some enhancements by Alessandro Rubini. * * Code to check for different video-cards mostly by Galen Hunt, * <g-hunt@ee.utah.edu> * * Rudimentary ISO 10646/Unicode/UTF-8 character set support by * Markus Kuhn, <mskuhn@immd4.informatik.uni-erlangen.de>. * * Dynamic allocation of consoles, aeb@cwi.nl, May 1994 * Resizing of consoles, aeb, 940926 * * Code for xterm like mouse click reporting by Peter Orbaek 20-Jul-94 * <poe@daimi.aau.dk> * * User-defined bell sound, new setterm control sequences and printk * redirection by Martin Mares <mj@k332.feld.cvut.cz> 19-Nov-95 * * APM screenblank bug fixed Takashi Manabe <manabe@roy.dsl.tutics.tut.jp> * * Merge with the abstract console driver by Geert Uytterhoeven * <geert@linux-m68k.org>, Jan 1997. * * Original m68k console driver modifications by * * - Arno Griffioen <arno@usn.nl> * - David Carter <carter@cs.bris.ac.uk> * * The abstract console driver provides a generic interface for a text * console. It supports VGA text mode, frame buffer based graphical consoles * and special graphics processors that are only accessible through some * registers (e.g. a TMS340x0 GSP). * * The interface to the hardware is specified using a special structure * (struct consw) which contains function pointers to console operations * (see <linux/console.h> for more information). * * Support for changeable cursor shape * by Pavel Machek <pavel@atrey.karlin.mff.cuni.cz>, August 1997 * * Ported to i386 and con_scrolldelta fixed * by Emmanuel Marty <core@ggi-project.org>, April 1998 * * Resurrected character buffers in videoram plus lots of other trickery * by Martin Mares <mj@atrey.karlin.mff.cuni.cz>, July 1998 * * Removed old-style timers, introduced console_timer, made timer * deletion SMP-safe. 17Jun00, Andrew Morton * * Removed console_lock, enabled interrupts across all console operations * 13 March 2001, Andrew Morton * * Fixed UTF-8 mode so alternate charset modes always work according * to control sequences interpreted in do_con_trol function * preserving backward VT100 semigraphics compatibility, * malformed UTF sequences represented as sequences of replacement glyphs, * original codes or '?' as a last resort if replacement glyph is undefined * by Adam Tla/lka <atlka@pg.gda.pl>, Aug 2006 */ #include <linux/module.h> #include <linux/types.h> #include <linux/sched/signal.h> #include <linux/tty.h> #include <linux/tty_flip.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/kd.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/major.h> #include <linux/mm.h> #include <linux/console.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/vt_kern.h> #include <linux/selection.h> #include <linux/tiocl.h> #include <linux/kbd_kern.h> #include <linux/consolemap.h> #include <linux/timer.h> #include <linux/interrupt.h> #include <linux/workqueue.h> #include <linux/pm.h> #include <linux/font.h> #include <linux/bitops.h> #include <linux/notifier.h> #include <linux/device.h> #include <linux/io.h> #include <linux/uaccess.h> #include <linux/kdb.h> #include <linux/ctype.h> #include <linux/gcd.h> #define MAX_NR_CON_DRIVER 16 #define CON_DRIVER_FLAG_MODULE 1 #define CON_DRIVER_FLAG_INIT 2 #define CON_DRIVER_FLAG_ATTR 4 #define CON_DRIVER_FLAG_ZOMBIE 8 struct con_driver { const struct consw *con; const char *desc; struct device *dev; int node; int first; int last; int flag; }; static struct con_driver registered_con_driver[MAX_NR_CON_DRIVER]; const struct consw *conswitchp; /* * Here is the default bell parameters: 750HZ, 1/8th of a second */ #define DEFAULT_BELL_PITCH 750 #define DEFAULT_BELL_DURATION (HZ/8) #define DEFAULT_CURSOR_BLINK_MS 200 struct vc vc_cons [MAX_NR_CONSOLES]; EXPORT_SYMBOL(vc_cons); static const struct consw *con_driver_map[MAX_NR_CONSOLES]; static int con_open(struct tty_struct *, struct file *); static void vc_init(struct vc_data *vc, int do_clear); static void gotoxy(struct vc_data *vc, int new_x, int new_y); static void restore_cur(struct vc_data *vc); static void save_cur(struct vc_data *vc); static void reset_terminal(struct vc_data *vc, int do_clear); static void con_flush_chars(struct tty_struct *tty); static int set_vesa_blanking(u8 __user *mode); static void set_cursor(struct vc_data *vc); static void hide_cursor(struct vc_data *vc); static void console_callback(struct work_struct *ignored); static void con_driver_unregister_callback(struct work_struct *ignored); static void blank_screen_t(struct timer_list *unused); static void set_palette(struct vc_data *vc); static void unblank_screen(void); #define vt_get_kmsg_redirect() vt_kmsg_redirect(-1) int default_utf8 = true; module_param(default_utf8, int, S_IRUGO | S_IWUSR); int global_cursor_default = -1; module_param(global_cursor_default, int, S_IRUGO | S_IWUSR); EXPORT_SYMBOL(global_cursor_default); static int cur_default = CUR_UNDERLINE; module_param(cur_default, int, S_IRUGO | S_IWUSR); /* * ignore_poke: don't unblank the screen when things are typed. This is * mainly for the privacy of braille terminal users. */ static int ignore_poke; int do_poke_blanked_console; int console_blanked; EXPORT_SYMBOL(console_blanked); static enum vesa_blank_mode vesa_blank_mode; static int vesa_off_interval; static int blankinterval; core_param(consoleblank, blankinterval, int, 0444); static DECLARE_WORK(console_work, console_callback); static DECLARE_WORK(con_driver_unregister_work, con_driver_unregister_callback); /* * fg_console is the current virtual console, * last_console is the last used one, * want_console is the console we want to switch to, * saved_* variants are for save/restore around kernel debugger enter/leave */ int fg_console; EXPORT_SYMBOL(fg_console); int last_console; int want_console = -1; static int saved_fg_console; static int saved_last_console; static int saved_want_console; static int saved_vc_mode; static int saved_console_blanked; /* * For each existing display, we have a pointer to console currently visible * on that display, allowing consoles other than fg_console to be refreshed * appropriately. Unless the low-level driver supplies its own display_fg * variable, we use this one for the "master display". */ static struct vc_data *master_display_fg; /* * Unfortunately, we need to delay tty echo when we're currently writing to the * console since the code is (and always was) not re-entrant, so we schedule * all flip requests to process context with schedule-task() and run it from * console_callback(). */ /* * For the same reason, we defer scrollback to the console callback. */ static int scrollback_delta; /* * Hook so that the power management routines can (un)blank * the console on our behalf. */ int (*console_blank_hook)(int); EXPORT_SYMBOL(console_blank_hook); static DEFINE_TIMER(console_timer, blank_screen_t); static int blank_state; static int blank_timer_expired; enum { blank_off = 0, blank_normal_wait, blank_vesa_wait, }; /* * /sys/class/tty/tty0/ * * the attribute 'active' contains the name of the current vc * console and it supports poll() to detect vc switches */ static struct device *tty0dev; /* * Notifier list for console events. */ static ATOMIC_NOTIFIER_HEAD(vt_notifier_list); int register_vt_notifier(struct notifier_block *nb) { return atomic_notifier_chain_register(&vt_notifier_list, nb); } EXPORT_SYMBOL_GPL(register_vt_notifier); int unregister_vt_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&vt_notifier_list, nb); } EXPORT_SYMBOL_GPL(unregister_vt_notifier); static void notify_write(struct vc_data *vc, unsigned int unicode) { struct vt_notifier_param param = { .vc = vc, .c = unicode }; atomic_notifier_call_chain(&vt_notifier_list, VT_WRITE, ¶m); } static void notify_update(struct vc_data *vc) { struct vt_notifier_param param = { .vc = vc }; atomic_notifier_call_chain(&vt_notifier_list, VT_UPDATE, ¶m); } /* * Low-Level Functions */ static inline bool con_is_fg(const struct vc_data *vc) { return vc->vc_num == fg_console; } static inline bool con_should_update(const struct vc_data *vc) { return con_is_visible(vc) && !console_blanked; } static inline u16 *screenpos(const struct vc_data *vc, unsigned int offset, bool viewed) { unsigned long origin = viewed ? vc->vc_visible_origin : vc->vc_origin; return (u16 *)(origin + offset); } static void con_putc(struct vc_data *vc, u16 ca, unsigned int y, unsigned int x) { if (vc->vc_sw->con_putc) vc->vc_sw->con_putc(vc, ca, y, x); else vc->vc_sw->con_putcs(vc, &ca, 1, y, x); } /* Called from the keyboard irq path.. */ static inline void scrolldelta(int lines) { /* FIXME */ /* scrolldelta needs some kind of consistency lock, but the BKL was and still is not protecting versus the scheduled back end */ scrollback_delta += lines; schedule_console_callback(); } void schedule_console_callback(void) { schedule_work(&console_work); } /* * Code to manage unicode-based screen buffers */ /* * Our screen buffer is preceded by an array of line pointers so that * scrolling only implies some pointer shuffling. */ static u32 **vc_uniscr_alloc(unsigned int cols, unsigned int rows) { u32 **uni_lines; void *p; unsigned int memsize, i, col_size = cols * sizeof(**uni_lines); /* allocate everything in one go */ memsize = col_size * rows; memsize += rows * sizeof(*uni_lines); uni_lines = vzalloc(memsize); if (!uni_lines) return NULL; /* initial line pointers */ p = uni_lines + rows; for (i = 0; i < rows; i++) { uni_lines[i] = p; p += col_size; } return uni_lines; } static void vc_uniscr_free(u32 **uni_lines) { vfree(uni_lines); } static void vc_uniscr_set(struct vc_data *vc, u32 **new_uni_lines) { vc_uniscr_free(vc->vc_uni_lines); vc->vc_uni_lines = new_uni_lines; } static void vc_uniscr_putc(struct vc_data *vc, u32 uc) { if (vc->vc_uni_lines) vc->vc_uni_lines[vc->state.y][vc->state.x] = uc; } static void vc_uniscr_insert(struct vc_data *vc, unsigned int nr) { if (vc->vc_uni_lines) { u32 *ln = vc->vc_uni_lines[vc->state.y]; unsigned int x = vc->state.x, cols = vc->vc_cols; memmove(&ln[x + nr], &ln[x], (cols - x - nr) * sizeof(*ln)); memset32(&ln[x], ' ', nr); } } static void vc_uniscr_delete(struct vc_data *vc, unsigned int nr) { if (vc->vc_uni_lines) { u32 *ln = vc->vc_uni_lines[vc->state.y]; unsigned int x = vc->state.x, cols = vc->vc_cols; memmove(&ln[x], &ln[x + nr], (cols - x - nr) * sizeof(*ln)); memset32(&ln[cols - nr], ' ', nr); } } static void vc_uniscr_clear_line(struct vc_data *vc, unsigned int x, unsigned int nr) { if (vc->vc_uni_lines) memset32(&vc->vc_uni_lines[vc->state.y][x], ' ', nr); } static void vc_uniscr_clear_lines(struct vc_data *vc, unsigned int y, unsigned int nr) { if (vc->vc_uni_lines) while (nr--) memset32(vc->vc_uni_lines[y++], ' ', vc->vc_cols); } /* juggling array rotation algorithm (complexity O(N), size complexity O(1)) */ static void juggle_array(u32 **array, unsigned int size, unsigned int nr) { unsigned int gcd_idx; for (gcd_idx = 0; gcd_idx < gcd(nr, size); gcd_idx++) { u32 *gcd_idx_val = array[gcd_idx]; unsigned int dst_idx = gcd_idx; while (1) { unsigned int src_idx = (dst_idx + nr) % size; if (src_idx == gcd_idx) break; array[dst_idx] = array[src_idx]; dst_idx = src_idx; } array[dst_idx] = gcd_idx_val; } } static void vc_uniscr_scroll(struct vc_data *vc, unsigned int top, unsigned int bottom, enum con_scroll dir, unsigned int nr) { u32 **uni_lines = vc->vc_uni_lines; unsigned int size = bottom - top; if (!uni_lines) return; if (dir == SM_DOWN) { juggle_array(&uni_lines[top], size, size - nr); vc_uniscr_clear_lines(vc, top, nr); } else { juggle_array(&uni_lines[top], size, nr); vc_uniscr_clear_lines(vc, bottom - nr, nr); } } static u32 vc_uniscr_getc(struct vc_data *vc, int relative_pos) { int pos = vc->state.x + vc->vc_need_wrap + relative_pos; if (vc->vc_uni_lines && in_range(pos, 0, vc->vc_cols)) return vc->vc_uni_lines[vc->state.y][pos]; return 0; } static void vc_uniscr_copy_area(u32 **dst_lines, unsigned int dst_cols, unsigned int dst_rows, u32 **src_lines, unsigned int src_cols, unsigned int src_top_row, unsigned int src_bot_row) { unsigned int dst_row = 0; if (!dst_lines) return; while (src_top_row < src_bot_row) { u32 *src_line = src_lines[src_top_row]; u32 *dst_line = dst_lines[dst_row]; memcpy(dst_line, src_line, src_cols * sizeof(*src_line)); if (dst_cols - src_cols) memset32(dst_line + src_cols, ' ', dst_cols - src_cols); src_top_row++; dst_row++; } while (dst_row < dst_rows) { u32 *dst_line = dst_lines[dst_row]; memset32(dst_line, ' ', dst_cols); dst_row++; } } /* * Called from vcs_read() to make sure unicode screen retrieval is possible. * This will initialize the unicode screen buffer if not already done. * This returns 0 if OK, or a negative error code otherwise. * In particular, -ENODATA is returned if the console is not in UTF-8 mode. */ int vc_uniscr_check(struct vc_data *vc) { u32 **uni_lines; unsigned short *p; int x, y, mask; WARN_CONSOLE_UNLOCKED(); if (!vc->vc_utf) return -ENODATA; if (vc->vc_uni_lines) return 0; uni_lines = vc_uniscr_alloc(vc->vc_cols, vc->vc_rows); if (!uni_lines) return -ENOMEM; /* * Let's populate it initially with (imperfect) reverse translation. * This is the next best thing we can do short of having it enabled * from the start even when no users rely on this functionality. True * unicode content will be available after a complete screen refresh. */ p = (unsigned short *)vc->vc_origin; mask = vc->vc_hi_font_mask | 0xff; for (y = 0; y < vc->vc_rows; y++) { u32 *line = uni_lines[y]; for (x = 0; x < vc->vc_cols; x++) { u16 glyph = scr_readw(p++) & mask; line[x] = inverse_translate(vc, glyph, true); } } vc->vc_uni_lines = uni_lines; return 0; } /* * Called from vcs_read() to get the unicode data from the screen. * This must be preceded by a successful call to vc_uniscr_check() once * the console lock has been taken. */ void vc_uniscr_copy_line(const struct vc_data *vc, void *dest, bool viewed, unsigned int row, unsigned int col, unsigned int nr) { u32 **uni_lines = vc->vc_uni_lines; int offset = row * vc->vc_size_row + col * 2; unsigned long pos; if (WARN_ON_ONCE(!uni_lines)) return; pos = (unsigned long)screenpos(vc, offset, viewed); if (pos >= vc->vc_origin && pos < vc->vc_scr_end) { /* * Desired position falls in the main screen buffer. * However the actual row/col might be different if * scrollback is active. */ row = (pos - vc->vc_origin) / vc->vc_size_row; col = ((pos - vc->vc_origin) % vc->vc_size_row) / 2; memcpy(dest, &uni_lines[row][col], nr * sizeof(u32)); } else { /* * Scrollback is active. For now let's simply backtranslate * the screen glyphs until the unicode screen buffer does * synchronize with console display drivers for a scrollback * buffer of its own. */ u16 *p = (u16 *)pos; int mask = vc->vc_hi_font_mask | 0xff; u32 *uni_buf = dest; while (nr--) { u16 glyph = scr_readw(p++) & mask; *uni_buf++ = inverse_translate(vc, glyph, true); } } } static void con_scroll(struct vc_data *vc, unsigned int top, unsigned int bottom, enum con_scroll dir, unsigned int nr) { unsigned int rows = bottom - top; u16 *clear, *dst, *src; if (top + nr >= bottom) nr = rows - 1; if (bottom > vc->vc_rows || top >= bottom || nr < 1) return; vc_uniscr_scroll(vc, top, bottom, dir, nr); if (con_is_visible(vc) && vc->vc_sw->con_scroll(vc, top, bottom, dir, nr)) return; src = clear = (u16 *)(vc->vc_origin + vc->vc_size_row * top); dst = (u16 *)(vc->vc_origin + vc->vc_size_row * (top + nr)); if (dir == SM_UP) { clear = src + (rows - nr) * vc->vc_cols; swap(src, dst); } scr_memmovew(dst, src, (rows - nr) * vc->vc_size_row); scr_memsetw(clear, vc->vc_video_erase_char, vc->vc_size_row * nr); } static void do_update_region(struct vc_data *vc, unsigned long start, int count) { unsigned int xx, yy, offset; u16 *p = (u16 *)start; offset = (start - vc->vc_origin) / 2; xx = offset % vc->vc_cols; yy = offset / vc->vc_cols; for(;;) { u16 attrib = scr_readw(p) & 0xff00; int startx = xx; u16 *q = p; while (xx < vc->vc_cols && count) { if (attrib != (scr_readw(p) & 0xff00)) { if (p > q) vc->vc_sw->con_putcs(vc, q, p-q, yy, startx); startx = xx; q = p; attrib = scr_readw(p) & 0xff00; } p++; xx++; count--; } if (p > q) vc->vc_sw->con_putcs(vc, q, p-q, yy, startx); if (!count) break; xx = 0; yy++; } } void update_region(struct vc_data *vc, unsigned long start, int count) { WARN_CONSOLE_UNLOCKED(); if (con_should_update(vc)) { hide_cursor(vc); do_update_region(vc, start, count); set_cursor(vc); } } EXPORT_SYMBOL(update_region); /* Structure of attributes is hardware-dependent */ static u8 build_attr(struct vc_data *vc, u8 _color, enum vc_intensity _intensity, bool _blink, bool _underline, bool _reverse, bool _italic) { if (vc->vc_sw->con_build_attr) return vc->vc_sw->con_build_attr(vc, _color, _intensity, _blink, _underline, _reverse, _italic); /* * ++roman: I completely changed the attribute format for monochrome * mode (!can_do_color). The formerly used MDA (monochrome display * adapter) format didn't allow the combination of certain effects. * Now the attribute is just a bit vector: * Bit 0..1: intensity (0..2) * Bit 2 : underline * Bit 3 : reverse * Bit 7 : blink */ { u8 a = _color; if (!vc->vc_can_do_color) return _intensity | (_italic << 1) | (_underline << 2) | (_reverse << 3) | (_blink << 7); if (_italic) a = (a & 0xF0) | vc->vc_itcolor; else if (_underline) a = (a & 0xf0) | vc->vc_ulcolor; else if (_intensity == VCI_HALF_BRIGHT) a = (a & 0xf0) | vc->vc_halfcolor; if (_reverse) a = (a & 0x88) | (((a >> 4) | (a << 4)) & 0x77); if (_blink) a ^= 0x80; if (_intensity == VCI_BOLD) a ^= 0x08; if (vc->vc_hi_font_mask == 0x100) a <<= 1; return a; } } static void update_attr(struct vc_data *vc) { vc->vc_attr = build_attr(vc, vc->state.color, vc->state.intensity, vc->state.blink, vc->state.underline, vc->state.reverse ^ vc->vc_decscnm, vc->state.italic); vc->vc_video_erase_char = ' ' | (build_attr(vc, vc->state.color, VCI_NORMAL, vc->state.blink, false, vc->vc_decscnm, false) << 8); } /* Note: inverting the screen twice should revert to the original state */ void invert_screen(struct vc_data *vc, int offset, int count, bool viewed) { u16 *p; WARN_CONSOLE_UNLOCKED(); count /= 2; p = screenpos(vc, offset, viewed); if (vc->vc_sw->con_invert_region) { vc->vc_sw->con_invert_region(vc, p, count); } else { u16 *q = p; int cnt = count; u16 a; if (!vc->vc_can_do_color) { while (cnt--) { a = scr_readw(q); a ^= 0x0800; scr_writew(a, q); q++; } } else if (vc->vc_hi_font_mask == 0x100) { while (cnt--) { a = scr_readw(q); a = (a & 0x11ff) | ((a & 0xe000) >> 4) | ((a & 0x0e00) << 4); scr_writew(a, q); q++; } } else { while (cnt--) { a = scr_readw(q); a = (a & 0x88ff) | ((a & 0x7000) >> 4) | ((a & 0x0700) << 4); scr_writew(a, q); q++; } } } if (con_should_update(vc)) do_update_region(vc, (unsigned long) p, count); notify_update(vc); } /* used by selection: complement pointer position */ void complement_pos(struct vc_data *vc, int offset) { static int old_offset = -1; static unsigned short old; static unsigned short oldx, oldy; WARN_CONSOLE_UNLOCKED(); if (old_offset != -1 && old_offset >= 0 && old_offset < vc->vc_screenbuf_size) { scr_writew(old, screenpos(vc, old_offset, true)); if (con_should_update(vc)) con_putc(vc, old, oldy, oldx); notify_update(vc); } old_offset = offset; if (offset != -1 && offset >= 0 && offset < vc->vc_screenbuf_size) { unsigned short new; u16 *p = screenpos(vc, offset, true); old = scr_readw(p); new = old ^ vc->vc_complement_mask; scr_writew(new, p); if (con_should_update(vc)) { oldx = (offset >> 1) % vc->vc_cols; oldy = (offset >> 1) / vc->vc_cols; con_putc(vc, new, oldy, oldx); } notify_update(vc); } } static void insert_char(struct vc_data *vc, unsigned int nr) { unsigned short *p = (unsigned short *) vc->vc_pos; vc_uniscr_insert(vc, nr); scr_memmovew(p + nr, p, (vc->vc_cols - vc->state.x - nr) * 2); scr_memsetw(p, vc->vc_video_erase_char, nr * 2); vc->vc_need_wrap = 0; if (con_should_update(vc)) do_update_region(vc, (unsigned long) p, vc->vc_cols - vc->state.x); } static void delete_char(struct vc_data *vc, unsigned int nr) { unsigned short *p = (unsigned short *) vc->vc_pos; vc_uniscr_delete(vc, nr); scr_memmovew(p, p + nr, (vc->vc_cols - vc->state.x - nr) * 2); scr_memsetw(p + vc->vc_cols - vc->state.x - nr, vc->vc_video_erase_char, nr * 2); vc->vc_need_wrap = 0; if (con_should_update(vc)) do_update_region(vc, (unsigned long) p, vc->vc_cols - vc->state.x); } static int softcursor_original = -1; static void add_softcursor(struct vc_data *vc) { int i = scr_readw((u16 *) vc->vc_pos); u32 type = vc->vc_cursor_type; if (!(type & CUR_SW)) return; if (softcursor_original != -1) return; softcursor_original = i; i |= CUR_SET(type); i ^= CUR_CHANGE(type); if ((type & CUR_ALWAYS_BG) && (softcursor_original & CUR_BG) == (i & CUR_BG)) i ^= CUR_BG; if ((type & CUR_INVERT_FG_BG) && (i & CUR_FG) == ((i & CUR_BG) >> 4)) i ^= CUR_FG; scr_writew(i, (u16 *)vc->vc_pos); if (con_should_update(vc)) con_putc(vc, i, vc->state.y, vc->state.x); } static void hide_softcursor(struct vc_data *vc) { if (softcursor_original != -1) { scr_writew(softcursor_original, (u16 *)vc->vc_pos); if (con_should_update(vc)) con_putc(vc, softcursor_original, vc->state.y, vc->state.x); softcursor_original = -1; } } static void hide_cursor(struct vc_data *vc) { if (vc_is_sel(vc)) clear_selection(); vc->vc_sw->con_cursor(vc, false); hide_softcursor(vc); } static void set_cursor(struct vc_data *vc) { if (!con_is_fg(vc) || console_blanked || vc->vc_mode == KD_GRAPHICS) return; if (vc->vc_deccm) { if (vc_is_sel(vc)) clear_selection(); add_softcursor(vc); if (CUR_SIZE(vc->vc_cursor_type) != CUR_NONE) vc->vc_sw->con_cursor(vc, true); } else hide_cursor(vc); } static void set_origin(struct vc_data *vc) { WARN_CONSOLE_UNLOCKED(); if (!con_is_visible(vc) || !vc->vc_sw->con_set_origin || !vc->vc_sw->con_set_origin(vc)) vc->vc_origin = (unsigned long)vc->vc_screenbuf; vc->vc_visible_origin = vc->vc_origin; vc->vc_scr_end = vc->vc_origin + vc->vc_screenbuf_size; vc->vc_pos = vc->vc_origin + vc->vc_size_row * vc->state.y + 2 * vc->state.x; } static void save_screen(struct vc_data *vc) { WARN_CONSOLE_UNLOCKED(); if (vc->vc_sw->con_save_screen) vc->vc_sw->con_save_screen(vc); } static void flush_scrollback(struct vc_data *vc) { WARN_CONSOLE_UNLOCKED(); set_origin(vc); if (!con_is_visible(vc)) return; /* * The legacy way for flushing the scrollback buffer is to use a side * effect of the con_switch method. We do it only on the foreground * console as background consoles have no scrollback buffers in that * case and we obviously don't want to switch to them. */ hide_cursor(vc); vc->vc_sw->con_switch(vc); set_cursor(vc); } /* * Redrawing of screen */ void clear_buffer_attributes(struct vc_data *vc) { unsigned short *p = (unsigned short *)vc->vc_origin; int count = vc->vc_screenbuf_size / 2; int mask = vc->vc_hi_font_mask | 0xff; for (; count > 0; count--, p++) { scr_writew((scr_readw(p)&mask) | (vc->vc_video_erase_char & ~mask), p); } } void redraw_screen(struct vc_data *vc, int is_switch) { int redraw = 0; WARN_CONSOLE_UNLOCKED(); if (!vc) { /* strange ... */ /* printk("redraw_screen: tty %d not allocated ??\n", new_console+1); */ return; } if (is_switch) { struct vc_data *old_vc = vc_cons[fg_console].d; if (old_vc == vc) return; if (!con_is_visible(vc)) redraw = 1; *vc->vc_display_fg = vc; fg_console = vc->vc_num; hide_cursor(old_vc); if (!con_is_visible(old_vc)) { save_screen(old_vc); set_origin(old_vc); } if (tty0dev) sysfs_notify(&tty0dev->kobj, NULL, "active"); } else { hide_cursor(vc); redraw = 1; } if (redraw) { bool update; int old_was_color = vc->vc_can_do_color; set_origin(vc); update = vc->vc_sw->con_switch(vc); set_palette(vc); /* * If console changed from mono<->color, the best we can do * is to clear the buffer attributes. As it currently stands, * rebuilding new attributes from the old buffer is not doable * without overly complex code. */ if (old_was_color != vc->vc_can_do_color) { update_attr(vc); clear_buffer_attributes(vc); } if (update && vc->vc_mode != KD_GRAPHICS) do_update_region(vc, vc->vc_origin, vc->vc_screenbuf_size / 2); } set_cursor(vc); if (is_switch) { vt_set_leds_compute_shiftstate(); notify_update(vc); } } EXPORT_SYMBOL(redraw_screen); /* * Allocation, freeing and resizing of VTs. */ int vc_cons_allocated(unsigned int i) { return (i < MAX_NR_CONSOLES && vc_cons[i].d); } static void visual_init(struct vc_data *vc, int num, bool init) { /* ++Geert: vc->vc_sw->con_init determines console size */ if (vc->vc_sw) module_put(vc->vc_sw->owner); vc->vc_sw = conswitchp; if (con_driver_map[num]) vc->vc_sw = con_driver_map[num]; __module_get(vc->vc_sw->owner); vc->vc_num = num; vc->vc_display_fg = &master_display_fg; if (vc->uni_pagedict_loc) con_free_unimap(vc); vc->uni_pagedict_loc = &vc->uni_pagedict; vc->uni_pagedict = NULL; vc->vc_hi_font_mask = 0; vc->vc_complement_mask = 0; vc->vc_can_do_color = 0; vc->vc_cur_blink_ms = DEFAULT_CURSOR_BLINK_MS; vc->vc_sw->con_init(vc, init); if (!vc->vc_complement_mask) vc->vc_complement_mask = vc->vc_can_do_color ? 0x7700 : 0x0800; vc->vc_s_complement_mask = vc->vc_complement_mask; vc->vc_size_row = vc->vc_cols << 1; vc->vc_screenbuf_size = vc->vc_rows * vc->vc_size_row; } static void visual_deinit(struct vc_data *vc) { vc->vc_sw->con_deinit(vc); module_put(vc->vc_sw->owner); } static void vc_port_destruct(struct tty_port *port) { struct vc_data *vc = container_of(port, struct vc_data, port); kfree(vc); } static const struct tty_port_operations vc_port_ops = { .destruct = vc_port_destruct, }; /* * Change # of rows and columns (0 means unchanged/the size of fg_console) * [this is to be used together with some user program * like resize that changes the hardware videomode] */ #define VC_MAXCOL (32767) #define VC_MAXROW (32767) int vc_allocate(unsigned int currcons) /* return 0 on success */ { struct vt_notifier_param param; struct vc_data *vc; int err; WARN_CONSOLE_UNLOCKED(); if (currcons >= MAX_NR_CONSOLES) return -ENXIO; if (vc_cons[currcons].d) return 0; /* due to the granularity of kmalloc, we waste some memory here */ /* the alloc is done in two steps, to optimize the common situation of a 25x80 console (structsize=216, screenbuf_size=4000) */ /* although the numbers above are not valid since long ago, the point is still up-to-date and the comment still has its value even if only as a historical artifact. --mj, July 1998 */ param.vc = vc = kzalloc(sizeof(struct vc_data), GFP_KERNEL); if (!vc) return -ENOMEM; vc_cons[currcons].d = vc; tty_port_init(&vc->port); vc->port.ops = &vc_port_ops; INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK); visual_init(vc, currcons, true); if (!*vc->uni_pagedict_loc) con_set_default_unimap(vc); err = -EINVAL; if (vc->vc_cols > VC_MAXCOL || vc->vc_rows > VC_MAXROW || vc->vc_screenbuf_size > KMALLOC_MAX_SIZE || !vc->vc_screenbuf_size) goto err_free; err = -ENOMEM; vc->vc_screenbuf = kzalloc(vc->vc_screenbuf_size, GFP_KERNEL); if (!vc->vc_screenbuf) goto err_free; /* If no drivers have overridden us and the user didn't pass a boot option, default to displaying the cursor */ if (global_cursor_default == -1) global_cursor_default = 1; vc_init(vc, 1); vcs_make_sysfs(currcons); atomic_notifier_call_chain(&vt_notifier_list, VT_ALLOCATE, ¶m); return 0; err_free: visual_deinit(vc); kfree(vc); vc_cons[currcons].d = NULL; return err; } static inline int resize_screen(struct vc_data *vc, int width, int height, bool from_user) { /* Resizes the resolution of the display adapater */ int err = 0; if (vc->vc_sw->con_resize) err = vc->vc_sw->con_resize(vc, width, height, from_user); return err; } /** * vc_do_resize - resizing method for the tty * @tty: tty being resized * @vc: virtual console private data * @cols: columns * @lines: lines * @from_user: invoked by a user? * * Resize a virtual console, clipping according to the actual constraints. If * the caller passes a tty structure then update the termios winsize * information and perform any necessary signal handling. * * Locking: Caller must hold the console semaphore. Takes the termios rwsem and * ctrl.lock of the tty IFF a tty is passed. */ static int vc_do_resize(struct tty_struct *tty, struct vc_data *vc, unsigned int cols, unsigned int lines, bool from_user) { unsigned long old_origin, new_origin, new_scr_end, rlth, rrem, err = 0; unsigned long end; unsigned int old_rows, old_row_size, first_copied_row; unsigned int new_cols, new_rows, new_row_size, new_screen_size; unsigned short *oldscreen, *newscreen; u32 **new_uniscr = NULL; WARN_CONSOLE_UNLOCKED(); if (cols > VC_MAXCOL || lines > VC_MAXROW) return -EINVAL; new_cols = (cols ? cols : vc->vc_cols); new_rows = (lines ? lines : vc->vc_rows); new_row_size = new_cols << 1; new_screen_size = new_row_size * new_rows; if (new_cols == vc->vc_cols && new_rows == vc->vc_rows) { /* * This function is being called here to cover the case * where the userspace calls the FBIOPUT_VSCREENINFO twice, * passing the same fb_var_screeninfo containing the fields * yres/xres equal to a number non-multiple of vc_font.height * and yres_virtual/xres_virtual equal to number lesser than the * vc_font.height and yres/xres. * In the second call, the struct fb_var_screeninfo isn't * being modified by the underlying driver because of the * if above, and this causes the fbcon_display->vrows to become * negative and it eventually leads to out-of-bound * access by the imageblit function. * To give the correct values to the struct and to not have * to deal with possible errors from the code below, we call * the resize_screen here as well. */ return resize_screen(vc, new_cols, new_rows, from_user); } if (new_screen_size > KMALLOC_MAX_SIZE || !new_screen_size) return -EINVAL; newscreen = kzalloc(new_screen_size, GFP_USER); if (!newscreen) return -ENOMEM; if (vc->vc_uni_lines) { new_uniscr = vc_uniscr_alloc(new_cols, new_rows); if (!new_uniscr) { kfree(newscreen); return -ENOMEM; } } if (vc_is_sel(vc)) clear_selection(); old_rows = vc->vc_rows; old_row_size = vc->vc_size_row; err = resize_screen(vc, new_cols, new_rows, from_user); if (err) { kfree(newscreen); vc_uniscr_free(new_uniscr); return err; } vc->vc_rows = new_rows; vc->vc_cols = new_cols; vc->vc_size_row = new_row_size; vc->vc_screenbuf_size = new_screen_size; rlth = min(old_row_size, new_row_size); rrem = new_row_size - rlth; old_origin = vc->vc_origin; new_origin = (long) newscreen; new_scr_end = new_origin + new_screen_size; if (vc->state.y > new_rows) { if (old_rows - vc->state.y < new_rows) { /* * Cursor near the bottom, copy contents from the * bottom of buffer */ first_copied_row = (old_rows - new_rows); } else { /* * Cursor is in no man's land, copy 1/2 screenful * from the top and bottom of cursor position */ first_copied_row = (vc->state.y - new_rows/2); } old_origin += first_copied_row * old_row_size; } else first_copied_row = 0; end = old_origin + old_row_size * min(old_rows, new_rows); vc_uniscr_copy_area(new_uniscr, new_cols, new_rows, vc->vc_uni_lines, rlth/2, first_copied_row, min(old_rows, new_rows)); vc_uniscr_set(vc, new_uniscr); update_attr(vc); while (old_origin < end) { scr_memcpyw((unsigned short *) new_origin, (unsigned short *) old_origin, rlth); if (rrem) scr_memsetw((void *)(new_origin + rlth), vc->vc_video_erase_char, rrem); old_origin += old_row_size; new_origin += new_row_size; } if (new_scr_end > new_origin) scr_memsetw((void *)new_origin, vc->vc_video_erase_char, new_scr_end - new_origin); oldscreen = vc->vc_screenbuf; vc->vc_screenbuf = newscreen; vc->vc_screenbuf_size = new_screen_size; set_origin(vc); kfree(oldscreen); /* do part of a reset_terminal() */ vc->vc_top = 0; vc->vc_bottom = vc->vc_rows; gotoxy(vc, vc->state.x, vc->state.y); save_cur(vc); if (tty) { /* Rewrite the requested winsize data with the actual resulting sizes */ struct winsize ws; memset(&ws, 0, sizeof(ws)); ws.ws_row = vc->vc_rows; ws.ws_col = vc->vc_cols; ws.ws_ypixel = vc->vc_scan_lines; tty_do_resize(tty, &ws); } if (con_is_visible(vc)) update_screen(vc); vt_event_post(VT_EVENT_RESIZE, vc->vc_num, vc->vc_num); notify_update(vc); return err; } /** * __vc_resize - resize a VT * @vc: virtual console * @cols: columns * @rows: rows * @from_user: invoked by a user? * * Resize a virtual console as seen from the console end of things. We use the * common vc_do_resize() method to update the structures. * * Locking: The caller must hold the console sem to protect console internals * and @vc->port.tty. */ int __vc_resize(struct vc_data *vc, unsigned int cols, unsigned int rows, bool from_user) { return vc_do_resize(vc->port.tty, vc, cols, rows, from_user); } EXPORT_SYMBOL(__vc_resize); /** * vt_resize - resize a VT * @tty: tty to resize * @ws: winsize attributes * * Resize a virtual terminal. This is called by the tty layer as we register * our own handler for resizing. The mutual helper does all the actual work. * * Locking: Takes the console sem and the called methods then take the tty * termios_rwsem and the tty ctrl.lock in that order. */ static int vt_resize(struct tty_struct *tty, struct winsize *ws) { struct vc_data *vc = tty->driver_data; guard(console_lock)(); return vc_do_resize(tty, vc, ws->ws_col, ws->ws_row, false); } struct vc_data *vc_deallocate(unsigned int currcons) { struct vc_data *vc = NULL; WARN_CONSOLE_UNLOCKED(); if (vc_cons_allocated(currcons)) { struct vt_notifier_param param; param.vc = vc = vc_cons[currcons].d; atomic_notifier_call_chain(&vt_notifier_list, VT_DEALLOCATE, ¶m); vcs_remove_sysfs(currcons); visual_deinit(vc); con_free_unimap(vc); put_pid(vc->vt_pid); vc_uniscr_set(vc, NULL); kfree(vc->vc_screenbuf); vc_cons[currcons].d = NULL; if (vc->vc_saved_screen != NULL) { kfree(vc->vc_saved_screen); vc->vc_saved_screen = NULL; } } return vc; } /* * VT102 emulator */ enum { EPecma = 0, EPdec, EPeq, EPgt, EPlt}; #define set_kbd(vc, x) vt_set_kbd_mode_bit((vc)->vc_num, (x)) #define clr_kbd(vc, x) vt_clr_kbd_mode_bit((vc)->vc_num, (x)) #define is_kbd(vc, x) vt_get_kbd_mode_bit((vc)->vc_num, (x)) #define decarm VC_REPEAT #define decckm VC_CKMODE #define kbdapplic VC_APPLIC #define lnm VC_CRLF const unsigned char color_table[] = { 0, 4, 2, 6, 1, 5, 3, 7, 8,12,10,14, 9,13,11,15 }; EXPORT_SYMBOL(color_table); /* the default colour table, for VGA+ colour systems */ unsigned char default_red[] = { 0x00, 0xaa, 0x00, 0xaa, 0x00, 0xaa, 0x00, 0xaa, 0x55, 0xff, 0x55, 0xff, 0x55, 0xff, 0x55, 0xff }; module_param_array(default_red, byte, NULL, S_IRUGO | S_IWUSR); EXPORT_SYMBOL(default_red); unsigned char default_grn[] = { 0x00, 0x00, 0xaa, 0x55, 0x00, 0x00, 0xaa, 0xaa, 0x55, 0x55, 0xff, 0xff, 0x55, 0x55, 0xff, 0xff }; module_param_array(default_grn, byte, NULL, S_IRUGO | S_IWUSR); EXPORT_SYMBOL(default_grn); unsigned char default_blu[] = { 0x00, 0x00, 0x00, 0x00, 0xaa, 0xaa, 0xaa, 0xaa, 0x55, 0x55, 0x55, 0x55, 0xff, 0xff, 0xff, 0xff }; module_param_array(default_blu, byte, NULL, S_IRUGO | S_IWUSR); EXPORT_SYMBOL(default_blu); /* * gotoxy() must verify all boundaries, because the arguments * might also be negative. If the given position is out of * bounds, the cursor is placed at the nearest margin. */ static void gotoxy(struct vc_data *vc, int new_x, int new_y) { int min_y, max_y; if (new_x < 0) vc->state.x = 0; else { if (new_x >= vc->vc_cols) vc->state.x = vc->vc_cols - 1; else vc->state.x = new_x; } if (vc->vc_decom) { min_y = vc->vc_top; max_y = vc->vc_bottom; } else { min_y = 0; max_y = vc->vc_rows; } if (new_y < min_y) vc->state.y = min_y; else if (new_y >= max_y) vc->state.y = max_y - 1; else vc->state.y = new_y; vc->vc_pos = vc->vc_origin + vc->state.y * vc->vc_size_row + (vc->state.x << 1); vc->vc_need_wrap = 0; } /* for absolute user moves, when decom is set */ static void gotoxay(struct vc_data *vc, int new_x, int new_y) { gotoxy(vc, new_x, vc->vc_decom ? (vc->vc_top + new_y) : new_y); } void scrollback(struct vc_data *vc) { scrolldelta(-(vc->vc_rows / 2)); } void scrollfront(struct vc_data *vc, int lines) { if (!lines) lines = vc->vc_rows / 2; scrolldelta(lines); } static void lf(struct vc_data *vc) { /* don't scroll if above bottom of scrolling region, or * if below scrolling region */ if (vc->state.y + 1 == vc->vc_bottom) con_scroll(vc, vc->vc_top, vc->vc_bottom, SM_UP, 1); else if (vc->state.y < vc->vc_rows - 1) { vc->state.y++; vc->vc_pos += vc->vc_size_row; } vc->vc_need_wrap = 0; notify_write(vc, '\n'); } static void ri(struct vc_data *vc) { /* don't scroll if below top of scrolling region, or * if above scrolling region */ if (vc->state.y == vc->vc_top) con_scroll(vc, vc->vc_top, vc->vc_bottom, SM_DOWN, 1); else if (vc->state.y > 0) { vc->state.y--; vc->vc_pos -= vc->vc_size_row; } vc->vc_need_wrap = 0; } static inline void cr(struct vc_data *vc) { vc->vc_pos -= vc->state.x << 1; vc->vc_need_wrap = vc->state.x = 0; notify_write(vc, '\r'); } static inline void bs(struct vc_data *vc) { if (vc->state.x) { vc->vc_pos -= 2; vc->state.x--; vc->vc_need_wrap = 0; notify_write(vc, '\b'); } } static inline void del(struct vc_data *vc) { /* ignored */ } enum CSI_J { CSI_J_CURSOR_TO_END = 0, CSI_J_START_TO_CURSOR = 1, CSI_J_VISIBLE = 2, CSI_J_FULL = 3, }; static void csi_J(struct vc_data *vc, enum CSI_J vpar) { unsigned short *start; unsigned int count; switch (vpar) { case CSI_J_CURSOR_TO_END: vc_uniscr_clear_line(vc, vc->state.x, vc->vc_cols - vc->state.x); vc_uniscr_clear_lines(vc, vc->state.y + 1, vc->vc_rows - vc->state.y - 1); count = (vc->vc_scr_end - vc->vc_pos) >> 1; start = (unsigned short *)vc->vc_pos; break; case CSI_J_START_TO_CURSOR: vc_uniscr_clear_line(vc, 0, vc->state.x + 1); vc_uniscr_clear_lines(vc, 0, vc->state.y); count = ((vc->vc_pos - vc->vc_origin) >> 1) + 1; start = (unsigned short *)vc->vc_origin; break; case CSI_J_FULL: flush_scrollback(vc); fallthrough; case CSI_J_VISIBLE: vc_uniscr_clear_lines(vc, 0, vc->vc_rows); count = vc->vc_cols * vc->vc_rows; start = (unsigned short *)vc->vc_origin; break; default: return; } scr_memsetw(start, vc->vc_video_erase_char, 2 * count); if (con_should_update(vc)) do_update_region(vc, (unsigned long) start, count); vc->vc_need_wrap = 0; } enum { CSI_K_CURSOR_TO_LINEEND = 0, CSI_K_LINESTART_TO_CURSOR = 1, CSI_K_LINE = 2, }; static void csi_K(struct vc_data *vc) { unsigned int count; unsigned short *start = (unsigned short *)vc->vc_pos; int offset; switch (vc->vc_par[0]) { case CSI_K_CURSOR_TO_LINEEND: offset = 0; count = vc->vc_cols - vc->state.x; break; case CSI_K_LINESTART_TO_CURSOR: offset = -vc->state.x; count = vc->state.x + 1; break; case CSI_K_LINE: offset = -vc->state.x; count = vc->vc_cols; break; default: return; } vc_uniscr_clear_line(vc, vc->state.x + offset, count); scr_memsetw(start + offset, vc->vc_video_erase_char, 2 * count); vc->vc_need_wrap = 0; if (con_should_update(vc)) do_update_region(vc, (unsigned long)(start + offset), count); } /* erase the following count positions */ static void csi_X(struct vc_data *vc) { /* not vt100? */ unsigned int count = clamp(vc->vc_par[0], 1, vc->vc_cols - vc->state.x); vc_uniscr_clear_line(vc, vc->state.x, count); scr_memsetw((unsigned short *)vc->vc_pos, vc->vc_video_erase_char, 2 * count); if (con_should_update(vc)) vc->vc_sw->con_clear(vc, vc->state.y, vc->state.x, count); vc->vc_need_wrap = 0; } static void default_attr(struct vc_data *vc) { vc->state.intensity = VCI_NORMAL; vc->state.italic = false; vc->state.underline = false; vc->state.reverse = false; vc->state.blink = false; vc->state.color = vc->vc_def_color; } struct rgb { u8 r; u8 g; u8 b; }; static void rgb_from_256(unsigned int i, struct rgb *c) { if (i < 8) { /* Standard colours. */ c->r = i&1 ? 0xaa : 0x00; c->g = i&2 ? 0xaa : 0x00; c->b = i&4 ? 0xaa : 0x00; } else if (i < 16) { c->r = i&1 ? 0xff : 0x55; c->g = i&2 ? 0xff : 0x55; c->b = i&4 ? 0xff : 0x55; } else if (i < 232) { /* 6x6x6 colour cube. */ i -= 16; c->b = i % 6 * 255 / 6; i /= 6; c->g = i % 6 * 255 / 6; i /= 6; c->r = i * 255 / 6; } else /* Grayscale ramp. */ c->r = c->g = c->b = i * 10 - 2312; } static void rgb_foreground(struct vc_data *vc, const struct rgb *c) { u8 hue = 0, max = max3(c->r, c->g, c->b); if (c->r > max / 2) hue |= 4; if (c->g > max / 2) hue |= 2; if (c->b > max / 2) hue |= 1; if (hue == 7 && max <= 0x55) { hue = 0; vc->state.intensity = VCI_BOLD; } else if (max > 0xaa) vc->state.intensity = VCI_BOLD; else vc->state.intensity = VCI_NORMAL; vc->state.color = (vc->state.color & 0xf0) | hue; } static void rgb_background(struct vc_data *vc, const struct rgb *c) { /* For backgrounds, err on the dark side. */ vc->state.color = (vc->state.color & 0x0f) | (c->r&0x80) >> 1 | (c->g&0x80) >> 2 | (c->b&0x80) >> 3; } /* * ITU T.416 Higher colour modes. They break the usual properties of SGR codes * and thus need to be detected and ignored by hand. That standard also * wants : rather than ; as separators but sequences containing : are currently * completely ignored by the parser. * * Subcommands 3 (CMY) and 4 (CMYK) are so insane there's no point in * supporting them. */ static int vc_t416_color(struct vc_data *vc, int i, void(*set_color)(struct vc_data *vc, const struct rgb *c)) { struct rgb c; i++; if (i > vc->vc_npar) return i; if (vc->vc_par[i] == 5 && i + 1 <= vc->vc_npar) { /* 256 colours */ i++; rgb_from_256(vc->vc_par[i], &c); } else if (vc->vc_par[i] == 2 && i + 3 <= vc->vc_npar) { /* 24 bit */ c.r = vc->vc_par[i + 1]; c.g = vc->vc_par[i + 2]; c.b = vc->vc_par[i + 3]; i += 3; } else return i; set_color(vc, &c); return i; } enum { CSI_m_DEFAULT = 0, CSI_m_BOLD = 1, CSI_m_HALF_BRIGHT = 2, CSI_m_ITALIC = 3, CSI_m_UNDERLINE = 4, CSI_m_BLINK = 5, CSI_m_REVERSE = 7, CSI_m_PRI_FONT = 10, CSI_m_ALT_FONT1 = 11, CSI_m_ALT_FONT2 = 12, CSI_m_DOUBLE_UNDERLINE = 21, CSI_m_NORMAL_INTENSITY = 22, CSI_m_NO_ITALIC = 23, CSI_m_NO_UNDERLINE = 24, CSI_m_NO_BLINK = 25, CSI_m_NO_REVERSE = 27, CSI_m_FG_COLOR_BEG = 30, CSI_m_FG_COLOR_END = 37, CSI_m_FG_COLOR = 38, CSI_m_DEFAULT_FG_COLOR = 39, CSI_m_BG_COLOR_BEG = 40, CSI_m_BG_COLOR_END = 47, CSI_m_BG_COLOR = 48, CSI_m_DEFAULT_BG_COLOR = 49, CSI_m_BRIGHT_FG_COLOR_BEG = 90, CSI_m_BRIGHT_FG_COLOR_END = 97, CSI_m_BRIGHT_FG_COLOR_OFF = CSI_m_BRIGHT_FG_COLOR_BEG - CSI_m_FG_COLOR_BEG, CSI_m_BRIGHT_BG_COLOR_BEG = 100, CSI_m_BRIGHT_BG_COLOR_END = 107, CSI_m_BRIGHT_BG_COLOR_OFF = CSI_m_BRIGHT_BG_COLOR_BEG - CSI_m_BG_COLOR_BEG, }; /* console_lock is held */ static void csi_m(struct vc_data *vc) { int i; for (i = 0; i <= vc->vc_npar; i++) switch (vc->vc_par[i]) { case CSI_m_DEFAULT: /* all attributes off */ default_attr(vc); break; case CSI_m_BOLD: vc->state.intensity = VCI_BOLD; break; case CSI_m_HALF_BRIGHT: vc->state.intensity = VCI_HALF_BRIGHT; break; case CSI_m_ITALIC: vc->state.italic = true; break; case CSI_m_DOUBLE_UNDERLINE: /* * No console drivers support double underline, so * convert it to a single underline. */ case CSI_m_UNDERLINE: vc->state.underline = true; break; case CSI_m_BLINK: vc->state.blink = true; break; case CSI_m_REVERSE: vc->state.reverse = true; break; case CSI_m_PRI_FONT: /* ANSI X3.64-1979 (SCO-ish?) * Select primary font, don't display control chars if * defined, don't set bit 8 on output. */ vc->vc_translate = set_translate(vc->state.Gx_charset[vc->state.charset], vc); vc->vc_disp_ctrl = 0; vc->vc_toggle_meta = 0; break; case CSI_m_ALT_FONT1: /* ANSI X3.64-1979 (SCO-ish?) * Select first alternate font, lets chars < 32 be * displayed as ROM chars. */ vc->vc_translate = set_translate(IBMPC_MAP, vc); vc->vc_disp_ctrl = 1; vc->vc_toggle_meta = 0; break; case CSI_m_ALT_FONT2: /* ANSI X3.64-1979 (SCO-ish?) * Select second alternate font, toggle high bit * before displaying as ROM char. */ vc->vc_translate = set_translate(IBMPC_MAP, vc); vc->vc_disp_ctrl = 1; vc->vc_toggle_meta = 1; break; case CSI_m_NORMAL_INTENSITY: vc->state.intensity = VCI_NORMAL; break; case CSI_m_NO_ITALIC: vc->state.italic = false; break; case CSI_m_NO_UNDERLINE: vc->state.underline = false; break; case CSI_m_NO_BLINK: vc->state.blink = false; break; case CSI_m_NO_REVERSE: vc->state.reverse = false; break; case CSI_m_FG_COLOR: i = vc_t416_color(vc, i, rgb_foreground); break; case CSI_m_BG_COLOR: i = vc_t416_color(vc, i, rgb_background); break; case CSI_m_DEFAULT_FG_COLOR: vc->state.color = (vc->vc_def_color & 0x0f) | (vc->state.color & 0xf0); break; case CSI_m_DEFAULT_BG_COLOR: vc->state.color = (vc->vc_def_color & 0xf0) | (vc->state.color & 0x0f); break; case CSI_m_BRIGHT_FG_COLOR_BEG ... CSI_m_BRIGHT_FG_COLOR_END: vc->state.intensity = VCI_BOLD; vc->vc_par[i] -= CSI_m_BRIGHT_FG_COLOR_OFF; fallthrough; case CSI_m_FG_COLOR_BEG ... CSI_m_FG_COLOR_END: vc->vc_par[i] -= CSI_m_FG_COLOR_BEG; vc->state.color = color_table[vc->vc_par[i]] | (vc->state.color & 0xf0); break; case CSI_m_BRIGHT_BG_COLOR_BEG ... CSI_m_BRIGHT_BG_COLOR_END: vc->vc_par[i] -= CSI_m_BRIGHT_BG_COLOR_OFF; fallthrough; case CSI_m_BG_COLOR_BEG ... CSI_m_BG_COLOR_END: vc->vc_par[i] -= CSI_m_BG_COLOR_BEG; vc->state.color = (color_table[vc->vc_par[i]] << 4) | (vc->state.color & 0x0f); break; } update_attr(vc); } static void respond_string(const char *p, size_t len, struct tty_port *port) { tty_insert_flip_string(port, p, len); tty_flip_buffer_push(port); } static void cursor_report(struct vc_data *vc, struct tty_struct *tty) { char buf[40]; int len; len = sprintf(buf, "\033[%d;%dR", vc->state.y + (vc->vc_decom ? vc->vc_top + 1 : 1), vc->state.x + 1); respond_string(buf, len, tty->port); } static inline void status_report(struct tty_struct *tty) { static const char teminal_ok[] = "\033[0n"; respond_string(teminal_ok, strlen(teminal_ok), tty->port); } static inline void respond_ID(struct tty_struct *tty) { /* terminal answer to an ESC-Z or csi0c query. */ static const char vt102_id[] = "\033[?6c"; respond_string(vt102_id, strlen(vt102_id), tty->port); } void mouse_report(struct tty_struct *tty, int butt, int mrx, int mry) { char buf[8]; int len; len = sprintf(buf, "\033[M%c%c%c", (char)(' ' + butt), (char)('!' + mrx), (char)('!' + mry)); respond_string(buf, len, tty->port); } /* invoked via ioctl(TIOCLINUX) and through set_selection_user */ int mouse_reporting(void) { return vc_cons[fg_console].d->vc_report_mouse; } /* invoked via ioctl(TIOCLINUX) */ static int get_bracketed_paste(struct tty_struct *tty) { struct vc_data *vc = tty->driver_data; return vc->vc_bracketed_paste; } /* console_lock is held */ static void enter_alt_screen(struct vc_data *vc) { unsigned int size = vc->vc_rows * vc->vc_cols * 2; if (vc->vc_saved_screen != NULL) return; /* Already inside an alt-screen */ vc->vc_saved_screen = kmemdup((u16 *)vc->vc_origin, size, GFP_KERNEL); if (vc->vc_saved_screen == NULL) return; vc->vc_saved_rows = vc->vc_rows; vc->vc_saved_cols = vc->vc_cols; save_cur(vc); /* clear entire screen */ csi_J(vc, CSI_J_FULL); } /* console_lock is held */ static void leave_alt_screen(struct vc_data *vc) { unsigned int rows = min(vc->vc_saved_rows, vc->vc_rows); unsigned int cols = min(vc->vc_saved_cols, vc->vc_cols); u16 *src, *dest; if (vc->vc_saved_screen == NULL) return; /* Not inside an alt-screen */ for (unsigned int r = 0; r < rows; r++) { src = vc->vc_saved_screen + r * vc->vc_saved_cols; dest = ((u16 *)vc->vc_origin) + r * vc->vc_cols; memcpy(dest, src, 2 * cols); } restore_cur(vc); /* Update the entire screen */ if (con_should_update(vc)) do_update_region(vc, vc->vc_origin, vc->vc_screenbuf_size / 2); kfree(vc->vc_saved_screen); vc->vc_saved_screen = NULL; } enum { CSI_DEC_hl_CURSOR_KEYS = 1, /* CKM: cursor keys send ^[Ox/^[[x */ CSI_DEC_hl_132_COLUMNS = 3, /* COLM: 80/132 mode switch */ CSI_DEC_hl_REVERSE_VIDEO = 5, /* SCNM */ CSI_DEC_hl_ORIGIN_MODE = 6, /* OM: origin relative/absolute */ CSI_DEC_hl_AUTOWRAP = 7, /* AWM */ CSI_DEC_hl_AUTOREPEAT = 8, /* ARM */ CSI_DEC_hl_MOUSE_X10 = 9, CSI_DEC_hl_SHOW_CURSOR = 25, /* TCEM */ CSI_DEC_hl_MOUSE_VT200 = 1000, CSI_DEC_hl_ALT_SCREEN = 1049, CSI_DEC_hl_BRACKETED_PASTE = 2004, }; /* console_lock is held */ static void csi_DEC_hl(struct vc_data *vc, bool on_off) { unsigned int i; for (i = 0; i <= vc->vc_npar; i++) switch (vc->vc_par[i]) { case CSI_DEC_hl_CURSOR_KEYS: if (on_off) set_kbd(vc, decckm); else clr_kbd(vc, decckm); break; case CSI_DEC_hl_132_COLUMNS: /* unimplemented */ #if 0 vc_resize(deccolm ? 132 : 80, vc->vc_rows); /* this alone does not suffice; some user mode utility has to change the hardware regs */ #endif break; case CSI_DEC_hl_REVERSE_VIDEO: if (vc->vc_decscnm != on_off) { vc->vc_decscnm = on_off; invert_screen(vc, 0, vc->vc_screenbuf_size, false); update_attr(vc); } break; case CSI_DEC_hl_ORIGIN_MODE: vc->vc_decom = on_off; gotoxay(vc, 0, 0); break; case CSI_DEC_hl_AUTOWRAP: vc->vc_decawm = on_off; break; case CSI_DEC_hl_AUTOREPEAT: if (on_off) set_kbd(vc, decarm); else clr_kbd(vc, decarm); break; case CSI_DEC_hl_MOUSE_X10: vc->vc_report_mouse = on_off ? 1 : 0; break; case CSI_DEC_hl_SHOW_CURSOR: vc->vc_deccm = on_off; break; case CSI_DEC_hl_MOUSE_VT200: vc->vc_report_mouse = on_off ? 2 : 0; break; case CSI_DEC_hl_BRACKETED_PASTE: vc->vc_bracketed_paste = on_off; break; case CSI_DEC_hl_ALT_SCREEN: if (on_off) enter_alt_screen(vc); else leave_alt_screen(vc); break; } } enum { CSI_hl_DISPLAY_CTRL = 3, /* handle ansi control chars */ CSI_hl_INSERT = 4, /* IRM: insert/replace */ CSI_hl_AUTO_NL = 20, /* LNM: Enter == CrLf/Lf */ }; /* console_lock is held */ static void csi_hl(struct vc_data *vc, bool on_off) { unsigned int i; for (i = 0; i <= vc->vc_npar; i++) switch (vc->vc_par[i]) { /* ANSI modes set/reset */ case CSI_hl_DISPLAY_CTRL: vc->vc_disp_ctrl = on_off; break; case CSI_hl_INSERT: vc->vc_decim = on_off; break; case CSI_hl_AUTO_NL: if (on_off) set_kbd(vc, lnm); else clr_kbd(vc, lnm); break; } } enum CSI_right_square_bracket { CSI_RSB_COLOR_FOR_UNDERLINE = 1, CSI_RSB_COLOR_FOR_HALF_BRIGHT = 2, CSI_RSB_MAKE_CUR_COLOR_DEFAULT = 8, CSI_RSB_BLANKING_INTERVAL = 9, CSI_RSB_BELL_FREQUENCY = 10, CSI_RSB_BELL_DURATION = 11, CSI_RSB_BRING_CONSOLE_TO_FRONT = 12, CSI_RSB_UNBLANK = 13, CSI_RSB_VESA_OFF_INTERVAL = 14, CSI_RSB_BRING_PREV_CONSOLE_TO_FRONT = 15, CSI_RSB_CURSOR_BLINK_INTERVAL = 16, }; /* * csi_RSB - csi+] (Right Square Bracket) handler * * These are linux console private sequences. * * console_lock is held */ static void csi_RSB(struct vc_data *vc) { switch (vc->vc_par[0]) { case CSI_RSB_COLOR_FOR_UNDERLINE: if (vc->vc_can_do_color && vc->vc_par[1] < 16) { vc->vc_ulcolor = color_table[vc->vc_par[1]]; if (vc->state.underline) update_attr(vc); } break; case CSI_RSB_COLOR_FOR_HALF_BRIGHT: if (vc->vc_can_do_color && vc->vc_par[1] < 16) { vc->vc_halfcolor = color_table[vc->vc_par[1]]; if (vc->state.intensity == VCI_HALF_BRIGHT) update_attr(vc); } break; case CSI_RSB_MAKE_CUR_COLOR_DEFAULT: vc->vc_def_color = vc->vc_attr; if (vc->vc_hi_font_mask == 0x100) vc->vc_def_color >>= 1; default_attr(vc); update_attr(vc); break; case CSI_RSB_BLANKING_INTERVAL: blankinterval = min(vc->vc_par[1], 60U) * 60; poke_blanked_console(); break; case CSI_RSB_BELL_FREQUENCY: if (vc->vc_npar >= 1) vc->vc_bell_pitch = vc->vc_par[1]; else vc->vc_bell_pitch = DEFAULT_BELL_PITCH; break; case CSI_RSB_BELL_DURATION: if (vc->vc_npar >= 1) vc->vc_bell_duration = (vc->vc_par[1] < 2000) ? msecs_to_jiffies(vc->vc_par[1]) : 0; else vc->vc_bell_duration = DEFAULT_BELL_DURATION; break; case CSI_RSB_BRING_CONSOLE_TO_FRONT: if (vc->vc_par[1] >= 1 && vc_cons_allocated(vc->vc_par[1] - 1)) set_console(vc->vc_par[1] - 1); break; case CSI_RSB_UNBLANK: poke_blanked_console(); break; case CSI_RSB_VESA_OFF_INTERVAL: vesa_off_interval = min(vc->vc_par[1], 60U) * 60 * HZ; break; case CSI_RSB_BRING_PREV_CONSOLE_TO_FRONT: set_console(last_console); break; case CSI_RSB_CURSOR_BLINK_INTERVAL: if (vc->vc_npar >= 1 && vc->vc_par[1] >= 50 && vc->vc_par[1] <= USHRT_MAX) vc->vc_cur_blink_ms = vc->vc_par[1]; else vc->vc_cur_blink_ms = DEFAULT_CURSOR_BLINK_MS; break; } } /* console_lock is held */ static void csi_at(struct vc_data *vc, unsigned int nr) { nr = clamp(nr, 1, vc->vc_cols - vc->state.x); insert_char(vc, nr); } /* console_lock is held */ static void csi_L(struct vc_data *vc) { unsigned int nr = clamp(vc->vc_par[0], 1, vc->vc_rows - vc->state.y); con_scroll(vc, vc->state.y, vc->vc_bottom, SM_DOWN, nr); vc->vc_need_wrap = 0; } /* console_lock is held */ static void csi_P(struct vc_data *vc) { unsigned int nr = clamp(vc->vc_par[0], 1, vc->vc_cols - vc->state.x); delete_char(vc, nr); } /* console_lock is held */ static void csi_M(struct vc_data *vc) { unsigned int nr = clamp(vc->vc_par[0], 1, vc->vc_rows - vc->state.y); con_scroll(vc, vc->state.y, vc->vc_bottom, SM_UP, nr); vc->vc_need_wrap = 0; } /* console_lock is held (except via vc_init->reset_terminal */ static void save_cur(struct vc_data *vc) { memcpy(&vc->saved_state, &vc->state, sizeof(vc->state)); } /* console_lock is held */ static void restore_cur(struct vc_data *vc) { memcpy(&vc->state, &vc->saved_state, sizeof(vc->state)); gotoxy(vc, vc->state.x, vc->state.y); vc->vc_translate = set_translate(vc->state.Gx_charset[vc->state.charset], vc); update_attr(vc); vc->vc_need_wrap = 0; } /** * enum vc_ctl_state - control characters state of a vt * * @ESnormal: initial state, no control characters parsed * @ESesc: ESC parsed * @ESsquare: CSI parsed -- modifiers/parameters/ctrl chars expected * @ESgetpars: CSI parsed -- parameters/ctrl chars expected * @ESfunckey: CSI [ parsed * @EShash: ESC # parsed * @ESsetG0: ESC ( parsed * @ESsetG1: ESC ) parsed * @ESpercent: ESC % parsed * @EScsiignore: CSI [0x20-0x3f] parsed * @ESnonstd: OSC parsed * @ESpalette: OSC P parsed * @ESosc: OSC [0-9] parsed * @ESANSI_first: first state for ignoring ansi control sequences * @ESapc: ESC _ parsed * @ESpm: ESC ^ parsed * @ESdcs: ESC P parsed * @ESANSI_last: last state for ignoring ansi control sequences */ enum vc_ctl_state { ESnormal, ESesc, ESsquare, ESgetpars, ESfunckey, EShash, ESsetG0, ESsetG1, ESpercent, EScsiignore, ESnonstd, ESpalette, ESosc, ESANSI_first = ESosc, ESapc, ESpm, ESdcs, ESANSI_last = ESdcs, }; /* console_lock is held (except via vc_init()) */ static void reset_terminal(struct vc_data *vc, int do_clear) { unsigned int i; vc->vc_top = 0; vc->vc_bottom = vc->vc_rows; vc->vc_state = ESnormal; vc->vc_priv = EPecma; vc->vc_translate = set_translate(LAT1_MAP, vc); vc->state.Gx_charset[0] = LAT1_MAP; vc->state.Gx_charset[1] = GRAF_MAP; vc->state.charset = 0; vc->vc_need_wrap = 0; vc->vc_report_mouse = 0; vc->vc_bracketed_paste = 0; vc->vc_utf = default_utf8; vc->vc_utf_count = 0; vc->vc_disp_ctrl = 0; vc->vc_toggle_meta = 0; vc->vc_decscnm = 0; vc->vc_decom = 0; vc->vc_decawm = 1; vc->vc_deccm = global_cursor_default; vc->vc_decim = 0; if (vc->vc_saved_screen != NULL) { kfree(vc->vc_saved_screen); vc->vc_saved_screen = NULL; vc->vc_saved_rows = 0; vc->vc_saved_cols = 0; } vt_reset_keyboard(vc->vc_num); vc->vc_cursor_type = cur_default; vc->vc_complement_mask = vc->vc_s_complement_mask; default_attr(vc); update_attr(vc); bitmap_zero(vc->vc_tab_stop, VC_TABSTOPS_COUNT); for (i = 0; i < VC_TABSTOPS_COUNT; i += 8) set_bit(i, vc->vc_tab_stop); vc->vc_bell_pitch = DEFAULT_BELL_PITCH; vc->vc_bell_duration = DEFAULT_BELL_DURATION; vc->vc_cur_blink_ms = DEFAULT_CURSOR_BLINK_MS; gotoxy(vc, 0, 0); save_cur(vc); if (do_clear) csi_J(vc, CSI_J_VISIBLE); } static void vc_setGx(struct vc_data *vc, unsigned int which, u8 c) { unsigned char *charset = &vc->state.Gx_charset[which]; switch (c) { case '0': *charset = GRAF_MAP; break; case 'B': *charset = LAT1_MAP; break; case 'U': *charset = IBMPC_MAP; break; case 'K': *charset = USER_MAP; break; } if (vc->state.charset == which) vc->vc_translate = set_translate(*charset, vc); } static bool ansi_control_string(enum vc_ctl_state state) { return state >= ESANSI_first && state <= ESANSI_last; } enum { ASCII_NULL = 0, ASCII_BELL = 7, ASCII_BACKSPACE = 8, ASCII_IGNORE_FIRST = ASCII_BACKSPACE, ASCII_HTAB = 9, ASCII_LINEFEED = 10, ASCII_VTAB = 11, ASCII_FORMFEED = 12, ASCII_CAR_RET = 13, ASCII_IGNORE_LAST = ASCII_CAR_RET, ASCII_SHIFTOUT = 14, ASCII_SHIFTIN = 15, ASCII_CANCEL = 24, ASCII_SUBSTITUTE = 26, ASCII_ESCAPE = 27, ASCII_CSI_IGNORE_FIRST = ' ', /* 0x2x, 0x3a and 0x3c - 0x3f */ ASCII_CSI_IGNORE_LAST = '?', ASCII_DEL = 127, ASCII_EXT_CSI = 128 + ASCII_ESCAPE, }; /* * Handle ascii characters in control sequences and change states accordingly. * E.g. ESC sets the state of vc to ESesc. * * Returns: true if @c handled. */ static bool handle_ascii(struct tty_struct *tty, struct vc_data *vc, u8 c) { switch (c) { case ASCII_NULL: return true; case ASCII_BELL: if (ansi_control_string(vc->vc_state)) vc->vc_state = ESnormal; else if (vc->vc_bell_duration) kd_mksound(vc->vc_bell_pitch, vc->vc_bell_duration); return true; case ASCII_BACKSPACE: bs(vc); return true; case ASCII_HTAB: vc->vc_pos -= (vc->state.x << 1); vc->state.x = find_next_bit(vc->vc_tab_stop, min(vc->vc_cols - 1, VC_TABSTOPS_COUNT), vc->state.x + 1); if (vc->state.x >= VC_TABSTOPS_COUNT) vc->state.x = vc->vc_cols - 1; vc->vc_pos += (vc->state.x << 1); notify_write(vc, '\t'); return true; case ASCII_LINEFEED: case ASCII_VTAB: case ASCII_FORMFEED: lf(vc); if (!is_kbd(vc, lnm)) return true; fallthrough; case ASCII_CAR_RET: cr(vc); return true; case ASCII_SHIFTOUT: vc->state.charset = 1; vc->vc_translate = set_translate(vc->state.Gx_charset[1], vc); vc->vc_disp_ctrl = 1; return true; case ASCII_SHIFTIN: vc->state.charset = 0; vc->vc_translate = set_translate(vc->state.Gx_charset[0], vc); vc->vc_disp_ctrl = 0; return true; case ASCII_CANCEL: case ASCII_SUBSTITUTE: vc->vc_state = ESnormal; return true; case ASCII_ESCAPE: vc->vc_state = ESesc; return true; case ASCII_DEL: del(vc); return true; case ASCII_EXT_CSI: vc->vc_state = ESsquare; return true; } return false; } /* * Handle a character (@c) following an ESC (when @vc is in the ESesc state). * E.g. previous ESC with @c == '[' here yields the ESsquare state (that is: * CSI). */ static void handle_esc(struct tty_struct *tty, struct vc_data *vc, u8 c) { vc->vc_state = ESnormal; switch (c) { case '[': vc->vc_state = ESsquare; break; case ']': vc->vc_state = ESnonstd; break; case '_': vc->vc_state = ESapc; break; case '^': vc->vc_state = ESpm; break; case '%': vc->vc_state = ESpercent; break; case 'E': cr(vc); lf(vc); break; case 'M': ri(vc); break; case 'D': lf(vc); break; case 'H': if (vc->state.x < VC_TABSTOPS_COUNT) set_bit(vc->state.x, vc->vc_tab_stop); break; case 'P': vc->vc_state = ESdcs; break; case 'Z': respond_ID(tty); break; case '7': save_cur(vc); break; case '8': restore_cur(vc); break; case '(': vc->vc_state = ESsetG0; break; case ')': vc->vc_state = ESsetG1; break; case '#': vc->vc_state = EShash; break; case 'c': reset_terminal(vc, 1); break; case '>': /* Numeric keypad */ clr_kbd(vc, kbdapplic); break; case '=': /* Appl. keypad */ set_kbd(vc, kbdapplic); break; } } /* * Handle special DEC control sequences ("ESC [ ? parameters char"). Parameters * are in @vc->vc_par and the char is in @c here. */ static void csi_DEC(struct tty_struct *tty, struct vc_data *vc, u8 c) { switch (c) { case 'h': csi_DEC_hl(vc, true); break; case 'l': csi_DEC_hl(vc, false); break; case 'c': if (vc->vc_par[0]) vc->vc_cursor_type = CUR_MAKE(vc->vc_par[0], vc->vc_par[1], vc->vc_par[2]); else vc->vc_cursor_type = cur_default; break; case 'm': clear_selection(); if (vc->vc_par[0]) vc->vc_complement_mask = vc->vc_par[0] << 8 | vc->vc_par[1]; else vc->vc_complement_mask = vc->vc_s_complement_mask; break; case 'n': if (vc->vc_par[0] == 5) status_report(tty); else if (vc->vc_par[0] == 6) cursor_report(vc, tty); break; } } /* * Handle Control Sequence Introducer control characters. That is * "ESC [ parameters char". Parameters are in @vc->vc_par and the char is in * @c here. */ static void csi_ECMA(struct tty_struct *tty, struct vc_data *vc, u8 c) { switch (c) { case 'G': case '`': if (vc->vc_par[0]) vc->vc_par[0]--; gotoxy(vc, vc->vc_par[0], vc->state.y); break; case 'A': if (!vc->vc_par[0]) vc->vc_par[0]++; gotoxy(vc, vc->state.x, vc->state.y - vc->vc_par[0]); break; case 'B': case 'e': if (!vc->vc_par[0]) vc->vc_par[0]++; gotoxy(vc, vc->state.x, vc->state.y + vc->vc_par[0]); break; case 'C': case 'a': if (!vc->vc_par[0]) vc->vc_par[0]++; gotoxy(vc, vc->state.x + vc->vc_par[0], vc->state.y); break; case 'D': if (!vc->vc_par[0]) vc->vc_par[0]++; gotoxy(vc, vc->state.x - vc->vc_par[0], vc->state.y); break; case 'E': if (!vc->vc_par[0]) vc->vc_par[0]++; gotoxy(vc, 0, vc->state.y + vc->vc_par[0]); break; case 'F': if (!vc->vc_par[0]) vc->vc_par[0]++; gotoxy(vc, 0, vc->state.y - vc->vc_par[0]); break; case 'd': if (vc->vc_par[0]) vc->vc_par[0]--; gotoxay(vc, vc->state.x ,vc->vc_par[0]); break; case 'H': case 'f': if (vc->vc_par[0]) vc->vc_par[0]--; if (vc->vc_par[1]) vc->vc_par[1]--; gotoxay(vc, vc->vc_par[1], vc->vc_par[0]); break; case 'J': csi_J(vc, vc->vc_par[0]); break; case 'K': csi_K(vc); break; case 'L': csi_L(vc); break; case 'M': csi_M(vc); break; case 'P': csi_P(vc); break; case 'c': if (!vc->vc_par[0]) respond_ID(tty); break; case 'g': if (!vc->vc_par[0] && vc->state.x < VC_TABSTOPS_COUNT) set_bit(vc->state.x, vc->vc_tab_stop); else if (vc->vc_par[0] == 3) bitmap_zero(vc->vc_tab_stop, VC_TABSTOPS_COUNT); break; case 'h': csi_hl(vc, true); break; case 'l': csi_hl(vc, false); break; case 'm': csi_m(vc); break; case 'n': if (vc->vc_par[0] == 5) status_report(tty); else if (vc->vc_par[0] == 6) cursor_report(vc, tty); break; case 'q': /* DECLL - but only 3 leds */ /* map 0,1,2,3 to 0,1,2,4 */ if (vc->vc_par[0] < 4) vt_set_led_state(vc->vc_num, (vc->vc_par[0] < 3) ? vc->vc_par[0] : 4); break; case 'r': if (!vc->vc_par[0]) vc->vc_par[0]++; if (!vc->vc_par[1]) vc->vc_par[1] = vc->vc_rows; /* Minimum allowed region is 2 lines */ if (vc->vc_par[0] < vc->vc_par[1] && vc->vc_par[1] <= vc->vc_rows) { vc->vc_top = vc->vc_par[0] - 1; vc->vc_bottom = vc->vc_par[1]; gotoxay(vc, 0, 0); } break; case 's': save_cur(vc); break; case 'u': restore_cur(vc); break; case 'X': csi_X(vc); break; case '@': csi_at(vc, vc->vc_par[0]); break; case ']': csi_RSB(vc); break; } } static void vc_reset_params(struct vc_data *vc) { memset(vc->vc_par, 0, sizeof(vc->vc_par)); vc->vc_npar = 0; } /* console_lock is held */ static void do_con_trol(struct tty_struct *tty, struct vc_data *vc, u8 c) { /* * Control characters can be used in the _middle_ * of an escape sequence, aside from ANSI control strings. */ if (ansi_control_string(vc->vc_state) && c >= ASCII_IGNORE_FIRST && c <= ASCII_IGNORE_LAST) return; if (handle_ascii(tty, vc, c)) return; switch(vc->vc_state) { case ESesc: /* ESC */ handle_esc(tty, vc, c); return; case ESnonstd: /* ESC ] aka OSC */ switch (c) { case 'P': /* palette escape sequence */ vc_reset_params(vc); vc->vc_state = ESpalette; return; case 'R': /* reset palette */ reset_palette(vc); break; case '0' ... '9': vc->vc_state = ESosc; return; } vc->vc_state = ESnormal; return; case ESpalette: /* ESC ] P aka OSC P */ if (isxdigit(c)) { vc->vc_par[vc->vc_npar++] = hex_to_bin(c); if (vc->vc_npar == 7) { int i = vc->vc_par[0] * 3, j = 1; vc->vc_palette[i] = 16 * vc->vc_par[j++]; vc->vc_palette[i++] += vc->vc_par[j++]; vc->vc_palette[i] = 16 * vc->vc_par[j++]; vc->vc_palette[i++] += vc->vc_par[j++]; vc->vc_palette[i] = 16 * vc->vc_par[j++]; vc->vc_palette[i] += vc->vc_par[j]; set_palette(vc); vc->vc_state = ESnormal; } } else vc->vc_state = ESnormal; return; case ESsquare: /* ESC [ aka CSI, parameters or modifiers expected */ vc_reset_params(vc); vc->vc_state = ESgetpars; switch (c) { case '[': /* Function key */ vc->vc_state = ESfunckey; return; case '?': vc->vc_priv = EPdec; return; case '>': vc->vc_priv = EPgt; return; case '=': vc->vc_priv = EPeq; return; case '<': vc->vc_priv = EPlt; return; } vc->vc_priv = EPecma; fallthrough; case ESgetpars: /* ESC [ aka CSI, parameters expected */ switch (c) { case ';': if (vc->vc_npar < NPAR - 1) { vc->vc_npar++; return; } break; case '0' ... '9': vc->vc_par[vc->vc_npar] *= 10; vc->vc_par[vc->vc_npar] += c - '0'; return; } if (c >= ASCII_CSI_IGNORE_FIRST && c <= ASCII_CSI_IGNORE_LAST) { vc->vc_state = EScsiignore; return; } /* parameters done, handle the control char @c */ vc->vc_state = ESnormal; switch (vc->vc_priv) { case EPdec: csi_DEC(tty, vc, c); return; case EPecma: csi_ECMA(tty, vc, c); return; default: return; } case EScsiignore: if (c >= ASCII_CSI_IGNORE_FIRST && c <= ASCII_CSI_IGNORE_LAST) return; vc->vc_state = ESnormal; return; case ESpercent: /* ESC % */ vc->vc_state = ESnormal; switch (c) { case '@': /* defined in ISO 2022 */ vc->vc_utf = 0; return; case 'G': /* prelim official escape code */ case '8': /* retained for compatibility */ vc->vc_utf = 1; return; } return; case ESfunckey: /* ESC [ [ aka CSI [ */ vc->vc_state = ESnormal; return; case EShash: /* ESC # */ vc->vc_state = ESnormal; if (c == '8') { /* DEC screen alignment test. kludge :-) */ vc->vc_video_erase_char = (vc->vc_video_erase_char & 0xff00) | 'E'; csi_J(vc, CSI_J_VISIBLE); vc->vc_video_erase_char = (vc->vc_video_erase_char & 0xff00) | ' '; do_update_region(vc, vc->vc_origin, vc->vc_screenbuf_size / 2); } return; case ESsetG0: /* ESC ( */ vc_setGx(vc, 0, c); vc->vc_state = ESnormal; return; case ESsetG1: /* ESC ) */ vc_setGx(vc, 1, c); vc->vc_state = ESnormal; return; case ESapc: /* ESC _ */ return; case ESosc: /* ESC ] [0-9] aka OSC [0-9] */ return; case ESpm: /* ESC ^ */ return; case ESdcs: /* ESC P */ return; default: vc->vc_state = ESnormal; } } struct vc_draw_region { unsigned long from, to; int x; }; static void con_flush(struct vc_data *vc, struct vc_draw_region *draw) { if (draw->x < 0) return; vc->vc_sw->con_putcs(vc, (u16 *)draw->from, (u16 *)draw->to - (u16 *)draw->from, vc->state.y, draw->x); draw->x = -1; } static inline int vc_translate_ascii(const struct vc_data *vc, int c) { if (IS_ENABLED(CONFIG_CONSOLE_TRANSLATIONS)) { if (vc->vc_toggle_meta) c |= 0x80; return vc->vc_translate[c]; } return c; } /** * vc_sanitize_unicode - Replace invalid Unicode code points with ``U+FFFD`` * @c: the received code point */ static inline int vc_sanitize_unicode(const int c) { if (c >= 0xd800 && c <= 0xdfff) return 0xfffd; return c; } /** * vc_translate_unicode - Combine UTF-8 into Unicode in &vc_data.vc_utf_char * @vc: virtual console * @c: UTF-8 byte to translate * @rescan: set to true iff @c wasn't consumed here and needs to be re-processed * * * &vc_data.vc_utf_char is the being-constructed Unicode code point. * * &vc_data.vc_utf_count is the number of continuation bytes still expected to * arrive. * * &vc_data.vc_npar is the number of continuation bytes arrived so far. * * Return: * * %-1 - Input OK so far, @c consumed, further bytes expected. * * %0xFFFD - Possibility 1: input invalid, @c may have been consumed (see * desc. of @rescan). Possibility 2: input OK, @c consumed, * ``U+FFFD`` is the resulting code point. ``U+FFFD`` is valid, * ``REPLACEMENT CHARACTER``. * * otherwise - Input OK, @c consumed, resulting code point returned. */ static int vc_translate_unicode(struct vc_data *vc, int c, bool *rescan) { static const u32 utf8_length_changes[] = {0x7f, 0x7ff, 0xffff, 0x10ffff}; /* Continuation byte received */ if ((c & 0xc0) == 0x80) { /* Unexpected continuation byte? */ if (!vc->vc_utf_count) goto bad_sequence; vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f); vc->vc_npar++; if (--vc->vc_utf_count) goto need_more_bytes; /* Got a whole character */ c = vc->vc_utf_char; /* Reject overlong sequences */ if (c <= utf8_length_changes[vc->vc_npar - 1] || c > utf8_length_changes[vc->vc_npar]) goto bad_sequence; return vc_sanitize_unicode(c); } /* Single ASCII byte or first byte of a sequence received */ if (vc->vc_utf_count) { /* A continuation byte was expected */ *rescan = true; vc->vc_utf_count = 0; goto bad_sequence; } /* Nothing to do if an ASCII byte was received */ if (c <= 0x7f) return c; /* First byte of a multibyte sequence received */ vc->vc_npar = 0; if ((c & 0xe0) == 0xc0) { vc->vc_utf_count = 1; vc->vc_utf_char = (c & 0x1f); } else if ((c & 0xf0) == 0xe0) { vc->vc_utf_count = 2; vc->vc_utf_char = (c & 0x0f); } else if ((c & 0xf8) == 0xf0) { vc->vc_utf_count = 3; vc->vc_utf_char = (c & 0x07); } else { goto bad_sequence; } need_more_bytes: return -1; bad_sequence: return 0xfffd; } static int vc_translate(struct vc_data *vc, int *c, bool *rescan) { /* Do no translation at all in control states */ if (vc->vc_state != ESnormal) return *c; if (vc->vc_utf && !vc->vc_disp_ctrl) return *c = vc_translate_unicode(vc, *c, rescan); /* no utf or alternate charset mode */ return vc_translate_ascii(vc, *c); } static inline unsigned char vc_invert_attr(const struct vc_data *vc) { if (!vc->vc_can_do_color) return vc->vc_attr ^ 0x08; if (vc->vc_hi_font_mask == 0x100) return (vc->vc_attr & 0x11) | ((vc->vc_attr & 0xe0) >> 4) | ((vc->vc_attr & 0x0e) << 4); return (vc->vc_attr & 0x88) | ((vc->vc_attr & 0x70) >> 4) | ((vc->vc_attr & 0x07) << 4); } static bool vc_is_control(struct vc_data *vc, int tc, int c) { /* * A bitmap for codes <32. A bit of 1 indicates that the code * corresponding to that bit number invokes some special action (such * as cursor movement) and should not be displayed as a glyph unless * the disp_ctrl mode is explicitly enabled. */ static const u32 CTRL_ACTION = BIT(ASCII_NULL) | GENMASK(ASCII_SHIFTIN, ASCII_BELL) | BIT(ASCII_CANCEL) | BIT(ASCII_SUBSTITUTE) | BIT(ASCII_ESCAPE); /* Cannot be overridden by disp_ctrl */ static const u32 CTRL_ALWAYS = BIT(ASCII_NULL) | BIT(ASCII_BACKSPACE) | BIT(ASCII_LINEFEED) | BIT(ASCII_SHIFTIN) | BIT(ASCII_SHIFTOUT) | BIT(ASCII_CAR_RET) | BIT(ASCII_FORMFEED) | BIT(ASCII_ESCAPE); if (vc->vc_state != ESnormal) return true; if (!tc) return true; /* * If the original code was a control character we only allow a glyph * to be displayed if the code is not normally used (such as for cursor * movement) or if the disp_ctrl mode has been explicitly enabled. * Certain characters (as given by the CTRL_ALWAYS bitmap) are always * displayed as control characters, as the console would be pretty * useless without them; to display an arbitrary font position use the * direct-to-font zone in UTF-8 mode. */ if (c < BITS_PER_TYPE(CTRL_ALWAYS)) { if (vc->vc_disp_ctrl) return CTRL_ALWAYS & BIT(c); else return vc->vc_utf || (CTRL_ACTION & BIT(c)); } if (c == ASCII_DEL && !vc->vc_disp_ctrl) return true; if (c == ASCII_EXT_CSI) return true; return false; } static void vc_con_rewind(struct vc_data *vc) { if (vc->state.x && !vc->vc_need_wrap) { vc->vc_pos -= 2; vc->state.x--; } vc->vc_need_wrap = 0; } #define UCS_ZWS 0x200b /* Zero Width Space */ #define UCS_VS16 0xfe0f /* Variation Selector 16 */ #define UCS_REPLACEMENT 0xfffd /* Replacement Character */ static int vc_process_ucs(struct vc_data *vc, int *c, int *tc) { u32 prev_c, curr_c = *c; if (ucs_is_double_width(curr_c)) { /* * The Unicode screen memory is allocated only when * required. This is one such case as we need to remember * which displayed characters are double-width. */ vc_uniscr_check(vc); return 2; } if (!ucs_is_zero_width(curr_c)) return 1; /* From here curr_c is known to be zero-width. */ if (ucs_is_double_width(vc_uniscr_getc(vc, -2))) { /* * Let's merge this zero-width code point with the preceding * double-width code point by replacing the existing * zero-width space padding. To do so we rewind one column * and pretend this has a width of 1. * We give the legacy display the same initial space padding. */ vc_con_rewind(vc); *tc = ' '; return 1; } /* From here the preceding character, if any, must be single-width. */ prev_c = vc_uniscr_getc(vc, -1); if (curr_c == UCS_VS16 && prev_c != 0) { /* * VS16 (U+FE0F) is special. It typically turns the preceding * single-width character into a double-width one. Let it * have a width of 1 effectively making the combination with * the preceding character double-width. */ *tc = ' '; return 1; } /* try recomposition */ prev_c = ucs_recompose(prev_c, curr_c); if (prev_c != 0) { vc_con_rewind(vc); *tc = *c = prev_c; return 1; } /* Otherwise zero-width code points are ignored. */ return 0; } static int vc_get_glyph(struct vc_data *vc, int tc) { int glyph = conv_uni_to_pc(vc, tc); u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff; if (!(glyph & ~charmask)) return glyph; if (glyph == -1) return -1; /* nothing to display */ /* Glyph not found */ if ((!vc->vc_utf || vc->vc_disp_ctrl || tc < 128) && !(tc & ~charmask)) { /* * In legacy mode use the glyph we get by a 1:1 mapping. * This would make absolutely no sense with Unicode in mind, but do this for * ASCII characters since a font may lack Unicode mapping info and we don't * want to end up with having question marks only. */ return tc; } /* * The Unicode screen memory is allocated only when required. * This is one such case: we're about to "cheat" with the displayed * character meaning the simple screen buffer won't hold the original * information, whereas the Unicode screen buffer always does. */ vc_uniscr_check(vc); /* Try getting a simpler fallback character. */ tc = ucs_get_fallback(tc); if (tc) return vc_get_glyph(vc, tc); /* Display U+FFFD (Unicode Replacement Character). */ return conv_uni_to_pc(vc, UCS_REPLACEMENT); } static int vc_con_write_normal(struct vc_data *vc, int tc, int c, struct vc_draw_region *draw) { int next_c; unsigned char vc_attr = vc->vc_attr; u16 himask = vc->vc_hi_font_mask; u8 width = 1; bool inverse = false; if (vc->vc_utf && !vc->vc_disp_ctrl) { width = vc_process_ucs(vc, &c, &tc); if (!width) goto out; } /* Now try to find out how to display it */ tc = vc_get_glyph(vc, tc); if (tc == -1) return -1; /* nothing to display */ if (tc < 0) { inverse = true; tc = conv_uni_to_pc(vc, '?'); if (tc < 0) tc = '?'; vc_attr = vc_invert_attr(vc); con_flush(vc, draw); } next_c = c; while (1) { if (vc->vc_need_wrap || vc->vc_decim) con_flush(vc, draw); if (vc->vc_need_wrap) { cr(vc); lf(vc); } if (vc->vc_decim) insert_char(vc, 1); vc_uniscr_putc(vc, next_c); if (himask) tc = ((tc & 0x100) ? himask : 0) | (tc & 0xff); tc |= (vc_attr << 8) & ~himask; scr_writew(tc, (u16 *)vc->vc_pos); if (con_should_update(vc) && draw->x < 0) { draw->x = vc->state.x; draw->from = vc->vc_pos; } if (vc->state.x == vc->vc_cols - 1) { vc->vc_need_wrap = vc->vc_decawm; draw->to = vc->vc_pos + 2; } else { vc->state.x++; draw->to = (vc->vc_pos += 2); } if (!--width) break; /* A space is printed in the second column */ tc = conv_uni_to_pc(vc, ' '); if (tc < 0) tc = ' '; /* * Store a zero-width space in the Unicode screen given that * the previous code point is semantically double width. */ next_c = UCS_ZWS; } out: notify_write(vc, c); if (inverse) con_flush(vc, draw); return 0; } /* acquires console_lock */ static int do_con_write(struct tty_struct *tty, const u8 *buf, int count) { struct vc_draw_region draw = { .x = -1, }; int c, tc, n = 0; unsigned int currcons; struct vc_data *vc = tty->driver_data; struct vt_notifier_param param; bool rescan; if (in_interrupt()) return count; guard(console_lock)(); currcons = vc->vc_num; if (!vc_cons_allocated(currcons)) { /* could this happen? */ pr_warn_once("con_write: tty %d not allocated\n", currcons+1); return 0; } /* undraw cursor first */ if (con_is_fg(vc)) hide_cursor(vc); param.vc = vc; while (!tty->flow.stopped && count) { u8 orig = *buf; buf++; n++; count--; rescan_last_byte: c = orig; rescan = false; tc = vc_translate(vc, &c, &rescan); if (tc == -1) continue; param.c = tc; if (atomic_notifier_call_chain(&vt_notifier_list, VT_PREWRITE, ¶m) == NOTIFY_STOP) continue; if (vc_is_control(vc, tc, c)) { con_flush(vc, &draw); do_con_trol(tty, vc, orig); continue; } if (vc_con_write_normal(vc, tc, c, &draw) < 0) continue; if (rescan) goto rescan_last_byte; } con_flush(vc, &draw); console_conditional_schedule(); notify_update(vc); return n; } /* * This is the console switching callback. * * Doing console switching in a process context allows * us to do the switches asynchronously (needed when we want * to switch due to a keyboard interrupt). Synchronization * with other console code and prevention of re-entrancy is * ensured with console_lock. */ static void console_callback(struct work_struct *ignored) { guard(console_lock)(); if (want_console >= 0) { if (want_console != fg_console && vc_cons_allocated(want_console)) { hide_cursor(vc_cons[fg_console].d); change_console(vc_cons[want_console].d); /* we only changed when the console had already been allocated - a new console is not created in an interrupt routine */ } want_console = -1; } if (do_poke_blanked_console) { /* do not unblank for a LED change */ do_poke_blanked_console = 0; poke_blanked_console(); } if (scrollback_delta) { struct vc_data *vc = vc_cons[fg_console].d; clear_selection(); if (vc->vc_mode == KD_TEXT && vc->vc_sw->con_scrolldelta) vc->vc_sw->con_scrolldelta(vc, scrollback_delta); scrollback_delta = 0; } if (blank_timer_expired) { do_blank_screen(0); blank_timer_expired = 0; } notify_update(vc_cons[fg_console].d); } int set_console(int nr) { struct vc_data *vc = vc_cons[fg_console].d; if (!vc_cons_allocated(nr) || vt_dont_switch || (vc->vt_mode.mode == VT_AUTO && vc->vc_mode == KD_GRAPHICS)) { /* * Console switch will fail in console_callback() or * change_console() so there is no point scheduling * the callback * * Existing set_console() users don't check the return * value so this shouldn't break anything */ return -EINVAL; } want_console = nr; schedule_console_callback(); return 0; } struct tty_driver *console_driver; #ifdef CONFIG_VT_CONSOLE /** * vt_kmsg_redirect() - sets/gets the kernel message console * @new: the new virtual terminal number or -1 if the console should stay * unchanged * * By default, the kernel messages are always printed on the current virtual * console. However, the user may modify that default with the * %TIOCL_SETKMSGREDIRECT ioctl call. * * This function sets the kernel message console to be @new. It returns the old * virtual console number. The virtual terminal number %0 (both as parameter and * return value) means no redirection (i.e. always printed on the currently * active console). * * The parameter -1 means that only the current console is returned, but the * value is not modified. You may use the macro vt_get_kmsg_redirect() in that * case to make the code more understandable. * * When the kernel is compiled without %CONFIG_VT_CONSOLE, this function ignores * the parameter and always returns %0. */ int vt_kmsg_redirect(int new) { static int kmsg_con; if (new != -1) return xchg(&kmsg_con, new); else return kmsg_con; } /* * Console on virtual terminal * * The console must be locked when we get here. */ static void vt_console_print(struct console *co, const char *b, unsigned count) { struct vc_data *vc = vc_cons[fg_console].d; unsigned char c; static DEFINE_SPINLOCK(printing_lock); const ushort *start; ushort start_x, cnt; int kmsg_console; WARN_CONSOLE_UNLOCKED(); /* this protects against concurrent oops only */ if (!spin_trylock(&printing_lock)) return; kmsg_console = vt_get_kmsg_redirect(); if (kmsg_console && vc_cons_allocated(kmsg_console - 1)) vc = vc_cons[kmsg_console - 1].d; if (!vc_cons_allocated(fg_console)) { /* impossible */ /* printk("vt_console_print: tty %d not allocated ??\n", currcons+1); */ goto quit; } if (vc->vc_mode != KD_TEXT) goto quit; /* undraw cursor first */ if (con_is_fg(vc)) hide_cursor(vc); start = (ushort *)vc->vc_pos; start_x = vc->state.x; cnt = 0; while (count--) { c = *b++; if (c == ASCII_LINEFEED || c == ASCII_CAR_RET || c == ASCII_BACKSPACE || vc->vc_need_wrap) { if (cnt && con_is_visible(vc)) vc->vc_sw->con_putcs(vc, start, cnt, vc->state.y, start_x); cnt = 0; if (c == ASCII_BACKSPACE) { bs(vc); start = (ushort *)vc->vc_pos; start_x = vc->state.x; continue; } if (c != ASCII_CAR_RET) lf(vc); cr(vc); start = (ushort *)vc->vc_pos; start_x = vc->state.x; if (c == ASCII_LINEFEED || c == ASCII_CAR_RET) continue; } vc_uniscr_putc(vc, c); scr_writew((vc->vc_attr << 8) + c, (unsigned short *)vc->vc_pos); notify_write(vc, c); cnt++; if (vc->state.x == vc->vc_cols - 1) { vc->vc_need_wrap = 1; } else { vc->vc_pos += 2; vc->state.x++; } } if (cnt && con_is_visible(vc)) vc->vc_sw->con_putcs(vc, start, cnt, vc->state.y, start_x); set_cursor(vc); notify_update(vc); quit: spin_unlock(&printing_lock); } static struct tty_driver *vt_console_device(struct console *c, int *index) { *index = c->index ? c->index-1 : fg_console; return console_driver; } static int vt_console_setup(struct console *co, char *options) { return co->index >= MAX_NR_CONSOLES ? -EINVAL : 0; } static struct console vt_console_driver = { .name = "tty", .setup = vt_console_setup, .write = vt_console_print, .device = vt_console_device, .unblank = unblank_screen, .flags = CON_PRINTBUFFER, .index = -1, }; #endif /* * Handling of Linux-specific VC ioctls */ /* * Generally a bit racy with respect to console_lock();. * * There are some functions which don't need it. * * There are some functions which can sleep for arbitrary periods * (paste_selection) but we don't need the lock there anyway. * * set_selection_user has locking, and definitely needs it */ int tioclinux(struct tty_struct *tty, unsigned long arg) { char type, data; char __user *p = (char __user *)arg; void __user *param_aligned32 = (u32 __user *)arg + 1; void __user *param = (void __user *)arg + 1; int lines; int ret; if (current->signal->tty != tty && !capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(type, p)) return -EFAULT; ret = 0; switch (type) { case TIOCL_SETSEL: return set_selection_user(param, tty); case TIOCL_PASTESEL: if (!capable(CAP_SYS_ADMIN)) return -EPERM; return paste_selection(tty); case TIOCL_UNBLANKSCREEN: scoped_guard(console_lock) unblank_screen(); break; case TIOCL_SELLOADLUT: if (!capable(CAP_SYS_ADMIN)) return -EPERM; return sel_loadlut(param_aligned32); case TIOCL_GETSHIFTSTATE: /* * Make it possible to react to Shift+Mousebutton. Note that * 'shift_state' is an undocumented kernel-internal variable; * programs not closely related to the kernel should not use * this. */ data = vt_get_shift_state(); return put_user(data, p); case TIOCL_GETMOUSEREPORTING: scoped_guard(console_lock) /* May be overkill */ data = mouse_reporting(); return put_user(data, p); case TIOCL_SETVESABLANK: return set_vesa_blanking(param); case TIOCL_GETKMSGREDIRECT: data = vt_get_kmsg_redirect(); return put_user(data, p); case TIOCL_SETKMSGREDIRECT: if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(data, p+1)) return -EFAULT; vt_kmsg_redirect(data); break; case TIOCL_GETFGCONSOLE: /* * No locking needed as this is a transiently correct return * anyway if the caller hasn't disabled switching. */ return fg_console; case TIOCL_SCROLLCONSOLE: if (get_user(lines, (s32 __user *)param_aligned32)) return -EFAULT; /* * Needs the console lock here. Note that lots of other calls * need fixing before the lock is actually useful! */ scoped_guard(console_lock) scrollfront(vc_cons[fg_console].d, lines); break; case TIOCL_BLANKSCREEN: /* until explicitly unblanked, not only poked */ scoped_guard(console_lock) { ignore_poke = 1; do_blank_screen(0); } break; case TIOCL_BLANKEDSCREEN: return console_blanked; case TIOCL_GETBRACKETEDPASTE: return get_bracketed_paste(tty); default: return -EINVAL; } return ret; } /* * /dev/ttyN handling */ static ssize_t con_write(struct tty_struct *tty, const u8 *buf, size_t count) { int retval; retval = do_con_write(tty, buf, count); con_flush_chars(tty); return retval; } static int con_put_char(struct tty_struct *tty, u8 ch) { return do_con_write(tty, &ch, 1); } static unsigned int con_write_room(struct tty_struct *tty) { if (tty->flow.stopped) return 0; return 32768; /* No limit, really; we're not buffering */ } /* * con_throttle and con_unthrottle are only used for * paste_selection(), which has to stuff in a large number of * characters... */ static void con_throttle(struct tty_struct *tty) { } static void con_unthrottle(struct tty_struct *tty) { struct vc_data *vc = tty->driver_data; wake_up_interruptible(&vc->paste_wait); } /* * Turn the Scroll-Lock LED on when the tty is stopped */ static void con_stop(struct tty_struct *tty) { int console_num; if (!tty) return; console_num = tty->index; if (!vc_cons_allocated(console_num)) return; vt_kbd_con_stop(console_num); } /* * Turn the Scroll-Lock LED off when the console is started */ static void con_start(struct tty_struct *tty) { int console_num; if (!tty) return; console_num = tty->index; if (!vc_cons_allocated(console_num)) return; vt_kbd_con_start(console_num); } static void con_flush_chars(struct tty_struct *tty) { struct vc_data *vc = tty->driver_data; if (in_interrupt()) /* from flush_to_ldisc */ return; guard(console_lock)(); set_cursor(vc); } /* * Allocate the console screen memory. */ static int con_install(struct tty_driver *driver, struct tty_struct *tty) { unsigned int currcons = tty->index; struct vc_data *vc; int ret; guard(console_lock)(); ret = vc_allocate(currcons); if (ret) return ret; vc = vc_cons[currcons].d; /* Still being freed */ if (vc->port.tty) return -ERESTARTSYS; ret = tty_port_install(&vc->port, driver, tty); if (ret) return ret; tty->driver_data = vc; vc->port.tty = tty; tty_port_get(&vc->port); if (!tty->winsize.ws_row && !tty->winsize.ws_col) { tty->winsize.ws_row = vc_cons[currcons].d->vc_rows; tty->winsize.ws_col = vc_cons[currcons].d->vc_cols; } if (vc->vc_utf) tty->termios.c_iflag |= IUTF8; else tty->termios.c_iflag &= ~IUTF8; return 0; } static int con_open(struct tty_struct *tty, struct file *filp) { /* everything done in install */ return 0; } static void con_close(struct tty_struct *tty, struct file *filp) { /* Nothing to do - we defer to shutdown */ } static void con_shutdown(struct tty_struct *tty) { struct vc_data *vc = tty->driver_data; BUG_ON(vc == NULL); guard(console_lock)(); vc->port.tty = NULL; } static void con_cleanup(struct tty_struct *tty) { struct vc_data *vc = tty->driver_data; tty_port_put(&vc->port); } /* * We can't deal with anything but the N_TTY ldisc, * because we can sleep in our write() routine. */ static int con_ldisc_ok(struct tty_struct *tty, int ldisc) { return ldisc == N_TTY ? 0 : -EINVAL; } static int default_color = 7; /* white */ static int default_italic_color = 2; // green (ASCII) static int default_underline_color = 3; // cyan (ASCII) module_param_named(color, default_color, int, S_IRUGO | S_IWUSR); module_param_named(italic, default_italic_color, int, S_IRUGO | S_IWUSR); module_param_named(underline, default_underline_color, int, S_IRUGO | S_IWUSR); static void vc_init(struct vc_data *vc, int do_clear) { int j, k ; set_origin(vc); vc->vc_pos = vc->vc_origin; reset_vc(vc); for (j=k=0; j<16; j++) { vc->vc_palette[k++] = default_red[j] ; vc->vc_palette[k++] = default_grn[j] ; vc->vc_palette[k++] = default_blu[j] ; } vc->vc_def_color = default_color; vc->vc_ulcolor = default_underline_color; vc->vc_itcolor = default_italic_color; vc->vc_halfcolor = 0x08; /* grey */ init_waitqueue_head(&vc->paste_wait); reset_terminal(vc, do_clear); } /* * This routine initializes console interrupts, and does nothing * else. If you want the screen to clear, call tty_write with * the appropriate escape-sequence. */ static int __init con_init(void) { const char *display_desc = NULL; struct vc_data *vc; unsigned int currcons = 0, i; console_lock(); if (!conswitchp) conswitchp = &dummy_con; display_desc = conswitchp->con_startup(); if (!display_desc) { fg_console = 0; console_unlock(); return 0; } for (i = 0; i < MAX_NR_CON_DRIVER; i++) { struct con_driver *con_driver = ®istered_con_driver[i]; if (con_driver->con == NULL) { con_driver->con = conswitchp; con_driver->desc = display_desc; con_driver->flag = CON_DRIVER_FLAG_INIT; con_driver->first = 0; con_driver->last = MAX_NR_CONSOLES - 1; break; } } for (i = 0; i < MAX_NR_CONSOLES; i++) con_driver_map[i] = conswitchp; if (blankinterval) { blank_state = blank_normal_wait; mod_timer(&console_timer, jiffies + (blankinterval * HZ)); } for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) { vc_cons[currcons].d = vc = kzalloc(sizeof(struct vc_data), GFP_NOWAIT); INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK); tty_port_init(&vc->port); visual_init(vc, currcons, true); /* Assuming vc->vc_{cols,rows,screenbuf_size} are sane here. */ vc->vc_screenbuf = kzalloc(vc->vc_screenbuf_size, GFP_NOWAIT); vc_init(vc, currcons || !vc->vc_sw->con_save_screen); } currcons = fg_console = 0; master_display_fg = vc = vc_cons[currcons].d; set_origin(vc); save_screen(vc); gotoxy(vc, vc->state.x, vc->state.y); csi_J(vc, CSI_J_CURSOR_TO_END); update_screen(vc); pr_info("Console: %s %s %dx%d\n", vc->vc_can_do_color ? "colour" : "mono", display_desc, vc->vc_cols, vc->vc_rows); console_unlock(); #ifdef CONFIG_VT_CONSOLE register_console(&vt_console_driver); #endif return 0; } console_initcall(con_init); static const struct tty_operations con_ops = { .install = con_install, .open = con_open, .close = con_close, .write = con_write, .write_room = con_write_room, .put_char = con_put_char, .flush_chars = con_flush_chars, .ioctl = vt_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vt_compat_ioctl, #endif .stop = con_stop, .start = con_start, .throttle = con_throttle, .unthrottle = con_unthrottle, .resize = vt_resize, .shutdown = con_shutdown, .cleanup = con_cleanup, .ldisc_ok = con_ldisc_ok, }; static struct cdev vc0_cdev; static ssize_t show_tty_active(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "tty%d\n", fg_console + 1); } static DEVICE_ATTR(active, S_IRUGO, show_tty_active, NULL); static struct attribute *vt_dev_attrs[] = { &dev_attr_active.attr, NULL }; ATTRIBUTE_GROUPS(vt_dev); int __init vty_init(const struct file_operations *console_fops) { cdev_init(&vc0_cdev, console_fops); if (cdev_add(&vc0_cdev, MKDEV(TTY_MAJOR, 0), 1) || register_chrdev_region(MKDEV(TTY_MAJOR, 0), 1, "/dev/vc/0") < 0) panic("Couldn't register /dev/tty0 driver\n"); tty0dev = device_create_with_groups(&tty_class, NULL, MKDEV(TTY_MAJOR, 0), NULL, vt_dev_groups, "tty0"); if (IS_ERR(tty0dev)) tty0dev = NULL; vcs_init(); console_driver = tty_alloc_driver(MAX_NR_CONSOLES, TTY_DRIVER_REAL_RAW | TTY_DRIVER_RESET_TERMIOS); if (IS_ERR(console_driver)) panic("Couldn't allocate console driver\n"); console_driver->name = "tty"; console_driver->name_base = 1; console_driver->major = TTY_MAJOR; console_driver->minor_start = 1; console_driver->type = TTY_DRIVER_TYPE_CONSOLE; console_driver->init_termios = tty_std_termios; if (default_utf8) console_driver->init_termios.c_iflag |= IUTF8; tty_set_operations(console_driver, &con_ops); if (tty_register_driver(console_driver)) panic("Couldn't register console driver\n"); kbd_init(); console_map_init(); #ifdef CONFIG_MDA_CONSOLE mda_console_init(); #endif return 0; } static const struct class vtconsole_class = { .name = "vtconsole", }; static int do_bind_con_driver(const struct consw *csw, int first, int last, int deflt) { struct module *owner = csw->owner; const char *desc = NULL; struct con_driver *con_driver; int i, j = -1, k = -1, retval = -ENODEV; if (!try_module_get(owner)) return -ENODEV; WARN_CONSOLE_UNLOCKED(); /* check if driver is registered */ for (i = 0; i < MAX_NR_CON_DRIVER; i++) { con_driver = ®istered_con_driver[i]; if (con_driver->con == csw) { desc = con_driver->desc; retval = 0; break; } } if (retval) goto err; if (!(con_driver->flag & CON_DRIVER_FLAG_INIT)) { csw->con_startup(); con_driver->flag |= CON_DRIVER_FLAG_INIT; } if (deflt) { if (conswitchp) module_put(conswitchp->owner); __module_get(owner); conswitchp = csw; } first = max(first, con_driver->first); last = min(last, con_driver->last); for (i = first; i <= last; i++) { int old_was_color; struct vc_data *vc = vc_cons[i].d; if (con_driver_map[i]) module_put(con_driver_map[i]->owner); __module_get(owner); con_driver_map[i] = csw; if (!vc || !vc->vc_sw) continue; j = i; if (con_is_visible(vc)) { k = i; save_screen(vc); } old_was_color = vc->vc_can_do_color; vc->vc_sw->con_deinit(vc); vc->vc_origin = (unsigned long)vc->vc_screenbuf; visual_init(vc, i, false); set_origin(vc); update_attr(vc); /* If the console changed between mono <-> color, then * the attributes in the screenbuf will be wrong. The * following resets all attributes to something sane. */ if (old_was_color != vc->vc_can_do_color) clear_buffer_attributes(vc); } pr_info("Console: switching "); if (!deflt) pr_cont("consoles %d-%d ", first + 1, last + 1); if (j >= 0) { struct vc_data *vc = vc_cons[j].d; pr_cont("to %s %s %dx%d\n", vc->vc_can_do_color ? "colour" : "mono", desc, vc->vc_cols, vc->vc_rows); if (k >= 0) { vc = vc_cons[k].d; update_screen(vc); } } else { pr_cont("to %s\n", desc); } retval = 0; err: module_put(owner); return retval; }; #ifdef CONFIG_VT_HW_CONSOLE_BINDING int do_unbind_con_driver(const struct consw *csw, int first, int last, int deflt) { struct module *owner = csw->owner; const struct consw *defcsw = NULL; struct con_driver *con_driver = NULL, *con_back = NULL; int i, retval = -ENODEV; if (!try_module_get(owner)) return -ENODEV; WARN_CONSOLE_UNLOCKED(); /* check if driver is registered and if it is unbindable */ for (i = 0; i < MAX_NR_CON_DRIVER; i++) { con_driver = ®istered_con_driver[i]; if (con_driver->con == csw && con_driver->flag & CON_DRIVER_FLAG_MODULE) { retval = 0; break; } } if (retval) goto err; retval = -ENODEV; /* check if backup driver exists */ for (i = 0; i < MAX_NR_CON_DRIVER; i++) { con_back = ®istered_con_driver[i]; if (con_back->con && con_back->con != csw) { defcsw = con_back->con; retval = 0; break; } } if (retval) goto err; if (!con_is_bound(csw)) goto err; first = max(first, con_driver->first); last = min(last, con_driver->last); for (i = first; i <= last; i++) { if (con_driver_map[i] == csw) { module_put(csw->owner); con_driver_map[i] = NULL; } } if (!con_is_bound(defcsw)) { const struct consw *defconsw = conswitchp; defcsw->con_startup(); con_back->flag |= CON_DRIVER_FLAG_INIT; /* * vgacon may change the default driver to point * to dummycon, we restore it here... */ conswitchp = defconsw; } if (!con_is_bound(csw)) con_driver->flag &= ~CON_DRIVER_FLAG_INIT; /* ignore return value, binding should not fail */ do_bind_con_driver(defcsw, first, last, deflt); err: module_put(owner); return retval; } EXPORT_SYMBOL_GPL(do_unbind_con_driver); static int vt_bind(struct con_driver *con) { const struct consw *defcsw = NULL, *csw = NULL; int i, more = 1, first = -1, last = -1, deflt = 0; if (!con->con || !(con->flag & CON_DRIVER_FLAG_MODULE)) goto err; csw = con->con; for (i = 0; i < MAX_NR_CON_DRIVER; i++) { struct con_driver *con = ®istered_con_driver[i]; if (con->con && !(con->flag & CON_DRIVER_FLAG_MODULE)) { defcsw = con->con; break; } } if (!defcsw) goto err; while (more) { more = 0; for (i = con->first; i <= con->last; i++) { if (con_driver_map[i] == defcsw) { if (first == -1) first = i; last = i; more = 1; } else if (first != -1) break; } if (first == 0 && last == MAX_NR_CONSOLES -1) deflt = 1; if (first != -1) do_bind_con_driver(csw, first, last, deflt); first = -1; last = -1; deflt = 0; } err: return 0; } static int vt_unbind(struct con_driver *con) { const struct consw *csw = NULL; int i, more = 1, first = -1, last = -1, deflt = 0; int ret; if (!con->con || !(con->flag & CON_DRIVER_FLAG_MODULE)) goto err; csw = con->con; while (more) { more = 0; for (i = con->first; i <= con->last; i++) { if (con_driver_map[i] == csw) { if (first == -1) first = i; last = i; more = 1; } else if (first != -1) break; } if (first == 0 && last == MAX_NR_CONSOLES -1) deflt = 1; if (first != -1) { ret = do_unbind_con_driver(csw, first, last, deflt); if (ret != 0) return ret; } first = -1; last = -1; deflt = 0; } err: return 0; } #else static inline int vt_bind(struct con_driver *con) { return 0; } static inline int vt_unbind(struct con_driver *con) { return 0; } #endif /* CONFIG_VT_HW_CONSOLE_BINDING */ static ssize_t store_bind(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct con_driver *con = dev_get_drvdata(dev); int bind = simple_strtoul(buf, NULL, 0); guard(console_lock)(); if (bind) vt_bind(con); else vt_unbind(con); return count; } static ssize_t show_bind(struct device *dev, struct device_attribute *attr, char *buf) { struct con_driver *con = dev_get_drvdata(dev); int bind; scoped_guard(console_lock) bind = con_is_bound(con->con); return sysfs_emit(buf, "%i\n", bind); } static ssize_t show_name(struct device *dev, struct device_attribute *attr, char *buf) { struct con_driver *con = dev_get_drvdata(dev); return sysfs_emit(buf, "%s %s\n", (con->flag & CON_DRIVER_FLAG_MODULE) ? "(M)" : "(S)", con->desc); } static DEVICE_ATTR(bind, S_IRUGO|S_IWUSR, show_bind, store_bind); static DEVICE_ATTR(name, S_IRUGO, show_name, NULL); static struct attribute *con_dev_attrs[] = { &dev_attr_bind.attr, &dev_attr_name.attr, NULL }; ATTRIBUTE_GROUPS(con_dev); static int vtconsole_init_device(struct con_driver *con) { con->flag |= CON_DRIVER_FLAG_ATTR; return 0; } static void vtconsole_deinit_device(struct con_driver *con) { con->flag &= ~CON_DRIVER_FLAG_ATTR; } /** * con_is_bound - checks if driver is bound to the console * @csw: console driver * * RETURNS: zero if unbound, nonzero if bound * * Drivers can call this and if zero, they should release * all resources allocated on &consw.con_startup() */ int con_is_bound(const struct consw *csw) { int i, bound = 0; WARN_CONSOLE_UNLOCKED(); for (i = 0; i < MAX_NR_CONSOLES; i++) { if (con_driver_map[i] == csw) { bound = 1; break; } } return bound; } EXPORT_SYMBOL(con_is_bound); /** * con_is_visible - checks whether the current console is visible * @vc: virtual console * * RETURNS: zero if not visible, nonzero if visible */ bool con_is_visible(const struct vc_data *vc) { WARN_CONSOLE_UNLOCKED(); return *vc->vc_display_fg == vc; } EXPORT_SYMBOL(con_is_visible); /** * con_debug_enter - prepare the console for the kernel debugger * @vc: virtual console * * Called when the console is taken over by the kernel debugger, this * function needs to save the current console state, then put the console * into a state suitable for the kernel debugger. */ void con_debug_enter(struct vc_data *vc) { saved_fg_console = fg_console; saved_last_console = last_console; saved_want_console = want_console; saved_vc_mode = vc->vc_mode; saved_console_blanked = console_blanked; vc->vc_mode = KD_TEXT; console_blanked = 0; if (vc->vc_sw->con_debug_enter) vc->vc_sw->con_debug_enter(vc); #ifdef CONFIG_KGDB_KDB /* Set the initial LINES variable if it is not already set */ if (vc->vc_rows < 999) { int linecount; char lns[4]; const char *setargs[3] = { "set", "LINES", lns, }; if (kdbgetintenv(setargs[0], &linecount)) { snprintf(lns, 4, "%i", vc->vc_rows); kdb_set(2, setargs); } } if (vc->vc_cols < 999) { int colcount; char cols[4]; const char *setargs[3] = { "set", "COLUMNS", cols, }; if (kdbgetintenv(setargs[0], &colcount)) { snprintf(cols, 4, "%i", vc->vc_cols); kdb_set(2, setargs); } } #endif /* CONFIG_KGDB_KDB */ } EXPORT_SYMBOL_GPL(con_debug_enter); /** * con_debug_leave - restore console state * * Restore the console state to what it was before the kernel debugger * was invoked. */ void con_debug_leave(void) { struct vc_data *vc; fg_console = saved_fg_console; last_console = saved_last_console; want_console = saved_want_console; console_blanked = saved_console_blanked; vc_cons[fg_console].d->vc_mode = saved_vc_mode; vc = vc_cons[fg_console].d; if (vc->vc_sw->con_debug_leave) vc->vc_sw->con_debug_leave(vc); } EXPORT_SYMBOL_GPL(con_debug_leave); static int do_register_con_driver(const struct consw *csw, int first, int last) { struct module *owner = csw->owner; struct con_driver *con_driver; const char *desc; int i, retval; WARN_CONSOLE_UNLOCKED(); if (!try_module_get(owner)) return -ENODEV; for (i = 0; i < MAX_NR_CON_DRIVER; i++) { con_driver = ®istered_con_driver[i]; /* already registered */ if (con_driver->con == csw) { retval = -EBUSY; goto err; } } desc = csw->con_startup(); if (!desc) { retval = -ENODEV; goto err; } retval = -EINVAL; for (i = 0; i < MAX_NR_CON_DRIVER; i++) { con_driver = ®istered_con_driver[i]; if (con_driver->con == NULL && !(con_driver->flag & CON_DRIVER_FLAG_ZOMBIE)) { con_driver->con = csw; con_driver->desc = desc; con_driver->node = i; con_driver->flag = CON_DRIVER_FLAG_MODULE | CON_DRIVER_FLAG_INIT; con_driver->first = first; con_driver->last = last; retval = 0; break; } } if (retval) goto err; con_driver->dev = device_create_with_groups(&vtconsole_class, NULL, MKDEV(0, con_driver->node), con_driver, con_dev_groups, "vtcon%i", con_driver->node); if (IS_ERR(con_driver->dev)) { pr_warn("Unable to create device for %s; errno = %ld\n", con_driver->desc, PTR_ERR(con_driver->dev)); con_driver->dev = NULL; } else { vtconsole_init_device(con_driver); } err: module_put(owner); return retval; } /** * do_unregister_con_driver - unregister console driver from console layer * @csw: console driver * * DESCRIPTION: All drivers that registers to the console layer must * call this function upon exit, or if the console driver is in a state * where it won't be able to handle console services, such as the * framebuffer console without loaded framebuffer drivers. * * The driver must unbind first prior to unregistration. */ int do_unregister_con_driver(const struct consw *csw) { int i; /* cannot unregister a bound driver */ if (con_is_bound(csw)) return -EBUSY; if (csw == conswitchp) return -EINVAL; for (i = 0; i < MAX_NR_CON_DRIVER; i++) { struct con_driver *con_driver = ®istered_con_driver[i]; if (con_driver->con == csw) { /* * Defer the removal of the sysfs entries since that * will acquire the kernfs s_active lock and we can't * acquire this lock while holding the console lock: * the unbind sysfs entry imposes already the opposite * order. Reset con already here to prevent any later * lookup to succeed and mark this slot as zombie, so * it won't get reused until we complete the removal * in the deferred work. */ con_driver->con = NULL; con_driver->flag = CON_DRIVER_FLAG_ZOMBIE; schedule_work(&con_driver_unregister_work); return 0; } } return -ENODEV; } EXPORT_SYMBOL_GPL(do_unregister_con_driver); static void con_driver_unregister_callback(struct work_struct *ignored) { int i; guard(console_lock)(); for (i = 0; i < MAX_NR_CON_DRIVER; i++) { struct con_driver *con_driver = ®istered_con_driver[i]; if (!(con_driver->flag & CON_DRIVER_FLAG_ZOMBIE)) continue; console_unlock(); vtconsole_deinit_device(con_driver); device_destroy(&vtconsole_class, MKDEV(0, con_driver->node)); console_lock(); if (WARN_ON_ONCE(con_driver->con)) con_driver->con = NULL; con_driver->desc = NULL; con_driver->dev = NULL; con_driver->node = 0; WARN_ON_ONCE(con_driver->flag != CON_DRIVER_FLAG_ZOMBIE); con_driver->flag = 0; con_driver->first = 0; con_driver->last = 0; } } /* * If we support more console drivers, this function is used * when a driver wants to take over some existing consoles * and become default driver for newly opened ones. * * do_take_over_console is basically a register followed by bind */ int do_take_over_console(const struct consw *csw, int first, int last, int deflt) { int err; err = do_register_con_driver(csw, first, last); /* * If we get an busy error we still want to bind the console driver * and return success, as we may have unbound the console driver * but not unregistered it. */ if (err == -EBUSY) err = 0; if (!err) do_bind_con_driver(csw, first, last, deflt); return err; } EXPORT_SYMBOL_GPL(do_take_over_console); /* * give_up_console is a wrapper to unregister_con_driver. It will only * work if driver is fully unbound. */ void give_up_console(const struct consw *csw) { guard(console_lock)(); do_unregister_con_driver(csw); } EXPORT_SYMBOL(give_up_console); static int __init vtconsole_class_init(void) { int i; i = class_register(&vtconsole_class); if (i) pr_warn("Unable to create vt console class; errno = %d\n", i); /* Add system drivers to sysfs */ for (i = 0; i < MAX_NR_CON_DRIVER; i++) { struct con_driver *con = ®istered_con_driver[i]; if (con->con && !con->dev) { con->dev = device_create_with_groups(&vtconsole_class, NULL, MKDEV(0, con->node), con, con_dev_groups, "vtcon%i", con->node); if (IS_ERR(con->dev)) { pr_warn("Unable to create device for %s; errno = %ld\n", con->desc, PTR_ERR(con->dev)); con->dev = NULL; } else { vtconsole_init_device(con); } } } return 0; } postcore_initcall(vtconsole_class_init); /* * Screen blanking */ static int set_vesa_blanking(u8 __user *mode_user) { u8 mode; if (get_user(mode, mode_user)) return -EFAULT; guard(console_lock)(); vesa_blank_mode = (mode <= VESA_BLANK_MAX) ? mode : VESA_NO_BLANKING; return 0; } void do_blank_screen(int entering_gfx) { struct vc_data *vc = vc_cons[fg_console].d; int i; might_sleep(); WARN_CONSOLE_UNLOCKED(); if (console_blanked) { if (blank_state == blank_vesa_wait) { blank_state = blank_off; vc->vc_sw->con_blank(vc, vesa_blank_mode + 1, 0); } return; } /* entering graphics mode? */ if (entering_gfx) { hide_cursor(vc); save_screen(vc); vc->vc_sw->con_blank(vc, VESA_VSYNC_SUSPEND, 1); console_blanked = fg_console + 1; blank_state = blank_off; set_origin(vc); return; } blank_state = blank_off; /* don't blank graphics */ if (vc->vc_mode != KD_TEXT) { console_blanked = fg_console + 1; return; } hide_cursor(vc); timer_delete_sync(&console_timer); blank_timer_expired = 0; save_screen(vc); /* In case we need to reset origin, blanking hook returns 1 */ i = vc->vc_sw->con_blank(vc, vesa_off_interval ? VESA_VSYNC_SUSPEND : (vesa_blank_mode + 1), 0); console_blanked = fg_console + 1; if (i) set_origin(vc); if (console_blank_hook && console_blank_hook(1)) return; if (vesa_off_interval && vesa_blank_mode) { blank_state = blank_vesa_wait; mod_timer(&console_timer, jiffies + vesa_off_interval); } vt_event_post(VT_EVENT_BLANK, vc->vc_num, vc->vc_num); } EXPORT_SYMBOL(do_blank_screen); /* * Called by timer as well as from vt_console_driver */ void do_unblank_screen(int leaving_gfx) { struct vc_data *vc; /* This should now always be called from a "sane" (read: can schedule) * context for the sake of the low level drivers, except in the special * case of oops_in_progress */ if (!oops_in_progress) might_sleep(); WARN_CONSOLE_UNLOCKED(); ignore_poke = 0; if (!console_blanked) return; if (!vc_cons_allocated(fg_console)) { /* impossible */ pr_warn("unblank_screen: tty %d not allocated ??\n", fg_console + 1); return; } vc = vc_cons[fg_console].d; if (vc->vc_mode != KD_TEXT) return; /* but leave console_blanked != 0 */ if (blankinterval) { mod_timer(&console_timer, jiffies + (blankinterval * HZ)); blank_state = blank_normal_wait; } console_blanked = 0; if (vc->vc_sw->con_blank(vc, VESA_NO_BLANKING, leaving_gfx)) /* Low-level driver cannot restore -> do it ourselves */ update_screen(vc); if (console_blank_hook) console_blank_hook(0); set_palette(vc); set_cursor(vc); vt_event_post(VT_EVENT_UNBLANK, vc->vc_num, vc->vc_num); notify_update(vc); } EXPORT_SYMBOL(do_unblank_screen); /* * This is called by the outside world to cause a forced unblank, mostly for * oopses. Currently, I just call do_unblank_screen(0), but we could eventually * call it with 1 as an argument and so force a mode restore... that may kill * X or at least garbage the screen but would also make the Oops visible... */ static void unblank_screen(void) { do_unblank_screen(0); } /* * We defer the timer blanking to work queue so it can take the console mutex * (console operations can still happen at irq time, but only from printk which * has the console mutex. Not perfect yet, but better than no locking */ static void blank_screen_t(struct timer_list *unused) { blank_timer_expired = 1; schedule_work(&console_work); } void poke_blanked_console(void) { WARN_CONSOLE_UNLOCKED(); /* Add this so we quickly catch whoever might call us in a non * safe context. Nowadays, unblank_screen() isn't to be called in * atomic contexts and is allowed to schedule (with the special case * of oops_in_progress, but that isn't of any concern for this * function. --BenH. */ might_sleep(); /* This isn't perfectly race free, but a race here would be mostly harmless, * at worst, we'll do a spurious blank and it's unlikely */ timer_delete(&console_timer); blank_timer_expired = 0; if (ignore_poke || !vc_cons[fg_console].d || vc_cons[fg_console].d->vc_mode == KD_GRAPHICS) return; if (console_blanked) unblank_screen(); else if (blankinterval) { mod_timer(&console_timer, jiffies + (blankinterval * HZ)); blank_state = blank_normal_wait; } } /* * Palettes */ static void set_palette(struct vc_data *vc) { WARN_CONSOLE_UNLOCKED(); if (vc->vc_mode != KD_GRAPHICS && vc->vc_sw->con_set_palette) vc->vc_sw->con_set_palette(vc, color_table); } /* * Load palette into the DAC registers. arg points to a colour * map, 3 bytes per colour, 16 colours, range from 0 to 255. */ int con_set_cmap(unsigned char __user *arg) { int i, j, k; unsigned char colormap[3*16]; if (copy_from_user(colormap, arg, sizeof(colormap))) return -EFAULT; guard(console_lock)(); for (i = k = 0; i < 16; i++) { default_red[i] = colormap[k++]; default_grn[i] = colormap[k++]; default_blu[i] = colormap[k++]; } for (i = 0; i < MAX_NR_CONSOLES; i++) { if (!vc_cons_allocated(i)) continue; for (j = k = 0; j < 16; j++) { vc_cons[i].d->vc_palette[k++] = default_red[j]; vc_cons[i].d->vc_palette[k++] = default_grn[j]; vc_cons[i].d->vc_palette[k++] = default_blu[j]; } set_palette(vc_cons[i].d); } return 0; } int con_get_cmap(unsigned char __user *arg) { int i, k; unsigned char colormap[3*16]; scoped_guard(console_lock) for (i = k = 0; i < 16; i++) { colormap[k++] = default_red[i]; colormap[k++] = default_grn[i]; colormap[k++] = default_blu[i]; } if (copy_to_user(arg, colormap, sizeof(colormap))) return -EFAULT; return 0; } void reset_palette(struct vc_data *vc) { int j, k; for (j=k=0; j<16; j++) { vc->vc_palette[k++] = default_red[j]; vc->vc_palette[k++] = default_grn[j]; vc->vc_palette[k++] = default_blu[j]; } set_palette(vc); } /* * Font switching * * Currently we only support fonts up to 128 pixels wide, at a maximum height * of 128 pixels. Userspace fontdata may have to be stored with 32 bytes * (shorts/ints, depending on width) reserved for each character which is * kinda wasty, but this is done in order to maintain compatibility with the * EGA/VGA fonts. It is up to the actual low-level console-driver convert data * into its favorite format (maybe we should add a `fontoffset' field to the * `display' structure so we won't have to convert the fontdata all the time. * /Jes */ #define max_font_width 64 #define max_font_height 128 #define max_font_glyphs 512 #define max_font_size (max_font_glyphs*max_font_width*max_font_height) static int con_font_get(struct vc_data *vc, struct console_font_op *op) { struct console_font font; int c; unsigned int vpitch = op->op == KD_FONT_OP_GET_TALL ? op->height : 32; if (vpitch > max_font_height) return -EINVAL; void *font_data __free(kvfree) = NULL; if (op->data) { font.data = font_data = kvzalloc(max_font_size, GFP_KERNEL); if (!font.data) return -ENOMEM; } else font.data = NULL; scoped_guard(console_lock) { if (vc->vc_mode != KD_TEXT) return -EINVAL; if (!vc->vc_sw->con_font_get) return -ENOSYS; int ret = vc->vc_sw->con_font_get(vc, &font, vpitch); if (ret) return ret; } c = DIV_ROUND_UP(font.width, 8) * vpitch * font.charcount; if (op->data && font.charcount > op->charcount) return -ENOSPC; if (font.width > op->width || font.height > op->height) return -ENOSPC; op->height = font.height; op->width = font.width; op->charcount = font.charcount; if (op->data && copy_to_user(op->data, font.data, c)) return -EFAULT; return 0; } static int con_font_set(struct vc_data *vc, const struct console_font_op *op) { struct console_font font; int size; unsigned int vpitch = op->op == KD_FONT_OP_SET_TALL ? op->height : 32; if (!op->data) return -EINVAL; if (op->charcount > max_font_glyphs) return -EINVAL; if (op->width <= 0 || op->width > max_font_width || !op->height || op->height > max_font_height) return -EINVAL; if (vpitch < op->height) return -EINVAL; size = DIV_ROUND_UP(op->width, 8) * vpitch * op->charcount; if (size > max_font_size) return -ENOSPC; void *font_data __free(kfree) = font.data = memdup_user(op->data, size); if (IS_ERR(font.data)) return PTR_ERR(font.data); font.charcount = op->charcount; font.width = op->width; font.height = op->height; guard(console_lock)(); if (vc->vc_mode != KD_TEXT) return -EINVAL; if (!vc->vc_sw->con_font_set) return -ENOSYS; if (vc_is_sel(vc)) clear_selection(); return vc->vc_sw->con_font_set(vc, &font, vpitch, op->flags); } static int con_font_default(struct vc_data *vc, struct console_font_op *op) { struct console_font font = {.width = op->width, .height = op->height}; char name[MAX_FONT_NAME]; char *s = name; if (!op->data) s = NULL; else if (strncpy_from_user(name, op->data, MAX_FONT_NAME - 1) < 0) return -EFAULT; else name[MAX_FONT_NAME - 1] = 0; scoped_guard(console_lock) { if (vc->vc_mode != KD_TEXT) return -EINVAL; if (!vc->vc_sw->con_font_default) return -ENOSYS; if (vc_is_sel(vc)) clear_selection(); int ret = vc->vc_sw->con_font_default(vc, &font, s); if (ret) return ret; } op->width = font.width; op->height = font.height; return 0; } int con_font_op(struct vc_data *vc, struct console_font_op *op) { switch (op->op) { case KD_FONT_OP_SET: case KD_FONT_OP_SET_TALL: return con_font_set(vc, op); case KD_FONT_OP_GET: case KD_FONT_OP_GET_TALL: return con_font_get(vc, op); case KD_FONT_OP_SET_DEFAULT: return con_font_default(vc, op); case KD_FONT_OP_COPY: /* was buggy and never really used */ return -EINVAL; } return -ENOSYS; } /* * Interface exported to selection and vcs. */ /* used by selection */ u16 screen_glyph(const struct vc_data *vc, int offset) { u16 w = scr_readw(screenpos(vc, offset, true)); u16 c = w & 0xff; if (w & vc->vc_hi_font_mask) c |= 0x100; return c; } EXPORT_SYMBOL_GPL(screen_glyph); u32 screen_glyph_unicode(const struct vc_data *vc, int n) { u32 **uni_lines = vc->vc_uni_lines; if (uni_lines) return uni_lines[n / vc->vc_cols][n % vc->vc_cols]; return inverse_translate(vc, screen_glyph(vc, n * 2), true); } EXPORT_SYMBOL_GPL(screen_glyph_unicode); /* used by vcs - note the word offset */ unsigned short *screen_pos(const struct vc_data *vc, int w_offset, bool viewed) { return screenpos(vc, 2 * w_offset, viewed); } EXPORT_SYMBOL_GPL(screen_pos); void getconsxy(const struct vc_data *vc, unsigned char xy[static 2]) { /* clamp values if they don't fit */ xy[0] = min(vc->state.x, 0xFFu); xy[1] = min(vc->state.y, 0xFFu); } void putconsxy(struct vc_data *vc, unsigned char xy[static const 2]) { hide_cursor(vc); gotoxy(vc, xy[0], xy[1]); set_cursor(vc); } u16 vcs_scr_readw(const struct vc_data *vc, const u16 *org) { if ((unsigned long)org == vc->vc_pos && softcursor_original != -1) return softcursor_original; return scr_readw(org); } void vcs_scr_writew(struct vc_data *vc, u16 val, u16 *org) { scr_writew(val, org); if ((unsigned long)org == vc->vc_pos) { softcursor_original = -1; add_softcursor(vc); } } void vcs_scr_updated(struct vc_data *vc) { notify_update(vc); } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_SCSI_DEVICE_H #define _SCSI_SCSI_DEVICE_H #include <linux/list.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/blk-mq.h> #include <scsi/scsi.h> #include <linux/atomic.h> #include <linux/sbitmap.h> struct bsg_device; struct device; struct request_queue; struct scsi_cmnd; struct scsi_lun; struct scsi_sense_hdr; typedef __u64 __bitwise blist_flags_t; #define SCSI_SENSE_BUFFERSIZE 96 struct scsi_mode_data { __u32 length; __u16 block_descriptor_length; __u8 medium_type; __u8 device_specific; __u8 header_length; __u8 longlba:1; }; /* * sdev state: If you alter this, you also need to alter scsi_sysfs.c * (for the ascii descriptions) and the state model enforcer: * scsi_lib:scsi_device_set_state(). */ enum scsi_device_state { SDEV_CREATED = 1, /* device created but not added to sysfs * Only internal commands allowed (for inq) */ SDEV_RUNNING, /* device properly configured * All commands allowed */ SDEV_CANCEL, /* beginning to delete device * Only error handler commands allowed */ SDEV_DEL, /* device deleted * no commands allowed */ SDEV_QUIESCE, /* Device quiescent. No block commands * will be accepted, only specials (which * originate in the mid-layer) */ SDEV_OFFLINE, /* Device offlined (by error handling or * user request */ SDEV_TRANSPORT_OFFLINE, /* Offlined by transport class error handler */ SDEV_BLOCK, /* Device blocked by scsi lld. No * scsi commands from user or midlayer * should be issued to the scsi * lld. */ SDEV_CREATED_BLOCK, /* same as above but for created devices */ }; enum scsi_scan_mode { SCSI_SCAN_INITIAL = 0, SCSI_SCAN_RESCAN, SCSI_SCAN_MANUAL, }; enum scsi_device_event { SDEV_EVT_MEDIA_CHANGE = 1, /* media has changed */ SDEV_EVT_INQUIRY_CHANGE_REPORTED, /* 3F 03 UA reported */ SDEV_EVT_CAPACITY_CHANGE_REPORTED, /* 2A 09 UA reported */ SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED, /* 38 07 UA reported */ SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED, /* 2A 01 UA reported */ SDEV_EVT_LUN_CHANGE_REPORTED, /* 3F 0E UA reported */ SDEV_EVT_ALUA_STATE_CHANGE_REPORTED, /* 2A 06 UA reported */ SDEV_EVT_POWER_ON_RESET_OCCURRED, /* 29 00 UA reported */ SDEV_EVT_FIRST = SDEV_EVT_MEDIA_CHANGE, SDEV_EVT_LAST = SDEV_EVT_POWER_ON_RESET_OCCURRED, SDEV_EVT_MAXBITS = SDEV_EVT_LAST + 1 }; struct scsi_event { enum scsi_device_event evt_type; struct list_head node; /* put union of data structures, for non-simple event types, * here */ }; /** * struct scsi_vpd - SCSI Vital Product Data * @rcu: For kfree_rcu(). * @len: Length in bytes of @data. * @data: VPD data as defined in various T10 SCSI standard documents. */ struct scsi_vpd { struct rcu_head rcu; int len; unsigned char data[]; }; struct scsi_device { struct Scsi_Host *host; struct request_queue *request_queue; /* the next two are protected by the host->host_lock */ struct list_head siblings; /* list of all devices on this host */ struct list_head same_target_siblings; /* just the devices sharing same target id */ struct sbitmap budget_map; atomic_t device_blocked; /* Device returned QUEUE_FULL. */ atomic_t restarts; spinlock_t list_lock; struct list_head starved_entry; unsigned short queue_depth; /* How deep of a queue we want */ unsigned short max_queue_depth; /* max queue depth */ unsigned short last_queue_full_depth; /* These two are used by */ unsigned short last_queue_full_count; /* scsi_track_queue_full() */ unsigned long last_queue_full_time; /* last queue full time */ unsigned long queue_ramp_up_period; /* ramp up period in jiffies */ #define SCSI_DEFAULT_RAMP_UP_PERIOD (120 * HZ) unsigned long last_queue_ramp_up; /* last queue ramp up time */ unsigned int id, channel; u64 lun; unsigned int manufacturer; /* Manufacturer of device, for using * vendor-specific cmd's */ unsigned sector_size; /* size in bytes */ void *hostdata; /* available to low-level driver */ unsigned char type; char scsi_level; char inq_periph_qual; /* PQ from INQUIRY data */ struct mutex inquiry_mutex; unsigned char inquiry_len; /* valid bytes in 'inquiry' */ unsigned char * inquiry; /* INQUIRY response data */ const char * vendor; /* [back_compat] point into 'inquiry' ... */ const char * model; /* ... after scan; point to static string */ const char * rev; /* ... "nullnullnullnull" before scan */ #define SCSI_DEFAULT_VPD_LEN 255 /* default SCSI VPD page size (max) */ struct scsi_vpd __rcu *vpd_pg0; struct scsi_vpd __rcu *vpd_pg83; struct scsi_vpd __rcu *vpd_pg80; struct scsi_vpd __rcu *vpd_pg89; struct scsi_vpd __rcu *vpd_pgb0; struct scsi_vpd __rcu *vpd_pgb1; struct scsi_vpd __rcu *vpd_pgb2; struct scsi_vpd __rcu *vpd_pgb7; struct scsi_target *sdev_target; blist_flags_t sdev_bflags; /* black/white flags as also found in * scsi_devinfo.[hc]. For now used only to * pass settings from sdev_init to scsi * core. */ unsigned int eh_timeout; /* Error handling timeout */ /* * If true, let the high-level device driver (sd) manage the device * power state for system suspend/resume (suspend to RAM and * hibernation) operations. */ unsigned manage_system_start_stop:1; /* * If true, let the high-level device driver (sd) manage the device * power state for runtime device suspand and resume operations. */ unsigned manage_runtime_start_stop:1; /* * If true, let the high-level device driver (sd) manage the device * power state for system shutdown (power off) operations. */ unsigned manage_shutdown:1; /* * If true, let the high-level device driver (sd) manage the device * power state for system restart (reboot) operations. */ unsigned manage_restart:1; /* * If set and if the device is runtime suspended, ask the high-level * device driver (sd) to force a runtime resume of the device. */ unsigned force_runtime_start_on_system_start:1; /* * Set if the device is an ATA device. */ unsigned is_ata:1; unsigned removable:1; unsigned changed:1; /* Data invalid due to media change */ unsigned busy:1; /* Used to prevent races */ unsigned lockable:1; /* Able to prevent media removal */ unsigned locked:1; /* Media removal disabled */ unsigned borken:1; /* Tell the Seagate driver to be * painfully slow on this device */ unsigned disconnect:1; /* can disconnect */ unsigned soft_reset:1; /* Uses soft reset option */ unsigned sdtr:1; /* Device supports SDTR messages */ unsigned wdtr:1; /* Device supports WDTR messages */ unsigned ppr:1; /* Device supports PPR messages */ unsigned tagged_supported:1; /* Supports SCSI-II tagged queuing */ unsigned simple_tags:1; /* simple queue tag messages are enabled */ unsigned was_reset:1; /* There was a bus reset on the bus for * this device */ unsigned expecting_cc_ua:1; /* Expecting a CHECK_CONDITION/UNIT_ATTN * because we did a bus reset. */ unsigned use_10_for_rw:1; /* first try 10-byte read / write */ unsigned use_10_for_ms:1; /* first try 10-byte mode sense/select */ unsigned set_dbd_for_ms:1; /* Set "DBD" field in mode sense */ unsigned read_before_ms:1; /* perform a READ before MODE SENSE */ unsigned no_report_opcodes:1; /* no REPORT SUPPORTED OPERATION CODES */ unsigned no_write_same:1; /* no WRITE SAME command */ unsigned use_16_for_rw:1; /* Use read/write(16) over read/write(10) */ unsigned use_16_for_sync:1; /* Use sync (16) over sync (10) */ unsigned skip_ms_page_8:1; /* do not use MODE SENSE page 0x08 */ unsigned skip_ms_page_3f:1; /* do not use MODE SENSE page 0x3f */ unsigned skip_vpd_pages:1; /* do not read VPD pages */ unsigned try_vpd_pages:1; /* attempt to read VPD pages */ unsigned use_192_bytes_for_3f:1; /* ask for 192 bytes from page 0x3f */ unsigned no_start_on_add:1; /* do not issue start on add */ unsigned allow_restart:1; /* issue START_UNIT in error handler */ unsigned start_stop_pwr_cond:1; /* Set power cond. in START_STOP_UNIT */ unsigned no_uld_attach:1; /* disable connecting to upper level drivers */ unsigned select_no_atn:1; unsigned fix_capacity:1; /* READ_CAPACITY is too high by 1 */ unsigned guess_capacity:1; /* READ_CAPACITY might be too high by 1 */ unsigned retry_hwerror:1; /* Retry HARDWARE_ERROR */ unsigned last_sector_bug:1; /* do not use multisector accesses on SD_LAST_BUGGY_SECTORS */ unsigned no_read_disc_info:1; /* Avoid READ_DISC_INFO cmds */ unsigned no_read_capacity_16:1; /* Avoid READ_CAPACITY_16 cmds */ unsigned try_rc_10_first:1; /* Try READ_CAPACACITY_10 first */ unsigned security_supported:1; /* Supports Security Protocols */ unsigned is_visible:1; /* is the device visible in sysfs */ unsigned wce_default_on:1; /* Cache is ON by default */ unsigned no_dif:1; /* T10 PI (DIF) should be disabled */ unsigned broken_fua:1; /* Don't set FUA bit */ unsigned lun_in_cdb:1; /* Store LUN bits in CDB[1] */ unsigned unmap_limit_for_ws:1; /* Use the UNMAP limit for WRITE SAME */ unsigned rpm_autosuspend:1; /* Enable runtime autosuspend at device * creation time */ unsigned ignore_media_change:1; /* Ignore MEDIA CHANGE on resume */ unsigned silence_suspend:1; /* Do not print runtime PM related messages */ unsigned no_vpd_size:1; /* No VPD size reported in header */ unsigned cdl_supported:1; /* Command duration limits supported */ unsigned cdl_enable:1; /* Enable/disable Command duration limits */ unsigned int queue_stopped; /* request queue is quiesced */ bool offline_already; /* Device offline message logged */ atomic_t ua_new_media_ctr; /* Counter for New Media UNIT ATTENTIONs */ atomic_t ua_por_ctr; /* Counter for Power On / Reset UAs */ atomic_t disk_events_disable_depth; /* disable depth for disk events */ DECLARE_BITMAP(supported_events, SDEV_EVT_MAXBITS); /* supported events */ DECLARE_BITMAP(pending_events, SDEV_EVT_MAXBITS); /* pending events */ struct list_head event_list; /* asserted events */ struct work_struct event_work; unsigned int max_device_blocked; /* what device_blocked counts down from */ #define SCSI_DEFAULT_DEVICE_BLOCKED 3 atomic_t iorequest_cnt; atomic_t iodone_cnt; atomic_t ioerr_cnt; atomic_t iotmo_cnt; struct device sdev_gendev, sdev_dev; struct work_struct requeue_work; struct scsi_device_handler *handler; void *handler_data; size_t dma_drain_len; void *dma_drain_buf; unsigned int sg_timeout; unsigned int sg_reserved_size; struct bsg_device *bsg_dev; unsigned char access_state; struct mutex state_mutex; enum scsi_device_state sdev_state; struct task_struct *quiesced_by; unsigned long sdev_data[]; } __attribute__((aligned(sizeof(unsigned long)))); #define to_scsi_device(d) \ container_of(d, struct scsi_device, sdev_gendev) #define class_to_sdev(d) \ container_of(d, struct scsi_device, sdev_dev) #define transport_class_to_sdev(class_dev) \ to_scsi_device(class_dev->parent) #define sdev_dbg(sdev, fmt, a...) \ dev_dbg(&(sdev)->sdev_gendev, fmt, ##a) /* * like scmd_printk, but the device name is passed in * as a string pointer */ __printf(4, 5) void sdev_prefix_printk(const char *, const struct scsi_device *, const char *, const char *, ...); #define sdev_printk(l, sdev, fmt, a...) \ sdev_prefix_printk(l, sdev, NULL, fmt, ##a) __printf(3, 4) void scmd_printk(const char *, struct scsi_cmnd *, const char *, ...); #define scmd_dbg(scmd, fmt, a...) \ do { \ struct request *__rq = scsi_cmd_to_rq((scmd)); \ \ if (__rq->q->disk) \ sdev_dbg((scmd)->device, "[%s] " fmt, \ __rq->q->disk->disk_name, ##a); \ else \ sdev_dbg((scmd)->device, fmt, ##a); \ } while (0) enum scsi_target_state { STARGET_CREATED = 1, STARGET_RUNNING, STARGET_REMOVE, STARGET_CREATED_REMOVE, STARGET_DEL, }; /* * scsi_target: representation of a scsi target, for now, this is only * used for single_lun devices. If no one has active IO to the target, * starget_sdev_user is NULL, else it points to the active sdev. */ struct scsi_target { struct scsi_device *starget_sdev_user; struct list_head siblings; struct list_head devices; struct device dev; struct kref reap_ref; /* last put renders target invisible */ unsigned int channel; unsigned int id; /* target id ... replace * scsi_device.id eventually */ unsigned int create:1; /* signal that it needs to be added */ unsigned int single_lun:1; /* Indicates we should only * allow I/O to one of the luns * for the device at a time. */ unsigned int pdt_1f_for_no_lun:1; /* PDT = 0x1f * means no lun present. */ unsigned int no_report_luns:1; /* Don't use * REPORT LUNS for scanning. */ unsigned int expecting_lun_change:1; /* A device has reported * a 3F/0E UA, other devices on * the same target will also. */ /* commands actually active on LLD. */ atomic_t target_busy; atomic_t target_blocked; /* * LLDs should set this in the sdev_init host template callout. * If set to zero then there is not limit. */ unsigned int can_queue; unsigned int max_target_blocked; #define SCSI_DEFAULT_TARGET_BLOCKED 3 char scsi_level; enum scsi_target_state state; void *hostdata; /* available to low-level driver */ unsigned long starget_data[]; /* for the transport */ /* starget_data must be the last element!!!! */ } __attribute__((aligned(sizeof(unsigned long)))); #define to_scsi_target(d) container_of(d, struct scsi_target, dev) static inline struct scsi_target *scsi_target(struct scsi_device *sdev) { return to_scsi_target(sdev->sdev_gendev.parent); } #define transport_class_to_starget(class_dev) \ to_scsi_target(class_dev->parent) #define starget_printk(prefix, starget, fmt, a...) \ dev_printk(prefix, &(starget)->dev, fmt, ##a) extern struct scsi_device *__scsi_add_device(struct Scsi_Host *, uint, uint, u64, void *hostdata); extern int scsi_add_device(struct Scsi_Host *host, uint channel, uint target, u64 lun); extern int scsi_register_device_handler(struct scsi_device_handler *scsi_dh); extern void scsi_remove_device(struct scsi_device *); extern int scsi_unregister_device_handler(struct scsi_device_handler *scsi_dh); void scsi_attach_vpd(struct scsi_device *sdev); void scsi_cdl_check(struct scsi_device *sdev); int scsi_cdl_enable(struct scsi_device *sdev, bool enable); extern struct scsi_device *scsi_device_from_queue(struct request_queue *q); extern int __must_check scsi_device_get(struct scsi_device *); extern void scsi_device_put(struct scsi_device *); extern struct scsi_device *scsi_device_lookup(struct Scsi_Host *, uint, uint, u64); extern struct scsi_device *__scsi_device_lookup(struct Scsi_Host *, uint, uint, u64); extern struct scsi_device *scsi_device_lookup_by_target(struct scsi_target *, u64); extern struct scsi_device *__scsi_device_lookup_by_target(struct scsi_target *, u64); extern void starget_for_each_device(struct scsi_target *, void *, void (*fn)(struct scsi_device *, void *)); extern void __starget_for_each_device(struct scsi_target *, void *, void (*fn)(struct scsi_device *, void *)); /* only exposed to implement shost_for_each_device */ extern struct scsi_device *__scsi_iterate_devices(struct Scsi_Host *, struct scsi_device *); /** * shost_for_each_device - iterate over all devices of a host * @sdev: the &struct scsi_device to use as a cursor * @shost: the &struct scsi_host to iterate over * * Iterator that returns each device attached to @shost. This loop * takes a reference on each device and releases it at the end. If * you break out of the loop, you must call scsi_device_put(sdev). */ #define shost_for_each_device(sdev, shost) \ for ((sdev) = __scsi_iterate_devices((shost), NULL); \ (sdev); \ (sdev) = __scsi_iterate_devices((shost), (sdev))) /** * __shost_for_each_device - iterate over all devices of a host (UNLOCKED) * @sdev: the &struct scsi_device to use as a cursor * @shost: the &struct scsi_host to iterate over * * Iterator that returns each device attached to @shost. It does _not_ * take a reference on the scsi_device, so the whole loop must be * protected by shost->host_lock. * * Note: The only reason to use this is because you need to access the * device list in interrupt context. Otherwise you really want to use * shost_for_each_device instead. */ #define __shost_for_each_device(sdev, shost) \ list_for_each_entry((sdev), &((shost)->__devices), siblings) extern int scsi_change_queue_depth(struct scsi_device *, int); extern int scsi_track_queue_full(struct scsi_device *, int); extern int scsi_set_medium_removal(struct scsi_device *, char); int scsi_mode_sense(struct scsi_device *sdev, int dbd, int modepage, int subpage, unsigned char *buffer, int len, int timeout, int retries, struct scsi_mode_data *data, struct scsi_sense_hdr *); extern int scsi_mode_select(struct scsi_device *sdev, int pf, int sp, unsigned char *buffer, int len, int timeout, int retries, struct scsi_mode_data *data, struct scsi_sense_hdr *); extern int scsi_test_unit_ready(struct scsi_device *sdev, int timeout, int retries, struct scsi_sense_hdr *sshdr); extern int scsi_get_vpd_page(struct scsi_device *, u8 page, unsigned char *buf, int buf_len); int scsi_report_opcode(struct scsi_device *sdev, unsigned char *buffer, unsigned int len, unsigned char opcode, unsigned short sa); extern int scsi_device_set_state(struct scsi_device *sdev, enum scsi_device_state state); extern struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type, gfp_t gfpflags); extern void sdev_evt_send(struct scsi_device *sdev, struct scsi_event *evt); extern void sdev_evt_send_simple(struct scsi_device *sdev, enum scsi_device_event evt_type, gfp_t gfpflags); extern int scsi_device_quiesce(struct scsi_device *sdev); extern void scsi_device_resume(struct scsi_device *sdev); extern void scsi_target_quiesce(struct scsi_target *); extern void scsi_target_resume(struct scsi_target *); extern void scsi_scan_target(struct device *parent, unsigned int channel, unsigned int id, u64 lun, enum scsi_scan_mode rescan); extern void scsi_target_reap(struct scsi_target *); void scsi_block_targets(struct Scsi_Host *shost, struct device *dev); extern void scsi_target_unblock(struct device *, enum scsi_device_state); extern void scsi_remove_target(struct device *); extern const char *scsi_device_state_name(enum scsi_device_state); extern int scsi_is_sdev_device(const struct device *); extern int scsi_is_target_device(const struct device *); extern void scsi_sanitize_inquiry_string(unsigned char *s, int len); /* * scsi_execute_cmd users can set scsi_failure.result to have * scsi_check_passthrough fail/retry a command. scsi_failure.result can be a * specific host byte or message code, or SCMD_FAILURE_RESULT_ANY can be used * to match any host or message code. */ #define SCMD_FAILURE_RESULT_ANY 0x7fffffff /* * Set scsi_failure.result to SCMD_FAILURE_STAT_ANY to fail/retry any failure * scsi_status_is_good returns false for. */ #define SCMD_FAILURE_STAT_ANY 0xff /* * The following can be set to the scsi_failure sense, asc and ascq fields to * match on any sense, ASC, or ASCQ value. */ #define SCMD_FAILURE_SENSE_ANY 0xff #define SCMD_FAILURE_ASC_ANY 0xff #define SCMD_FAILURE_ASCQ_ANY 0xff /* Always retry a matching failure. */ #define SCMD_FAILURE_NO_LIMIT -1 struct scsi_failure { int result; u8 sense; u8 asc; u8 ascq; /* * Number of times scsi_execute_cmd will retry the failure. It does * not count for the total_allowed. */ s8 allowed; /* Number of times the failure has been retried. */ s8 retries; }; struct scsi_failures { /* * If a scsi_failure does not have a retry limit setup this limit will * be used. */ int total_allowed; int total_retries; struct scsi_failure *failure_definitions; }; /* Optional arguments to scsi_execute_cmd */ struct scsi_exec_args { unsigned char *sense; /* sense buffer */ unsigned int sense_len; /* sense buffer len */ struct scsi_sense_hdr *sshdr; /* decoded sense header */ blk_mq_req_flags_t req_flags; /* BLK_MQ_REQ flags */ int scmd_flags; /* SCMD flags */ int *resid; /* residual length */ struct scsi_failures *failures; /* failures to retry */ }; int scsi_execute_cmd(struct scsi_device *sdev, const unsigned char *cmd, blk_opf_t opf, void *buffer, unsigned int bufflen, int timeout, int retries, const struct scsi_exec_args *args); void scsi_failures_reset_retries(struct scsi_failures *failures); struct scsi_cmnd *scsi_get_internal_cmd(struct scsi_device *sdev, enum dma_data_direction data_direction, blk_mq_req_flags_t flags); void scsi_put_internal_cmd(struct scsi_cmnd *scmd); extern void sdev_disable_disk_events(struct scsi_device *sdev); extern void sdev_enable_disk_events(struct scsi_device *sdev); extern int scsi_vpd_lun_id(struct scsi_device *, char *, size_t); extern int scsi_vpd_tpg_id(struct scsi_device *, int *); #ifdef CONFIG_PM extern int scsi_autopm_get_device(struct scsi_device *); extern void scsi_autopm_put_device(struct scsi_device *); #else static inline int scsi_autopm_get_device(struct scsi_device *d) { return 0; } static inline void scsi_autopm_put_device(struct scsi_device *d) {} #endif /* CONFIG_PM */ static inline int __must_check scsi_device_reprobe(struct scsi_device *sdev) { return device_reprobe(&sdev->sdev_gendev); } static inline unsigned int sdev_channel(struct scsi_device *sdev) { return sdev->channel; } static inline unsigned int sdev_id(struct scsi_device *sdev) { return sdev->id; } #define scmd_id(scmd) sdev_id((scmd)->device) #define scmd_channel(scmd) sdev_channel((scmd)->device) /** * scsi_device_is_pseudo_dev() - Whether a device is a pseudo SCSI device. * @sdev: SCSI device to examine * * A pseudo SCSI device can be used to allocate SCSI commands but does not show * up in sysfs. Additionally, the logical unit information in *@sdev is made up. * * This function tests the LUN number instead of comparing @sdev with * @sdev->host->pseudo_sdev because this function may be called before * @sdev->host->pseudo_sdev has been initialized. */ static inline bool scsi_device_is_pseudo_dev(struct scsi_device *sdev) { return sdev->lun == U64_MAX; } /* * checks for positions of the SCSI state machine */ static inline int scsi_device_online(struct scsi_device *sdev) { return (sdev->sdev_state != SDEV_OFFLINE && sdev->sdev_state != SDEV_TRANSPORT_OFFLINE && sdev->sdev_state != SDEV_DEL); } static inline int scsi_device_blocked(struct scsi_device *sdev) { return sdev->sdev_state == SDEV_BLOCK || sdev->sdev_state == SDEV_CREATED_BLOCK; } static inline int scsi_device_created(struct scsi_device *sdev) { return sdev->sdev_state == SDEV_CREATED || sdev->sdev_state == SDEV_CREATED_BLOCK; } int scsi_internal_device_block_nowait(struct scsi_device *sdev); int scsi_internal_device_unblock_nowait(struct scsi_device *sdev, enum scsi_device_state new_state); /* accessor functions for the SCSI parameters */ static inline int scsi_device_sync(struct scsi_device *sdev) { return sdev->sdtr; } static inline int scsi_device_wide(struct scsi_device *sdev) { return sdev->wdtr; } static inline int scsi_device_dt(struct scsi_device *sdev) { return sdev->ppr; } static inline int scsi_device_dt_only(struct scsi_device *sdev) { if (sdev->inquiry_len < 57) return 0; return (sdev->inquiry[56] & 0x0c) == 0x04; } static inline int scsi_device_ius(struct scsi_device *sdev) { if (sdev->inquiry_len < 57) return 0; return sdev->inquiry[56] & 0x01; } static inline int scsi_device_qas(struct scsi_device *sdev) { if (sdev->inquiry_len < 57) return 0; return sdev->inquiry[56] & 0x02; } static inline int scsi_device_enclosure(struct scsi_device *sdev) { return sdev->inquiry ? (sdev->inquiry[6] & (1<<6)) : 1; } static inline int scsi_device_protection(struct scsi_device *sdev) { if (sdev->no_dif) return 0; return sdev->scsi_level > SCSI_2 && sdev->inquiry[5] & (1<<0); } static inline int scsi_device_tpgs(struct scsi_device *sdev) { return sdev->inquiry ? (sdev->inquiry[5] >> 4) & 0x3 : 0; } /** * scsi_device_supports_vpd - test if a device supports VPD pages * @sdev: the &struct scsi_device to test * * If the 'try_vpd_pages' flag is set it takes precedence. * Otherwise we will assume VPD pages are supported if the * SCSI level is at least SPC-3 and 'skip_vpd_pages' is not set. */ static inline int scsi_device_supports_vpd(struct scsi_device *sdev) { /* Attempt VPD inquiry if the device blacklist explicitly calls * for it. */ if (sdev->try_vpd_pages) return 1; /* * Although VPD inquiries can go to SCSI-2 type devices, * some USB ones crash on receiving them, and the pages * we currently ask for are mandatory for SPC-2 and beyond */ if (sdev->scsi_level >= SCSI_SPC_2 && !sdev->skip_vpd_pages) return 1; return 0; } static inline int scsi_device_busy(struct scsi_device *sdev) { return sbitmap_weight(&sdev->budget_map); } /* Macros to access the UNIT ATTENTION counters */ #define scsi_get_ua_new_media_ctr(sdev) atomic_read(&sdev->ua_new_media_ctr) #define scsi_get_ua_por_ctr(sdev) atomic_read(&sdev->ua_por_ctr) #define MODULE_ALIAS_SCSI_DEVICE(type) \ MODULE_ALIAS("scsi:t-" __stringify(type) "*") #define SCSI_DEVICE_MODALIAS_FMT "scsi:t-0x%02x" #endif /* _SCSI_SCSI_DEVICE_H */ |
| 206 3 7 63 63 2 63 65 65 65 1 64 4 2 60 48 59 63 4 63 63 63 76 74 3 66 65 51 13 60 14 57 75 19 19 3 17 78 78 166 147 36 3 232 225 16 168 2 3 127 1 6 307 230 4 14 19 36 54 54 54 54 70 70 70 70 70 35 35 32 32 5 32 93 93 93 93 188 187 100 255 255 255 1 318 92 311 317 318 316 318 296 296 125 125 299 295 297 300 209 123 16 141 255 252 169 124 124 253 1 252 80 5 5 5 80 1 1 209 1 209 208 209 3 75 75 73 6 2 5 4 5 5 75 75 7 2 3 164 64 131 203 197 49 11 200 38 200 164 164 200 1 149 142 200 199 148 143 27 6 115 194 45 124 10 116 128 16 119 120 120 119 62 11 9 11 5 9 9 8 9 2 5 4 1 1 1 6 1 1 4 5 4 1 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ #include <linux/slab.h> #include <linux/compat.h> #include <linux/bio.h> #include <linux/buffer_head.h> #include "exfat_raw.h" #include "exfat_fs.h" static int exfat_extract_uni_name(struct exfat_dentry *ep, unsigned short *uniname) { int i, len = 0; for (i = 0; i < EXFAT_FILE_NAME_LEN; i++) { *uniname = le16_to_cpu(ep->dentry.name.unicode_0_14[i]); if (*uniname == 0x0) return len; uniname++; len++; } *uniname = 0x0; return len; } static int exfat_get_uniname_from_ext_entry(struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned short *uniname) { int i, err; struct exfat_entry_set_cache es; unsigned int uni_len = 0, len; err = exfat_get_dentry_set(&es, sb, p_dir, entry, ES_ALL_ENTRIES); if (err) return err; /* * First entry : file entry * Second entry : stream-extension entry * Third entry : first file-name entry * So, the index of first file-name dentry should start from 2. */ for (i = ES_IDX_FIRST_FILENAME; i < es.num_entries; i++) { struct exfat_dentry *ep = exfat_get_dentry_cached(&es, i); /* end of name entry */ if (exfat_get_entry_type(ep) != TYPE_EXTEND) break; len = exfat_extract_uni_name(ep, uniname); uni_len += len; if (len != EXFAT_FILE_NAME_LEN || uni_len >= MAX_NAME_LENGTH) break; uniname += EXFAT_FILE_NAME_LEN; } exfat_put_dentry_set(&es, false); return 0; } /* read a directory entry from the opened directory */ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry) { int i, dentries_per_clu, num_ext, err; unsigned int type, clu_offset, max_dentries; struct exfat_chain dir, clu; struct exfat_uni_name uni_name; struct exfat_dentry *ep; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); unsigned int dentry = EXFAT_B_TO_DEN(*cpos) & 0xFFFFFFFF; struct buffer_head *bh; /* check if the given file ID is opened */ if (ei->type != TYPE_DIR) return -EPERM; exfat_chain_set(&dir, ei->start_clu, EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags); dentries_per_clu = sbi->dentries_per_clu; max_dentries = (unsigned int)min_t(u64, MAX_EXFAT_DENTRIES, (u64)EXFAT_CLU_TO_DEN(sbi->num_clusters, sbi)); clu_offset = EXFAT_DEN_TO_CLU(dentry, sbi); exfat_chain_dup(&clu, &dir); if (clu.flags == ALLOC_NO_FAT_CHAIN) { clu.dir += clu_offset; clu.size -= clu_offset; } else { /* hint_information */ if (clu_offset > 0 && ei->hint_bmap.off != EXFAT_EOF_CLUSTER && ei->hint_bmap.off > 0 && clu_offset >= ei->hint_bmap.off) { clu_offset -= ei->hint_bmap.off; clu.dir = ei->hint_bmap.clu; } while (clu_offset > 0 && clu.dir != EXFAT_EOF_CLUSTER) { if (exfat_get_next_cluster(sb, &(clu.dir))) return -EIO; clu_offset--; } } while (clu.dir != EXFAT_EOF_CLUSTER && dentry < max_dentries) { i = dentry & (dentries_per_clu - 1); for ( ; i < dentries_per_clu; i++, dentry++) { ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; type = exfat_get_entry_type(ep); if (type == TYPE_UNUSED) { brelse(bh); goto out; } if (type != TYPE_FILE && type != TYPE_DIR) { brelse(bh); continue; } num_ext = ep->dentry.file.num_ext; dir_entry->attr = le16_to_cpu(ep->dentry.file.attr); *uni_name.name = 0x0; err = exfat_get_uniname_from_ext_entry(sb, &clu, i, uni_name.name); if (err) { brelse(bh); continue; } exfat_utf16_to_nls(sb, &uni_name, dir_entry->namebuf.lfn, dir_entry->namebuf.lfnbuf_len); brelse(bh); ep = exfat_get_dentry(sb, &clu, i + 1, &bh); if (!ep) return -EIO; dir_entry->entry = i; dir_entry->dir = clu; brelse(bh); ei->hint_bmap.off = EXFAT_DEN_TO_CLU(dentry, sbi); ei->hint_bmap.clu = clu.dir; *cpos = EXFAT_DEN_TO_B(dentry + 1 + num_ext); return 0; } if (clu.flags == ALLOC_NO_FAT_CHAIN) { if (--clu.size > 0) clu.dir++; else clu.dir = EXFAT_EOF_CLUSTER; } else { if (exfat_get_next_cluster(sb, &(clu.dir))) return -EIO; } } out: dir_entry->namebuf.lfn[0] = '\0'; *cpos = EXFAT_DEN_TO_B(dentry); return 0; } static void exfat_init_namebuf(struct exfat_dentry_namebuf *nb) { nb->lfn = NULL; nb->lfnbuf_len = 0; } static int exfat_alloc_namebuf(struct exfat_dentry_namebuf *nb) { nb->lfn = __getname(); if (!nb->lfn) return -ENOMEM; nb->lfnbuf_len = MAX_VFSNAME_BUF_SIZE; return 0; } static void exfat_free_namebuf(struct exfat_dentry_namebuf *nb) { if (!nb->lfn) return; __putname(nb->lfn); exfat_init_namebuf(nb); } /* * Before calling dir_emit*(), sbi->s_lock should be released * because page fault can occur in dir_emit*(). */ #define ITER_POS_FILLED_DOTS (2) static int exfat_iterate(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct inode *tmp; struct exfat_dir_entry de; struct exfat_dentry_namebuf *nb = &(de.namebuf); struct exfat_inode_info *ei = EXFAT_I(inode); unsigned long inum; loff_t cpos, i_pos; int err = 0, fake_offset = 0; exfat_init_namebuf(nb); cpos = ctx->pos; if (!dir_emit_dots(file, ctx)) goto out; if (ctx->pos == ITER_POS_FILLED_DOTS) { cpos = 0; fake_offset = 1; } cpos = round_up(cpos, DENTRY_SIZE); /* name buffer should be allocated before use */ err = exfat_alloc_namebuf(nb); if (err) goto out; get_new: mutex_lock(&EXFAT_SB(sb)->s_lock); if (ei->flags == ALLOC_NO_FAT_CHAIN && cpos >= i_size_read(inode)) goto end_of_dir; err = exfat_readdir(inode, &cpos, &de); if (err) { /* * At least we tried to read a sector. * Move cpos to next sector position (should be aligned). */ if (err == -EIO) { cpos += 1 << (sb->s_blocksize_bits); cpos &= ~(sb->s_blocksize - 1); } err = -EIO; goto end_of_dir; } if (!nb->lfn[0]) goto end_of_dir; i_pos = ((loff_t)de.dir.dir << 32) | (de.entry & 0xffffffff); tmp = exfat_iget(sb, i_pos); if (tmp) { inum = tmp->i_ino; iput(tmp); } else { inum = iunique(sb, EXFAT_ROOT_INO); } mutex_unlock(&EXFAT_SB(sb)->s_lock); if (!dir_emit(ctx, nb->lfn, strlen(nb->lfn), inum, (de.attr & EXFAT_ATTR_SUBDIR) ? DT_DIR : DT_REG)) goto out; ctx->pos = cpos; goto get_new; end_of_dir: if (!cpos && fake_offset) cpos = ITER_POS_FILLED_DOTS; ctx->pos = cpos; mutex_unlock(&EXFAT_SB(sb)->s_lock); out: /* * To improve performance, free namebuf after unlock sb_lock. * If namebuf is not allocated, this function do nothing */ exfat_free_namebuf(nb); return err; } WRAP_DIR_ITER(exfat_iterate) // FIXME! const struct file_operations exfat_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = shared_exfat_iterate, .unlocked_ioctl = exfat_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = exfat_compat_ioctl, #endif .fsync = exfat_file_fsync, }; int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu) { int ret; exfat_chain_set(clu, EXFAT_EOF_CLUSTER, 0, ALLOC_NO_FAT_CHAIN); ret = exfat_alloc_cluster(inode, 1, clu, IS_DIRSYNC(inode)); if (ret) return ret; return exfat_zeroed_cluster(inode, clu->dir); } int exfat_calc_num_entries(struct exfat_uni_name *p_uniname) { int len; len = p_uniname->name_len; if (len == 0) return -EINVAL; /* 1 file entry + 1 stream entry + name entries */ return ES_ENTRY_NUM(len); } unsigned int exfat_get_entry_type(struct exfat_dentry *ep) { if (ep->type == EXFAT_UNUSED) return TYPE_UNUSED; if (IS_EXFAT_DELETED(ep->type)) return TYPE_DELETED; if (ep->type == EXFAT_INVAL) return TYPE_INVALID; if (IS_EXFAT_CRITICAL_PRI(ep->type)) { if (ep->type == EXFAT_BITMAP) return TYPE_BITMAP; if (ep->type == EXFAT_UPCASE) return TYPE_UPCASE; if (ep->type == EXFAT_VOLUME) return TYPE_VOLUME; if (ep->type == EXFAT_FILE) { if (le16_to_cpu(ep->dentry.file.attr) & EXFAT_ATTR_SUBDIR) return TYPE_DIR; return TYPE_FILE; } return TYPE_CRITICAL_PRI; } if (IS_EXFAT_BENIGN_PRI(ep->type)) { if (ep->type == EXFAT_GUID) return TYPE_GUID; if (ep->type == EXFAT_PADDING) return TYPE_PADDING; if (ep->type == EXFAT_ACLTAB) return TYPE_ACLTAB; return TYPE_BENIGN_PRI; } if (IS_EXFAT_CRITICAL_SEC(ep->type)) { if (ep->type == EXFAT_STREAM) return TYPE_STREAM; if (ep->type == EXFAT_NAME) return TYPE_EXTEND; if (ep->type == EXFAT_ACL) return TYPE_ACL; return TYPE_CRITICAL_SEC; } if (ep->type == EXFAT_VENDOR_EXT) return TYPE_VENDOR_EXT; if (ep->type == EXFAT_VENDOR_ALLOC) return TYPE_VENDOR_ALLOC; return TYPE_BENIGN_SEC; } static void exfat_set_entry_type(struct exfat_dentry *ep, unsigned int type) { if (type == TYPE_UNUSED) { ep->type = EXFAT_UNUSED; } else if (type == TYPE_DELETED) { ep->type &= EXFAT_DELETE; } else if (type == TYPE_STREAM) { ep->type = EXFAT_STREAM; } else if (type == TYPE_EXTEND) { ep->type = EXFAT_NAME; } else if (type == TYPE_BITMAP) { ep->type = EXFAT_BITMAP; } else if (type == TYPE_UPCASE) { ep->type = EXFAT_UPCASE; } else if (type == TYPE_VOLUME) { ep->type = EXFAT_VOLUME; } else if (type == TYPE_DIR) { ep->type = EXFAT_FILE; ep->dentry.file.attr = cpu_to_le16(EXFAT_ATTR_SUBDIR); } else if (type == TYPE_FILE) { ep->type = EXFAT_FILE; ep->dentry.file.attr = cpu_to_le16(EXFAT_ATTR_ARCHIVE); } } static void exfat_init_stream_entry(struct exfat_dentry *ep, unsigned int start_clu, unsigned long long size) { memset(ep, 0, sizeof(*ep)); exfat_set_entry_type(ep, TYPE_STREAM); if (size == 0) ep->dentry.stream.flags = ALLOC_FAT_CHAIN; else ep->dentry.stream.flags = ALLOC_NO_FAT_CHAIN; ep->dentry.stream.start_clu = cpu_to_le32(start_clu); ep->dentry.stream.valid_size = cpu_to_le64(size); ep->dentry.stream.size = cpu_to_le64(size); } static void exfat_init_name_entry(struct exfat_dentry *ep, unsigned short *uniname) { int i; exfat_set_entry_type(ep, TYPE_EXTEND); ep->dentry.name.flags = 0x0; for (i = 0; i < EXFAT_FILE_NAME_LEN; i++) { if (*uniname != 0x0) { ep->dentry.name.unicode_0_14[i] = cpu_to_le16(*uniname); uniname++; } else { ep->dentry.name.unicode_0_14[i] = 0x0; } } } void exfat_init_dir_entry(struct exfat_entry_set_cache *es, unsigned int type, unsigned int start_clu, unsigned long long size, struct timespec64 *ts) { struct super_block *sb = es->sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_dentry *ep; ep = exfat_get_dentry_cached(es, ES_IDX_FILE); memset(ep, 0, sizeof(*ep)); exfat_set_entry_type(ep, type); exfat_set_entry_time(sbi, ts, &ep->dentry.file.create_tz, &ep->dentry.file.create_time, &ep->dentry.file.create_date, &ep->dentry.file.create_time_cs); exfat_set_entry_time(sbi, ts, &ep->dentry.file.modify_tz, &ep->dentry.file.modify_time, &ep->dentry.file.modify_date, &ep->dentry.file.modify_time_cs); exfat_set_entry_time(sbi, ts, &ep->dentry.file.access_tz, &ep->dentry.file.access_time, &ep->dentry.file.access_date, NULL); ep = exfat_get_dentry_cached(es, ES_IDX_STREAM); exfat_init_stream_entry(ep, start_clu, size); } static void exfat_free_benign_secondary_clusters(struct inode *inode, struct exfat_dentry *ep) { struct super_block *sb = inode->i_sb; struct exfat_chain dir; unsigned int start_clu = le32_to_cpu(ep->dentry.generic_secondary.start_clu); u64 size = le64_to_cpu(ep->dentry.generic_secondary.size); unsigned char flags = ep->dentry.generic_secondary.flags; if (!(flags & ALLOC_POSSIBLE) || !start_clu || !size) return; exfat_chain_set(&dir, start_clu, EXFAT_B_TO_CLU_ROUND_UP(size, EXFAT_SB(sb)), flags); exfat_free_cluster(inode, &dir); } void exfat_init_ext_entry(struct exfat_entry_set_cache *es, int num_entries, struct exfat_uni_name *p_uniname) { int i; unsigned short *uniname = p_uniname->name; struct exfat_dentry *ep; ep = exfat_get_dentry_cached(es, ES_IDX_FILE); ep->dentry.file.num_ext = (unsigned char)(num_entries - 1); ep = exfat_get_dentry_cached(es, ES_IDX_STREAM); ep->dentry.stream.name_len = p_uniname->name_len; ep->dentry.stream.name_hash = cpu_to_le16(p_uniname->name_hash); for (i = ES_IDX_FIRST_FILENAME; i < num_entries; i++) { ep = exfat_get_dentry_cached(es, i); exfat_init_name_entry(ep, uniname); uniname += EXFAT_FILE_NAME_LEN; } exfat_update_dir_chksum(es); } void exfat_remove_entries(struct inode *inode, struct exfat_entry_set_cache *es, int order) { int i; struct exfat_dentry *ep; for (i = order; i < es->num_entries; i++) { ep = exfat_get_dentry_cached(es, i); if (exfat_get_entry_type(ep) & TYPE_BENIGN_SEC) exfat_free_benign_secondary_clusters(inode, ep); exfat_set_entry_type(ep, TYPE_DELETED); } if (order < es->num_entries) es->modified = true; } void exfat_update_dir_chksum(struct exfat_entry_set_cache *es) { int chksum_type = CS_DIR_ENTRY, i; unsigned short chksum = 0; struct exfat_dentry *ep; for (i = ES_IDX_FILE; i < es->num_entries; i++) { ep = exfat_get_dentry_cached(es, i); chksum = exfat_calc_chksum16(ep, DENTRY_SIZE, chksum, chksum_type); chksum_type = CS_DEFAULT; } ep = exfat_get_dentry_cached(es, ES_IDX_FILE); ep->dentry.file.checksum = cpu_to_le16(chksum); es->modified = true; } int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync) { int i, err = 0; if (es->modified) err = exfat_update_bhs(es->bh, es->num_bh, sync); for (i = 0; i < es->num_bh; i++) if (err) bforget(es->bh[i]); else brelse(es->bh[i]); if (IS_DYNAMIC_ES(es)) kfree(es->bh); return err; } static int exfat_walk_fat_chain(struct super_block *sb, struct exfat_chain *p_dir, unsigned int byte_offset, unsigned int *clu) { struct exfat_sb_info *sbi = EXFAT_SB(sb); unsigned int clu_offset; unsigned int cur_clu; clu_offset = EXFAT_B_TO_CLU(byte_offset, sbi); cur_clu = p_dir->dir; if (p_dir->flags == ALLOC_NO_FAT_CHAIN) { cur_clu += clu_offset; } else { while (clu_offset > 0) { if (exfat_get_next_cluster(sb, &cur_clu)) return -EIO; if (cur_clu == EXFAT_EOF_CLUSTER) { exfat_fs_error(sb, "invalid dentry access beyond EOF (clu : %u, eidx : %d)", p_dir->dir, EXFAT_B_TO_DEN(byte_offset)); return -EIO; } clu_offset--; } } *clu = cur_clu; return 0; } static int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir, int entry, sector_t *sector, int *offset) { int ret; unsigned int off, clu = 0; struct exfat_sb_info *sbi = EXFAT_SB(sb); off = EXFAT_DEN_TO_B(entry); ret = exfat_walk_fat_chain(sb, p_dir, off, &clu); if (ret) return ret; if (!exfat_test_bitmap(sb, clu)) { exfat_err(sb, "failed to test cluster bit(%u)", clu); return -EIO; } /* byte offset in cluster */ off = EXFAT_CLU_OFFSET(off, sbi); /* byte offset in sector */ *offset = EXFAT_BLK_OFFSET(off, sb); /* sector offset in cluster */ *sector = EXFAT_B_TO_BLK(off, sb); *sector += exfat_cluster_to_sector(sbi, clu); return 0; } #define EXFAT_MAX_RA_SIZE (128*1024) static int exfat_dir_readahead(struct super_block *sb, sector_t sec) { struct exfat_sb_info *sbi = EXFAT_SB(sb); struct buffer_head *bh; unsigned int max_ra_count = EXFAT_MAX_RA_SIZE >> sb->s_blocksize_bits; unsigned int page_ra_count = PAGE_SIZE >> sb->s_blocksize_bits; unsigned int adj_ra_count = max(sbi->sect_per_clus, page_ra_count); unsigned int ra_count = min(adj_ra_count, max_ra_count); /* Read-ahead is not required */ if (sbi->sect_per_clus == 1) return 0; if (sec < sbi->data_start_sector) { exfat_err(sb, "requested sector is invalid(sect:%llu, root:%llu)", (unsigned long long)sec, sbi->data_start_sector); return -EIO; } /* Not sector aligned with ra_count, resize ra_count to page size */ if ((sec - sbi->data_start_sector) & (ra_count - 1)) ra_count = page_ra_count; bh = sb_find_get_block(sb, sec); if (!bh || !buffer_uptodate(bh)) { unsigned int i; for (i = 0; i < ra_count; i++) sb_breadahead(sb, (sector_t)(sec + i)); } brelse(bh); return 0; } struct exfat_dentry *exfat_get_dentry(struct super_block *sb, struct exfat_chain *p_dir, int entry, struct buffer_head **bh) { unsigned int dentries_per_page = EXFAT_B_TO_DEN(PAGE_SIZE); int off; sector_t sec; if (p_dir->dir == DIR_DELETED) { exfat_err(sb, "abnormal access to deleted dentry"); return NULL; } if (exfat_find_location(sb, p_dir, entry, &sec, &off)) return NULL; if (p_dir->dir != EXFAT_FREE_CLUSTER && !(entry & (dentries_per_page - 1))) exfat_dir_readahead(sb, sec); *bh = sb_bread(sb, sec); if (!*bh) return NULL; return (struct exfat_dentry *)((*bh)->b_data + off); } enum exfat_validate_dentry_mode { ES_MODE_GET_FILE_ENTRY, ES_MODE_GET_STRM_ENTRY, ES_MODE_GET_NAME_ENTRY, ES_MODE_GET_CRITICAL_SEC_ENTRY, ES_MODE_GET_BENIGN_SEC_ENTRY, }; static bool exfat_validate_entry(unsigned int type, enum exfat_validate_dentry_mode *mode) { if (type == TYPE_UNUSED || type == TYPE_DELETED) return false; switch (*mode) { case ES_MODE_GET_FILE_ENTRY: if (type != TYPE_STREAM) return false; *mode = ES_MODE_GET_STRM_ENTRY; break; case ES_MODE_GET_STRM_ENTRY: if (type != TYPE_EXTEND) return false; *mode = ES_MODE_GET_NAME_ENTRY; break; case ES_MODE_GET_NAME_ENTRY: if (type & TYPE_BENIGN_SEC) *mode = ES_MODE_GET_BENIGN_SEC_ENTRY; else if (type != TYPE_EXTEND) return false; break; case ES_MODE_GET_BENIGN_SEC_ENTRY: /* Assume unreconized benign secondary entry */ if (!(type & TYPE_BENIGN_SEC)) return false; break; default: return false; } return true; } struct exfat_dentry *exfat_get_dentry_cached( struct exfat_entry_set_cache *es, int num) { int off = es->start_off + num * DENTRY_SIZE; struct buffer_head *bh = es->bh[EXFAT_B_TO_BLK(off, es->sb)]; char *p = bh->b_data + EXFAT_BLK_OFFSET(off, es->sb); return (struct exfat_dentry *)p; } /* * Returns a set of dentries. * * Note It provides a direct pointer to bh->data via exfat_get_dentry_cached(). * User should call exfat_get_dentry_set() after setting 'modified' to apply * changes made in this entry set to the real device. * * in: * sb+p_dir+entry: indicates a file/dir * num_entries: specifies how many dentries should be included. * It will be set to es->num_entries if it is not 0. * If num_entries is 0, es->num_entries will be obtained * from the first dentry. * out: * es: pointer of entry set on success. * return: * 0 on success * -error code on failure */ static int __exfat_get_dentry_set(struct exfat_entry_set_cache *es, struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned int num_entries) { int ret, i, num_bh; unsigned int off; sector_t sec; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct buffer_head *bh; if (p_dir->dir == DIR_DELETED) { exfat_err(sb, "access to deleted dentry"); return -EIO; } ret = exfat_find_location(sb, p_dir, entry, &sec, &off); if (ret) return ret; memset(es, 0, sizeof(*es)); es->sb = sb; es->modified = false; es->start_off = off; es->bh = es->__bh; bh = sb_bread(sb, sec); if (!bh) return -EIO; es->bh[es->num_bh++] = bh; if (num_entries == ES_ALL_ENTRIES) { struct exfat_dentry *ep; ep = exfat_get_dentry_cached(es, ES_IDX_FILE); if (ep->type != EXFAT_FILE) { brelse(bh); return -EIO; } num_entries = ep->dentry.file.num_ext + 1; } es->num_entries = num_entries; num_bh = EXFAT_B_TO_BLK_ROUND_UP(off + num_entries * DENTRY_SIZE, sb); if (num_bh > ARRAY_SIZE(es->__bh)) { es->bh = kmalloc_array(num_bh, sizeof(*es->bh), GFP_NOFS); if (!es->bh) { brelse(bh); return -ENOMEM; } es->bh[0] = bh; } for (i = 1; i < num_bh; i++) { /* get the next sector */ if (exfat_is_last_sector_in_cluster(sbi, sec)) { unsigned int clu = exfat_sector_to_cluster(sbi, sec); if (p_dir->flags == ALLOC_NO_FAT_CHAIN) clu++; else if (exfat_get_next_cluster(sb, &clu)) goto put_es; sec = exfat_cluster_to_sector(sbi, clu); } else { sec++; } bh = sb_bread(sb, sec); if (!bh) goto put_es; es->bh[es->num_bh++] = bh; } return 0; put_es: exfat_put_dentry_set(es, false); return -EIO; } int exfat_get_dentry_set(struct exfat_entry_set_cache *es, struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned int num_entries) { int ret, i; struct exfat_dentry *ep; enum exfat_validate_dentry_mode mode = ES_MODE_GET_FILE_ENTRY; ret = __exfat_get_dentry_set(es, sb, p_dir, entry, num_entries); if (ret < 0) return ret; /* validate cached dentries */ for (i = ES_IDX_STREAM; i < es->num_entries; i++) { ep = exfat_get_dentry_cached(es, i); if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode)) goto put_es; } return 0; put_es: exfat_put_dentry_set(es, false); return -EIO; } static int exfat_validate_empty_dentry_set(struct exfat_entry_set_cache *es) { struct exfat_dentry *ep; struct buffer_head *bh; int i, off; bool unused_hit = false; /* * ONLY UNUSED OR DELETED DENTRIES ARE ALLOWED: * Although it violates the specification for a deleted entry to * follow an unused entry, some exFAT implementations could work * like this. Therefore, to improve compatibility, let's allow it. */ for (i = 0; i < es->num_entries; i++) { ep = exfat_get_dentry_cached(es, i); if (ep->type == EXFAT_UNUSED) { unused_hit = true; } else if (!IS_EXFAT_DELETED(ep->type)) { if (unused_hit) goto err_used_follow_unused; i++; goto count_skip_entries; } } return 0; err_used_follow_unused: off = es->start_off + (i << DENTRY_SIZE_BITS); bh = es->bh[EXFAT_B_TO_BLK(off, es->sb)]; exfat_fs_error(es->sb, "in sector %lld, dentry %d should be unused, but 0x%x", bh->b_blocknr, off >> DENTRY_SIZE_BITS, ep->type); return -EIO; count_skip_entries: es->num_entries = EXFAT_B_TO_DEN(EXFAT_BLK_TO_B(es->num_bh, es->sb) - es->start_off); for (; i < es->num_entries; i++) { ep = exfat_get_dentry_cached(es, i); if (IS_EXFAT_DELETED(ep->type)) break; } return i; } /* * Get an empty dentry set. * * in: * sb+p_dir+entry: indicates the empty dentry location * num_entries: specifies how many empty dentries should be included. * out: * es: pointer of empty dentry set on success. * return: * 0 : on success * >0 : the dentries are not empty, the return value is the number of * dentries to be skipped for the next lookup. * <0 : on failure */ int exfat_get_empty_dentry_set(struct exfat_entry_set_cache *es, struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned int num_entries) { int ret; ret = __exfat_get_dentry_set(es, sb, p_dir, entry, num_entries); if (ret < 0) return ret; ret = exfat_validate_empty_dentry_set(es); if (ret) exfat_put_dentry_set(es, false); return ret; } static inline void exfat_reset_empty_hint(struct exfat_hint_femp *hint_femp) { hint_femp->eidx = EXFAT_HINT_NONE; hint_femp->count = 0; } static inline void exfat_set_empty_hint(struct exfat_inode_info *ei, struct exfat_hint_femp *candi_empty, struct exfat_chain *clu, int dentry, int num_entries, int entry_type) { if (ei->hint_femp.eidx == EXFAT_HINT_NONE || ei->hint_femp.eidx > dentry) { int total_entries = EXFAT_B_TO_DEN(i_size_read(&ei->vfs_inode)); if (candi_empty->count == 0) { candi_empty->cur = *clu; candi_empty->eidx = dentry; } if (entry_type == TYPE_UNUSED) candi_empty->count += total_entries - dentry; else candi_empty->count++; if (candi_empty->count == num_entries || candi_empty->count + candi_empty->eidx == total_entries) ei->hint_femp = *candi_empty; } } enum { DIRENT_STEP_FILE, DIRENT_STEP_STRM, DIRENT_STEP_NAME, DIRENT_STEP_SECD, }; /* * @ei: inode info of parent directory * @p_dir: directory structure of parent directory * @num_entries:entry size of p_uniname * @hint_opt: If p_uniname is found, filled with optimized dir/entry * for traversing cluster chain. * @return: * >= 0: file directory entry position where the name exists * -ENOENT: entry with the name does not exist * -EIO: I/O error */ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, struct exfat_hint *hint_opt) { int i, rewind = 0, dentry = 0, end_eidx = 0, num_ext = 0, len; int order, step, name_len = 0; int dentries_per_clu; unsigned int entry_type; unsigned short *uniname = NULL; struct exfat_chain clu; struct exfat_hint *hint_stat = &ei->hint_stat; struct exfat_hint_femp candi_empty; struct exfat_sb_info *sbi = EXFAT_SB(sb); int num_entries = exfat_calc_num_entries(p_uniname); unsigned int clu_count = 0; if (num_entries < 0) return num_entries; dentries_per_clu = sbi->dentries_per_clu; exfat_chain_dup(&clu, p_dir); if (hint_stat->eidx) { clu.dir = hint_stat->clu; dentry = hint_stat->eidx; end_eidx = dentry; } exfat_reset_empty_hint(&ei->hint_femp); rewind: order = 0; step = DIRENT_STEP_FILE; exfat_reset_empty_hint(&candi_empty); while (clu.dir != EXFAT_EOF_CLUSTER) { i = dentry & (dentries_per_clu - 1); for (; i < dentries_per_clu; i++, dentry++) { struct exfat_dentry *ep; struct buffer_head *bh; if (rewind && dentry == end_eidx) goto not_found; ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; entry_type = exfat_get_entry_type(ep); if (entry_type == TYPE_UNUSED || entry_type == TYPE_DELETED) { step = DIRENT_STEP_FILE; exfat_set_empty_hint(ei, &candi_empty, &clu, dentry, num_entries, entry_type); brelse(bh); if (entry_type == TYPE_UNUSED) goto not_found; continue; } exfat_reset_empty_hint(&candi_empty); if (entry_type == TYPE_FILE || entry_type == TYPE_DIR) { step = DIRENT_STEP_FILE; hint_opt->clu = clu.dir; hint_opt->eidx = i; num_ext = ep->dentry.file.num_ext; step = DIRENT_STEP_STRM; brelse(bh); continue; } if (entry_type == TYPE_STREAM) { u16 name_hash; if (step != DIRENT_STEP_STRM) { step = DIRENT_STEP_FILE; brelse(bh); continue; } step = DIRENT_STEP_FILE; name_hash = le16_to_cpu( ep->dentry.stream.name_hash); if (p_uniname->name_hash == name_hash && p_uniname->name_len == ep->dentry.stream.name_len) { step = DIRENT_STEP_NAME; order = 1; name_len = 0; } brelse(bh); continue; } brelse(bh); if (entry_type == TYPE_EXTEND) { unsigned short entry_uniname[16], unichar; if (step != DIRENT_STEP_NAME || name_len >= MAX_NAME_LENGTH) { step = DIRENT_STEP_FILE; continue; } if (++order == 2) uniname = p_uniname->name; else uniname += EXFAT_FILE_NAME_LEN; len = exfat_extract_uni_name(ep, entry_uniname); name_len += len; unichar = *(uniname+len); *(uniname+len) = 0x0; if (exfat_uniname_ncmp(sb, uniname, entry_uniname, len)) { step = DIRENT_STEP_FILE; } else if (p_uniname->name_len == name_len) { if (order == num_ext) goto found; step = DIRENT_STEP_SECD; } *(uniname+len) = unichar; continue; } if (entry_type & (TYPE_CRITICAL_SEC | TYPE_BENIGN_SEC)) { if (step == DIRENT_STEP_SECD) { if (++order == num_ext) goto found; continue; } } step = DIRENT_STEP_FILE; } if (clu.flags == ALLOC_NO_FAT_CHAIN) { if (--clu.size > 0) clu.dir++; else clu.dir = EXFAT_EOF_CLUSTER; } else { if (exfat_get_next_cluster(sb, &clu.dir)) return -EIO; /* break if the cluster chain includes a loop */ if (unlikely(++clu_count > EXFAT_DATA_CLUSTER_COUNT(sbi))) goto not_found; } } not_found: /* * We started at not 0 index,so we should try to find target * from 0 index to the index we started at. */ if (!rewind && end_eidx) { rewind = 1; dentry = 0; clu.dir = p_dir->dir; goto rewind; } /* * set the EXFAT_EOF_CLUSTER flag to avoid search * from the beginning again when allocated a new cluster */ if (ei->hint_femp.eidx == EXFAT_HINT_NONE) { ei->hint_femp.cur.dir = EXFAT_EOF_CLUSTER; ei->hint_femp.eidx = p_dir->size * dentries_per_clu; ei->hint_femp.count = 0; } /* initialized hint_stat */ hint_stat->clu = p_dir->dir; hint_stat->eidx = 0; return -ENOENT; found: /* next dentry we'll find is out of this cluster */ if (!((dentry + 1) & (dentries_per_clu - 1))) { int ret = 0; if (clu.flags == ALLOC_NO_FAT_CHAIN) { if (--clu.size > 0) clu.dir++; else clu.dir = EXFAT_EOF_CLUSTER; } else { ret = exfat_get_next_cluster(sb, &clu.dir); } if (ret || clu.dir == EXFAT_EOF_CLUSTER) { /* just initialized hint_stat */ hint_stat->clu = p_dir->dir; hint_stat->eidx = 0; return (dentry - num_ext); } } hint_stat->clu = clu.dir; hint_stat->eidx = dentry + 1; return dentry - num_ext; } int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir) { int i, count = 0; int dentries_per_clu; unsigned int entry_type; unsigned int clu_count = 0; struct exfat_chain clu; struct exfat_dentry *ep; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct buffer_head *bh; dentries_per_clu = sbi->dentries_per_clu; exfat_chain_dup(&clu, p_dir); while (clu.dir != EXFAT_EOF_CLUSTER) { for (i = 0; i < dentries_per_clu; i++) { ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; entry_type = exfat_get_entry_type(ep); brelse(bh); if (entry_type == TYPE_UNUSED) return count; if (entry_type != TYPE_DIR) continue; count++; } if (clu.flags == ALLOC_NO_FAT_CHAIN) { if (--clu.size > 0) clu.dir++; else clu.dir = EXFAT_EOF_CLUSTER; } else { if (exfat_get_next_cluster(sb, &(clu.dir))) return -EIO; if (unlikely(++clu_count > sbi->used_clusters)) { exfat_fs_error(sb, "FAT or bitmap is corrupted"); return -EIO; } } } return count; } static int exfat_get_volume_label_dentry(struct super_block *sb, struct exfat_entry_set_cache *es) { int i; int dentry = 0; unsigned int type; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_hint_femp hint_femp; struct exfat_inode_info *ei = EXFAT_I(sb->s_root->d_inode); struct exfat_chain clu; struct exfat_dentry *ep; struct buffer_head *bh; hint_femp.eidx = EXFAT_HINT_NONE; exfat_chain_set(&clu, sbi->root_dir, 0, ALLOC_FAT_CHAIN); while (clu.dir != EXFAT_EOF_CLUSTER) { for (i = 0; i < sbi->dentries_per_clu; i++, dentry++) { ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; type = exfat_get_entry_type(ep); if (hint_femp.eidx == EXFAT_HINT_NONE) { if (type == TYPE_DELETED || type == TYPE_UNUSED) { hint_femp.cur = clu; hint_femp.eidx = dentry; hint_femp.count = 1; } } if (type == TYPE_UNUSED) { brelse(bh); goto not_found; } if (type != TYPE_VOLUME) { brelse(bh); continue; } memset(es, 0, sizeof(*es)); es->sb = sb; es->bh = es->__bh; es->bh[0] = bh; es->num_bh = 1; es->start_off = EXFAT_DEN_TO_B(i) % sb->s_blocksize; return 0; } if (exfat_get_next_cluster(sb, &(clu.dir))) return -EIO; } not_found: if (hint_femp.eidx == EXFAT_HINT_NONE) { hint_femp.cur.dir = EXFAT_EOF_CLUSTER; hint_femp.eidx = dentry; hint_femp.count = 0; } ei->hint_femp = hint_femp; return -ENOENT; } int exfat_read_volume_label(struct super_block *sb, struct exfat_uni_name *label_out) { int ret, i; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_entry_set_cache es; struct exfat_dentry *ep; mutex_lock(&sbi->s_lock); memset(label_out, 0, sizeof(*label_out)); ret = exfat_get_volume_label_dentry(sb, &es); if (ret < 0) { /* * ENOENT signifies that a volume label dentry doesn't exist * We will treat this as an empty volume label and not fail. */ if (ret == -ENOENT) ret = 0; goto unlock; } ep = exfat_get_dentry_cached(&es, 0); label_out->name_len = ep->dentry.volume_label.char_count; if (label_out->name_len > EXFAT_VOLUME_LABEL_LEN) { ret = -EIO; exfat_put_dentry_set(&es, false); goto unlock; } for (i = 0; i < label_out->name_len; i++) label_out->name[i] = le16_to_cpu(ep->dentry.volume_label.volume_label[i]); exfat_put_dentry_set(&es, false); unlock: mutex_unlock(&sbi->s_lock); return ret; } int exfat_write_volume_label(struct super_block *sb, struct exfat_uni_name *label) { int ret, i; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct inode *root_inode = sb->s_root->d_inode; struct exfat_entry_set_cache es; struct exfat_chain clu; struct exfat_dentry *ep; if (label->name_len > EXFAT_VOLUME_LABEL_LEN) return -EINVAL; mutex_lock(&sbi->s_lock); ret = exfat_get_volume_label_dentry(sb, &es); if (ret == -ENOENT) { if (label->name_len == 0) { /* No volume label dentry, no need to clear */ ret = 0; goto unlock; } ret = exfat_find_empty_entry(root_inode, &clu, 1, &es); } if (ret < 0) goto unlock; ep = exfat_get_dentry_cached(&es, 0); if (label->name_len == 0 && ep->dentry.volume_label.char_count == 0) { /* volume label had been cleared */ exfat_put_dentry_set(&es, 0); goto unlock; } memset(ep, 0, sizeof(*ep)); ep->type = EXFAT_VOLUME; for (i = 0; i < label->name_len; i++) ep->dentry.volume_label.volume_label[i] = cpu_to_le16(label->name[i]); ep->dentry.volume_label.char_count = label->name_len; es.modified = true; ret = exfat_put_dentry_set(&es, IS_DIRSYNC(root_inode)); unlock: mutex_unlock(&sbi->s_lock); return ret; } |
| 5 5 5 5 5 5 5 5 3 5 5 5 5 5 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 | // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/util.c * Copyright (C) 1992 Krishna Balasubramanian * * Sep 1997 - Call suser() last after "normal" permission checks so we * get BSD style process accounting right. * Occurs in several places in the IPC code. * Chris Evans, <chris@ferret.lmh.ox.ac.uk> * Nov 1999 - ipc helper functions, unified SMP locking * Manfred Spraul <manfred@colorfullife.com> * Oct 2002 - One lock per IPC id. RCU ipc_free for lock-free grow_ary(). * Mingming Cao <cmm@us.ibm.com> * Mar 2006 - support for audit of ipc object properties * Dustin Kirkland <dustin.kirkland@us.ibm.com> * Jun 2006 - namespaces ssupport * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> * * General sysv ipc locking scheme: * rcu_read_lock() * obtain the ipc object (kern_ipc_perm) by looking up the id in an idr * tree. * - perform initial checks (capabilities, auditing and permission, * etc). * - perform read-only operations, such as INFO command, that * do not demand atomicity * acquire the ipc lock (kern_ipc_perm.lock) through * ipc_lock_object() * - perform read-only operations that demand atomicity, * such as STAT command. * - perform data updates, such as SET, RMID commands and * mechanism-specific operations (semop/semtimedop, * msgsnd/msgrcv, shmat/shmdt). * drop the ipc lock, through ipc_unlock_object(). * rcu_read_unlock() * * The ids->rwsem must be taken when: * - creating, removing and iterating the existing entries in ipc * identifier sets. * - iterating through files under /proc/sysvipc/ * * Note that sems have a special fast path that avoids kern_ipc_perm.lock - * see sem_lock(). */ #include <linux/mm.h> #include <linux/shm.h> #include <linux/init.h> #include <linux/msg.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/notifier.h> #include <linux/capability.h> #include <linux/highuid.h> #include <linux/security.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/audit.h> #include <linux/nsproxy.h> #include <linux/rwsem.h> #include <linux/memory.h> #include <linux/ipc_namespace.h> #include <linux/rhashtable.h> #include <linux/log2.h> #include <asm/unistd.h> #include "util.h" struct ipc_proc_iface { const char *path; const char *header; int ids; int (*show)(struct seq_file *, void *); }; /** * ipc_init - initialise ipc subsystem * * The various sysv ipc resources (semaphores, messages and shared * memory) are initialised. * * A callback routine is registered into the memory hotplug notifier * chain: since msgmni scales to lowmem this callback routine will be * called upon successful memory add / remove to recompute msmgni. */ static int __init ipc_init(void) { proc_mkdir("sysvipc", NULL); sem_init(); msg_init(); shm_init(); return 0; } device_initcall(ipc_init); static const struct rhashtable_params ipc_kht_params = { .head_offset = offsetof(struct kern_ipc_perm, khtnode), .key_offset = offsetof(struct kern_ipc_perm, key), .key_len = sizeof_field(struct kern_ipc_perm, key), .automatic_shrinking = true, }; /** * ipc_init_ids - initialise ipc identifiers * @ids: ipc identifier set * * Set up the sequence range to use for the ipc identifier range (limited * below ipc_mni) then initialise the keys hashtable and ids idr. */ void ipc_init_ids(struct ipc_ids *ids) { ids->in_use = 0; ids->seq = 0; init_rwsem(&ids->rwsem); rhashtable_init(&ids->key_ht, &ipc_kht_params); idr_init(&ids->ipcs_idr); ids->max_idx = -1; ids->last_idx = -1; #ifdef CONFIG_CHECKPOINT_RESTORE ids->next_id = -1; #endif } #ifdef CONFIG_PROC_FS static const struct proc_ops sysvipc_proc_ops; /** * ipc_init_proc_interface - create a proc interface for sysipc types using a seq_file interface. * @path: Path in procfs * @header: Banner to be printed at the beginning of the file. * @ids: ipc id table to iterate. * @show: show routine. */ void __init ipc_init_proc_interface(const char *path, const char *header, int ids, int (*show)(struct seq_file *, void *)) { struct proc_dir_entry *pde; struct ipc_proc_iface *iface; iface = kmalloc(sizeof(*iface), GFP_KERNEL); if (!iface) return; iface->path = path; iface->header = header; iface->ids = ids; iface->show = show; pde = proc_create_data(path, S_IRUGO, /* world readable */ NULL, /* parent dir */ &sysvipc_proc_ops, iface); if (!pde) kfree(iface); } #endif /** * ipc_findkey - find a key in an ipc identifier set * @ids: ipc identifier set * @key: key to find * * Returns the locked pointer to the ipc structure if found or NULL * otherwise. If key is found ipc points to the owning ipc structure * * Called with writer ipc_ids.rwsem held. */ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) { struct kern_ipc_perm *ipcp; ipcp = rhashtable_lookup_fast(&ids->key_ht, &key, ipc_kht_params); if (!ipcp) return NULL; rcu_read_lock(); ipc_lock_object(ipcp); return ipcp; } /* * Insert new IPC object into idr tree, and set sequence number and id * in the correct order. * Especially: * - the sequence number must be set before inserting the object into the idr, * because the sequence number is accessed without a lock. * - the id can/must be set after inserting the object into the idr. * All accesses must be done after getting kern_ipc_perm.lock. * * The caller must own kern_ipc_perm.lock.of the new object. * On error, the function returns a (negative) error code. * * To conserve sequence number space, especially with extended ipc_mni, * the sequence number is incremented only when the returned ID is less than * the last one. */ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) { int idx, next_id = -1; #ifdef CONFIG_CHECKPOINT_RESTORE next_id = ids->next_id; ids->next_id = -1; #endif /* * As soon as a new object is inserted into the idr, * ipc_obtain_object_idr() or ipc_obtain_object_check() can find it, * and the lockless preparations for ipc operations can start. * This means especially: permission checks, audit calls, allocation * of undo structures, ... * * Thus the object must be fully initialized, and if something fails, * then the full tear-down sequence must be followed. * (i.e.: set new->deleted, reduce refcount, call_rcu()) */ if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */ int max_idx; max_idx = max(ids->in_use*3/2, ipc_min_cycle); max_idx = min(max_idx, ipc_mni); /* allocate the idx, with a NULL struct kern_ipc_perm */ idx = idr_alloc_cyclic(&ids->ipcs_idr, NULL, 0, max_idx, GFP_NOWAIT); if (idx >= 0) { /* * idx got allocated successfully. * Now calculate the sequence number and set the * pointer for real. */ if (idx <= ids->last_idx) { ids->seq++; if (ids->seq >= ipcid_seq_max()) ids->seq = 0; } ids->last_idx = idx; new->seq = ids->seq; /* no need for smp_wmb(), this is done * inside idr_replace, as part of * rcu_assign_pointer */ idr_replace(&ids->ipcs_idr, new, idx); } } else { new->seq = ipcid_to_seqx(next_id); idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id), 0, GFP_NOWAIT); } if (idx >= 0) new->id = (new->seq << ipcmni_seq_shift()) + idx; return idx; } /** * ipc_addid - add an ipc identifier * @ids: ipc identifier set * @new: new ipc permission set * @limit: limit for the number of used ids * * Add an entry 'new' to the ipc ids idr. The permissions object is * initialised and the first free entry is set up and the index assigned * is returned. The 'new' entry is returned in a locked state on success. * * On failure the entry is not locked and a negative err-code is returned. * The caller must use ipc_rcu_putref() to free the identifier. * * Called with writer ipc_ids.rwsem held. */ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) { kuid_t euid; kgid_t egid; int idx, err; /* 1) Initialize the refcount so that ipc_rcu_putref works */ refcount_set(&new->refcount, 1); if (limit > ipc_mni) limit = ipc_mni; if (ids->in_use >= limit) return -ENOSPC; idr_preload(GFP_KERNEL); spin_lock_init(&new->lock); rcu_read_lock(); spin_lock(&new->lock); current_euid_egid(&euid, &egid); new->cuid = new->uid = euid; new->gid = new->cgid = egid; new->deleted = false; idx = ipc_idr_alloc(ids, new); idr_preload_end(); if (idx >= 0 && new->key != IPC_PRIVATE) { err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode, ipc_kht_params); if (err < 0) { idr_remove(&ids->ipcs_idr, idx); idx = err; } } if (idx < 0) { new->deleted = true; spin_unlock(&new->lock); rcu_read_unlock(); return idx; } ids->in_use++; if (idx > ids->max_idx) ids->max_idx = idx; return idx; } /** * ipcget_new - create a new ipc object * @ns: ipc namespace * @ids: ipc identifier set * @ops: the actual creation routine to call * @params: its parameters * * This routine is called by sys_msgget, sys_semget() and sys_shmget() * when the key is IPC_PRIVATE. */ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids, const struct ipc_ops *ops, struct ipc_params *params) { int err; down_write(&ids->rwsem); err = ops->getnew(ns, params); up_write(&ids->rwsem); return err; } /** * ipc_check_perms - check security and permissions for an ipc object * @ns: ipc namespace * @ipcp: ipc permission set * @ops: the actual security routine to call * @params: its parameters * * This routine is called by sys_msgget(), sys_semget() and sys_shmget() * when the key is not IPC_PRIVATE and that key already exists in the * ds IDR. * * On success, the ipc id is returned. * * It is called with ipc_ids.rwsem and ipcp->lock held. */ static int ipc_check_perms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, const struct ipc_ops *ops, struct ipc_params *params) { int err; if (ipcperms(ns, ipcp, params->flg)) err = -EACCES; else { err = ops->associate(ipcp, params->flg); if (!err) err = ipcp->id; } return err; } /** * ipcget_public - get an ipc object or create a new one * @ns: ipc namespace * @ids: ipc identifier set * @ops: the actual creation routine to call * @params: its parameters * * This routine is called by sys_msgget, sys_semget() and sys_shmget() * when the key is not IPC_PRIVATE. * It adds a new entry if the key is not found and does some permission * / security checkings if the key is found. * * On success, the ipc id is returned. */ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, const struct ipc_ops *ops, struct ipc_params *params) { struct kern_ipc_perm *ipcp; int flg = params->flg; int err; /* * Take the lock as a writer since we are potentially going to add * a new entry + read locks are not "upgradable" */ down_write(&ids->rwsem); ipcp = ipc_findkey(ids, params->key); if (ipcp == NULL) { /* key not used */ if (!(flg & IPC_CREAT)) err = -ENOENT; else err = ops->getnew(ns, params); } else { /* ipc object has been locked by ipc_findkey() */ if (flg & IPC_CREAT && flg & IPC_EXCL) err = -EEXIST; else { err = 0; if (ops->more_checks) err = ops->more_checks(ipcp, params); if (!err) /* * ipc_check_perms returns the IPC id on * success */ err = ipc_check_perms(ns, ipcp, ops, params); } ipc_unlock(ipcp); } up_write(&ids->rwsem); return err; } /** * ipc_kht_remove - remove an ipc from the key hashtable * @ids: ipc identifier set * @ipcp: ipc perm structure containing the key to remove * * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held * before this function is called, and remain locked on the exit. */ static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { if (ipcp->key != IPC_PRIVATE) WARN_ON_ONCE(rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, ipc_kht_params)); } /** * ipc_search_maxidx - search for the highest assigned index * @ids: ipc identifier set * @limit: known upper limit for highest assigned index * * The function determines the highest assigned index in @ids. It is intended * to be called when ids->max_idx needs to be updated. * Updating ids->max_idx is necessary when the current highest index ipc * object is deleted. * If no ipc object is allocated, then -1 is returned. * * ipc_ids.rwsem needs to be held by the caller. */ static int ipc_search_maxidx(struct ipc_ids *ids, int limit) { int tmpidx; int i; int retval; i = ilog2(limit+1); retval = 0; for (; i >= 0; i--) { tmpidx = retval | (1<<i); /* * "0" is a possible index value, thus search using * e.g. 15,7,3,1,0 instead of 16,8,4,2,1. */ tmpidx = tmpidx-1; if (idr_get_next(&ids->ipcs_idr, &tmpidx)) retval |= (1<<i); } return retval - 1; } /** * ipc_rmid - remove an ipc identifier * @ids: ipc identifier set * @ipcp: ipc perm structure containing the identifier to remove * * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held * before this function is called, and remain locked on the exit. */ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { int idx = ipcid_to_idx(ipcp->id); WARN_ON_ONCE(idr_remove(&ids->ipcs_idr, idx) != ipcp); ipc_kht_remove(ids, ipcp); ids->in_use--; ipcp->deleted = true; if (unlikely(idx == ids->max_idx)) { idx = ids->max_idx-1; if (idx >= 0) idx = ipc_search_maxidx(ids, idx); ids->max_idx = idx; } } /** * ipc_set_key_private - switch the key of an existing ipc to IPC_PRIVATE * @ids: ipc identifier set * @ipcp: ipc perm structure containing the key to modify * * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held * before this function is called, and remain locked on the exit. */ void ipc_set_key_private(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { ipc_kht_remove(ids, ipcp); ipcp->key = IPC_PRIVATE; } bool ipc_rcu_getref(struct kern_ipc_perm *ptr) { return refcount_inc_not_zero(&ptr->refcount); } void ipc_rcu_putref(struct kern_ipc_perm *ptr, void (*func)(struct rcu_head *head)) { if (!refcount_dec_and_test(&ptr->refcount)) return; call_rcu(&ptr->rcu, func); } /** * ipcperms - check ipc permissions * @ns: ipc namespace * @ipcp: ipc permission set * @flag: desired permission set * * Check user, group, other permissions for access * to ipc resources. return 0 if allowed * * @flag will most probably be 0 or ``S_...UGO`` from <linux/stat.h> */ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flag) { kuid_t euid = current_euid(); int requested_mode, granted_mode; audit_ipc_obj(ipcp); requested_mode = (flag >> 6) | (flag >> 3) | flag; granted_mode = ipcp->mode; if (uid_eq(euid, ipcp->cuid) || uid_eq(euid, ipcp->uid)) granted_mode >>= 6; else if (in_group_p(ipcp->cgid) || in_group_p(ipcp->gid)) granted_mode >>= 3; /* is there some bit set in requested_mode but not in granted_mode? */ if ((requested_mode & ~granted_mode & 0007) && !ns_capable(ns->user_ns, CAP_IPC_OWNER)) return -1; return security_ipc_permission(ipcp, flag); } /* * Functions to convert between the kern_ipc_perm structure and the * old/new ipc_perm structures */ /** * kernel_to_ipc64_perm - convert kernel ipc permissions to user * @in: kernel permissions * @out: new style ipc permissions * * Turn the kernel object @in into a set of permissions descriptions * for returning to userspace (@out). */ void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out) { out->key = in->key; out->uid = from_kuid_munged(current_user_ns(), in->uid); out->gid = from_kgid_munged(current_user_ns(), in->gid); out->cuid = from_kuid_munged(current_user_ns(), in->cuid); out->cgid = from_kgid_munged(current_user_ns(), in->cgid); out->mode = in->mode; out->seq = in->seq; } /** * ipc64_perm_to_ipc_perm - convert new ipc permissions to old * @in: new style ipc permissions * @out: old style ipc permissions * * Turn the new style permissions object @in into a compatibility * object and store it into the @out pointer. */ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out) { out->key = in->key; SET_UID(out->uid, in->uid); SET_GID(out->gid, in->gid); SET_UID(out->cuid, in->cuid); SET_GID(out->cgid, in->cgid); out->mode = in->mode; out->seq = in->seq; } /** * ipc_obtain_object_idr - Look for an id in the ipc ids idr and * return associated ipc object. * @ids: ipc identifier set * @id: ipc id to look for * * Call inside the RCU critical section. * The ipc object is *not* locked on exit. */ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id) { struct kern_ipc_perm *out; int idx = ipcid_to_idx(id); out = idr_find(&ids->ipcs_idr, idx); if (!out) return ERR_PTR(-EINVAL); return out; } /** * ipc_obtain_object_check - Similar to ipc_obtain_object_idr() but * also checks the ipc object sequence number. * @ids: ipc identifier set * @id: ipc id to look for * * Call inside the RCU critical section. * The ipc object is *not* locked on exit. */ struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id) { struct kern_ipc_perm *out = ipc_obtain_object_idr(ids, id); if (IS_ERR(out)) goto out; if (ipc_checkid(out, id)) return ERR_PTR(-EINVAL); out: return out; } /** * ipcget - Common sys_*get() code * @ns: namespace * @ids: ipc identifier set * @ops: operations to be called on ipc object creation, permission checks * and further checks * @params: the parameters needed by the previous operations. * * Common routine called by sys_msgget(), sys_semget() and sys_shmget(). */ int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, const struct ipc_ops *ops, struct ipc_params *params) { if (params->key == IPC_PRIVATE) return ipcget_new(ns, ids, ops, params); else return ipcget_public(ns, ids, ops, params); } /** * ipc_update_perm - update the permissions of an ipc object * @in: the permission given as input. * @out: the permission of the ipc to set. */ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) { kuid_t uid = make_kuid(current_user_ns(), in->uid); kgid_t gid = make_kgid(current_user_ns(), in->gid); if (!uid_valid(uid) || !gid_valid(gid)) return -EINVAL; out->uid = uid; out->gid = gid; out->mode = (out->mode & ~S_IRWXUGO) | (in->mode & S_IRWXUGO); return 0; } /** * ipcctl_obtain_check - retrieve an ipc object and check permissions * @ns: ipc namespace * @ids: the table of ids where to look for the ipc * @id: the id of the ipc to retrieve * @cmd: the cmd to check * @perm: the permission to set * @extra_perm: one extra permission parameter used by msq * * This function does some common audit and permissions check for some IPC_XXX * cmd and is called from semctl_down, shmctl_down and msgctl_down. * * It: * - retrieves the ipc object with the given id in the given table. * - performs some audit and permission check, depending on the given cmd * - returns a pointer to the ipc object or otherwise, the corresponding * error. * * Call holding the both the rwsem and the rcu read lock. */ struct kern_ipc_perm *ipcctl_obtain_check(struct ipc_namespace *ns, struct ipc_ids *ids, int id, int cmd, struct ipc64_perm *perm, int extra_perm) { kuid_t euid; int err = -EPERM; struct kern_ipc_perm *ipcp; ipcp = ipc_obtain_object_check(ids, id); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); goto err; } audit_ipc_obj(ipcp); if (cmd == IPC_SET) audit_ipc_set_perm(extra_perm, perm->uid, perm->gid, perm->mode); euid = current_euid(); if (uid_eq(euid, ipcp->cuid) || uid_eq(euid, ipcp->uid) || ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return ipcp; /* successful lookup */ err: return ERR_PTR(err); } #ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION /** * ipc_parse_version - ipc call version * @cmd: pointer to command * * Return IPC_64 for new style IPC and IPC_OLD for old style IPC. * The @cmd value is turned from an encoding command and version into * just the command code. */ int ipc_parse_version(int *cmd) { if (*cmd & IPC_64) { *cmd ^= IPC_64; return IPC_64; } else { return IPC_OLD; } } #endif /* CONFIG_ARCH_WANT_IPC_PARSE_VERSION */ #ifdef CONFIG_PROC_FS struct ipc_proc_iter { struct ipc_namespace *ns; struct pid_namespace *pid_ns; struct ipc_proc_iface *iface; }; struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s) { struct ipc_proc_iter *iter = s->private; return iter->pid_ns; } /** * sysvipc_find_ipc - Find and lock the ipc structure based on seq pos * @ids: ipc identifier set * @pos: expected position * * The function finds an ipc structure, based on the sequence file * position @pos. If there is no ipc structure at position @pos, then * the successor is selected. * If a structure is found, then it is locked (both rcu_read_lock() and * ipc_lock_object()) and @pos is set to the position needed to locate * the found ipc structure. * If nothing is found (i.e. EOF), @pos is not modified. * * The function returns the found ipc structure, or NULL at EOF. */ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t *pos) { int tmpidx; struct kern_ipc_perm *ipc; /* convert from position to idr index -> "-1" */ tmpidx = *pos - 1; ipc = idr_get_next(&ids->ipcs_idr, &tmpidx); if (ipc != NULL) { rcu_read_lock(); ipc_lock_object(ipc); /* convert from idr index to position -> "+1" */ *pos = tmpidx + 1; } return ipc; } static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) { struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; struct kern_ipc_perm *ipc = it; /* If we had an ipc id locked before, unlock it */ if (ipc && ipc != SEQ_START_TOKEN) ipc_unlock(ipc); /* Next -> search for *pos+1 */ (*pos)++; return sysvipc_find_ipc(&iter->ns->ids[iface->ids], pos); } /* * File positions: pos 0 -> header, pos n -> ipc idx = n - 1. * SeqFile iterator: iterator value locked ipc pointer or SEQ_TOKEN_START. */ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) { struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; struct ipc_ids *ids; ids = &iter->ns->ids[iface->ids]; /* * Take the lock - this will be released by the corresponding * call to stop(). */ down_read(&ids->rwsem); /* pos < 0 is invalid */ if (*pos < 0) return NULL; /* pos == 0 means header */ if (*pos == 0) return SEQ_START_TOKEN; /* Otherwise return the correct ipc structure */ return sysvipc_find_ipc(ids, pos); } static void sysvipc_proc_stop(struct seq_file *s, void *it) { struct kern_ipc_perm *ipc = it; struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; struct ipc_ids *ids; /* If we had a locked structure, release it */ if (ipc && ipc != SEQ_START_TOKEN) ipc_unlock(ipc); ids = &iter->ns->ids[iface->ids]; /* Release the lock we took in start() */ up_read(&ids->rwsem); } static int sysvipc_proc_show(struct seq_file *s, void *it) { struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; if (it == SEQ_START_TOKEN) { seq_puts(s, iface->header); return 0; } return iface->show(s, it); } static const struct seq_operations sysvipc_proc_seqops = { .start = sysvipc_proc_start, .stop = sysvipc_proc_stop, .next = sysvipc_proc_next, .show = sysvipc_proc_show, }; static int sysvipc_proc_open(struct inode *inode, struct file *file) { struct ipc_proc_iter *iter; iter = __seq_open_private(file, &sysvipc_proc_seqops, sizeof(*iter)); if (!iter) return -ENOMEM; iter->iface = pde_data(inode); iter->ns = get_ipc_ns(current->nsproxy->ipc_ns); iter->pid_ns = get_pid_ns(task_active_pid_ns(current)); return 0; } static int sysvipc_proc_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct ipc_proc_iter *iter = seq->private; put_ipc_ns(iter->ns); put_pid_ns(iter->pid_ns); return seq_release_private(inode, file); } static const struct proc_ops sysvipc_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = sysvipc_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = sysvipc_proc_release, }; #endif /* CONFIG_PROC_FS */ |
| 5 14 14 10 10 15 6 7 10 18 19 10 9 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 | // SPDX-License-Identifier: GPL-2.0 /* * Ldisc rw semaphore * * The ldisc semaphore is semantically a rw_semaphore but which enforces * an alternate policy, namely: * 1) Supports lock wait timeouts * 2) Write waiter has priority * 3) Downgrading is not supported * * Implementation notes: * 1) Upper half of semaphore count is a wait count (differs from rwsem * in that rwsem normalizes the upper half to the wait bias) * 2) Lacks overflow checking * * The generic counting was copied and modified from include/asm-generic/rwsem.h * by Paul Mackerras <paulus@samba.org>. * * The scheduling policy was copied and modified from lib/rwsem.c * Written by David Howells (dhowells@redhat.com). * * This implementation incorporates the write lock stealing work of * Michel Lespinasse <walken@google.com>. * * Copyright (C) 2013 Peter Hurley <peter@hurleysoftware.com> */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/tty.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #if BITS_PER_LONG == 64 # define LDSEM_ACTIVE_MASK 0xffffffffL #else # define LDSEM_ACTIVE_MASK 0x0000ffffL #endif #define LDSEM_UNLOCKED 0L #define LDSEM_ACTIVE_BIAS 1L #define LDSEM_WAIT_BIAS (-LDSEM_ACTIVE_MASK-1) #define LDSEM_READ_BIAS LDSEM_ACTIVE_BIAS #define LDSEM_WRITE_BIAS (LDSEM_WAIT_BIAS + LDSEM_ACTIVE_BIAS) struct ldsem_waiter { struct list_head list; struct task_struct *task; }; /* * Initialize an ldsem: */ void __init_ldsem(struct ld_semaphore *sem, const char *name, struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * Make sure we are not reinitializing a held semaphore: */ debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map(&sem->dep_map, name, key, 0); #endif atomic_long_set(&sem->count, LDSEM_UNLOCKED); sem->wait_readers = 0; raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->read_wait); INIT_LIST_HEAD(&sem->write_wait); } static void __ldsem_wake_readers(struct ld_semaphore *sem) { struct ldsem_waiter *waiter, *next; struct task_struct *tsk; long adjust, count; /* * Try to grant read locks to all readers on the read wait list. * Note the 'active part' of the count is incremented by * the number of readers before waking any processes up. */ adjust = sem->wait_readers * (LDSEM_ACTIVE_BIAS - LDSEM_WAIT_BIAS); count = atomic_long_add_return(adjust, &sem->count); do { if (count > 0) break; if (atomic_long_try_cmpxchg(&sem->count, &count, count - adjust)) return; } while (1); list_for_each_entry_safe(waiter, next, &sem->read_wait, list) { tsk = waiter->task; smp_store_release(&waiter->task, NULL); wake_up_process(tsk); put_task_struct(tsk); } INIT_LIST_HEAD(&sem->read_wait); sem->wait_readers = 0; } static inline int writer_trylock(struct ld_semaphore *sem) { /* * Only wake this writer if the active part of the count can be * transitioned from 0 -> 1 */ long count = atomic_long_add_return(LDSEM_ACTIVE_BIAS, &sem->count); do { if ((count & LDSEM_ACTIVE_MASK) == LDSEM_ACTIVE_BIAS) return 1; if (atomic_long_try_cmpxchg(&sem->count, &count, count - LDSEM_ACTIVE_BIAS)) return 0; } while (1); } static void __ldsem_wake_writer(struct ld_semaphore *sem) { struct ldsem_waiter *waiter; waiter = list_entry(sem->write_wait.next, struct ldsem_waiter, list); wake_up_process(waiter->task); } /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then: * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) * - the spinlock must be held by the caller * - woken process blocks are discarded from the list after having task zeroed */ static void __ldsem_wake(struct ld_semaphore *sem) { if (!list_empty(&sem->write_wait)) __ldsem_wake_writer(sem); else if (!list_empty(&sem->read_wait)) __ldsem_wake_readers(sem); } static void ldsem_wake(struct ld_semaphore *sem) { unsigned long flags; raw_spin_lock_irqsave(&sem->wait_lock, flags); __ldsem_wake(sem); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } /* * wait for the read lock to be granted */ static struct ld_semaphore __sched * down_read_failed(struct ld_semaphore *sem, long count, long timeout) { struct ldsem_waiter waiter; long adjust = -LDSEM_ACTIVE_BIAS + LDSEM_WAIT_BIAS; /* set up my own style of waitqueue */ raw_spin_lock_irq(&sem->wait_lock); /* * Try to reverse the lock attempt but if the count has changed * so that reversing fails, check if there are no waiters, * and early-out if not */ do { if (atomic_long_try_cmpxchg(&sem->count, &count, count + adjust)) { count += adjust; break; } if (count > 0) { raw_spin_unlock_irq(&sem->wait_lock); return sem; } } while (1); list_add_tail(&waiter.list, &sem->read_wait); sem->wait_readers++; waiter.task = current; get_task_struct(current); /* if there are no active locks, wake the new lock owner(s) */ if ((count & LDSEM_ACTIVE_MASK) == 0) __ldsem_wake(sem); raw_spin_unlock_irq(&sem->wait_lock); /* wait to be given the lock */ for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!smp_load_acquire(&waiter.task)) break; if (!timeout) break; timeout = schedule_timeout(timeout); } __set_current_state(TASK_RUNNING); if (!timeout) { /* * Lock timed out but check if this task was just * granted lock ownership - if so, pretend there * was no timeout; otherwise, cleanup lock wait. */ raw_spin_lock_irq(&sem->wait_lock); if (waiter.task) { atomic_long_add_return(-LDSEM_WAIT_BIAS, &sem->count); sem->wait_readers--; list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); put_task_struct(waiter.task); return NULL; } raw_spin_unlock_irq(&sem->wait_lock); } return sem; } /* * wait for the write lock to be granted */ static struct ld_semaphore __sched * down_write_failed(struct ld_semaphore *sem, long count, long timeout) { struct ldsem_waiter waiter; long adjust = -LDSEM_ACTIVE_BIAS; int locked = 0; /* set up my own style of waitqueue */ raw_spin_lock_irq(&sem->wait_lock); /* * Try to reverse the lock attempt but if the count has changed * so that reversing fails, check if the lock is now owned, * and early-out if so. */ do { if (atomic_long_try_cmpxchg(&sem->count, &count, count + adjust)) break; if ((count & LDSEM_ACTIVE_MASK) == LDSEM_ACTIVE_BIAS) { raw_spin_unlock_irq(&sem->wait_lock); return sem; } } while (1); list_add_tail(&waiter.list, &sem->write_wait); waiter.task = current; set_current_state(TASK_UNINTERRUPTIBLE); for (;;) { if (!timeout) break; raw_spin_unlock_irq(&sem->wait_lock); timeout = schedule_timeout(timeout); raw_spin_lock_irq(&sem->wait_lock); set_current_state(TASK_UNINTERRUPTIBLE); locked = writer_trylock(sem); if (locked) break; } if (!locked) atomic_long_add_return(-LDSEM_WAIT_BIAS, &sem->count); list_del(&waiter.list); /* * In case of timeout, wake up every reader who gave the right of way * to writer. Prevent separation readers into two groups: * one that helds semaphore and another that sleeps. * (in case of no contention with a writer) */ if (!locked && list_empty(&sem->write_wait)) __ldsem_wake_readers(sem); raw_spin_unlock_irq(&sem->wait_lock); __set_current_state(TASK_RUNNING); /* lock wait may have timed out */ if (!locked) return NULL; return sem; } static int __ldsem_down_read_nested(struct ld_semaphore *sem, int subclass, long timeout) { long count; rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); count = atomic_long_add_return(LDSEM_READ_BIAS, &sem->count); if (count <= 0) { lock_contended(&sem->dep_map, _RET_IP_); if (!down_read_failed(sem, count, timeout)) { rwsem_release(&sem->dep_map, _RET_IP_); return 0; } } lock_acquired(&sem->dep_map, _RET_IP_); return 1; } static int __ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, long timeout) { long count; rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); count = atomic_long_add_return(LDSEM_WRITE_BIAS, &sem->count); if ((count & LDSEM_ACTIVE_MASK) != LDSEM_ACTIVE_BIAS) { lock_contended(&sem->dep_map, _RET_IP_); if (!down_write_failed(sem, count, timeout)) { rwsem_release(&sem->dep_map, _RET_IP_); return 0; } } lock_acquired(&sem->dep_map, _RET_IP_); return 1; } /* * lock for reading -- returns 1 if successful, 0 if timed out */ int __sched ldsem_down_read(struct ld_semaphore *sem, long timeout) { might_sleep(); return __ldsem_down_read_nested(sem, 0, timeout); } /* * trylock for reading -- returns 1 if successful, 0 if contention */ int ldsem_down_read_trylock(struct ld_semaphore *sem) { long count = atomic_long_read(&sem->count); while (count >= 0) { if (atomic_long_try_cmpxchg(&sem->count, &count, count + LDSEM_READ_BIAS)) { rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); lock_acquired(&sem->dep_map, _RET_IP_); return 1; } } return 0; } /* * lock for writing -- returns 1 if successful, 0 if timed out */ int __sched ldsem_down_write(struct ld_semaphore *sem, long timeout) { might_sleep(); return __ldsem_down_write_nested(sem, 0, timeout); } /* * release a read lock */ void ldsem_up_read(struct ld_semaphore *sem) { long count; rwsem_release(&sem->dep_map, _RET_IP_); count = atomic_long_add_return(-LDSEM_READ_BIAS, &sem->count); if (count < 0 && (count & LDSEM_ACTIVE_MASK) == 0) ldsem_wake(sem); } /* * release a write lock */ void ldsem_up_write(struct ld_semaphore *sem) { long count; rwsem_release(&sem->dep_map, _RET_IP_); count = atomic_long_add_return(-LDSEM_WRITE_BIAS, &sem->count); if (count < 0) ldsem_wake(sem); } #ifdef CONFIG_DEBUG_LOCK_ALLOC int ldsem_down_read_nested(struct ld_semaphore *sem, int subclass, long timeout) { might_sleep(); return __ldsem_down_read_nested(sem, subclass, timeout); } int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, long timeout) { might_sleep(); return __ldsem_down_write_nested(sem, subclass, timeout); } #endif |
| 2 3 3 3 2 2 2 2 3 3 2 2 2 2 3 4 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 | // SPDX-License-Identifier: GPL-2.0-only /* * Pluggable TCP congestion control support and newReno * congestion control. * Based on ideas from I/O scheduler support and Web100. * * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> */ #define pr_fmt(fmt) "TCP: " fmt #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/list.h> #include <linux/gfp.h> #include <linux/jhash.h> #include <net/tcp.h> #include <trace/events/tcp.h> static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); /* Simple linear search, don't expect many entries! */ struct tcp_congestion_ops *tcp_ca_find(const char *name) { struct tcp_congestion_ops *e; list_for_each_entry_rcu(e, &tcp_cong_list, list) { if (strcmp(e->name, name) == 0) return e; } return NULL; } void tcp_set_ca_state(struct sock *sk, const u8 ca_state) { struct inet_connection_sock *icsk = inet_csk(sk); trace_tcp_cong_state_set(sk, ca_state); if (icsk->icsk_ca_ops->set_state) icsk->icsk_ca_ops->set_state(sk, ca_state); icsk->icsk_ca_state = ca_state; } /* Must be called with rcu lock held */ static struct tcp_congestion_ops *tcp_ca_find_autoload(const char *name) { struct tcp_congestion_ops *ca = tcp_ca_find(name); #ifdef CONFIG_MODULES if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); request_module("tcp_%s", name); rcu_read_lock(); ca = tcp_ca_find(name); } #endif return ca; } /* Simple linear search, not much in here. */ struct tcp_congestion_ops *tcp_ca_find_key(u32 key) { struct tcp_congestion_ops *e; list_for_each_entry_rcu(e, &tcp_cong_list, list) { if (e->key == key) return e; } return NULL; } int tcp_validate_congestion_control(struct tcp_congestion_ops *ca) { /* all algorithms must implement these */ if (!ca->ssthresh || !ca->undo_cwnd || !(ca->cong_avoid || ca->cong_control)) { pr_err("%s does not implement required ops\n", ca->name); return -EINVAL; } return 0; } /* Attach new congestion control algorithm to the list * of available options. */ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) { int ret; ret = tcp_validate_congestion_control(ca); if (ret) return ret; ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); spin_lock(&tcp_cong_list_lock); if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) { pr_notice("%s already registered or non-unique key\n", ca->name); ret = -EEXIST; } else { list_add_tail_rcu(&ca->list, &tcp_cong_list); pr_debug("%s registered\n", ca->name); } spin_unlock(&tcp_cong_list_lock); return ret; } EXPORT_SYMBOL_GPL(tcp_register_congestion_control); /* * Remove congestion control algorithm, called from * the module's remove function. Module ref counts are used * to ensure that this can't be done till all sockets using * that method are closed. */ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) { spin_lock(&tcp_cong_list_lock); list_del_rcu(&ca->list); spin_unlock(&tcp_cong_list_lock); /* Wait for outstanding readers to complete before the * module gets removed entirely. * * A try_module_get() should fail by now as our module is * in "going" state since no refs are held anymore and * module_exit() handler being called. */ synchronize_rcu(); } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); /* Replace a registered old ca with a new one. * * The new ca must have the same name as the old one, that has been * registered. */ int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca) { struct tcp_congestion_ops *existing; int ret = 0; ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); spin_lock(&tcp_cong_list_lock); existing = tcp_ca_find_key(old_ca->key); if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) { pr_notice("%s not registered or non-unique key\n", ca->name); ret = -EINVAL; } else if (existing != old_ca) { pr_notice("invalid old congestion control algorithm to replace\n"); ret = -EINVAL; } else { /* Add the new one before removing the old one to keep * one implementation available all the time. */ list_add_tail_rcu(&ca->list, &tcp_cong_list); list_del_rcu(&existing->list); pr_debug("%s updated\n", ca->name); } spin_unlock(&tcp_cong_list_lock); /* Wait for outstanding readers to complete before the * module or struct_ops gets removed entirely. */ if (!ret) synchronize_rcu(); return ret; } u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; u32 key = TCP_CA_UNSPEC; might_sleep(); rcu_read_lock(); ca = tcp_ca_find_autoload(name); if (ca) { key = ca->key; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; } rcu_read_unlock(); return key; } char *tcp_ca_get_name_by_key(u32 key, char *buffer) { const struct tcp_congestion_ops *ca; char *ret = NULL; rcu_read_lock(); ca = tcp_ca_find_key(key); if (ca) { strscpy(buffer, ca->name, TCP_CA_NAME_MAX); ret = buffer; } rcu_read_unlock(); return ret; } /* Assign choice of congestion control. */ void tcp_assign_congestion_control(struct sock *sk) { struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); if (unlikely(!bpf_try_module_get(ca, ca->owner))) ca = &tcp_reno; icsk->icsk_ca_ops = ca; rcu_read_unlock(); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); } void tcp_init_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); icsk->icsk_ca_initialized = 1; } static void tcp_reinit_congestion_control(struct sock *sk, const struct tcp_congestion_ops *ca) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = ca; icsk->icsk_ca_setsockopt = 1; memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) tcp_init_congestion_control(sk); } /* Manage refcounts on socket close. */ void tcp_cleanup_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); icsk->icsk_ca_initialized = 0; bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ int tcp_set_default_congestion_control(struct net *net, const char *name) { struct tcp_congestion_ops *ca; const struct tcp_congestion_ops *prev; int ret; rcu_read_lock(); ca = tcp_ca_find_autoload(name); if (!ca) { ret = -ENOENT; } else if (!bpf_try_module_get(ca, ca->owner)) { ret = -EBUSY; } else if (!net_eq(net, &init_net) && !(ca->flags & TCP_CONG_NON_RESTRICTED)) { /* Only init netns can set default to a restricted algorithm */ ret = -EPERM; } else { prev = xchg(&net->ipv4.tcp_congestion_control, ca); if (prev) bpf_module_put(prev, prev->owner); ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; } rcu_read_unlock(); return ret; } /* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) { return tcp_set_default_congestion_control(&init_net, CONFIG_DEFAULT_TCP_CONG); } late_initcall(tcp_congestion_default); /* Build string with list of available congestion control values */ void tcp_get_available_congestion_control(char *buf, size_t maxlen) { struct tcp_congestion_ops *ca; size_t offs = 0; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } /* Get current default congestion control */ void tcp_get_default_congestion_control(struct net *net, char *name) { const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); strscpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } /* Built list of non-restricted congestion control values */ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) { struct tcp_congestion_ops *ca; size_t offs = 0; *buf = '\0'; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { if (!(ca->flags & TCP_CONG_NON_RESTRICTED)) continue; offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } /* Change list of non-restricted congestion control */ int tcp_set_allowed_congestion_control(char *val) { struct tcp_congestion_ops *ca; char *saved_clone, *clone, *name; int ret = 0; saved_clone = clone = kstrdup(val, GFP_USER); if (!clone) return -ENOMEM; spin_lock(&tcp_cong_list_lock); /* pass 1 check for bad entries */ while ((name = strsep(&clone, " ")) && *name) { ca = tcp_ca_find(name); if (!ca) { ret = -ENOENT; goto out; } } /* pass 2 clear old values */ list_for_each_entry_rcu(ca, &tcp_cong_list, list) ca->flags &= ~TCP_CONG_NON_RESTRICTED; /* pass 3 mark as allowed */ while ((name = strsep(&val, " ")) && *name) { ca = tcp_ca_find(name); WARN_ON(!ca); if (ca) ca->flags |= TCP_CONG_NON_RESTRICTED; } out: spin_unlock(&tcp_cong_list_lock); kfree(saved_clone); return ret; } /* Change congestion control for socket. If load is false, then it is the * responsibility of the caller to call tcp_init_congestion_control or * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; int err = 0; if (icsk->icsk_ca_dst_locked) return -EPERM; rcu_read_lock(); if (!load) ca = tcp_ca_find(name); else ca = tcp_ca_find_autoload(name); /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { icsk->icsk_ca_setsockopt = 1; goto out; } if (!ca) err = -ENOENT; else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) err = -EPERM; else if (!bpf_try_module_get(ca, ca->owner)) err = -EBUSY; else tcp_reinit_congestion_control(sk, ca); out: rcu_read_unlock(); return err; } /* Slow start is used when congestion window is no greater than the slow start * threshold. We base on RFC2581 and also handle stretch ACKs properly. * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but * something better;) a packet is only considered (s)acked in its entirety to * defend the ACK attacks described in the RFC. Slow start processes a stretch * ACK of degree N as if N acks of degree 1 are received back to back except * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ __bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh); acked -= cwnd - tcp_snd_cwnd(tp); tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); return acked; } EXPORT_SYMBOL_GPL(tcp_slow_start); /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w), * for every packet that was ACKed. */ __bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { /* If credits accumulated at a higher w, apply them gently now. */ if (tp->snd_cwnd_cnt >= w) { tp->snd_cwnd_cnt = 0; tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); } tp->snd_cwnd_cnt += acked; if (tp->snd_cwnd_cnt >= w) { u32 delta = tp->snd_cwnd_cnt / w; tp->snd_cwnd_cnt -= delta * w; tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta); } tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp)); } EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); /* * TCP Reno congestion control * This is special case used for fallback as well. */ /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ __bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_is_cwnd_limited(sk)) return; /* In "safe" area, increase. */ if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; } /* In dangerous area, increase slowly. */ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked); } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); /* Slow start threshold is half the congestion window (min 2) */ __bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp) >> 1U, 2U); } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); __bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp), tp->prior_cwnd); } EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd); struct tcp_congestion_ops tcp_reno = { .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .undo_cwnd = tcp_reno_undo_cwnd, }; |
| 3 3 4 4 4 2 3 3 3 3 3 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Initialization routines * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/init.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/device.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/ctype.h> #include <linux/pm.h> #include <linux/debugfs.h> #include <linux/completion.h> #include <linux/interrupt.h> #include <sound/core.h> #include <sound/control.h> #include <sound/info.h> /* monitor files for graceful shutdown (hotplug) */ struct snd_monitor_file { struct file *file; const struct file_operations *disconnected_f_op; struct list_head shutdown_list; /* still need to shutdown */ struct list_head list; /* link of monitor files */ }; static DEFINE_SPINLOCK(shutdown_lock); static LIST_HEAD(shutdown_files); static const struct file_operations snd_shutdown_f_ops; /* locked for registering/using */ static DECLARE_BITMAP(snd_cards_lock, SNDRV_CARDS); static struct snd_card *snd_cards[SNDRV_CARDS]; static DEFINE_MUTEX(snd_card_mutex); static char *slots[SNDRV_CARDS]; module_param_array(slots, charp, NULL, 0444); MODULE_PARM_DESC(slots, "Module names assigned to the slots."); /* return non-zero if the given index is reserved for the given * module via slots option */ static int module_slot_match(struct module *module, int idx) { int match = 1; #ifdef CONFIG_MODULES const char *s1, *s2; if (!module || !*module->name || !slots[idx]) return 0; s1 = module->name; s2 = slots[idx]; if (*s2 == '!') { match = 0; /* negative match */ s2++; } /* compare module name strings * hyphens are handled as equivalent with underscore */ for (;;) { char c1 = *s1++; char c2 = *s2++; if (c1 == '-') c1 = '_'; if (c2 == '-') c2 = '_'; if (c1 != c2) return !match; if (!c1) break; } #endif /* CONFIG_MODULES */ return match; } #if IS_ENABLED(CONFIG_SND_MIXER_OSS) int (*snd_mixer_oss_notify_callback)(struct snd_card *card, int free_flag); EXPORT_SYMBOL(snd_mixer_oss_notify_callback); #endif static int check_empty_slot(struct module *module, int slot) { return !slots[slot] || !*slots[slot]; } /* return an empty slot number (>= 0) found in the given bitmask @mask. * @mask == -1 == 0xffffffff means: take any free slot up to 32 * when no slot is available, return the original @mask as is. */ static int get_slot_from_bitmask(int mask, int (*check)(struct module *, int), struct module *module) { int slot; for (slot = 0; slot < SNDRV_CARDS; slot++) { if (slot < 32 && !(mask & (1U << slot))) continue; if (!test_bit(slot, snd_cards_lock)) { if (check(module, slot)) return slot; /* found */ } } return mask; /* unchanged */ } /* the default release callback set in snd_device_alloc() */ static void default_release_alloc(struct device *dev) { kfree(dev); } /** * snd_device_alloc - Allocate and initialize struct device for sound devices * @dev_p: pointer to store the allocated device * @card: card to assign, optional * * For releasing the allocated device, call put_device(). */ int snd_device_alloc(struct device **dev_p, struct snd_card *card) { struct device *dev; *dev_p = NULL; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; device_initialize(dev); if (card) dev->parent = &card->card_dev; dev->class = &sound_class; dev->release = default_release_alloc; *dev_p = dev; return 0; } EXPORT_SYMBOL_GPL(snd_device_alloc); static int snd_card_init(struct snd_card *card, struct device *parent, int idx, const char *xid, struct module *module, size_t extra_size); static int snd_card_do_free(struct snd_card *card); static const struct attribute_group card_dev_attr_group; static void release_card_device(struct device *dev) { snd_card_do_free(dev_to_snd_card(dev)); } /** * snd_card_new - create and initialize a soundcard structure * @parent: the parent device object * @idx: card index (address) [0 ... (SNDRV_CARDS-1)] * @xid: card identification (ASCII string) * @module: top level module for locking * @extra_size: allocate this extra size after the main soundcard structure * @card_ret: the pointer to store the created card instance * * The function allocates snd_card instance via kzalloc with the given * space for the driver to use freely. The allocated struct is stored * in the given card_ret pointer. * * Return: Zero if successful or a negative error code. */ int snd_card_new(struct device *parent, int idx, const char *xid, struct module *module, int extra_size, struct snd_card **card_ret) { struct snd_card *card; int err; if (snd_BUG_ON(!card_ret)) return -EINVAL; *card_ret = NULL; if (extra_size < 0) extra_size = 0; card = kzalloc(sizeof(*card) + extra_size, GFP_KERNEL); if (!card) return -ENOMEM; err = snd_card_init(card, parent, idx, xid, module, extra_size); if (err < 0) return err; /* card is freed by error handler */ *card_ret = card; return 0; } EXPORT_SYMBOL(snd_card_new); static void __snd_card_release(struct device *dev, void *data) { snd_card_free(data); } /** * snd_devm_card_new - managed snd_card object creation * @parent: the parent device object * @idx: card index (address) [0 ... (SNDRV_CARDS-1)] * @xid: card identification (ASCII string) * @module: top level module for locking * @extra_size: allocate this extra size after the main soundcard structure * @card_ret: the pointer to store the created card instance * * This function works like snd_card_new() but manages the allocated resource * via devres, i.e. you don't need to free explicitly. * * When a snd_card object is created with this function and registered via * snd_card_register(), the very first devres action to call snd_card_free() * is added automatically. In that way, the resource disconnection is assured * at first, then released in the expected order. * * If an error happens at the probe before snd_card_register() is called and * there have been other devres resources, you'd need to free the card manually * via snd_card_free() call in the error; otherwise it may lead to UAF due to * devres call orders. You can use snd_card_free_on_error() helper for * handling it more easily. * * Return: zero if successful, or a negative error code */ int snd_devm_card_new(struct device *parent, int idx, const char *xid, struct module *module, size_t extra_size, struct snd_card **card_ret) { struct snd_card *card; int err; *card_ret = NULL; card = devres_alloc(__snd_card_release, sizeof(*card) + extra_size, GFP_KERNEL); if (!card) return -ENOMEM; card->managed = true; err = snd_card_init(card, parent, idx, xid, module, extra_size); if (err < 0) { devres_free(card); /* in managed mode, we need to free manually */ return err; } devres_add(parent, card); *card_ret = card; return 0; } EXPORT_SYMBOL_GPL(snd_devm_card_new); /** * snd_card_free_on_error - a small helper for handling devm probe errors * @dev: the managed device object * @ret: the return code from the probe callback * * This function handles the explicit snd_card_free() call at the error from * the probe callback. It's just a small helper for simplifying the error * handling with the managed devices. * * Return: zero if successful, or a negative error code */ int snd_card_free_on_error(struct device *dev, int ret) { struct snd_card *card; if (!ret) return 0; card = devres_find(dev, __snd_card_release, NULL, NULL); if (card) snd_card_free(card); return ret; } EXPORT_SYMBOL_GPL(snd_card_free_on_error); static int snd_card_init(struct snd_card *card, struct device *parent, int idx, const char *xid, struct module *module, size_t extra_size) { int err; if (extra_size > 0) card->private_data = (char *)card + sizeof(struct snd_card); if (xid) strscpy(card->id, xid, sizeof(card->id)); err = 0; scoped_guard(mutex, &snd_card_mutex) { if (idx < 0) /* first check the matching module-name slot */ idx = get_slot_from_bitmask(idx, module_slot_match, module); if (idx < 0) /* if not matched, assign an empty slot */ idx = get_slot_from_bitmask(idx, check_empty_slot, module); if (idx < 0) err = -ENODEV; else if (idx < snd_ecards_limit) { if (test_bit(idx, snd_cards_lock)) err = -EBUSY; /* invalid */ } else if (idx >= SNDRV_CARDS) err = -ENODEV; if (!err) { set_bit(idx, snd_cards_lock); /* lock it */ if (idx >= snd_ecards_limit) snd_ecards_limit = idx + 1; /* increase the limit */ } } if (err < 0) { dev_err(parent, "cannot find the slot for index %d (range 0-%i), error: %d\n", idx, snd_ecards_limit - 1, err); if (!card->managed) kfree(card); /* manually free here, as no destructor called */ return err; } card->dev = parent; card->number = idx; WARN_ON(IS_MODULE(CONFIG_SND) && !module); card->module = module; INIT_LIST_HEAD(&card->devices); init_rwsem(&card->controls_rwsem); rwlock_init(&card->controls_rwlock); INIT_LIST_HEAD(&card->controls); INIT_LIST_HEAD(&card->ctl_files); #ifdef CONFIG_SND_CTL_FAST_LOOKUP xa_init(&card->ctl_numids); xa_init(&card->ctl_hash); #endif spin_lock_init(&card->files_lock); INIT_LIST_HEAD(&card->files_list); mutex_init(&card->memory_mutex); #ifdef CONFIG_PM init_waitqueue_head(&card->power_sleep); init_waitqueue_head(&card->power_ref_sleep); atomic_set(&card->power_ref, 0); #endif init_waitqueue_head(&card->remove_sleep); card->sync_irq = -1; device_initialize(&card->card_dev); card->card_dev.parent = parent; card->card_dev.class = &sound_class; card->card_dev.release = release_card_device; card->card_dev.groups = card->dev_groups; card->dev_groups[0] = &card_dev_attr_group; err = kobject_set_name(&card->card_dev.kobj, "card%d", idx); if (err < 0) goto __error; snprintf(card->irq_descr, sizeof(card->irq_descr), "%s:%s", dev_driver_string(card->dev), dev_name(&card->card_dev)); /* the control interface cannot be accessed from the user space until */ /* snd_cards_bitmask and snd_cards are set with snd_card_register */ err = snd_ctl_create(card); if (err < 0) { dev_err(parent, "unable to register control minors\n"); goto __error; } err = snd_info_card_create(card); if (err < 0) { dev_err(parent, "unable to create card info\n"); goto __error_ctl; } #ifdef CONFIG_SND_DEBUG card->debugfs_root = debugfs_create_dir(dev_name(&card->card_dev), sound_debugfs_root); #endif return 0; __error_ctl: snd_device_free_all(card); __error: put_device(&card->card_dev); return err; } /** * snd_card_ref - Get the card object from the index * @idx: the card index * * Returns a card object corresponding to the given index or NULL if not found. * Release the object via snd_card_unref(). * * Return: a card object or NULL */ struct snd_card *snd_card_ref(int idx) { struct snd_card *card; guard(mutex)(&snd_card_mutex); card = snd_cards[idx]; if (card) get_device(&card->card_dev); return card; } EXPORT_SYMBOL_GPL(snd_card_ref); /* return non-zero if a card is already locked */ int snd_card_locked(int card) { guard(mutex)(&snd_card_mutex); return test_bit(card, snd_cards_lock); } static loff_t snd_disconnect_llseek(struct file *file, loff_t offset, int orig) { return -ENODEV; } static ssize_t snd_disconnect_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { return -ENODEV; } static ssize_t snd_disconnect_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { return -ENODEV; } static int snd_disconnect_release(struct inode *inode, struct file *file) { struct snd_monitor_file *df = NULL, *_df; scoped_guard(spinlock, &shutdown_lock) { list_for_each_entry(_df, &shutdown_files, shutdown_list) { if (_df->file == file) { df = _df; list_del_init(&df->shutdown_list); break; } } } if (likely(df)) { if ((file->f_flags & FASYNC) && df->disconnected_f_op->fasync) df->disconnected_f_op->fasync(-1, file, 0); return df->disconnected_f_op->release(inode, file); } panic("%s(%p, %p) failed!", __func__, inode, file); } static __poll_t snd_disconnect_poll(struct file * file, poll_table * wait) { return EPOLLERR | EPOLLNVAL; } static long snd_disconnect_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return -ENODEV; } static int snd_disconnect_mmap(struct file *file, struct vm_area_struct *vma) { return -ENODEV; } static int snd_disconnect_fasync(int fd, struct file *file, int on) { return -ENODEV; } static const struct file_operations snd_shutdown_f_ops = { .owner = THIS_MODULE, .llseek = snd_disconnect_llseek, .read = snd_disconnect_read, .write = snd_disconnect_write, .release = snd_disconnect_release, .poll = snd_disconnect_poll, .unlocked_ioctl = snd_disconnect_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = snd_disconnect_ioctl, #endif .mmap = snd_disconnect_mmap, .fasync = snd_disconnect_fasync }; /** * snd_card_disconnect - disconnect all APIs from the file-operations (user space) * @card: soundcard structure * * Disconnects all APIs from the file-operations (user space). * * Return: Zero, otherwise a negative error code. * * Note: The current implementation replaces all active file->f_op with special * dummy file operations (they do nothing except release). */ void snd_card_disconnect(struct snd_card *card) { struct snd_monitor_file *mfile; if (!card) return; scoped_guard(spinlock, &card->files_lock) { if (card->shutdown) return; card->shutdown = 1; /* replace file->f_op with special dummy operations */ list_for_each_entry(mfile, &card->files_list, list) { /* it's critical part, use endless loop */ /* we have no room to fail */ mfile->disconnected_f_op = mfile->file->f_op; scoped_guard(spinlock, &shutdown_lock) list_add(&mfile->shutdown_list, &shutdown_files); mfile->file->f_op = &snd_shutdown_f_ops; fops_get(mfile->file->f_op); } } #ifdef CONFIG_PM /* wake up sleepers here before other callbacks for avoiding potential * deadlocks with other locks (e.g. in kctls); * then this notifies the shutdown and sleepers would abort immediately */ wake_up_all(&card->power_sleep); #endif /* notify all connected devices about disconnection */ /* at this point, they cannot respond to any calls except release() */ #if IS_ENABLED(CONFIG_SND_MIXER_OSS) if (snd_mixer_oss_notify_callback) snd_mixer_oss_notify_callback(card, SND_MIXER_OSS_NOTIFY_DISCONNECT); #endif /* notify all devices that we are disconnected */ snd_device_disconnect_all(card); if (card->sync_irq > 0) synchronize_irq(card->sync_irq); snd_info_card_disconnect(card); #ifdef CONFIG_SND_DEBUG debugfs_remove(card->debugfs_root); card->debugfs_root = NULL; #endif if (card->registered) { device_del(&card->card_dev); card->registered = false; } /* disable fops (user space) operations for ALSA API */ scoped_guard(mutex, &snd_card_mutex) { snd_cards[card->number] = NULL; clear_bit(card->number, snd_cards_lock); } snd_power_sync_ref(card); } EXPORT_SYMBOL(snd_card_disconnect); /** * snd_card_disconnect_sync - disconnect card and wait until files get closed * @card: card object to disconnect * * This calls snd_card_disconnect() for disconnecting all belonging components * and waits until all pending files get closed. * It assures that all accesses from user-space finished so that the driver * can release its resources gracefully. */ void snd_card_disconnect_sync(struct snd_card *card) { snd_card_disconnect(card); guard(spinlock_irq)(&card->files_lock); wait_event_lock_irq(card->remove_sleep, list_empty(&card->files_list), card->files_lock); } EXPORT_SYMBOL_GPL(snd_card_disconnect_sync); static int snd_card_do_free(struct snd_card *card) { card->releasing = true; #if IS_ENABLED(CONFIG_SND_MIXER_OSS) if (snd_mixer_oss_notify_callback) snd_mixer_oss_notify_callback(card, SND_MIXER_OSS_NOTIFY_FREE); #endif snd_device_free_all(card); if (card->private_free) card->private_free(card); if (snd_info_card_free(card) < 0) { dev_warn(card->dev, "unable to free card info\n"); /* Not fatal error */ } if (card->release_completion) complete(card->release_completion); if (!card->managed) kfree(card); return 0; } /** * snd_card_free_when_closed - Disconnect the card, free it later eventually * @card: soundcard structure * * Unlike snd_card_free(), this function doesn't try to release the card * resource immediately, but tries to disconnect at first. When the card * is still in use, the function returns before freeing the resources. * The card resources will be freed when the refcount gets to zero. * * Return: zero if successful, or a negative error code */ void snd_card_free_when_closed(struct snd_card *card) { if (!card) return; snd_card_disconnect(card); put_device(&card->card_dev); return; } EXPORT_SYMBOL(snd_card_free_when_closed); /** * snd_card_free - frees given soundcard structure * @card: soundcard structure * * This function releases the soundcard structure and the all assigned * devices automatically. That is, you don't have to release the devices * by yourself. * * This function waits until the all resources are properly released. * * Return: Zero. Frees all associated devices and frees the control * interface associated to given soundcard. */ void snd_card_free(struct snd_card *card) { DECLARE_COMPLETION_ONSTACK(released); /* The call of snd_card_free() is allowed from various code paths; * a manual call from the driver and the call via devres_free, and * we need to avoid double-free. Moreover, the release via devres * may call snd_card_free() twice due to its nature, we need to have * the check here at the beginning. */ if (card->releasing) return; card->release_completion = &released; snd_card_free_when_closed(card); /* wait, until all devices are ready for the free operation */ wait_for_completion(&released); } EXPORT_SYMBOL(snd_card_free); /* check, if the character is in the valid ASCII range */ static inline bool safe_ascii_char(char c) { return isascii(c) && isalnum(c); } /* retrieve the last word of shortname or longname */ static const char *retrieve_id_from_card_name(const char *name) { const char *spos = name; while (*name) { if (isspace(*name) && safe_ascii_char(name[1])) spos = name + 1; name++; } return spos; } /* return true if the given id string doesn't conflict any other card ids */ static bool card_id_ok(struct snd_card *card, const char *id) { int i; if (!snd_info_check_reserved_words(id)) return false; for (i = 0; i < snd_ecards_limit; i++) { if (snd_cards[i] && snd_cards[i] != card && !strcmp(snd_cards[i]->id, id)) return false; } return true; } /* copy to card->id only with valid letters from nid */ static void copy_valid_id_string(struct snd_card *card, const char *src, const char *nid) { char *id = card->id; while (*nid && !safe_ascii_char(*nid)) nid++; if (isdigit(*nid)) *id++ = isalpha(*src) ? *src : 'D'; while (*nid && (size_t)(id - card->id) < sizeof(card->id) - 1) { if (safe_ascii_char(*nid)) *id++ = *nid; nid++; } *id = 0; } /* Set card->id from the given string * If the string conflicts with other ids, add a suffix to make it unique. */ static void snd_card_set_id_no_lock(struct snd_card *card, const char *src, const char *nid) { int len, loops; bool is_default = false; char *id; copy_valid_id_string(card, src, nid); id = card->id; again: /* use "Default" for obviously invalid strings * ("card" conflicts with proc directories) */ if (!*id || !strncmp(id, "card", 4)) { strscpy(card->id, "Default"); is_default = true; } len = strlen(id); for (loops = 0; loops < SNDRV_CARDS; loops++) { char sfxstr[5]; /* "_012" */ int sfxlen, slen; if (card_id_ok(card, id)) return; /* OK */ /* Add _XYZ suffix */ sfxlen = scnprintf(sfxstr, sizeof(sfxstr), "_%X", loops + 1); if (len + sfxlen >= sizeof(card->id)) slen = sizeof(card->id) - sfxlen - 1; else slen = len; strscpy(id + slen, sfxstr, sizeof(card->id) - slen); } /* fallback to the default id */ if (!is_default) { *id = 0; goto again; } /* last resort... */ dev_err(card->dev, "unable to set card id (%s)\n", id); if (card->proc_root->name) strscpy(card->id, card->proc_root->name, sizeof(card->id)); } /** * snd_card_set_id - set card identification name * @card: soundcard structure * @nid: new identification string * * This function sets the card identification and checks for name * collisions. */ void snd_card_set_id(struct snd_card *card, const char *nid) { /* check if user specified own card->id */ if (card->id[0] != '\0') return; guard(mutex)(&snd_card_mutex); snd_card_set_id_no_lock(card, nid, nid); } EXPORT_SYMBOL(snd_card_set_id); static ssize_t id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct snd_card *card = container_of(dev, struct snd_card, card_dev); return sysfs_emit(buf, "%s\n", card->id); } static ssize_t id_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct snd_card *card = container_of(dev, struct snd_card, card_dev); char buf1[sizeof(card->id)]; size_t copy = count > sizeof(card->id) - 1 ? sizeof(card->id) - 1 : count; size_t idx; int c; for (idx = 0; idx < copy; idx++) { c = buf[idx]; if (!safe_ascii_char(c) && c != '_' && c != '-') return -EINVAL; } memcpy(buf1, buf, copy); buf1[copy] = '\0'; guard(mutex)(&snd_card_mutex); if (!card_id_ok(NULL, buf1)) return -EEXIST; strscpy(card->id, buf1); snd_info_card_id_change(card); return count; } static DEVICE_ATTR_RW(id); static ssize_t number_show(struct device *dev, struct device_attribute *attr, char *buf) { struct snd_card *card = container_of(dev, struct snd_card, card_dev); return sysfs_emit(buf, "%i\n", card->number); } static DEVICE_ATTR_RO(number); static struct attribute *card_dev_attrs[] = { &dev_attr_id.attr, &dev_attr_number.attr, NULL }; static const struct attribute_group card_dev_attr_group = { .attrs = card_dev_attrs, }; /** * snd_card_add_dev_attr - Append a new sysfs attribute group to card * @card: card instance * @group: attribute group to append * * Return: zero if successful, or a negative error code */ int snd_card_add_dev_attr(struct snd_card *card, const struct attribute_group *group) { int i; /* loop for (arraysize-1) here to keep NULL at the last entry */ for (i = 0; i < ARRAY_SIZE(card->dev_groups) - 1; i++) { if (!card->dev_groups[i]) { card->dev_groups[i] = group; return 0; } } dev_err(card->dev, "Too many groups assigned\n"); return -ENOSPC; } EXPORT_SYMBOL_GPL(snd_card_add_dev_attr); static void trigger_card_free(void *data) { snd_card_free(data); } /** * snd_card_register - register the soundcard * @card: soundcard structure * * This function registers all the devices assigned to the soundcard. * Until calling this, the ALSA control interface is blocked from the * external accesses. Thus, you should call this function at the end * of the initialization of the card. * * Return: Zero otherwise a negative error code if the registration failed. */ int snd_card_register(struct snd_card *card) { int err; if (snd_BUG_ON(!card)) return -EINVAL; if (!card->registered) { err = device_add(&card->card_dev); if (err < 0) return err; card->registered = true; } else { if (card->managed) devm_remove_action(card->dev, trigger_card_free, card); } if (card->managed) { err = devm_add_action(card->dev, trigger_card_free, card); if (err < 0) return err; } err = snd_device_register_all(card); if (err < 0) return err; scoped_guard(mutex, &snd_card_mutex) { if (snd_cards[card->number]) { /* already registered */ return snd_info_card_register(card); /* register pending info */ } if (*card->id) { /* make a unique id name from the given string */ char tmpid[sizeof(card->id)]; memcpy(tmpid, card->id, sizeof(card->id)); snd_card_set_id_no_lock(card, tmpid, tmpid); } else { /* create an id from either shortname or longname */ const char *src; src = *card->shortname ? card->shortname : card->longname; snd_card_set_id_no_lock(card, src, retrieve_id_from_card_name(src)); } snd_cards[card->number] = card; } err = snd_info_card_register(card); if (err < 0) return err; #if IS_ENABLED(CONFIG_SND_MIXER_OSS) if (snd_mixer_oss_notify_callback) snd_mixer_oss_notify_callback(card, SND_MIXER_OSS_NOTIFY_REGISTER); #endif return 0; } EXPORT_SYMBOL(snd_card_register); #ifdef CONFIG_SND_PROC_FS static void snd_card_info_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { int idx, count; struct snd_card *card; for (idx = count = 0; idx < SNDRV_CARDS; idx++) { guard(mutex)(&snd_card_mutex); card = snd_cards[idx]; if (card) { count++; snd_iprintf(buffer, "%2i [%-15s]: %s - %s\n", idx, card->id, card->driver, card->shortname); snd_iprintf(buffer, " %s\n", card->longname); } } if (!count) snd_iprintf(buffer, "--- no soundcards ---\n"); } #ifdef CONFIG_SND_OSSEMUL void snd_card_info_read_oss(struct snd_info_buffer *buffer) { int idx, count; struct snd_card *card; for (idx = count = 0; idx < SNDRV_CARDS; idx++) { guard(mutex)(&snd_card_mutex); card = snd_cards[idx]; if (card) { count++; snd_iprintf(buffer, "%s\n", card->longname); } } if (!count) { snd_iprintf(buffer, "--- no soundcards ---\n"); } } #endif #ifdef CONFIG_MODULES static void snd_card_module_info_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { int idx; struct snd_card *card; for (idx = 0; idx < SNDRV_CARDS; idx++) { guard(mutex)(&snd_card_mutex); card = snd_cards[idx]; if (card) snd_iprintf(buffer, "%2i %s\n", idx, card->module->name); } } #endif int __init snd_card_info_init(void) { struct snd_info_entry *entry; entry = snd_info_create_module_entry(THIS_MODULE, "cards", NULL); if (! entry) return -ENOMEM; entry->c.text.read = snd_card_info_read; if (snd_info_register(entry) < 0) return -ENOMEM; /* freed in error path */ #ifdef CONFIG_MODULES entry = snd_info_create_module_entry(THIS_MODULE, "modules", NULL); if (!entry) return -ENOMEM; entry->c.text.read = snd_card_module_info_read; if (snd_info_register(entry) < 0) return -ENOMEM; /* freed in error path */ #endif return 0; } #endif /* CONFIG_SND_PROC_FS */ /** * snd_component_add - add a component string * @card: soundcard structure * @component: the component id string * * This function adds the component id string to the supported list. * The component can be referred from the alsa-lib. * * Return: Zero otherwise a negative error code. */ int snd_component_add(struct snd_card *card, const char *component) { char *ptr; int len = strlen(component); ptr = strstr(card->components, component); if (ptr != NULL) { if (ptr[len] == '\0' || ptr[len] == ' ') /* already there */ return 1; } if (strlen(card->components) + 1 + len + 1 > sizeof(card->components)) { snd_BUG(); return -ENOMEM; } if (card->components[0] != '\0') strcat(card->components, " "); strcat(card->components, component); return 0; } EXPORT_SYMBOL(snd_component_add); /** * snd_card_file_add - add the file to the file list of the card * @card: soundcard structure * @file: file pointer * * This function adds the file to the file linked-list of the card. * This linked-list is used to keep tracking the connection state, * and to avoid the release of busy resources by hotplug. * * Return: zero or a negative error code. */ int snd_card_file_add(struct snd_card *card, struct file *file) { struct snd_monitor_file *mfile; mfile = kmalloc(sizeof(*mfile), GFP_KERNEL); if (mfile == NULL) return -ENOMEM; mfile->file = file; mfile->disconnected_f_op = NULL; INIT_LIST_HEAD(&mfile->shutdown_list); guard(spinlock)(&card->files_lock); if (card->shutdown) { kfree(mfile); return -ENODEV; } list_add(&mfile->list, &card->files_list); get_device(&card->card_dev); return 0; } EXPORT_SYMBOL(snd_card_file_add); /** * snd_card_file_remove - remove the file from the file list * @card: soundcard structure * @file: file pointer * * This function removes the file formerly added to the card via * snd_card_file_add() function. * If all files are removed and snd_card_free_when_closed() was * called beforehand, it processes the pending release of * resources. * * Return: Zero or a negative error code. */ int snd_card_file_remove(struct snd_card *card, struct file *file) { struct snd_monitor_file *mfile, *found = NULL; scoped_guard(spinlock, &card->files_lock) { list_for_each_entry(mfile, &card->files_list, list) { if (mfile->file == file) { list_del(&mfile->list); scoped_guard(spinlock, &shutdown_lock) list_del(&mfile->shutdown_list); if (mfile->disconnected_f_op) fops_put(mfile->disconnected_f_op); found = mfile; break; } } if (list_empty(&card->files_list)) wake_up_all(&card->remove_sleep); } if (!found) { dev_err(card->dev, "card file remove problem (%p)\n", file); return -ENOENT; } kfree(found); put_device(&card->card_dev); return 0; } EXPORT_SYMBOL(snd_card_file_remove); #ifdef CONFIG_PM /** * snd_power_ref_and_wait - wait until the card gets powered up * @card: soundcard structure * * Take the power_ref reference count of the given card, and * wait until the card gets powered up to SNDRV_CTL_POWER_D0 state. * The refcount is down again while sleeping until power-up, hence this * function can be used for syncing the floating control ops accesses, * typically around calling control ops. * * The caller needs to pull down the refcount via snd_power_unref() later * no matter whether the error is returned from this function or not. * * Return: Zero if successful, or a negative error code. */ int snd_power_ref_and_wait(struct snd_card *card) { snd_power_ref(card); if (snd_power_get_state(card) == SNDRV_CTL_POWER_D0) return 0; wait_event_cmd(card->power_sleep, card->shutdown || snd_power_get_state(card) == SNDRV_CTL_POWER_D0, snd_power_unref(card), snd_power_ref(card)); return card->shutdown ? -ENODEV : 0; } EXPORT_SYMBOL_GPL(snd_power_ref_and_wait); /** * snd_power_wait - wait until the card gets powered up (old form) * @card: soundcard structure * * Wait until the card gets powered up to SNDRV_CTL_POWER_D0 state. * * Return: Zero if successful, or a negative error code. */ int snd_power_wait(struct snd_card *card) { int ret; ret = snd_power_ref_and_wait(card); snd_power_unref(card); return ret; } EXPORT_SYMBOL(snd_power_wait); #endif /* CONFIG_PM */ |
| 86 31 21 2 155 183 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * ocfs2_fs.h * * On-disk structures for OCFS2. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ #ifndef _OCFS2_FS_H #define _OCFS2_FS_H #include <linux/magic.h> /* Version */ #define OCFS2_MAJOR_REV_LEVEL 0 #define OCFS2_MINOR_REV_LEVEL 90 /* * An OCFS2 volume starts this way: * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS. * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS. * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock. * * All other structures are found from the superblock information. * * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors. eg, for a * blocksize of 2K, it is 4096 bytes into disk. */ #define OCFS2_SUPER_BLOCK_BLKNO 2 /* * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could * grow if needed. */ #define OCFS2_MIN_CLUSTERSIZE 4096 #define OCFS2_MAX_CLUSTERSIZE 1048576 /* * Blocks cannot be bigger than clusters, so the maximum blocksize is the * minimum cluster size. */ #define OCFS2_MIN_BLOCKSIZE 512 #define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE /* Object signatures */ #define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2" #define OCFS2_INODE_SIGNATURE "INODE01" #define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" #define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" #define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" #define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" #define OCFS2_DX_ROOT_SIGNATURE "DXDIR01" #define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1" #define OCFS2_REFCOUNT_BLOCK_SIGNATURE "REFCNT1" /* Compatibility flags */ #define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ ( OCFS2_SB(sb)->s_feature_compat & (mask) ) #define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask) \ ( OCFS2_SB(sb)->s_feature_ro_compat & (mask) ) #define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask) \ ( OCFS2_SB(sb)->s_feature_incompat & (mask) ) #define OCFS2_SET_COMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_compat |= (mask) #define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_ro_compat |= (mask) #define OCFS2_SET_INCOMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_incompat |= (mask) #define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_compat &= ~(mask) #define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask) #define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_incompat &= ~(mask) #define OCFS2_FEATURE_COMPAT_SUPP (OCFS2_FEATURE_COMPAT_BACKUP_SB \ | OCFS2_FEATURE_COMPAT_JBD2_SB) #define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ | OCFS2_FEATURE_INCOMPAT_XATTR \ | OCFS2_FEATURE_INCOMPAT_META_ECC \ | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \ | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \ | OCFS2_FEATURE_INCOMPAT_APPEND_DIO) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) /* * Heartbeat-only devices are missing journals and other files. The * filesystem driver can't load them, but the library can. Never put * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*. */ #define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002 /* * tunefs sets this incompat flag before starting the resize and clears it * at the end. This flag protects users from inadvertently mounting the fs * after an aborted run without fsck-ing. */ #define OCFS2_FEATURE_INCOMPAT_RESIZE_INPROG 0x0004 /* Used to denote a non-clustered volume */ #define OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT 0x0008 /* Support for sparse allocation in b-trees */ #define OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC 0x0010 /* * Tunefs sets this incompat flag before starting an operation which * would require cleanup on abort. This is done to protect users from * inadvertently mounting the fs after an aborted run without * fsck-ing. * * s_tunefs_flags on the super block describes precisely which * operations were in progress. */ #define OCFS2_FEATURE_INCOMPAT_TUNEFS_INPROG 0x0020 /* Support for data packed into inode blocks */ #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 /* * Support for alternate, userspace cluster stacks. If set, the superblock * field s_cluster_info contains a tag for the alternate stack in use as * well as the name of the cluster being joined. * mount.ocfs2 must pass in a matching stack name. * * If not set, the classic stack will be used. This is compatible with * all older versions. */ #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080 /* Support for the extended slot map */ #define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100 /* Support for extended attributes */ #define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 /* Support for indexed directories */ #define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400 /* Metadata checksum and error correction */ #define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 /* Refcount tree support */ #define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 /* Discontiguous block groups */ #define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 /* * Incompat bit to indicate usable clusterinfo with stackflags for all * cluster stacks (userspace adnd o2cb). If this bit is set, * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set. */ #define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000 /* * Append Direct IO support */ #define OCFS2_FEATURE_INCOMPAT_APPEND_DIO 0x8000 /* * backup superblock flag is used to indicate that this volume * has backup superblocks. */ #define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001 /* * The filesystem will correctly handle journal feature bits. */ #define OCFS2_FEATURE_COMPAT_JBD2_SB 0x0002 /* * Unwritten extents support. */ #define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001 /* * Maintain quota information for this filesystem */ #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 /* The byte offset of the first backup block will be 1G. * The following will be 4G, 16G, 64G, 256G and 1T. */ #define OCFS2_BACKUP_SB_START 1 << 30 /* the max backup superblock nums */ #define OCFS2_MAX_BACKUP_SUPERBLOCKS 6 /* * Flags on ocfs2_super_block.s_tunefs_flags */ #define OCFS2_TUNEFS_INPROG_REMOVE_SLOT 0x0001 /* Removing slots */ /* * Flags on ocfs2_dinode.i_flags */ #define OCFS2_VALID_FL (0x00000001) /* Inode is valid */ #define OCFS2_UNUSED2_FL (0x00000002) #define OCFS2_ORPHANED_FL (0x00000004) /* On the orphan list */ #define OCFS2_UNUSED3_FL (0x00000008) /* System inode flags */ #define OCFS2_SYSTEM_FL (0x00000010) /* System inode */ #define OCFS2_SUPER_BLOCK_FL (0x00000020) /* Super block */ #define OCFS2_LOCAL_ALLOC_FL (0x00000040) /* Slot local alloc bitmap */ #define OCFS2_BITMAP_FL (0x00000080) /* Allocation bitmap */ #define OCFS2_JOURNAL_FL (0x00000100) /* Slot local journal */ #define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */ #define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ #define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ #define OCFS2_QUOTA_FL (0x00001000) /* Quota file */ #define OCFS2_DIO_ORPHANED_FL (0X00002000) /* On the orphan list especially * for dio */ /* * Flags on ocfs2_dinode.i_dyn_features * * These can change much more often than i_flags. When adding flags, * keep in mind that i_dyn_features is only 16 bits wide. */ #define OCFS2_INLINE_DATA_FL (0x0001) /* Data stored in inode block */ #define OCFS2_HAS_XATTR_FL (0x0002) #define OCFS2_INLINE_XATTR_FL (0x0004) #define OCFS2_INDEXED_DIR_FL (0x0008) #define OCFS2_HAS_REFCOUNT_FL (0x0010) /* Inode attributes, keep in sync with EXT2 */ #define OCFS2_SECRM_FL FS_SECRM_FL /* Secure deletion */ #define OCFS2_UNRM_FL FS_UNRM_FL /* Undelete */ #define OCFS2_COMPR_FL FS_COMPR_FL /* Compress file */ #define OCFS2_SYNC_FL FS_SYNC_FL /* Synchronous updates */ #define OCFS2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */ #define OCFS2_APPEND_FL FS_APPEND_FL /* writes to file may only append */ #define OCFS2_NODUMP_FL FS_NODUMP_FL /* do not dump file */ #define OCFS2_NOATIME_FL FS_NOATIME_FL /* do not update atime */ /* Reserved for compression usage... */ #define OCFS2_DIRTY_FL FS_DIRTY_FL #define OCFS2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */ #define OCFS2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */ #define OCFS2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */ /* End compression flags --- maybe not all used */ #define OCFS2_BTREE_FL FS_BTREE_FL /* btree format dir */ #define OCFS2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */ #define OCFS2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */ #define OCFS2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */ #define OCFS2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */ #define OCFS2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */ #define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/ #define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */ #define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */ #define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */ /* * Extent record flags (e_node.leaf.flags) */ #define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but * unwritten */ #define OCFS2_EXT_REFCOUNTED (0x02) /* Extent is reference * counted in an associated * refcount tree */ /* * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) */ #define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ /* * superblock s_state flags */ #define OCFS2_ERROR_FS (0x00000001) /* FS saw errors */ /* Limit of space in ocfs2_dir_entry */ #define OCFS2_MAX_FILENAME_LEN 255 /* Maximum slots on an ocfs2 file system */ #define OCFS2_MAX_SLOTS 255 /* Slot map indicator for an empty slot */ #define OCFS2_INVALID_SLOT ((u16)-1) #define OCFS2_VOL_UUID_LEN 16 #define OCFS2_MAX_VOL_LABEL_LEN 64 /* The cluster stack fields */ #define OCFS2_STACK_LABEL_LEN 4 #define OCFS2_CLUSTER_NAME_LEN 16 /* Classic (historically speaking) cluster stack */ #define OCFS2_CLASSIC_CLUSTER_STACK "o2cb" /* Journal limits (in bytes) */ #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) /* * Inline extended attribute size (in bytes) * The value chosen should be aligned to 16 byte boundaries. */ #define OCFS2_MIN_XATTR_INLINE_SIZE 256 /* * Cluster info flags (ocfs2_cluster_info.ci_stackflags) */ #define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01) struct ocfs2_system_inode_info { char *si_name; int si_iflags; int si_mode; }; /* System file index */ enum { BAD_BLOCK_SYSTEM_INODE = 0, GLOBAL_INODE_ALLOC_SYSTEM_INODE, #define OCFS2_FIRST_ONLINE_SYSTEM_INODE GLOBAL_INODE_ALLOC_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE, HEARTBEAT_SYSTEM_INODE, GLOBAL_BITMAP_SYSTEM_INODE, USER_QUOTA_SYSTEM_INODE, GROUP_QUOTA_SYSTEM_INODE, #define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE #define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE, EXTENT_ALLOC_SYSTEM_INODE, INODE_ALLOC_SYSTEM_INODE, JOURNAL_SYSTEM_INODE, LOCAL_ALLOC_SYSTEM_INODE, TRUNCATE_LOG_SYSTEM_INODE, LOCAL_USER_QUOTA_SYSTEM_INODE, LOCAL_GROUP_QUOTA_SYSTEM_INODE, #define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE NUM_SYSTEM_INODES }; #define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE #define NUM_LOCAL_SYSTEM_INODES \ (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE) static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { /* Global system inodes (single copy) */ /* The first two are only used from userspace mfks/tunefs */ [BAD_BLOCK_SYSTEM_INODE] = { "bad_blocks", 0, S_IFREG | 0644 }, [GLOBAL_INODE_ALLOC_SYSTEM_INODE] = { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, /* These are used by the running filesystem */ [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 }, [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 }, [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 }, [USER_QUOTA_SYSTEM_INODE] = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 }, [GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 }, /* Slot-specific system inodes (one copy per slot) */ [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 }, [EXTENT_ALLOC_SYSTEM_INODE] = { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 }, [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 }, [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }, [LOCAL_USER_QUOTA_SYSTEM_INODE] = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 }, [LOCAL_GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 }, }; /* Parameter passed from mount.ocfs2 to module */ #define OCFS2_HB_NONE "heartbeat=none" #define OCFS2_HB_LOCAL "heartbeat=local" #define OCFS2_HB_GLOBAL "heartbeat=global" /* * OCFS2_DIR_PAD defines the directory entries boundaries * * NOTE: It must be a multiple of 4 */ #define OCFS2_DIR_PAD 4 #define OCFS2_DIR_ROUND (OCFS2_DIR_PAD - 1) #define OCFS2_DIR_MEMBER_LEN offsetof(struct ocfs2_dir_entry, name) #define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ OCFS2_DIR_ROUND) & \ ~OCFS2_DIR_ROUND) #define OCFS2_DIR_MIN_REC_LEN OCFS2_DIR_REC_LEN(1) #define OCFS2_LINK_MAX 32000 #define OCFS2_DX_LINK_MAX ((1U << 31) - 1U) #define OCFS2_LINKS_HI_SHIFT 16 #define OCFS2_DX_ENTRIES_MAX (0xffffffffU) /* * Convenience casts */ #define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super)) /* * Block checking structure. This is used in metadata to validate the * contents. If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all * zeros. */ struct ocfs2_block_check { /*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */ __le16 bc_ecc; /* Single-error-correction parity vector. This is a simple Hamming code dependent on the blocksize. OCFS2's maximum blocksize, 4K, requires 16 parity bits, so we fit in __le16. */ __le16 bc_reserved1; /*08*/ }; /* * On disk extent record for OCFS2 * It describes a range of clusters on disk. * * Length fields are divided into interior and leaf node versions. * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes. */ struct ocfs2_extent_rec { /*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ union { __le32 e_int_clusters; /* Clusters covered by all children */ struct { __le16 e_leaf_clusters; /* Clusters covered by this extent */ __u8 e_reserved1; __u8 e_flags; /* Extent flags */ }; }; __le64 e_blkno; /* Physical disk offset, in blocks */ /*10*/ }; struct ocfs2_chain_rec { __le32 c_free; /* Number of free bits in this chain. */ __le32 c_total; /* Number of total bits in this chain */ __le64 c_blkno; /* Physical disk offset (blocks) of 1st group */ }; struct ocfs2_truncate_rec { __le32 t_start; /* 1st cluster in this log */ __le32 t_clusters; /* Number of total clusters covered */ }; /* * On disk extent list for OCFS2 (node in the tree). Note that this * is contained inside ocfs2_dinode or ocfs2_extent_block, so the * offsets are relative to ocfs2_dinode.id2.i_list or * ocfs2_extent_block.h_list, respectively. */ struct ocfs2_extent_list { /*00*/ __le16 l_tree_depth; /* Extent tree depth from this point. 0 means data extents hang directly off this header (a leaf) NOTE: The high 8 bits cannot be used - tree_depth is never that big. */ __le16 l_count; /* Number of extent records */ __le16 l_next_free_rec; /* Next unused extent slot */ __le16 l_reserved1; __le64 l_reserved2; /* Pad to sizeof(ocfs2_extent_rec) */ /* Extent records */ /*10*/ struct ocfs2_extent_rec l_recs[] __counted_by_le(l_count); }; /* * On disk allocation chain list for OCFS2. Note that this is * contained inside ocfs2_dinode, so the offsets are relative to * ocfs2_dinode.id2.i_chain. */ struct ocfs2_chain_list { /*00*/ __le16 cl_cpg; /* Clusters per Block Group */ __le16 cl_bpc; /* Bits per cluster */ __le16 cl_count; /* Total chains in this list */ __le16 cl_next_free_rec; /* Next unused chain slot */ __le64 cl_reserved1; /* Chain records */ /*10*/ struct ocfs2_chain_rec cl_recs[] __counted_by_le(cl_count); }; /* * On disk deallocation log for OCFS2. Note that this is * contained inside ocfs2_dinode, so the offsets are relative to * ocfs2_dinode.id2.i_dealloc. */ struct ocfs2_truncate_log { /*00*/ __le16 tl_count; /* Total records in this log */ __le16 tl_used; /* Number of records in use */ __le32 tl_reserved1; /* Truncate records */ /*08*/ struct ocfs2_truncate_rec tl_recs[] __counted_by_le(tl_count); }; /* * On disk extent block (indirect block) for OCFS2 */ struct ocfs2_extent_block { /*00*/ __u8 h_signature[8]; /* Signature for verification */ struct ocfs2_block_check h_check; /* Error checking */ /*10*/ __le16 h_suballoc_slot; /* Slot suballocator this extent_header belongs to */ __le16 h_suballoc_bit; /* Bit offset in suballocator block group */ __le32 h_fs_generation; /* Must match super block */ __le64 h_blkno; /* Offset on disk, in blocks */ /*20*/ __le64 h_suballoc_loc; /* Suballocator block group this eb belongs to. Only valid if allocated from a discontiguous block group */ __le64 h_next_leaf_blk; /* Offset on disk, in blocks, of next leaf header pointing to data */ /*30*/ struct ocfs2_extent_list h_list; /* Extent record list */ /* Actual on-disk size is one block */ }; /* * On disk slot map for OCFS2. This defines the contents of the "slot_map" * system file. A slot is valid if it contains a node number >= 0. The * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty. */ struct ocfs2_slot_map { /*00*/ DECLARE_FLEX_ARRAY(__le16, sm_slots); /* * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255, * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize. */ }; struct ocfs2_extended_slot { /*00*/ __u8 es_valid; __u8 es_reserved1[3]; __le32 es_node_num; /*08*/ }; /* * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP * is set. It separates out the valid marker from the node number, and * has room to grow. Unlike the old slot map, this format is defined by * i_size. */ struct ocfs2_slot_map_extended { /*00*/ DECLARE_FLEX_ARRAY(struct ocfs2_extended_slot, se_slots); /* * Actual size is i_size of the slot_map system file. It should * match s_max_slots * sizeof(struct ocfs2_extended_slot) */ }; /* * ci_stackflags is only valid if the incompat bit * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set. */ struct ocfs2_cluster_info { /*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN]; union { __le32 ci_reserved; struct { __u8 ci_stackflags; __u8 ci_reserved1; __u8 ci_reserved2; __u8 ci_reserved3; }; }; /*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN]; /*18*/ }; /* * On disk superblock for OCFS2 * Note that it is contained inside an ocfs2_dinode, so all offsets * are relative to the start of ocfs2_dinode.id2. */ struct ocfs2_super_block { /*00*/ __le16 s_major_rev_level; __le16 s_minor_rev_level; __le16 s_mnt_count; __le16 s_max_mnt_count; __le16 s_state; /* File system state */ __le16 s_errors; /* Behaviour when detecting errors */ __le32 s_checkinterval; /* Max time between checks */ /*10*/ __le64 s_lastcheck; /* Time of last check */ __le32 s_creator_os; /* OS */ __le32 s_feature_compat; /* Compatible feature set */ /*20*/ __le32 s_feature_incompat; /* Incompatible feature set */ __le32 s_feature_ro_compat; /* Readonly-compatible feature set */ __le64 s_root_blkno; /* Offset, in blocks, of root directory dinode */ /*30*/ __le64 s_system_dir_blkno; /* Offset, in blocks, of system directory dinode */ __le32 s_blocksize_bits; /* Blocksize for this fs */ __le32 s_clustersize_bits; /* Clustersize for this fs */ /*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts before tunefs required */ __le16 s_tunefs_flag; __le32 s_uuid_hash; /* hash value of uuid */ __le64 s_first_cluster_group; /* Block offset of 1st cluster * group header */ /*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ /*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ /*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either userspace or clusterinfo INCOMPAT flag set. */ /*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size for this fs*/ __le16 s_reserved0; __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash. * s_uuid_hash serves as seed[3]. */ /*C8*/ __le64 s_reserved2[15]; /* Fill out superblock */ /*140*/ /* * NOTE: As stated above, all offsets are relative to * ocfs2_dinode.id2, which is at 0xC0 in the inode. * 0xC0 + 0x140 = 0x200 or 512 bytes. A superblock must fit within * our smallest blocksize, which is 512 bytes. To ensure this, * we reserve the space in s_reserved2. Anything past s_reserved2 * will not be available on the smallest blocksize. */ }; /* * Local allocation bitmap for OCFS2 slots * Note that it exists inside an ocfs2_dinode, so all offsets are * relative to the start of ocfs2_dinode.id2. */ struct ocfs2_local_alloc { /*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */ __le16 la_size; /* Size of included bitmap, in bytes */ __le16 la_reserved1; __le64 la_reserved2; /*10*/ __u8 la_bitmap[]; }; /* * Data-in-inode header. This is only used if i_dyn_features has * OCFS2_INLINE_DATA_FL set. */ struct ocfs2_inline_data { /*00*/ __le16 id_count; /* Number of bytes that can be used * for data, starting at id_data */ __le16 id_reserved0; __le32 id_reserved1; __u8 id_data[]; /* Start of user data */ }; /* * On disk inode for OCFS2 */ struct ocfs2_dinode { /*00*/ __u8 i_signature[8]; /* Signature for validation */ __le32 i_generation; /* Generation number */ __le16 i_suballoc_slot; /* Slot suballocator this inode belongs to */ __le16 i_suballoc_bit; /* Bit offset in suballocator block group */ /*10*/ __le16 i_links_count_hi; /* High 16 bits of links count */ __le16 i_xattr_inline_size; __le32 i_clusters; /* Cluster count */ __le32 i_uid; /* Owner UID */ __le32 i_gid; /* Owning GID */ /*20*/ __le64 i_size; /* Size in bytes */ __le16 i_mode; /* File mode */ __le16 i_links_count; /* Links count */ __le32 i_flags; /* File flags */ /*30*/ __le64 i_atime; /* Access time */ __le64 i_ctime; /* Creation time */ /*40*/ __le64 i_mtime; /* Modification time */ __le64 i_dtime; /* Deletion time */ /*50*/ __le64 i_blkno; /* Offset on disk, in blocks */ __le64 i_last_eb_blk; /* Pointer to last extent block */ /*60*/ __le32 i_fs_generation; /* Generation per fs-instance */ __le32 i_atime_nsec; __le32 i_ctime_nsec; __le32 i_mtime_nsec; /*70*/ __le32 i_attr; __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL was set in i_flags */ __le16 i_dyn_features; __le64 i_xattr_loc; /*80*/ struct ocfs2_block_check i_check; /* Error checking */ /*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ /*90*/ __le64 i_refcount_loc; __le64 i_suballoc_loc; /* Suballocator block group this inode belongs to. Only valid if allocated from a discontiguous block group */ /*A0*/ __le16 i_dio_orphaned_slot; /* only used for append dio write */ __le16 i_reserved1[3]; __le64 i_reserved2[2]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ struct { __le64 i_rdev; /* Device number */ } dev1; struct { /* Info for bitmap system inodes */ __le32 i_used; /* Bits (ie, clusters) used */ __le32 i_total; /* Total bits (clusters) available */ } bitmap1; struct { /* Info for journal system inodes */ __le32 ij_flags; /* Mounted, version, etc. */ __le32 ij_recovery_generation; /* Incremented when the journal is recovered after an unclean shutdown */ } journal1; } id1; /* Inode type dependent 1 */ /*C0*/ union { struct ocfs2_super_block i_super; struct ocfs2_local_alloc i_lab; struct ocfs2_chain_list i_chain; struct ocfs2_extent_list i_list; struct ocfs2_truncate_log i_dealloc; struct ocfs2_inline_data i_data; DECLARE_FLEX_ARRAY(__u8, i_symlink); } id2; /* Actual on-disk size is one block */ }; /* * On-disk directory entry structure for OCFS2 * * Packed as this structure could be accessed unaligned on 64-bit platforms */ struct ocfs2_dir_entry { /*00*/ __le64 inode; /* Inode number */ __le16 rec_len; /* Directory entry length */ __u8 name_len; /* Name length */ __u8 file_type; /*0C*/ char name[OCFS2_MAX_FILENAME_LEN]; /* File name */ /* Actual on-disk length specified by rec_len */ } __attribute__ ((packed)); /* * Per-block record for the unindexed directory btree. This is carefully * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are * mirrored. That way, the directory manipulation code needs a minimal amount * of update. * * NOTE: Keep this structure aligned to a multiple of 4 bytes. */ struct ocfs2_dir_block_trailer { /*00*/ __le64 db_compat_inode; /* Always zero. Was inode */ __le16 db_compat_rec_len; /* Backwards compatible with * ocfs2_dir_entry. */ __u8 db_compat_name_len; /* Always zero. Was name_len */ __u8 db_reserved0; __le16 db_reserved1; __le16 db_free_rec_len; /* Size of largest empty hole * in this block. (unused) */ /*10*/ __u8 db_signature[8]; /* Signature for verification */ __le64 db_reserved2; /*20*/ __le64 db_free_next; /* Next block in list (unused) */ __le64 db_blkno; /* Offset on disk, in blocks */ /*30*/ __le64 db_parent_dinode; /* dinode which owns me, in blocks */ struct ocfs2_block_check db_check; /* Error checking */ /*40*/ }; /* * A directory entry in the indexed tree. We don't store the full name here, * but instead provide a pointer to the full dirent in the unindexed tree. * * We also store name_len here so as to reduce the number of leaf blocks we * need to search in case of collisions. */ struct ocfs2_dx_entry { __le32 dx_major_hash; /* Used to find logical * cluster in index */ __le32 dx_minor_hash; /* Lower bits used to find * block in cluster */ __le64 dx_dirent_blk; /* Physical block in unindexed * tree holding this dirent. */ }; struct ocfs2_dx_entry_list { __le32 de_reserved; __le16 de_count; /* Maximum number of entries * possible in de_entries */ __le16 de_num_used; /* Current number of * de_entries entries */ /* Indexed dir entries in a packed * array of length de_num_used. */ struct ocfs2_dx_entry de_entries[] __counted_by_le(de_count); }; #define OCFS2_DX_FLAG_INLINE 0x01 /* * A directory indexing block. Each indexed directory has one of these, * pointed to by ocfs2_dinode. * * This block stores an indexed btree root, and a set of free space * start-of-list pointers. */ struct ocfs2_dx_root_block { __u8 dr_signature[8]; /* Signature for verification */ struct ocfs2_block_check dr_check; /* Error checking */ __le16 dr_suballoc_slot; /* Slot suballocator this * block belongs to. */ __le16 dr_suballoc_bit; /* Bit offset in suballocator * block group */ __le32 dr_fs_generation; /* Must match super block */ __le64 dr_blkno; /* Offset on disk, in blocks */ __le64 dr_last_eb_blk; /* Pointer to last * extent block */ __le32 dr_clusters; /* Clusters allocated * to the indexed tree. */ __u8 dr_flags; /* OCFS2_DX_FLAG_* flags */ __u8 dr_reserved0; __le16 dr_reserved1; __le64 dr_dir_blkno; /* Pointer to parent inode */ __le32 dr_num_entries; /* Total number of * names stored in * this directory.*/ __le32 dr_reserved2; __le64 dr_free_blk; /* Pointer to head of free * unindexed block list. */ __le64 dr_suballoc_loc; /* Suballocator block group this root belongs to. Only valid if allocated from a discontiguous block group */ __le64 dr_reserved3[14]; union { struct ocfs2_extent_list dr_list; /* Keep this aligned to 128 * bits for maximum space * efficiency. */ struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of * entries. We grow out * to extents if this * gets too big. */ }; }; /* * The header of a leaf block in the indexed tree. */ struct ocfs2_dx_leaf { __u8 dl_signature[8];/* Signature for verification */ struct ocfs2_block_check dl_check; /* Error checking */ __le64 dl_blkno; /* Offset on disk, in blocks */ __le32 dl_fs_generation;/* Must match super block */ __le32 dl_reserved0; __le64 dl_reserved1; struct ocfs2_dx_entry_list dl_list; }; /* * Largest bitmap for a block (suballocator) group in bytes. This limit * does not affect cluster groups (global allocator). Cluster group * bitmaps run to the end of the block. */ #define OCFS2_MAX_BG_BITMAP_SIZE 256 /* * On disk allocator group structure for OCFS2 */ struct ocfs2_group_desc { /*00*/ __u8 bg_signature[8]; /* Signature for validation */ __le16 bg_size; /* Size of included bitmap in bytes. */ __le16 bg_bits; /* Bits represented by this group. */ __le16 bg_free_bits_count; /* Free bits count */ __le16 bg_chain; /* What chain I am in. */ /*10*/ __le32 bg_generation; __le16 bg_contig_free_bits; /* max contig free bits length */ __le16 bg_reserved1; __le64 bg_next_group; /* Next group in my list, in blocks */ /*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in blocks */ __le64 bg_blkno; /* Offset on disk, in blocks */ /*30*/ struct ocfs2_block_check bg_check; /* Error checking */ __le64 bg_reserved2; /*40*/ union { DECLARE_FLEX_ARRAY(__u8, bg_bitmap); struct { /* * Block groups may be discontiguous when * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set. * The extents of a discontiguous block group are * stored in bg_list. It is a flat list. * l_tree_depth must always be zero. A * discontiguous group is signified by a non-zero * bg_list->l_next_free_rec. Only block groups * can be discontiguous; Cluster groups cannot. * We've never made a block group with more than * 2048 blocks (256 bytes of bg_bitmap). This * codifies that limit so that we can fit bg_list. * bg_size of a discontiguous block group will * be 256 to match bg_bitmap_filler. */ __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE]; /*140*/ struct ocfs2_extent_list bg_list; }; }; /* Actual on-disk size is one block */ }; struct ocfs2_refcount_rec { /*00*/ __le64 r_cpos; /* Physical offset, in clusters */ __le32 r_clusters; /* Clusters covered by this extent */ __le32 r_refcount; /* Reference count of this extent */ /*10*/ }; #define OCFS2_32BIT_POS_MASK (0xffffffffULL) #define OCFS2_REFCOUNT_LEAF_FL (0x00000001) #define OCFS2_REFCOUNT_TREE_FL (0x00000002) struct ocfs2_refcount_list { /*00*/ __le16 rl_count; /* Maximum number of entries possible in rl_records */ __le16 rl_used; /* Current number of used records */ __le32 rl_reserved2; __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */ /* Refcount records */ /*10*/ struct ocfs2_refcount_rec rl_recs[] __counted_by_le(rl_count); }; struct ocfs2_refcount_block { /*00*/ __u8 rf_signature[8]; /* Signature for verification */ __le16 rf_suballoc_slot; /* Slot suballocator this block belongs to */ __le16 rf_suballoc_bit; /* Bit offset in suballocator block group */ __le32 rf_fs_generation; /* Must match superblock */ /*10*/ __le64 rf_blkno; /* Offset on disk, in blocks */ __le64 rf_parent; /* Parent block, only valid if OCFS2_REFCOUNT_LEAF_FL is set in rf_flags */ /*20*/ struct ocfs2_block_check rf_check; /* Error checking */ __le64 rf_last_eb_blk; /* Pointer to last extent block */ /*30*/ __le32 rf_count; /* Number of inodes sharing this refcount tree */ __le32 rf_flags; /* See the flags above */ __le32 rf_clusters; /* clusters covered by refcount tree. */ __le32 rf_cpos; /* cluster offset in refcount tree.*/ /*40*/ __le32 rf_generation; /* generation number. all be the same * for the same refcount tree. */ __le32 rf_reserved0; __le64 rf_suballoc_loc; /* Suballocator block group this refcount block belongs to. Only valid if allocated from a discontiguous block group */ /*50*/ __le64 rf_reserved1[6]; /*80*/ union { struct ocfs2_refcount_list rf_records; /* List of refcount records */ struct ocfs2_extent_list rf_list; /* Extent record list, only valid if OCFS2_REFCOUNT_TREE_FL is set in rf_flags */ }; /* Actual on-disk size is one block */ }; /* * On disk extended attribute structure for OCFS2. */ /* * ocfs2_xattr_entry indicates one extend attribute. * * Note that it can be stored in inode, one block or one xattr bucket. */ struct ocfs2_xattr_entry { __le32 xe_name_hash; /* hash value of xattr prefix+suffix. */ __le16 xe_name_offset; /* byte offset from the 1st entry in the local xattr storage(inode, xattr block or xattr bucket). */ __u8 xe_name_len; /* xattr name len, doesn't include prefix. */ __u8 xe_type; /* the low 7 bits indicate the name prefix * type and the highest bit indicates whether * the EA is stored in the local storage. */ __le64 xe_value_size; /* real xattr value length. */ }; /* * On disk structure for xattr header. * * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in * the local xattr storage. */ struct ocfs2_xattr_header { __le16 xh_count; /* contains the count of how many records are in the local xattr storage. */ __le16 xh_free_start; /* current offset for storing xattr. */ __le16 xh_name_value_len; /* total length of name/value length in this bucket. */ __le16 xh_num_buckets; /* Number of xattr buckets in this extent record, only valid in the first bucket. */ struct ocfs2_block_check xh_check; /* Error checking (Note, this is only used for xattr buckets. A block uses xb_check and sets this field to zero.) */ /* xattr entry list. */ struct ocfs2_xattr_entry xh_entries[] __counted_by_le(xh_count); }; /* * On disk structure for xattr value root. * * When an xattr's value is large enough, it is stored in an external * b-tree like file data. The xattr value root points to this structure. */ struct ocfs2_xattr_value_root { /*00*/ __le32 xr_clusters; /* clusters covered by xattr value. */ __le32 xr_reserved0; __le64 xr_last_eb_blk; /* Pointer to last extent block */ /*10*/ struct ocfs2_extent_list xr_list; /* Extent record list */ }; /* * On disk structure for xattr tree root. * * It is used when there are too many extended attributes for one file. These * attributes will be organized and stored in an indexed-btree. */ struct ocfs2_xattr_tree_root { /*00*/ __le32 xt_clusters; /* clusters covered by xattr. */ __le32 xt_reserved0; __le64 xt_last_eb_blk; /* Pointer to last extent block */ /*10*/ struct ocfs2_extent_list xt_list; /* Extent record list */ }; #define OCFS2_XATTR_INDEXED 0x1 #define OCFS2_HASH_SHIFT 5 #define OCFS2_XATTR_ROUND 3 #define OCFS2_XATTR_SIZE(size) (((size) + OCFS2_XATTR_ROUND) & \ ~(OCFS2_XATTR_ROUND)) #define OCFS2_XATTR_BUCKET_SIZE 4096 #define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET (OCFS2_XATTR_BUCKET_SIZE \ / OCFS2_MIN_BLOCKSIZE) /* * On disk structure for xattr block. */ struct ocfs2_xattr_block { /*00*/ __u8 xb_signature[8]; /* Signature for verification */ __le16 xb_suballoc_slot; /* Slot suballocator this block belongs to. */ __le16 xb_suballoc_bit; /* Bit offset in suballocator block group */ __le32 xb_fs_generation; /* Must match super block */ /*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */ struct ocfs2_block_check xb_check; /* Error checking */ /*20*/ __le16 xb_flags; /* Indicates whether this block contains real xattr or a xattr tree. */ __le16 xb_reserved0; __le32 xb_reserved1; __le64 xb_suballoc_loc; /* Suballocator block group this xattr block belongs to. Only valid if allocated from a discontiguous block group */ /*30*/ union { struct ocfs2_xattr_header xb_header; /* xattr header if this block contains xattr */ struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this block contains xattr tree. */ } xb_attrs; }; #define OCFS2_XATTR_ENTRY_LOCAL 0x80 #define OCFS2_XATTR_TYPE_MASK 0x7F static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe, int local) { if (local) xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL; else xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL; } static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe) { return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL; } static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type) { xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK; } static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe) { return xe->xe_type & OCFS2_XATTR_TYPE_MASK; } /* * On disk structures for global quota file */ /* Magic numbers and known versions for global quota files */ #define OCFS2_GLOBAL_QMAGICS {\ 0x0cf52470, /* USRQUOTA */ \ 0x0cf52471 /* GRPQUOTA */ \ } #define OCFS2_GLOBAL_QVERSIONS {\ 0, \ 0, \ } /* Each block of each quota file has a certain fixed number of bytes reserved * for OCFS2 internal use at its end. OCFS2 can use it for things like * checksums, etc. */ #define OCFS2_QBLK_RESERVED_SPACE 8 /* Generic header of all quota files */ struct ocfs2_disk_dqheader { __le32 dqh_magic; /* Magic number identifying file */ __le32 dqh_version; /* Quota format version */ }; #define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader)) /* Information header of global quota file (immediately follows the generic * header) */ struct ocfs2_global_disk_dqinfo { /*00*/ __le32 dqi_bgrace; /* Grace time for space softlimit excess */ __le32 dqi_igrace; /* Grace time for inode softlimit excess */ __le32 dqi_syncms; /* Time after which we sync local changes to * global quota file */ __le32 dqi_blocks; /* Number of blocks in quota file */ /*10*/ __le32 dqi_free_blk; /* First free block in quota file */ __le32 dqi_free_entry; /* First block with free dquot entry in quota * file */ }; /* Structure with global user / group information. We reserve some space * for future use. */ struct ocfs2_global_disk_dqblk { /*00*/ __le32 dqb_id; /* ID the structure belongs to */ __le32 dqb_use_count; /* Number of nodes having reference to this structure */ __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */ /*10*/ __le64 dqb_isoftlimit; /* preferred inode limit */ __le64 dqb_curinodes; /* current # allocated inodes */ /*20*/ __le64 dqb_bhardlimit; /* absolute limit on disk space */ __le64 dqb_bsoftlimit; /* preferred limit on disk space */ /*30*/ __le64 dqb_curspace; /* current space occupied */ __le64 dqb_btime; /* time limit for excessive disk use */ /*40*/ __le64 dqb_itime; /* time limit for excessive inode use */ __le64 dqb_pad1; /*50*/ __le64 dqb_pad2; }; /* * On-disk structures for local quota file */ /* Magic numbers and known versions for local quota files */ #define OCFS2_LOCAL_QMAGICS {\ 0x0cf524c0, /* USRQUOTA */ \ 0x0cf524c1 /* GRPQUOTA */ \ } #define OCFS2_LOCAL_QVERSIONS {\ 0, \ 0, \ } /* Quota flags in dqinfo header */ #define OLQF_CLEAN 0x0001 /* Quota file is empty (this should be after\ * quota has been cleanly turned off) */ #define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader)) /* Information header of local quota file (immediately follows the generic * header) */ struct ocfs2_local_disk_dqinfo { __le32 dqi_flags; /* Flags for quota file */ __le32 dqi_chunks; /* Number of chunks of quota structures * with a bitmap */ __le32 dqi_blocks; /* Number of blocks allocated for quota file */ }; /* Header of one chunk of a quota file */ struct ocfs2_local_disk_chunk { __le32 dqc_free; /* Number of free entries in the bitmap */ __u8 dqc_bitmap[]; /* Bitmap of entries in the corresponding * chunk of quota file */ }; /* One entry in local quota file */ struct ocfs2_local_disk_dqblk { /*00*/ __le64 dqb_id; /* id this quota applies to */ __le64 dqb_spacemod; /* Change in the amount of used space */ /*10*/ __le64 dqb_inodemod; /* Change in the amount of used inodes */ }; /* * The quota trailer lives at the end of each quota block. */ struct ocfs2_disk_dqtrailer { /*00*/ struct ocfs2_block_check dq_check; /* Error checking */ /*08*/ /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */ }; static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize, void *buf) { char *ptr = buf; ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE; return (struct ocfs2_disk_dqtrailer *)ptr; } #ifdef __KERNEL__ static inline int ocfs2_fast_symlink_chars(struct super_block *sb) { return sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); } static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb, struct ocfs2_dinode *di) { unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) return sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data) - xattrsize; else return sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data); } static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_extent_recs_per_inode_with_xattr( struct super_block *sb, struct ocfs2_dinode *di) { int size; unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_list.l_recs) - xattrsize; else size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dx_root_block, dr_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); return size / sizeof(struct ocfs2_chain_rec); } static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_extent_block, h_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_group_desc, bg_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dx_leaf, dl_list.de_entries); return size / sizeof(struct ocfs2_dx_entry); } static inline int ocfs2_dx_entries_per_root(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries); return size / sizeof(struct ocfs2_dx_entry); } static inline u16 ocfs2_local_alloc_size(struct super_block *sb) { u16 size; size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); return size; } static inline int ocfs2_group_bitmap_size(struct super_block *sb, int suballocator, u32 feature_incompat) { int size = sb->s_blocksize - offsetof(struct ocfs2_group_desc, bg_bitmap); /* * The cluster allocator uses the entire block. Suballocators have * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older * code expects bg_size set to the maximum. Thus we must keep * bg_size as-is unless discontig_bg is enabled. */ if (suballocator && (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)) size = OCFS2_MAX_BG_BITMAP_SIZE; return size; } static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); return size / sizeof(struct ocfs2_truncate_rec); } static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index) { u64 offset = OCFS2_BACKUP_SB_START; if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) { offset <<= (2 * index); offset >>= sb->s_blocksize_bits; return offset; } return 0; } static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_xattr_block, xb_attrs.xb_root.xt_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_refcount_block, rf_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_refcount_block, rf_records.rl_recs); return size / sizeof(struct ocfs2_refcount_rec); } static inline u32 ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec) { return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK; } #else static inline int ocfs2_fast_symlink_chars(int blocksize) { return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); } static inline int ocfs2_max_inline_data_with_xattr(int blocksize, struct ocfs2_dinode *di) { if (di && (di->i_dyn_features & OCFS2_INLINE_XATTR_FL)) return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data) - di->i_xattr_inline_size; else return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data); } static inline int ocfs2_extent_recs_per_inode(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_dinode, id2.i_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_chain_recs_per_inode(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); return size / sizeof(struct ocfs2_chain_rec); } static inline int ocfs2_extent_recs_per_eb(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_extent_block, h_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_extent_recs_per_gd(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_group_desc, bg_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_local_alloc_size(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); return size; } static inline int ocfs2_group_bitmap_size(int blocksize, int suballocator, uint32_t feature_incompat) { int size = sb->s_blocksize - offsetof(struct ocfs2_group_desc, bg_bitmap); /* * The cluster allocator uses the entire block. Suballocators have * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older * code expects bg_size set to the maximum. Thus we must keep * bg_size as-is unless discontig_bg is enabled. */ if (suballocator && (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)) size = OCFS2_MAX_BG_BITMAP_SIZE; return size; } static inline int ocfs2_truncate_recs_per_inode(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); return size / sizeof(struct ocfs2_truncate_rec); } static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index) { uint64_t offset = OCFS2_BACKUP_SB_START; if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) { offset <<= (2 * index); offset /= blocksize; return offset; } return 0; } static inline int ocfs2_xattr_recs_per_xb(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_xattr_block, xb_attrs.xb_root.xt_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } #endif /* __KERNEL__ */ static inline int ocfs2_system_inode_is_global(int type) { return ((type >= 0) && (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)); } static inline int ocfs2_sprintf_system_inode_name(char *buf, int len, int type, int slot) { int chars; /* * Global system inodes can only have one copy. Everything * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode * list has a copy per slot. */ if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE) chars = snprintf(buf, len, "%s", ocfs2_system_inodes[type].si_name); else chars = snprintf(buf, len, ocfs2_system_inodes[type].si_name, slot); return chars; } static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de, umode_t mode) { de->file_type = fs_umode_to_ftype(mode); } static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd) { if ((offsetof(struct ocfs2_group_desc, bg_bitmap) + le16_to_cpu(gd->bg_size)) != offsetof(struct ocfs2_group_desc, bg_list)) return 0; /* * Only valid to check l_next_free_rec if * bg_bitmap + bg_size == bg_list. */ if (!gd->bg_list.l_next_free_rec) return 0; return 1; } #endif /* _OCFS2_FS_H */ |
| 29 29 29 3 3 3 3 3 3 3 3 3 3 58 55 11 23 18 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 | // SPDX-License-Identifier: GPL-2.0-only /* * This contains encryption functions for per-file encryption. * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility * * Written by Michael Halcrow, 2014. * * Filename encryption additions * Uday Savagaonkar, 2014 * Encryption policy handling additions * Ildar Muslukhov, 2014 * Add fscrypt_pullback_bio_page() * Jaegeuk Kim, 2015. * * This has not yet undergone a rigorous security audit. * * The usage of AES-XTS should conform to recommendations in NIST * Special Publication 800-38E and IEEE P1619/D16. */ #include <crypto/skcipher.h> #include <linux/export.h> #include <linux/mempool.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/ratelimit.h> #include <linux/scatterlist.h> #include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; module_param(num_prealloc_crypto_pages, uint, 0444); MODULE_PARM_DESC(num_prealloc_crypto_pages, "Number of crypto pages to preallocate"); static mempool_t *fscrypt_bounce_page_pool = NULL; static struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); struct kmem_cache *fscrypt_inode_info_cachep; void fscrypt_enqueue_decrypt_work(struct work_struct *work) { queue_work(fscrypt_read_workqueue, work); } EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work); struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags) { if (WARN_ON_ONCE(!fscrypt_bounce_page_pool)) { /* * Oops, the filesystem called a function that uses the bounce * page pool, but it didn't set needs_bounce_pages. */ return NULL; } return mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); } /** * fscrypt_free_bounce_page() - free a ciphertext bounce page * @bounce_page: the bounce page to free, or NULL * * Free a bounce page that was allocated by fscrypt_encrypt_pagecache_blocks(), * or by fscrypt_alloc_bounce_page() directly. */ void fscrypt_free_bounce_page(struct page *bounce_page) { if (!bounce_page) return; set_page_private(bounce_page, (unsigned long)NULL); ClearPagePrivate(bounce_page); mempool_free(bounce_page, fscrypt_bounce_page_pool); } EXPORT_SYMBOL(fscrypt_free_bounce_page); /* * Generate the IV for the given data unit index within the given file. * For filenames encryption, index == 0. * * Keep this in sync with fscrypt_limit_io_blocks(). fscrypt_limit_io_blocks() * needs to know about any IV generation methods where the low bits of IV don't * simply contain the data unit index (e.g., IV_INO_LBLK_32). */ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index, const struct fscrypt_inode_info *ci) { u8 flags = fscrypt_policy_flags(&ci->ci_policy); memset(iv, 0, ci->ci_mode->ivsize); if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { WARN_ON_ONCE(index > U32_MAX); WARN_ON_ONCE(ci->ci_inode->i_ino > U32_MAX); index |= (u64)ci->ci_inode->i_ino << 32; } else if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { WARN_ON_ONCE(index > U32_MAX); index = (u32)(ci->ci_hashed_ino + index); } else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { memcpy(iv->nonce, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE); } iv->index = cpu_to_le64(index); } /* Encrypt or decrypt a single "data unit" of file contents. */ int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, fscrypt_direction_t rw, u64 index, struct page *src_page, struct page *dest_page, unsigned int len, unsigned int offs) { struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; struct scatterlist dst, src; int err; if (WARN_ON_ONCE(len <= 0)) return -EINVAL; if (WARN_ON_ONCE(len % FSCRYPT_CONTENTS_ALIGNMENT != 0)) return -EINVAL; fscrypt_generate_iv(&iv, index, ci); skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, len, offs); sg_init_table(&src, 1); sg_set_page(&src, src_page, len, offs); skcipher_request_set_crypt(req, &src, &dst, len, &iv); if (rw == FS_DECRYPT) err = crypto_skcipher_decrypt(req); else err = crypto_skcipher_encrypt(req); if (err) fscrypt_err(ci->ci_inode, "%scryption failed for data unit %llu: %d", (rw == FS_DECRYPT ? "De" : "En"), index, err); return err; } /** * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache folio * @folio: the locked pagecache folio containing the data to encrypt * @len: size of the data to encrypt, in bytes * @offs: offset within @page of the data to encrypt, in bytes * @gfp_flags: memory allocation flags; see details below * * This allocates a new bounce page and encrypts the given data into it. The * length and offset of the data must be aligned to the file's crypto data unit * size. Alignment to the filesystem block size fulfills this requirement, as * the filesystem block size is always a multiple of the data unit size. * * In the bounce page, the ciphertext data will be located at the same offset at * which the plaintext data was located in the source page. Any other parts of * the bounce page will be left uninitialized. * * This is for use by the filesystem's ->writepages() method. * * The bounce page allocation is mempool-backed, so it will always succeed when * @gfp_flags includes __GFP_DIRECT_RECLAIM, e.g. when it's GFP_NOFS. However, * only the first page of each bio can be allocated this way. To prevent * deadlocks, for any additional pages a mask like GFP_NOWAIT must be used. * * Return: the new encrypted bounce page on success; an ERR_PTR() on failure */ struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs, gfp_t gfp_flags) { const struct inode *inode = folio->mapping->host; const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; struct page *ciphertext_page; u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) + (offs >> du_bits); unsigned int i; int err; VM_BUG_ON_FOLIO(folio_test_large(folio), folio); if (WARN_ON_ONCE(!folio_test_locked(folio))) return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size))) return ERR_PTR(-EINVAL); ciphertext_page = fscrypt_alloc_bounce_page(gfp_flags); if (!ciphertext_page) return ERR_PTR(-ENOMEM); for (i = offs; i < offs + len; i += du_size, index++) { err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index, &folio->page, ciphertext_page, du_size, i); if (err) { fscrypt_free_bounce_page(ciphertext_page); return ERR_PTR(err); } } SetPagePrivate(ciphertext_page); set_page_private(ciphertext_page, (unsigned long)folio); return ciphertext_page; } EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks); /** * fscrypt_encrypt_block_inplace() - Encrypt a filesystem block in-place * @inode: The inode to which this block belongs * @page: The page containing the block to encrypt * @len: Size of block to encrypt. This must be a multiple of * FSCRYPT_CONTENTS_ALIGNMENT. * @offs: Byte offset within @page at which the block to encrypt begins * @lblk_num: Filesystem logical block number of the block, i.e. the 0-based * number of the block within the file * * Encrypt a possibly-compressed filesystem block that is located in an * arbitrary page, not necessarily in the original pagecache page. The @inode * and @lblk_num must be specified, as they can't be determined from @page. * * This is not compatible with fscrypt_operations::supports_subblock_data_units. * * Return: 0 on success; -errno on failure */ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num) { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode), FS_ENCRYPT, lblk_num, page, page, len, offs); } EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); /** * fscrypt_decrypt_pagecache_blocks() - Decrypt data from a pagecache folio * @folio: the pagecache folio containing the data to decrypt * @len: size of the data to decrypt, in bytes * @offs: offset within @folio of the data to decrypt, in bytes * * Decrypt data that has just been read from an encrypted file. The data must * be located in a pagecache folio that is still locked and not yet uptodate. * The length and offset of the data must be aligned to the file's crypto data * unit size. Alignment to the filesystem block size fulfills this requirement, * as the filesystem block size is always a multiple of the data unit size. * * Return: 0 on success; -errno on failure */ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs) { const struct inode *inode = folio->mapping->host; const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) + (offs >> du_bits); size_t i; int err; if (WARN_ON_ONCE(!folio_test_locked(folio))) return -EINVAL; if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size))) return -EINVAL; for (i = offs; i < offs + len; i += du_size, index++) { struct page *page = folio_page(folio, i >> PAGE_SHIFT); err = fscrypt_crypt_data_unit(ci, FS_DECRYPT, index, page, page, du_size, i & ~PAGE_MASK); if (err) return err; } return 0; } EXPORT_SYMBOL(fscrypt_decrypt_pagecache_blocks); /** * fscrypt_decrypt_block_inplace() - Decrypt a filesystem block in-place * @inode: The inode to which this block belongs * @page: The page containing the block to decrypt * @len: Size of block to decrypt. This must be a multiple of * FSCRYPT_CONTENTS_ALIGNMENT. * @offs: Byte offset within @page at which the block to decrypt begins * @lblk_num: Filesystem logical block number of the block, i.e. the 0-based * number of the block within the file * * Decrypt a possibly-compressed filesystem block that is located in an * arbitrary page, not necessarily in the original pagecache page. The @inode * and @lblk_num must be specified, as they can't be determined from @page. * * This is not compatible with fscrypt_operations::supports_subblock_data_units. * * Return: 0 on success; -errno on failure */ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num) { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode), FS_DECRYPT, lblk_num, page, page, len, offs); } EXPORT_SYMBOL(fscrypt_decrypt_block_inplace); /** * fscrypt_initialize() - allocate major buffers for fs encryption. * @sb: the filesystem superblock * * We only call this when we start accessing encrypted files, since it * results in memory getting allocated that wouldn't otherwise be used. * * Return: 0 on success; -errno on failure */ int fscrypt_initialize(struct super_block *sb) { int err = 0; mempool_t *pool; /* pairs with smp_store_release() below */ if (likely(smp_load_acquire(&fscrypt_bounce_page_pool))) return 0; /* No need to allocate a bounce page pool if this FS won't use it. */ if (!sb->s_cop->needs_bounce_pages) return 0; mutex_lock(&fscrypt_init_mutex); if (fscrypt_bounce_page_pool) goto out_unlock; err = -ENOMEM; pool = mempool_create_page_pool(num_prealloc_crypto_pages, 0); if (!pool) goto out_unlock; /* pairs with smp_load_acquire() above */ smp_store_release(&fscrypt_bounce_page_pool, pool); err = 0; out_unlock: mutex_unlock(&fscrypt_init_mutex); return err; } void fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...) { static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); struct va_format vaf; va_list args; if (!__ratelimit(&rs)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (inode && inode->i_ino) printk("%sfscrypt (%s, inode %lu): %pV\n", level, inode->i_sb->s_id, inode->i_ino, &vaf); else if (inode) printk("%sfscrypt (%s): %pV\n", level, inode->i_sb->s_id, &vaf); else printk("%sfscrypt: %pV\n", level, &vaf); va_end(args); } /** * fscrypt_init() - Set up for fs encryption. * * Return: 0 on success; -errno on failure */ static int __init fscrypt_init(void) { int err = -ENOMEM; /* * Use an unbound workqueue to allow bios to be decrypted in parallel * even when they happen to complete on the same CPU. This sacrifices * locality, but it's worthwhile since decryption is CPU-intensive. * * Also use a high-priority workqueue to prioritize decryption work, * which blocks reads from completing, over regular application tasks. */ fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue", WQ_UNBOUND | WQ_HIGHPRI, num_online_cpus()); if (!fscrypt_read_workqueue) goto fail; fscrypt_inode_info_cachep = KMEM_CACHE(fscrypt_inode_info, SLAB_RECLAIM_ACCOUNT); if (!fscrypt_inode_info_cachep) goto fail_free_queue; err = fscrypt_init_keyring(); if (err) goto fail_free_inode_info; return 0; fail_free_inode_info: kmem_cache_destroy(fscrypt_inode_info_cachep); fail_free_queue: destroy_workqueue(fscrypt_read_workqueue); fail: return err; } late_initcall(fscrypt_init) |
| 23 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | /* SPDX-License-Identifier: GPL-2.0 OR MIT */ #ifndef _CRYPTO_BLAKE2B_H #define _CRYPTO_BLAKE2B_H #include <linux/bug.h> #include <linux/types.h> #include <linux/string.h> enum blake2b_lengths { BLAKE2B_BLOCK_SIZE = 128, BLAKE2B_HASH_SIZE = 64, BLAKE2B_KEY_SIZE = 64, BLAKE2B_160_HASH_SIZE = 20, BLAKE2B_256_HASH_SIZE = 32, BLAKE2B_384_HASH_SIZE = 48, BLAKE2B_512_HASH_SIZE = 64, }; /** * struct blake2b_ctx - Context for hashing a message with BLAKE2b * @h: compression function state * @t: block counter * @f: finalization indicator * @buf: partial block buffer; 'buflen' bytes are valid * @buflen: number of bytes buffered in @buf * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE */ struct blake2b_ctx { /* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */ u64 h[8]; u64 t[2]; u64 f[2]; u8 buf[BLAKE2B_BLOCK_SIZE]; unsigned int buflen; unsigned int outlen; }; enum blake2b_iv { BLAKE2B_IV0 = 0x6A09E667F3BCC908ULL, BLAKE2B_IV1 = 0xBB67AE8584CAA73BULL, BLAKE2B_IV2 = 0x3C6EF372FE94F82BULL, BLAKE2B_IV3 = 0xA54FF53A5F1D36F1ULL, BLAKE2B_IV4 = 0x510E527FADE682D1ULL, BLAKE2B_IV5 = 0x9B05688C2B3E6C1FULL, BLAKE2B_IV6 = 0x1F83D9ABFB41BD6BULL, BLAKE2B_IV7 = 0x5BE0CD19137E2179ULL, }; static inline void __blake2b_init(struct blake2b_ctx *ctx, size_t outlen, const void *key, size_t keylen) { ctx->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen); ctx->h[1] = BLAKE2B_IV1; ctx->h[2] = BLAKE2B_IV2; ctx->h[3] = BLAKE2B_IV3; ctx->h[4] = BLAKE2B_IV4; ctx->h[5] = BLAKE2B_IV5; ctx->h[6] = BLAKE2B_IV6; ctx->h[7] = BLAKE2B_IV7; ctx->t[0] = 0; ctx->t[1] = 0; ctx->f[0] = 0; ctx->f[1] = 0; ctx->buflen = 0; ctx->outlen = outlen; if (keylen) { memcpy(ctx->buf, key, keylen); memset(&ctx->buf[keylen], 0, BLAKE2B_BLOCK_SIZE - keylen); ctx->buflen = BLAKE2B_BLOCK_SIZE; } } /** * blake2b_init() - Initialize a BLAKE2b context for a new message (unkeyed) * @ctx: the context to initialize * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE * * Context: Any context. */ static inline void blake2b_init(struct blake2b_ctx *ctx, size_t outlen) { __blake2b_init(ctx, outlen, NULL, 0); } /** * blake2b_init_key() - Initialize a BLAKE2b context for a new message (keyed) * @ctx: the context to initialize * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE * @key: the key * @keylen: the key length in bytes, at most BLAKE2B_KEY_SIZE * * Context: Any context. */ static inline void blake2b_init_key(struct blake2b_ctx *ctx, size_t outlen, const void *key, size_t keylen) { WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2B_HASH_SIZE || !key || !keylen || keylen > BLAKE2B_KEY_SIZE)); __blake2b_init(ctx, outlen, key, keylen); } /** * blake2b_update() - Update a BLAKE2b context with message data * @ctx: the context to update; must have been initialized * @in: the message data * @inlen: the data length in bytes * * This can be called any number of times. * * Context: Any context. */ void blake2b_update(struct blake2b_ctx *ctx, const u8 *in, size_t inlen); /** * blake2b_final() - Finish computing a BLAKE2b hash * @ctx: the context to finalize; must have been initialized * @out: (output) the resulting BLAKE2b hash. Its length will be equal to the * @outlen that was passed to blake2b_init() or blake2b_init_key(). * * After finishing, this zeroizes @ctx. So the caller does not need to do it. * * Context: Any context. */ void blake2b_final(struct blake2b_ctx *ctx, u8 *out); /** * blake2b() - Compute BLAKE2b hash in one shot * @key: the key, or NULL for an unkeyed hash * @keylen: the key length in bytes (at most BLAKE2B_KEY_SIZE), or 0 for an * unkeyed hash * @in: the message data * @inlen: the data length in bytes * @out: (output) the resulting BLAKE2b hash, with length @outlen * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE * * Context: Any context. */ static inline void blake2b(const u8 *key, size_t keylen, const u8 *in, size_t inlen, u8 *out, size_t outlen) { struct blake2b_ctx ctx; WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen || outlen > BLAKE2B_HASH_SIZE || keylen > BLAKE2B_KEY_SIZE || (!key && keylen))); __blake2b_init(&ctx, outlen, key, keylen); blake2b_update(&ctx, in, inlen); blake2b_final(&ctx, out); } #endif /* _CRYPTO_BLAKE2B_H */ |
| 260 230 230 230 6 224 230 155 169 169 169 169 74 74 74 74 147 146 147 129 147 147 146 80 80 22 4 80 80 22 73 27 3 415 416 416 414 38 38 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/list_bl.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/mbcache.h> /* * Mbcache is a simple key-value store. Keys need not be unique, however * key-value pairs are expected to be unique (we use this fact in * mb_cache_entry_delete_or_get()). * * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. * Ext4 also uses it for deduplication of xattr values stored in inodes. * They use hash of data as a key and provide a value that may represent a * block or inode number. That's why keys need not be unique (hash of different * data may be the same). However user provided value always uniquely * identifies a cache entry. * * We provide functions for creation and removal of entries, search by key, * and a special "delete entry with given key-value pair" operation. Fixed * size hash table is used for fast key lookups. */ struct mb_cache { /* Hash table of entries */ struct hlist_bl_head *c_hash; /* log2 of hash table size */ int c_bucket_bits; /* Maximum entries in cache to avoid degrading hash too much */ unsigned long c_max_entries; /* Protects c_list, c_entry_count */ spinlock_t c_list_lock; struct list_head c_list; /* Number of entries in cache */ unsigned long c_entry_count; struct shrinker *c_shrink; /* Work for shrinking when the cache has too many entries */ struct work_struct c_shrink_work; }; static struct kmem_cache *mb_entry_cache; static unsigned long mb_cache_shrink(struct mb_cache *cache, unsigned long nr_to_scan); static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, u32 key) { return &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; } /* * Number of entries to reclaim synchronously when there are too many entries * in cache */ #define SYNC_SHRINK_BATCH 64 /* * mb_cache_entry_create - create entry in cache * @cache - cache where the entry should be created * @mask - gfp mask with which the entry should be allocated * @key - key of the entry * @value - value of the entry * @reusable - is the entry reusable by others? * * Creates entry in @cache with key @key and value @value. The function returns * -EBUSY if entry with the same key and value already exists in cache. * Otherwise 0 is returned. */ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, u64 value, bool reusable) { struct mb_cache_entry *entry, *dup; struct hlist_bl_node *dup_node; struct hlist_bl_head *head; /* Schedule background reclaim if there are too many entries */ if (cache->c_entry_count >= cache->c_max_entries) schedule_work(&cache->c_shrink_work); /* Do some sync reclaim if background reclaim cannot keep up */ if (cache->c_entry_count >= 2*cache->c_max_entries) mb_cache_shrink(cache, SYNC_SHRINK_BATCH); entry = kmem_cache_alloc(mb_entry_cache, mask); if (!entry) return -ENOMEM; INIT_LIST_HEAD(&entry->e_list); /* * We create entry with two references. One reference is kept by the * hash table, the other reference is used to protect us from * mb_cache_entry_delete_or_get() until the entry is fully setup. This * avoids nesting of cache->c_list_lock into hash table bit locks which * is problematic for RT. */ atomic_set(&entry->e_refcnt, 2); entry->e_key = key; entry->e_value = value; entry->e_flags = 0; if (reusable) set_bit(MBE_REUSABLE_B, &entry->e_flags); head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { if (dup->e_key == key && dup->e_value == value) { hlist_bl_unlock(head); kmem_cache_free(mb_entry_cache, entry); return -EBUSY; } } hlist_bl_add_head(&entry->e_hash_list, head); hlist_bl_unlock(head); spin_lock(&cache->c_list_lock); list_add_tail(&entry->e_list, &cache->c_list); cache->c_entry_count++; spin_unlock(&cache->c_list_lock); mb_cache_entry_put(cache, entry); return 0; } EXPORT_SYMBOL(mb_cache_entry_create); void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry) { struct hlist_bl_head *head; head = mb_cache_entry_head(cache, entry->e_key); hlist_bl_lock(head); hlist_bl_del(&entry->e_hash_list); hlist_bl_unlock(head); kmem_cache_free(mb_entry_cache, entry); } EXPORT_SYMBOL(__mb_cache_entry_free); /* * mb_cache_entry_wait_unused - wait to be the last user of the entry * * @entry - entry to work on * * Wait to be the last user of the entry. */ void mb_cache_entry_wait_unused(struct mb_cache_entry *entry) { wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 2); } EXPORT_SYMBOL(mb_cache_entry_wait_unused); static struct mb_cache_entry *__entry_find(struct mb_cache *cache, struct mb_cache_entry *entry, u32 key) { struct mb_cache_entry *old_entry = entry; struct hlist_bl_node *node; struct hlist_bl_head *head; head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); if (entry && !hlist_bl_unhashed(&entry->e_hash_list)) node = entry->e_hash_list.next; else node = hlist_bl_first(head); while (node) { entry = hlist_bl_entry(node, struct mb_cache_entry, e_hash_list); if (entry->e_key == key && test_bit(MBE_REUSABLE_B, &entry->e_flags) && atomic_inc_not_zero(&entry->e_refcnt)) goto out; node = node->next; } entry = NULL; out: hlist_bl_unlock(head); if (old_entry) mb_cache_entry_put(cache, old_entry); return entry; } /* * mb_cache_entry_find_first - find the first reusable entry with the given key * @cache: cache where we should search * @key: key to look for * * Search in @cache for a reusable entry with key @key. Grabs reference to the * first reusable entry found and returns the entry. */ struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, u32 key) { return __entry_find(cache, NULL, key); } EXPORT_SYMBOL(mb_cache_entry_find_first); /* * mb_cache_entry_find_next - find next reusable entry with the same key * @cache: cache where we should search * @entry: entry to start search from * * Finds next reusable entry in the hash chain which has the same key as @entry. * If @entry is unhashed (which can happen when deletion of entry races with the * search), finds the first reusable entry in the hash chain. The function drops * reference to @entry and returns with a reference to the found entry. */ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, struct mb_cache_entry *entry) { return __entry_find(cache, entry, entry->e_key); } EXPORT_SYMBOL(mb_cache_entry_find_next); /* * mb_cache_entry_get - get a cache entry by value (and key) * @cache - cache we work with * @key - key * @value - value */ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, u64 value) { struct hlist_bl_node *node; struct hlist_bl_head *head; struct mb_cache_entry *entry; head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(entry, node, head, e_hash_list) { if (entry->e_key == key && entry->e_value == value && atomic_inc_not_zero(&entry->e_refcnt)) goto out; } entry = NULL; out: hlist_bl_unlock(head); return entry; } EXPORT_SYMBOL(mb_cache_entry_get); /* mb_cache_entry_delete_or_get - remove a cache entry if it has no users * @cache - cache we work with * @key - key * @value - value * * Remove entry from cache @cache with key @key and value @value. The removal * happens only if the entry is unused. The function returns NULL in case the * entry was successfully removed or there's no entry in cache. Otherwise the * function grabs reference of the entry that we failed to delete because it * still has users and return it. */ struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache, u32 key, u64 value) { struct mb_cache_entry *entry; entry = mb_cache_entry_get(cache, key, value); if (!entry) return NULL; /* * Drop the ref we got from mb_cache_entry_get() and the initial hash * ref if we are the last user */ if (atomic_cmpxchg(&entry->e_refcnt, 2, 0) != 2) return entry; spin_lock(&cache->c_list_lock); if (!list_empty(&entry->e_list)) list_del_init(&entry->e_list); cache->c_entry_count--; spin_unlock(&cache->c_list_lock); __mb_cache_entry_free(cache, entry); return NULL; } EXPORT_SYMBOL(mb_cache_entry_delete_or_get); /* mb_cache_entry_touch - cache entry got used * @cache - cache the entry belongs to * @entry - entry that got used * * Marks entry as used to give hit higher chances of surviving in cache. */ void mb_cache_entry_touch(struct mb_cache *cache, struct mb_cache_entry *entry) { set_bit(MBE_REFERENCED_B, &entry->e_flags); } EXPORT_SYMBOL(mb_cache_entry_touch); static unsigned long mb_cache_count(struct shrinker *shrink, struct shrink_control *sc) { struct mb_cache *cache = shrink->private_data; return cache->c_entry_count; } /* Shrink number of entries in cache */ static unsigned long mb_cache_shrink(struct mb_cache *cache, unsigned long nr_to_scan) { struct mb_cache_entry *entry; unsigned long shrunk = 0; spin_lock(&cache->c_list_lock); while (nr_to_scan-- && !list_empty(&cache->c_list)) { entry = list_first_entry(&cache->c_list, struct mb_cache_entry, e_list); /* Drop initial hash reference if there is no user */ if (test_bit(MBE_REFERENCED_B, &entry->e_flags) || atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) { clear_bit(MBE_REFERENCED_B, &entry->e_flags); list_move_tail(&entry->e_list, &cache->c_list); continue; } list_del_init(&entry->e_list); cache->c_entry_count--; spin_unlock(&cache->c_list_lock); __mb_cache_entry_free(cache, entry); shrunk++; cond_resched(); spin_lock(&cache->c_list_lock); } spin_unlock(&cache->c_list_lock); return shrunk; } static unsigned long mb_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct mb_cache *cache = shrink->private_data; return mb_cache_shrink(cache, sc->nr_to_scan); } /* We shrink 1/X of the cache when we have too many entries in it */ #define SHRINK_DIVISOR 16 static void mb_cache_shrink_worker(struct work_struct *work) { struct mb_cache *cache = container_of(work, struct mb_cache, c_shrink_work); mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR); } /* * mb_cache_create - create cache * @bucket_bits: log2 of the hash table size * * Create cache for keys with 2^bucket_bits hash entries. */ struct mb_cache *mb_cache_create(int bucket_bits) { struct mb_cache *cache; unsigned long bucket_count = 1UL << bucket_bits; unsigned long i; cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL); if (!cache) goto err_out; cache->c_bucket_bits = bucket_bits; cache->c_max_entries = bucket_count << 4; INIT_LIST_HEAD(&cache->c_list); spin_lock_init(&cache->c_list_lock); cache->c_hash = kmalloc_array(bucket_count, sizeof(struct hlist_bl_head), GFP_KERNEL); if (!cache->c_hash) { kfree(cache); goto err_out; } for (i = 0; i < bucket_count; i++) INIT_HLIST_BL_HEAD(&cache->c_hash[i]); cache->c_shrink = shrinker_alloc(0, "mbcache-shrinker"); if (!cache->c_shrink) { kfree(cache->c_hash); kfree(cache); goto err_out; } cache->c_shrink->count_objects = mb_cache_count; cache->c_shrink->scan_objects = mb_cache_scan; cache->c_shrink->private_data = cache; shrinker_register(cache->c_shrink); INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker); return cache; err_out: return NULL; } EXPORT_SYMBOL(mb_cache_create); /* * mb_cache_destroy - destroy cache * @cache: the cache to destroy * * Free all entries in cache and cache itself. Caller must make sure nobody * (except shrinker) can reach @cache when calling this. */ void mb_cache_destroy(struct mb_cache *cache) { struct mb_cache_entry *entry, *next; shrinker_free(cache->c_shrink); /* * We don't bother with any locking. Cache must not be used at this * point. */ list_for_each_entry_safe(entry, next, &cache->c_list, e_list) { list_del(&entry->e_list); WARN_ON(atomic_read(&entry->e_refcnt) != 1); mb_cache_entry_put(cache, entry); } kfree(cache->c_hash); kfree(cache); } EXPORT_SYMBOL(mb_cache_destroy); static int __init mbcache_init(void) { mb_entry_cache = KMEM_CACHE(mb_cache_entry, SLAB_RECLAIM_ACCOUNT); if (!mb_entry_cache) return -ENOMEM; return 0; } static void __exit mbcache_exit(void) { kmem_cache_destroy(mb_entry_cache); } module_init(mbcache_init) module_exit(mbcache_exit) MODULE_AUTHOR("Jan Kara <jack@suse.cz>"); MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); MODULE_LICENSE("GPL"); |
| 891 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * SHA-256 optimized for x86_64 * * Copyright 2025 Google LLC */ #include <asm/fpu/api.h> #include <linux/static_call.h> static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni); DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic); #define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \ asmlinkage void asm_fn(struct sha256_block_state *state, \ const u8 *data, size_t nblocks); \ static void c_fn(struct sha256_block_state *state, const u8 *data, \ size_t nblocks) \ { \ if (likely(irq_fpu_usable())) { \ kernel_fpu_begin(); \ asm_fn(state, data, nblocks); \ kernel_fpu_end(); \ } else { \ sha256_blocks_generic(state, data, nblocks); \ } \ } DEFINE_X86_SHA256_FN(sha256_blocks_ssse3, sha256_transform_ssse3); DEFINE_X86_SHA256_FN(sha256_blocks_avx, sha256_transform_avx); DEFINE_X86_SHA256_FN(sha256_blocks_avx2, sha256_transform_rorx); DEFINE_X86_SHA256_FN(sha256_blocks_ni, sha256_ni_transform); static void sha256_blocks(struct sha256_block_state *state, const u8 *data, size_t nblocks) { static_call(sha256_blocks_x86)(state, data, nblocks); } static_assert(offsetof(struct __sha256_ctx, state) == 0); static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); static_assert(offsetof(struct __sha256_ctx, buf) == 40); asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2, int len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]); #define sha256_finup_2x_arch sha256_finup_2x_arch static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]) { /* * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX. * Further limit len to 65536 to avoid spending too long with preemption * disabled. (Of course, in practice len is nearly always 4096 anyway.) */ if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE && len <= 65536 && likely(irq_fpu_usable())) { kernel_fpu_begin(); sha256_ni_finup2x(ctx, data1, data2, len, out1, out2); kernel_fpu_end(); kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE); kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE); return true; } return false; } static bool sha256_finup_2x_is_optimized_arch(void) { return static_key_enabled(&have_sha_ni); } #define sha256_mod_init_arch sha256_mod_init_arch static void sha256_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SHA_NI)) { static_call_update(sha256_blocks_x86, sha256_blocks_ni); static_branch_enable(&have_sha_ni); } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) static_call_update(sha256_blocks_x86, sha256_blocks_avx2); else static_call_update(sha256_blocks_x86, sha256_blocks_avx); } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { static_call_update(sha256_blocks_x86, sha256_blocks_ssse3); } } |
| 1 1 1 1 1 1 1 1 1 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar * * This file contains the /proc/irq/ handling code. */ #include <linux/irq.h> #include <linux/gfp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/mutex.h> #include "internals.h" /* * Access rules: * * procfs protects read/write of /proc/irq/N/ files against a * concurrent free of the interrupt descriptor. remove_proc_entry() * immediately prevents new read/writes to happen and waits for * already running read/write functions to complete. * * We remove the proc entries first and then delete the interrupt * descriptor from the radix tree and free it. So it is guaranteed * that irq_to_desc(N) is valid as long as the read/writes are * permitted by procfs. * * The read from /proc/interrupts is a different problem because there * is no protection. So the lookup and the access to irqdesc * information must be protected by sparse_irq_lock. */ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP enum { AFFINITY, AFFINITY_LIST, EFFECTIVE, EFFECTIVE_LIST, }; static int show_irq_affinity(int type, struct seq_file *m) { struct irq_desc *desc = irq_to_desc((long)m->private); const struct cpumask *mask; guard(raw_spinlock_irq)(&desc->lock); switch (type) { case AFFINITY: case AFFINITY_LIST: mask = desc->irq_common_data.affinity; if (irq_move_pending(&desc->irq_data)) mask = irq_desc_get_pending_mask(desc); break; case EFFECTIVE: case EFFECTIVE_LIST: #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK mask = irq_data_get_effective_affinity_mask(&desc->irq_data); break; #endif default: return -EINVAL; } switch (type) { case AFFINITY_LIST: case EFFECTIVE_LIST: seq_printf(m, "%*pbl\n", cpumask_pr_args(mask)); break; case AFFINITY: case EFFECTIVE: seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); break; } return 0; } static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); cpumask_var_t mask; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; scoped_guard(raw_spinlock_irq, &desc->lock) { if (desc->affinity_hint) cpumask_copy(mask, desc->affinity_hint); } seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); free_cpumask_var(mask); return 0; } int no_irq_affinity; static int irq_affinity_proc_show(struct seq_file *m, void *v) { return show_irq_affinity(AFFINITY, m); } static int irq_affinity_list_proc_show(struct seq_file *m, void *v) { return show_irq_affinity(AFFINITY_LIST, m); } #ifndef CONFIG_AUTO_IRQ_AFFINITY static inline int irq_select_affinity_usr(unsigned int irq) { /* * If the interrupt is started up already then this fails. The * interrupt is assigned to an online CPU already. There is no * point to move it around randomly. Tell user space that the * selected mask is bogus. * * If not then any change to the affinity is pointless because the * startup code invokes irq_setup_affinity() which will select * a online CPU anyway. */ return -EINVAL; } #else /* ALPHA magic affinity auto selector. Keep it for historical reasons. */ static inline int irq_select_affinity_usr(unsigned int irq) { return irq_select_affinity(irq); } #endif static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { unsigned int irq = (int)(long)pde_data(file_inode(file)); cpumask_var_t new_value; int err; if (!irq_can_set_affinity_usr(irq) || no_irq_affinity) return -EPERM; if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; if (type) err = cpumask_parselist_user(buffer, count, new_value); else err = cpumask_parse_user(buffer, count, new_value); if (err) goto free_cpumask; /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ if (!cpumask_intersects(new_value, cpu_online_mask)) { /* * Special case for empty set - allow the architecture code * to set default SMP affinity. */ err = irq_select_affinity_usr(irq) ? -EINVAL : count; } else { err = irq_set_affinity(irq, new_value); if (!err) err = count; } free_cpumask: free_cpumask_var(new_value); return err; } static ssize_t irq_affinity_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { return write_irq_affinity(0, file, buffer, count, pos); } static ssize_t irq_affinity_list_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { return write_irq_affinity(1, file, buffer, count, pos); } static int irq_affinity_proc_open(struct inode *inode, struct file *file) { return single_open(file, irq_affinity_proc_show, pde_data(inode)); } static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) { return single_open(file, irq_affinity_list_proc_show, pde_data(inode)); } static const struct proc_ops irq_affinity_proc_ops = { .proc_open = irq_affinity_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, .proc_write = irq_affinity_proc_write, }; static const struct proc_ops irq_affinity_list_proc_ops = { .proc_open = irq_affinity_list_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, .proc_write = irq_affinity_list_proc_write, }; #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static int irq_effective_aff_proc_show(struct seq_file *m, void *v) { return show_irq_affinity(EFFECTIVE, m); } static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v) { return show_irq_affinity(EFFECTIVE_LIST, m); } #endif static int default_affinity_show(struct seq_file *m, void *v) { seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity)); return 0; } static ssize_t default_affinity_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { cpumask_var_t new_value; int err; if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(buffer, count, new_value); if (err) goto out; /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ if (!cpumask_intersects(new_value, cpu_online_mask)) { err = -EINVAL; goto out; } cpumask_copy(irq_default_affinity, new_value); err = count; out: free_cpumask_var(new_value); return err; } static int default_affinity_open(struct inode *inode, struct file *file) { return single_open(file, default_affinity_show, pde_data(inode)); } static const struct proc_ops default_affinity_proc_ops = { .proc_open = default_affinity_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, .proc_write = default_affinity_write, }; static int irq_node_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long) m->private); seq_printf(m, "%d\n", irq_desc_get_node(desc)); return 0; } #endif static int irq_spurious_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long) m->private); seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n", desc->irq_count, desc->irqs_unhandled, jiffies_to_msecs(desc->last_unhandled)); return 0; } #define MAX_NAMELEN 128 static bool name_unique(unsigned int irq, struct irqaction *new_action) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; guard(raw_spinlock_irq)(&desc->lock); for_each_action_of_desc(desc, action) { if ((action != new_action) && action->name && !strcmp(new_action->name, action->name)) return false; } return true; } void register_handler_proc(unsigned int irq, struct irqaction *action) { char name[MAX_NAMELEN]; struct irq_desc *desc = irq_to_desc(irq); if (!desc->dir || action->dir || !action->name || !name_unique(irq, action)) return; snprintf(name, MAX_NAMELEN, "%s", action->name); /* create /proc/irq/1234/handler/ */ action->dir = proc_mkdir(name, desc->dir); } #undef MAX_NAMELEN #define MAX_NAMELEN 10 void register_irq_proc(unsigned int irq, struct irq_desc *desc) { static DEFINE_MUTEX(register_lock); void __maybe_unused *irqp = (void *)(unsigned long) irq; char name [MAX_NAMELEN]; if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip)) return; /* * irq directories are registered only when a handler is * added, not when the descriptor is created, so multiple * tasks might try to register at the same time. */ guard(mutex)(®ister_lock); if (desc->dir) return; /* create /proc/irq/1234 */ sprintf(name, "%u", irq); desc->dir = proc_mkdir(name, root_irq_dir); if (!desc->dir) return; #ifdef CONFIG_SMP umode_t umode = S_IRUGO; if (irq_can_set_affinity_usr(desc->irq_data.irq)) umode |= S_IWUSR; /* create /proc/irq/<irq>/smp_affinity */ proc_create_data("smp_affinity", umode, desc->dir, &irq_affinity_proc_ops, irqp); /* create /proc/irq/<irq>/affinity_hint */ proc_create_single_data("affinity_hint", 0444, desc->dir, irq_affinity_hint_proc_show, irqp); /* create /proc/irq/<irq>/smp_affinity_list */ proc_create_data("smp_affinity_list", umode, desc->dir, &irq_affinity_list_proc_ops, irqp); proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, irqp); # ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK proc_create_single_data("effective_affinity", 0444, desc->dir, irq_effective_aff_proc_show, irqp); proc_create_single_data("effective_affinity_list", 0444, desc->dir, irq_effective_aff_list_proc_show, irqp); # endif #endif proc_create_single_data("spurious", 0444, desc->dir, irq_spurious_proc_show, (void *)(long)irq); } void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { char name [MAX_NAMELEN]; if (!root_irq_dir || !desc->dir) return; #ifdef CONFIG_SMP remove_proc_entry("smp_affinity", desc->dir); remove_proc_entry("affinity_hint", desc->dir); remove_proc_entry("smp_affinity_list", desc->dir); remove_proc_entry("node", desc->dir); # ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK remove_proc_entry("effective_affinity", desc->dir); remove_proc_entry("effective_affinity_list", desc->dir); # endif #endif remove_proc_entry("spurious", desc->dir); sprintf(name, "%u", irq); remove_proc_entry(name, root_irq_dir); } #undef MAX_NAMELEN void unregister_handler_proc(unsigned int irq, struct irqaction *action) { proc_remove(action->dir); } static void register_default_affinity_proc(void) { #ifdef CONFIG_SMP proc_create("irq/default_smp_affinity", 0644, NULL, &default_affinity_proc_ops); #endif } void init_irq_proc(void) { unsigned int irq; struct irq_desc *desc; /* create /proc/irq */ root_irq_dir = proc_mkdir("irq", NULL); if (!root_irq_dir) return; register_default_affinity_proc(); /* * Create entries for all existing IRQs. */ for_each_irq_desc(irq, desc) register_irq_proc(irq, desc); } #ifdef CONFIG_GENERIC_IRQ_SHOW int __weak arch_show_interrupts(struct seq_file *p, int prec) { return 0; } #ifndef ACTUAL_NR_IRQS # define ACTUAL_NR_IRQS irq_get_nr_irqs() #endif int show_interrupts(struct seq_file *p, void *v) { const unsigned int nr_irqs = irq_get_nr_irqs(); static int prec; int i = *(loff_t *) v, j; struct irqaction *action; struct irq_desc *desc; if (i > ACTUAL_NR_IRQS) return 0; if (i == ACTUAL_NR_IRQS) return arch_show_interrupts(p, prec); /* print header and calculate the width of the first column */ if (i == 0) { for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) j *= 10; seq_printf(p, "%*s", prec + 8, ""); for_each_online_cpu(j) seq_printf(p, "CPU%-8d", j); seq_putc(p, '\n'); } guard(rcu)(); desc = irq_to_desc(i); if (!desc || irq_settings_is_hidden(desc)) return 0; if (!desc->action || irq_desc_is_chained(desc) || !desc->kstat_irqs) return 0; seq_printf(p, "%*d:", prec, i); for_each_online_cpu(j) { unsigned int cnt = desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, j) : 0; seq_put_decimal_ull_width(p, " ", cnt, 10); } seq_putc(p, ' '); guard(raw_spinlock_irq)(&desc->lock); if (desc->irq_data.chip) { if (desc->irq_data.chip->irq_print_chip) desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); else if (desc->irq_data.chip->name) seq_printf(p, "%8s", desc->irq_data.chip->name); else seq_printf(p, "%8s", "-"); } else { seq_printf(p, "%8s", "None"); } if (desc->irq_data.domain) seq_printf(p, " %*lu", prec, desc->irq_data.hwirq); else seq_printf(p, " %*s", prec, ""); #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); #endif if (desc->name) seq_printf(p, "-%-8s", desc->name); action = desc->action; if (action) { seq_printf(p, " %s", action->name); while ((action = action->next) != NULL) seq_printf(p, ", %s", action->name); } seq_putc(p, '\n'); return 0; } #endif |
| 24 24 31 29 3 2 18 3 20 3 18 20 18 3 24 18 29 29 29 29 29 29 29 18 29 29 24 18 3 3 3 3 4 3 24 4 23 3 3 3 3 3 3 3 3 3 3 3 945 2 945 18 18 2880 2882 2875 24 2881 18 4 18 18 18 18 18 18 18 18 18 18 18 18 33 31 27 8 2851 2854 21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ /* * fsnotify inode mark locking/lifetime/and refcnting * * REFCNT: * The group->recnt and mark->refcnt tell how many "things" in the kernel * currently are referencing the objects. Both kind of objects typically will * live inside the kernel with a refcnt of 2, one for its creation and one for * the reference a group and a mark hold to each other. * If you are holding the appropriate locks, you can take a reference and the * object itself is guaranteed to survive until the reference is dropped. * * LOCKING: * There are 3 locks involved with fsnotify inode marks and they MUST be taken * in order as follows: * * group->mark_mutex * mark->lock * mark->connector->lock * * group->mark_mutex protects the marks_list anchored inside a given group and * each mark is hooked via the g_list. It also protects the groups private * data (i.e group limits). * mark->lock protects the marks attributes like its masks and flags. * Furthermore it protects the access to a reference of the group that the mark * is assigned to as well as the access to a reference of the inode/vfsmount * that is being watched by the mark. * * mark->connector->lock protects the list of marks anchored inside an * inode / vfsmount and each mark is hooked via the i_list. * * A list of notification marks relating to inode / mnt is contained in * fsnotify_mark_connector. That structure is alive as long as there are any * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets * detached from fsnotify_mark_connector when last reference to the mark is * dropped. Thus having mark reference is enough to protect mark->connector * pointer and to make sure fsnotify_mark_connector cannot disappear. Also * because we remove mark from g_list before dropping mark reference associated * with that, any mark found through g_list is guaranteed to have * mark->connector set until we drop group->mark_mutex. * * LIFETIME: * Inode marks survive between when they are added to an inode and when their * refcnt==0. Marks are also protected by fsnotify_mark_srcu. * * The inode mark can be cleared for a number of different reasons including: * - The inode is unlinked for the last time. (fsnotify_inode_remove) * - The inode is being evicted from cache. (fsnotify_inode_delete) * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes) * - Something explicitly requests that it be removed. (fsnotify_destroy_mark) * - The fsnotify_group associated with the mark is going away and all such marks * need to be cleaned up. (fsnotify_clear_marks_by_group) * * This has the very interesting property of being able to run concurrently with * any (or all) other directions. */ #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/srcu.h> #include <linux/ratelimit.h> #include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" #define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ struct srcu_struct fsnotify_mark_srcu; struct kmem_cache *fsnotify_mark_connector_cachep; static DEFINE_SPINLOCK(destroy_lock); static LIST_HEAD(destroy_list); static struct fsnotify_mark_connector *connector_destroy_list; static void fsnotify_mark_destroy_workfn(struct work_struct *work); static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); static void fsnotify_connector_destroy_workfn(struct work_struct *work); static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn); void fsnotify_get_mark(struct fsnotify_mark *mark) { WARN_ON_ONCE(!refcount_read(&mark->refcnt)); refcount_inc(&mark->refcnt); } static fsnotify_connp_t *fsnotify_object_connp(void *obj, enum fsnotify_obj_type obj_type) { switch (obj_type) { case FSNOTIFY_OBJ_TYPE_INODE: return &((struct inode *)obj)->i_fsnotify_marks; case FSNOTIFY_OBJ_TYPE_VFSMOUNT: return &real_mount(obj)->mnt_fsnotify_marks; case FSNOTIFY_OBJ_TYPE_SB: return fsnotify_sb_marks(obj); case FSNOTIFY_OBJ_TYPE_MNTNS: return &((struct mnt_namespace *)obj)->n_fsnotify_marks; default: return NULL; } } static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn) { if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) return &fsnotify_conn_inode(conn)->i_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) return &fsnotify_conn_sb(conn)->s_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) return &fsnotify_conn_mntns(conn)->n_fsnotify_mask; return NULL; } __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn) { if (WARN_ON(!fsnotify_valid_obj_type(conn->type))) return 0; return READ_ONCE(*fsnotify_conn_mask_p(conn)); } static void fsnotify_get_sb_watched_objects(struct super_block *sb) { atomic_long_inc(fsnotify_sb_watched_objects(sb)); } static void fsnotify_put_sb_watched_objects(struct super_block *sb) { atomic_long_t *watched_objects = fsnotify_sb_watched_objects(sb); /* the superblock can go away after this decrement */ if (atomic_long_dec_and_test(watched_objects)) wake_up_var(watched_objects); } static void fsnotify_get_inode_ref(struct inode *inode) { ihold(inode); fsnotify_get_sb_watched_objects(inode->i_sb); } static void fsnotify_put_inode_ref(struct inode *inode) { /* read ->i_sb before the inode can go away */ struct super_block *sb = inode->i_sb; iput(inode); fsnotify_put_sb_watched_objects(sb); } /* * Grab or drop watched objects reference depending on whether the connector * is attached and has any marks attached. */ static void fsnotify_update_sb_watchers(struct super_block *sb, struct fsnotify_mark_connector *conn) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); bool is_watched = conn->flags & FSNOTIFY_CONN_FLAG_IS_WATCHED; struct fsnotify_mark *first_mark = NULL; unsigned int highest_prio = 0; if (conn->obj) first_mark = hlist_entry_safe(conn->list.first, struct fsnotify_mark, obj_list); if (first_mark) highest_prio = first_mark->group->priority; if (WARN_ON(highest_prio >= __FSNOTIFY_PRIO_NUM)) highest_prio = 0; /* * If the highest priority of group watching this object is prio, * then watched object has a reference on counters [0..prio]. * Update priority >= 1 watched objects counters. */ for (unsigned int p = conn->prio + 1; p <= highest_prio; p++) atomic_long_inc(&sbinfo->watched_objects[p]); for (unsigned int p = conn->prio; p > highest_prio; p--) atomic_long_dec(&sbinfo->watched_objects[p]); conn->prio = highest_prio; /* Update priority >= 0 (a.k.a total) watched objects counter */ BUILD_BUG_ON(FSNOTIFY_PRIO_NORMAL != 0); if (first_mark && !is_watched) { conn->flags |= FSNOTIFY_CONN_FLAG_IS_WATCHED; fsnotify_get_sb_watched_objects(sb); } else if (!first_mark && is_watched) { conn->flags &= ~FSNOTIFY_CONN_FLAG_IS_WATCHED; fsnotify_put_sb_watched_objects(sb); } } /* * Grab or drop inode reference for the connector if needed. * * When it's time to drop the reference, we only clear the HAS_IREF flag and * return the inode object. fsnotify_drop_object() will be resonsible for doing * iput() outside of spinlocks. This happens when last mark that wanted iref is * detached. */ static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn, bool want_iref) { bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF; struct inode *inode = NULL; if (conn->type != FSNOTIFY_OBJ_TYPE_INODE || want_iref == has_iref) return NULL; if (want_iref) { /* Pin inode if any mark wants inode refcount held */ fsnotify_get_inode_ref(fsnotify_conn_inode(conn)); conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF; } else { /* Unpin inode after detach of last mark that wanted iref */ inode = fsnotify_conn_inode(conn); conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF; } return inode; } static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { u32 new_mask = 0; bool want_iref = false; struct fsnotify_mark *mark; assert_spin_locked(&conn->lock); /* We can get detached connector here when inode is getting unlinked. */ if (!fsnotify_valid_obj_type(conn->type)) return NULL; hlist_for_each_entry(mark, &conn->list, obj_list) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) continue; new_mask |= fsnotify_calc_mask(mark); if (conn->type == FSNOTIFY_OBJ_TYPE_INODE && !(mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) want_iref = true; } /* * We use WRITE_ONCE() to prevent silly compiler optimizations from * confusing readers not holding conn->lock with partial updates. */ WRITE_ONCE(*fsnotify_conn_mask_p(conn), new_mask); return fsnotify_update_iref(conn, want_iref); } static bool fsnotify_conn_watches_children( struct fsnotify_mark_connector *conn) { if (conn->type != FSNOTIFY_OBJ_TYPE_INODE) return false; return fsnotify_inode_watches_children(fsnotify_conn_inode(conn)); } static void fsnotify_conn_set_children_dentry_flags( struct fsnotify_mark_connector *conn) { if (conn->type != FSNOTIFY_OBJ_TYPE_INODE) return; fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn)); } /* * Calculate mask of events for a list of marks. The caller must make sure * connector and connector->obj cannot disappear under us. Callers achieve * this by holding a mark->lock or mark->group->mark_mutex for a mark on this * list. */ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { bool update_children; if (!conn) return; spin_lock(&conn->lock); update_children = !fsnotify_conn_watches_children(conn); __fsnotify_recalc_mask(conn); update_children &= fsnotify_conn_watches_children(conn); spin_unlock(&conn->lock); /* * Set children's PARENT_WATCHED flags only if parent started watching. * When parent stops watching, we clear false positive PARENT_WATCHED * flags lazily in __fsnotify_parent(). */ if (update_children) fsnotify_conn_set_children_dentry_flags(conn); } /* Free all connectors queued for freeing once SRCU period ends */ static void fsnotify_connector_destroy_workfn(struct work_struct *work) { struct fsnotify_mark_connector *conn, *free; spin_lock(&destroy_lock); conn = connector_destroy_list; connector_destroy_list = NULL; spin_unlock(&destroy_lock); synchronize_srcu(&fsnotify_mark_srcu); while (conn) { free = conn; conn = conn->destroy_next; kmem_cache_free(fsnotify_mark_connector_cachep, free); } } static void *fsnotify_detach_connector_from_object( struct fsnotify_mark_connector *conn, unsigned int *type) { fsnotify_connp_t *connp = fsnotify_object_connp(conn->obj, conn->type); struct super_block *sb = fsnotify_connector_sb(conn); struct inode *inode = NULL; *type = conn->type; if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) return NULL; if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; /* Unpin inode when detaching from connector */ if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF)) inode = NULL; } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) { fsnotify_conn_mntns(conn)->n_fsnotify_mask = 0; } rcu_assign_pointer(*connp, NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; if (sb) fsnotify_update_sb_watchers(sb, conn); return inode; } static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; if (WARN_ON_ONCE(!group)) return; group->ops->free_mark(mark); fsnotify_put_group(group); } /* Drop object reference originally held by a connector */ static void fsnotify_drop_object(unsigned int type, void *objp) { if (!objp) return; /* Currently only inode references are passed to be dropped */ if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE)) return; fsnotify_put_inode_ref(objp); } void fsnotify_put_mark(struct fsnotify_mark *mark) { struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector); void *objp = NULL; unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED; bool free_conn = false; /* Catch marks that were actually never attached to object */ if (!conn) { if (refcount_dec_and_test(&mark->refcnt)) fsnotify_final_mark_destroy(mark); return; } /* * We have to be careful so that traversals of obj_list under lock can * safely grab mark reference. */ if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock)) return; hlist_del_init_rcu(&mark->obj_list); if (hlist_empty(&conn->list)) { objp = fsnotify_detach_connector_from_object(conn, &type); free_conn = true; } else { struct super_block *sb = fsnotify_connector_sb(conn); /* Update watched objects after detaching mark */ if (sb) fsnotify_update_sb_watchers(sb, conn); objp = __fsnotify_recalc_mask(conn); type = conn->type; } WRITE_ONCE(mark->connector, NULL); spin_unlock(&conn->lock); fsnotify_drop_object(type, objp); if (free_conn) { spin_lock(&destroy_lock); conn->destroy_next = connector_destroy_list; connector_destroy_list = conn; spin_unlock(&destroy_lock); queue_work(system_dfl_wq, &connector_reaper_work); } /* * Note that we didn't update flags telling whether inode cares about * what's happening with children. We update these flags from * __fsnotify_parent() lazily when next event happens on one of our * children. */ spin_lock(&destroy_lock); list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); queue_delayed_work(system_dfl_wq, &reaper_work, FSNOTIFY_REAPER_DELAY); } EXPORT_SYMBOL_GPL(fsnotify_put_mark); /* * Get mark reference when we found the mark via lockless traversal of object * list. Mark can be already removed from the list by now and on its way to be * destroyed once SRCU period ends. * * Also pin the group so it doesn't disappear under us. */ static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark) { if (!mark) return true; if (refcount_inc_not_zero(&mark->refcnt)) { spin_lock(&mark->lock); if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) { /* mark is attached, group is still alive then */ atomic_inc(&mark->group->user_waits); spin_unlock(&mark->lock); return true; } spin_unlock(&mark->lock); fsnotify_put_mark(mark); } return false; } /* * Puts marks and wakes up group destruction if necessary. * * Pairs with fsnotify_get_mark_safe() */ static void fsnotify_put_mark_wake(struct fsnotify_mark *mark) { if (mark) { struct fsnotify_group *group = mark->group; fsnotify_put_mark(mark); /* * We abuse notification_waitq on group shutdown for waiting for * all marks pinned when waiting for userspace. */ if (atomic_dec_and_test(&group->user_waits) && group->shutdown) wake_up(&group->notification_waitq); } } bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) __releases(&fsnotify_mark_srcu) { int type; fsnotify_foreach_iter_type(type) { /* This can fail if mark is being removed */ if (!fsnotify_get_mark_safe(iter_info->marks[type])) { __release(&fsnotify_mark_srcu); goto fail; } } /* * Now that both marks are pinned by refcount in the inode / vfsmount * lists, we can drop SRCU lock, and safely resume the list iteration * once userspace returns. */ srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx); return true; fail: for (type--; type >= 0; type--) fsnotify_put_mark_wake(iter_info->marks[type]); return false; } void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info) __acquires(&fsnotify_mark_srcu) { int type; iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); fsnotify_foreach_iter_type(type) fsnotify_put_mark_wake(iter_info->marks[type]); } /* * Mark mark as detached, remove it from group list. Mark still stays in object * list until its last reference is dropped. Note that we rely on mark being * removed from group list before corresponding reference to it is dropped. In * particular we rely on mark->connector being valid while we hold * group->mark_mutex if we found the mark through g_list. * * Must be called with group->mark_mutex held. The caller must either hold * reference to the mark or be protected by fsnotify_mark_srcu. */ void fsnotify_detach_mark(struct fsnotify_mark *mark) { fsnotify_group_assert_locked(mark->group); WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) && refcount_read(&mark->refcnt) < 1 + !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)); spin_lock(&mark->lock); /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { spin_unlock(&mark->lock); return; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; list_del_init(&mark->g_list); spin_unlock(&mark->lock); /* Drop mark reference acquired in fsnotify_add_mark_locked() */ fsnotify_put_mark(mark); } /* * Free fsnotify mark. The mark is actually only marked as being freed. The * freeing is actually happening only once last reference to the mark is * dropped from a workqueue which first waits for srcu period end. * * Caller must have a reference to the mark or be protected by * fsnotify_mark_srcu. */ void fsnotify_free_mark(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; spin_lock(&mark->lock); /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); return; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); /* * Some groups like to know that marks are being freed. This is a * callback to the group function to let it know that this mark * is being freed. */ if (group->ops->freeing_mark) group->ops->freeing_mark(mark, group); } void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { fsnotify_group_lock(group); fsnotify_detach_mark(mark); fsnotify_group_unlock(group); fsnotify_free_mark(mark); } EXPORT_SYMBOL_GPL(fsnotify_destroy_mark); /* * Sorting function for lists of fsnotify marks. * * Fanotify supports different notification classes (reflected as priority of * notification group). Events shall be passed to notification groups in * decreasing priority order. To achieve this marks in notification lists for * inodes and vfsmounts are sorted so that priorities of corresponding groups * are descending. * * Furthermore correct handling of the ignore mask requires processing inode * and vfsmount marks of each group together. Using the group address as * further sort criterion provides a unique sorting order and thus we can * merge inode and vfsmount lists of marks in linear time and find groups * present in both lists. * * A return value of 1 signifies that b has priority over a. * A return value of 0 signifies that the two marks have to be handled together. * A return value of -1 signifies that a has priority over b. */ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) { if (a == b) return 0; if (!a) return 1; if (!b) return -1; if (a->priority < b->priority) return 1; if (a->priority > b->priority) return -1; if (a < b) return 1; return -1; } static int fsnotify_attach_info_to_sb(struct super_block *sb) { struct fsnotify_sb_info *sbinfo; /* sb info is freed on fsnotify_sb_delete() */ sbinfo = kzalloc(sizeof(*sbinfo), GFP_KERNEL); if (!sbinfo) return -ENOMEM; /* * cmpxchg() provides the barrier so that callers of fsnotify_sb_info() * will observe an initialized structure */ if (cmpxchg(&sb->s_fsnotify_info, NULL, sbinfo)) { /* Someone else created sbinfo for us */ kfree(sbinfo); } return 0; } static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, void *obj, unsigned int obj_type) { struct fsnotify_mark_connector *conn; conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL); if (!conn) return -ENOMEM; spin_lock_init(&conn->lock); INIT_HLIST_HEAD(&conn->list); conn->flags = 0; conn->prio = 0; conn->type = obj_type; conn->obj = obj; /* * cmpxchg() provides the barrier so that readers of *connp can see * only initialized structure */ if (cmpxchg(connp, NULL, conn)) { /* Someone else created list structure for us */ kmem_cache_free(fsnotify_mark_connector_cachep, conn); } return 0; } /* * Get mark connector, make sure it is alive and return with its lock held. * This is for users that get connector pointer from inode or mount. Users that * hold reference to a mark on the list may directly lock connector->lock as * they are sure list cannot go away under them. */ static struct fsnotify_mark_connector *fsnotify_grab_connector( fsnotify_connp_t *connp) { struct fsnotify_mark_connector *conn; int idx; idx = srcu_read_lock(&fsnotify_mark_srcu); conn = srcu_dereference(*connp, &fsnotify_mark_srcu); if (!conn) goto out; spin_lock(&conn->lock); if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) { spin_unlock(&conn->lock); srcu_read_unlock(&fsnotify_mark_srcu, idx); return NULL; } out: srcu_read_unlock(&fsnotify_mark_srcu, idx); return conn; } /* * Add mark into proper place in given list of marks. These marks may be used * for the fsnotify backend to determine which event types should be delivered * to which group and for which inodes. These marks are ordered according to * priority, highest number first, and then by the group's location in memory. */ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { struct super_block *sb = fsnotify_object_sb(obj, obj_type); struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; fsnotify_connp_t *connp; int cmp; int err = 0; if (WARN_ON(!fsnotify_valid_obj_type(obj_type))) return -EINVAL; /* * Attach the sb info before attaching a connector to any object on sb. * The sb info will remain attached as long as sb lives. */ if (sb && !fsnotify_sb_info(sb)) { err = fsnotify_attach_info_to_sb(sb); if (err) return err; } connp = fsnotify_object_connp(obj, obj_type); restart: spin_lock(&mark->lock); conn = fsnotify_grab_connector(connp); if (!conn) { spin_unlock(&mark->lock); err = fsnotify_attach_connector_to_object(connp, obj, obj_type); if (err) return err; goto restart; } /* is mark the first mark? */ if (hlist_empty(&conn->list)) { hlist_add_head_rcu(&mark->obj_list, &conn->list); goto added; } /* should mark be in the middle of the current list? */ hlist_for_each_entry(lmark, &conn->list, obj_list) { last = lmark; if ((lmark->group == mark->group) && (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) && !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) { err = -EEXIST; goto out_err; } cmp = fsnotify_compare_groups(lmark->group, mark->group); if (cmp >= 0) { hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); goto added; } } BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: if (sb) fsnotify_update_sb_watchers(sb, conn); /* * Since connector is attached to object using cmpxchg() we are * guaranteed that connector initialization is fully visible by anyone * seeing mark->connector set. */ WRITE_ONCE(mark->connector, conn); out_err: spin_unlock(&conn->lock); spin_unlock(&mark->lock); return err; } /* * Attach an initialized mark to a given group and fs object. * These marks may be used for the fsnotify backend to determine which * event types should be delivered to which group. */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { struct fsnotify_group *group = mark->group; int ret = 0; fsnotify_group_assert_locked(group); /* * LOCKING ORDER!!!! * group->mark_mutex * mark->lock * mark->connector->lock */ spin_lock(&mark->lock); mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; list_add(&mark->g_list, &group->marks_list); fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); ret = fsnotify_add_mark_list(mark, obj, obj_type, add_flags); if (ret) goto err; fsnotify_recalc_mask(mark->connector); return ret; err: spin_lock(&mark->lock); mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED); list_del_init(&mark->g_list); spin_unlock(&mark->lock); fsnotify_put_mark(mark); return ret; } int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { int ret; struct fsnotify_group *group = mark->group; fsnotify_group_lock(group); ret = fsnotify_add_mark_locked(mark, obj, obj_type, add_flags); fsnotify_group_unlock(group); return ret; } EXPORT_SYMBOL_GPL(fsnotify_add_mark); /* * Given a list of marks, find the mark associated with given group. If found * take a reference to that mark and return it, else return NULL. */ struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type, struct fsnotify_group *group) { fsnotify_connp_t *connp = fsnotify_object_connp(obj, obj_type); struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark; if (!connp) return NULL; conn = fsnotify_grab_connector(connp); if (!conn) return NULL; hlist_for_each_entry(mark, &conn->list, obj_list) { if (mark->group == group && (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { fsnotify_get_mark(mark); spin_unlock(&conn->lock); return mark; } } spin_unlock(&conn->lock); return NULL; } EXPORT_SYMBOL_GPL(fsnotify_find_mark); /* Clear any marks in a group with given type mask */ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int obj_type) { struct fsnotify_mark *lmark, *mark; LIST_HEAD(to_free); struct list_head *head = &to_free; /* Skip selection step if we want to clear all marks. */ if (obj_type == FSNOTIFY_OBJ_TYPE_ANY) { head = &group->marks_list; goto clear; } /* * We have to be really careful here. Anytime we drop mark_mutex, e.g. * fsnotify_clear_marks_by_inode() can come and free marks. Even in our * to_free list so we have to use mark_mutex even when accessing that * list. And freeing mark requires us to drop mark_mutex. So we can * reliably free only the first mark in the list. That's why we first * move marks to free to to_free list in one go and then free marks in * to_free list one by one. */ fsnotify_group_lock(group); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { if (mark->connector->type == obj_type) list_move(&mark->g_list, &to_free); } fsnotify_group_unlock(group); clear: while (1) { fsnotify_group_lock(group); if (list_empty(head)) { fsnotify_group_unlock(group); break; } mark = list_first_entry(head, struct fsnotify_mark, g_list); fsnotify_get_mark(mark); fsnotify_detach_mark(mark); fsnotify_group_unlock(group); fsnotify_free_mark(mark); fsnotify_put_mark(mark); } } /* Destroy all marks attached to an object via connector */ void fsnotify_destroy_marks(fsnotify_connp_t *connp) { struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark, *old_mark = NULL; void *objp; unsigned int type; conn = fsnotify_grab_connector(connp); if (!conn) return; /* * We have to be careful since we can race with e.g. * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the * list can get modified. However we are holding mark reference and * thus our mark cannot be removed from obj_list so we can continue * iteration after regaining conn->lock. */ hlist_for_each_entry(mark, &conn->list, obj_list) { fsnotify_get_mark(mark); spin_unlock(&conn->lock); if (old_mark) fsnotify_put_mark(old_mark); old_mark = mark; fsnotify_destroy_mark(mark, mark->group); spin_lock(&conn->lock); } /* * Detach list from object now so that we don't pin inode until all * mark references get dropped. It would lead to strange results such * as delaying inode deletion or blocking unmount. */ objp = fsnotify_detach_connector_from_object(conn, &type); spin_unlock(&conn->lock); if (old_mark) fsnotify_put_mark(old_mark); fsnotify_drop_object(type, objp); } /* * Nothing fancy, just initialize lists and locks and counters. */ void fsnotify_init_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { memset(mark, 0, sizeof(*mark)); spin_lock_init(&mark->lock); refcount_set(&mark->refcnt, 1); fsnotify_get_group(group); mark->group = group; WRITE_ONCE(mark->connector, NULL); } EXPORT_SYMBOL_GPL(fsnotify_init_mark); /* * Destroy all marks in destroy_list, waits for SRCU period to finish before * actually freeing marks. */ static void fsnotify_mark_destroy_workfn(struct work_struct *work) { struct fsnotify_mark *mark, *next; struct list_head private_destroy_list; spin_lock(&destroy_lock); /* exchange the list head */ list_replace_init(&destroy_list, &private_destroy_list); spin_unlock(&destroy_lock); synchronize_srcu(&fsnotify_mark_srcu); list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { list_del_init(&mark->g_list); fsnotify_final_mark_destroy(mark); } } /* Wait for all marks queued for destruction to be actually destroyed */ void fsnotify_wait_marks_destroyed(void) { flush_delayed_work(&reaper_work); } EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed); |
| 5 5 75 68 75 84 58 1 6 6 37 25 2 331 331 119 45 3 2 1 1 40 13 65 25 120 53 29 15 41 77 38 1 34 69 1 57 9 2 1 17 45 52 107 37 24 3 49 30 56 15 14 121 30 3 32 21 68 65 14 6 22 3 20 4 4 1 8 2 17 3 5 286 286 9 334 328 1 2 2 2 13 318 6 334 36 237 72 310 310 14 296 120 175 51 262 150 161 294 294 284 26 286 32 32 30 18 20 2 13 11 2 9 4 10 3 9 4 9 4 10 3 8 1 4 9 4 2 4 7 12 1 12 1 13 11 2 12 1 11 2 12 1 11 2 11 2 9 4 11 2 3 9 1 12 1 13 9 4 11 2 10 3 12 1 12 1 13 12 1 13 13 13 287 2 1 284 285 287 30 2 19 8 5 5 32 20 12 29 4 10 310 310 310 310 310 310 310 310 310 310 310 310 310 310 310 310 310 310 310 310 310 310 310 310 32 32 32 32 32 32 32 32 32 32 303 2 12 76 233 77 36 33 3 4 32 32 32 15 10 4 4 10 29 3 32 13 13 13 13 13 1 13 13 13 13 13 13 13 13 13 13 13 13 13 13 1 12 13 13 13 154 8 318 4 315 2 315 313 312 1 288 286 3 326 8 288 1 287 326 326 36 352 352 12 353 326 324 352 35 330 1 24 3 1 14 2 2 1 1 14 14 14 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/blkdev.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/time.h> #include <linux/init.h> #include <linux/seq_file.h> #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mount.h> #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/compat.h> #include <linux/parser.h> #include <linux/ctype.h> #include <linux/namei.h> #include <linux/miscdevice.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/crc32c.h> #include <linux/btrfs.h> #include <linux/security.h> #include <linux/fs_parser.h> #include "messages.h" #include "delayed-inode.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" #include "direct-io.h" #include "props.h" #include "xattr.h" #include "bio.h" #include "export.h" #include "compression.h" #include "dev-replace.h" #include "free-space-cache.h" #include "backref.h" #include "space-info.h" #include "sysfs.h" #include "zoned.h" #include "tests/btrfs-tests.h" #include "block-group.h" #include "discard.h" #include "qgroup.h" #include "raid56.h" #include "fs.h" #include "accessors.h" #include "defrag.h" #include "dir-item.h" #include "ioctl.h" #include "scrub.h" #include "verity.h" #include "super.h" #include "extent-tree.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> static const struct super_operations btrfs_super_ops; static struct file_system_type btrfs_fs_type; static void btrfs_put_super(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); btrfs_info(fs_info, "last unmount of filesystem %pU", fs_info->fs_devices->fsid); close_ctree(fs_info); } /* Store the mount options related information. */ struct btrfs_fs_context { char *subvol_name; u64 subvol_objectid; u64 max_inline; u32 commit_interval; u32 metadata_ratio; u32 thread_pool_size; unsigned long long mount_opt; unsigned long compress_type:4; int compress_level; refcount_t refs; }; static void btrfs_emit_options(struct btrfs_fs_info *info, struct btrfs_fs_context *old); enum { Opt_acl, Opt_clear_cache, Opt_commit_interval, Opt_compress, Opt_compress_force, Opt_compress_force_type, Opt_compress_type, Opt_degraded, Opt_device, Opt_fatal_errors, Opt_flushoncommit, Opt_max_inline, Opt_barrier, Opt_datacow, Opt_datasum, Opt_defrag, Opt_discard, Opt_discard_mode, Opt_ratio, Opt_rescan_uuid_tree, Opt_skip_balance, Opt_space_cache, Opt_space_cache_version, Opt_ssd, Opt_ssd_spread, Opt_subvol, Opt_subvol_empty, Opt_subvolid, Opt_thread_pool, Opt_treelog, Opt_user_subvol_rm_allowed, Opt_norecovery, /* Rescue options */ Opt_rescue, Opt_usebackuproot, /* Debugging options */ Opt_enospc_debug, #ifdef CONFIG_BTRFS_DEBUG Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, Opt_ref_verify, Opt_ref_tracker, #endif Opt_err, }; enum { Opt_fatal_errors_panic, Opt_fatal_errors_bug, }; static const struct constant_table btrfs_parameter_fatal_errors[] = { { "panic", Opt_fatal_errors_panic }, { "bug", Opt_fatal_errors_bug }, {} }; enum { Opt_discard_sync, Opt_discard_async, }; static const struct constant_table btrfs_parameter_discard[] = { { "sync", Opt_discard_sync }, { "async", Opt_discard_async }, {} }; enum { Opt_space_cache_v1, Opt_space_cache_v2, }; static const struct constant_table btrfs_parameter_space_cache[] = { { "v1", Opt_space_cache_v1 }, { "v2", Opt_space_cache_v2 }, {} }; enum { Opt_rescue_usebackuproot, Opt_rescue_nologreplay, Opt_rescue_ignorebadroots, Opt_rescue_ignoredatacsums, Opt_rescue_ignoremetacsums, Opt_rescue_ignoresuperflags, Opt_rescue_parameter_all, }; static const struct constant_table btrfs_parameter_rescue[] = { { "usebackuproot", Opt_rescue_usebackuproot }, { "nologreplay", Opt_rescue_nologreplay }, { "ignorebadroots", Opt_rescue_ignorebadroots }, { "ibadroots", Opt_rescue_ignorebadroots }, { "ignoredatacsums", Opt_rescue_ignoredatacsums }, { "ignoremetacsums", Opt_rescue_ignoremetacsums}, { "ignoresuperflags", Opt_rescue_ignoresuperflags}, { "idatacsums", Opt_rescue_ignoredatacsums }, { "imetacsums", Opt_rescue_ignoremetacsums}, { "isuperflags", Opt_rescue_ignoresuperflags}, { "all", Opt_rescue_parameter_all }, {} }; #ifdef CONFIG_BTRFS_DEBUG enum { Opt_fragment_parameter_data, Opt_fragment_parameter_metadata, Opt_fragment_parameter_all, }; static const struct constant_table btrfs_parameter_fragment[] = { { "data", Opt_fragment_parameter_data }, { "metadata", Opt_fragment_parameter_metadata }, { "all", Opt_fragment_parameter_all }, {} }; #endif static const struct fs_parameter_spec btrfs_fs_parameters[] = { fsparam_flag_no("acl", Opt_acl), fsparam_flag_no("autodefrag", Opt_defrag), fsparam_flag_no("barrier", Opt_barrier), fsparam_flag("clear_cache", Opt_clear_cache), fsparam_u32("commit", Opt_commit_interval), fsparam_flag("compress", Opt_compress), fsparam_string("compress", Opt_compress_type), fsparam_flag("compress-force", Opt_compress_force), fsparam_string("compress-force", Opt_compress_force_type), fsparam_flag_no("datacow", Opt_datacow), fsparam_flag_no("datasum", Opt_datasum), fsparam_flag("degraded", Opt_degraded), fsparam_string("device", Opt_device), fsparam_flag_no("discard", Opt_discard), fsparam_enum("discard", Opt_discard_mode, btrfs_parameter_discard), fsparam_enum("fatal_errors", Opt_fatal_errors, btrfs_parameter_fatal_errors), fsparam_flag_no("flushoncommit", Opt_flushoncommit), fsparam_string("max_inline", Opt_max_inline), fsparam_u32("metadata_ratio", Opt_ratio), fsparam_flag("rescan_uuid_tree", Opt_rescan_uuid_tree), fsparam_flag("skip_balance", Opt_skip_balance), fsparam_flag_no("space_cache", Opt_space_cache), fsparam_enum("space_cache", Opt_space_cache_version, btrfs_parameter_space_cache), fsparam_flag_no("ssd", Opt_ssd), fsparam_flag_no("ssd_spread", Opt_ssd_spread), fsparam_string("subvol", Opt_subvol), fsparam_flag("subvol=", Opt_subvol_empty), fsparam_u64("subvolid", Opt_subvolid), fsparam_u32("thread_pool", Opt_thread_pool), fsparam_flag_no("treelog", Opt_treelog), fsparam_flag("user_subvol_rm_allowed", Opt_user_subvol_rm_allowed), /* Rescue options. */ fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue), /* Deprecated, with alias rescue=usebackuproot */ __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL), /* For compatibility only, alias for "rescue=nologreplay". */ fsparam_flag("norecovery", Opt_norecovery), /* Debugging options. */ fsparam_flag_no("enospc_debug", Opt_enospc_debug), #ifdef CONFIG_BTRFS_DEBUG fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment), fsparam_flag("ref_tracker", Opt_ref_tracker), fsparam_flag("ref_verify", Opt_ref_verify), #endif {} }; static bool btrfs_match_compress_type(const char *string, const char *type, bool may_have_level) { const int len = strlen(type); return (strncmp(string, type, len) == 0) && ((may_have_level && string[len] == ':') || string[len] == '\0'); } static int btrfs_parse_compress(struct btrfs_fs_context *ctx, const struct fs_parameter *param, int opt) { const char *string = param->string; int ret; /* * Provide the same semantics as older kernels that don't use fs * context, specifying the "compress" option clears "force-compress" * without the need to pass "compress-force=[no|none]" before * specifying "compress". */ if (opt != Opt_compress_force && opt != Opt_compress_force_type) btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); if (opt == Opt_compress || opt == Opt_compress_force) { ctx->compress_type = BTRFS_COMPRESS_ZLIB; ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); } else if (btrfs_match_compress_type(string, "zlib", true)) { ctx->compress_type = BTRFS_COMPRESS_ZLIB; ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, string + 4, &ctx->compress_level); if (ret < 0) goto error; btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); } else if (btrfs_match_compress_type(string, "lzo", true)) { ctx->compress_type = BTRFS_COMPRESS_LZO; ret = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, string + 3, &ctx->compress_level); if (ret < 0) goto error; if (string[3] == ':' && string[4]) btrfs_warn(NULL, "Compression level ignored for LZO"); btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); } else if (btrfs_match_compress_type(string, "zstd", true)) { ctx->compress_type = BTRFS_COMPRESS_ZSTD; ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, string + 4, &ctx->compress_level); if (ret < 0) goto error; btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); } else if (btrfs_match_compress_type(string, "no", false) || btrfs_match_compress_type(string, "none", false)) { ctx->compress_level = 0; ctx->compress_type = 0; btrfs_clear_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); } else { ret = -EINVAL; goto error; } return 0; error: btrfs_err(NULL, "failed to parse compression option '%s'", string); return ret; } static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct btrfs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; int opt; opt = fs_parse(fc, btrfs_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_degraded: btrfs_set_opt(ctx->mount_opt, DEGRADED); break; case Opt_subvol_empty: /* * This exists because we used to allow it on accident, so we're * keeping it to maintain ABI. See 37becec95ac3 ("Btrfs: allow * empty subvol= again"). */ break; case Opt_subvol: kfree(ctx->subvol_name); ctx->subvol_name = kstrdup(param->string, GFP_KERNEL); if (!ctx->subvol_name) return -ENOMEM; break; case Opt_subvolid: ctx->subvol_objectid = result.uint_64; /* subvolid=0 means give me the original fs_tree. */ if (!ctx->subvol_objectid) ctx->subvol_objectid = BTRFS_FS_TREE_OBJECTID; break; case Opt_device: { struct btrfs_device *device; mutex_lock(&uuid_mutex); device = btrfs_scan_one_device(param->string, false); mutex_unlock(&uuid_mutex); if (IS_ERR(device)) return PTR_ERR(device); break; } case Opt_datasum: if (result.negated) { btrfs_set_opt(ctx->mount_opt, NODATASUM); } else { btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); } break; case Opt_datacow: if (result.negated) { btrfs_clear_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); btrfs_set_opt(ctx->mount_opt, NODATACOW); btrfs_set_opt(ctx->mount_opt, NODATASUM); } else { btrfs_clear_opt(ctx->mount_opt, NODATACOW); } break; case Opt_compress_force: case Opt_compress_force_type: btrfs_set_opt(ctx->mount_opt, FORCE_COMPRESS); fallthrough; case Opt_compress: case Opt_compress_type: if (btrfs_parse_compress(ctx, param, opt)) return -EINVAL; break; case Opt_ssd: if (result.negated) { btrfs_set_opt(ctx->mount_opt, NOSSD); btrfs_clear_opt(ctx->mount_opt, SSD); btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD); } else { btrfs_set_opt(ctx->mount_opt, SSD); btrfs_clear_opt(ctx->mount_opt, NOSSD); } break; case Opt_ssd_spread: if (result.negated) { btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD); } else { btrfs_set_opt(ctx->mount_opt, SSD); btrfs_set_opt(ctx->mount_opt, SSD_SPREAD); btrfs_clear_opt(ctx->mount_opt, NOSSD); } break; case Opt_barrier: if (result.negated) btrfs_set_opt(ctx->mount_opt, NOBARRIER); else btrfs_clear_opt(ctx->mount_opt, NOBARRIER); break; case Opt_thread_pool: if (result.uint_32 == 0) { btrfs_err(NULL, "invalid value 0 for thread_pool"); return -EINVAL; } ctx->thread_pool_size = result.uint_32; break; case Opt_max_inline: ctx->max_inline = memparse(param->string, NULL); break; case Opt_acl: if (result.negated) { fc->sb_flags &= ~SB_POSIXACL; } else { #ifdef CONFIG_BTRFS_FS_POSIX_ACL fc->sb_flags |= SB_POSIXACL; #else btrfs_err(NULL, "support for ACL not compiled in"); return -EINVAL; #endif } /* * VFS limits the ability to toggle ACL on and off via remount, * despite every file system allowing this. This seems to be * an oversight since we all do, but it'll fail if we're * remounting. So don't set the mask here, we'll check it in * btrfs_reconfigure and do the toggling ourselves. */ if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) fc->sb_flags_mask |= SB_POSIXACL; break; case Opt_treelog: if (result.negated) btrfs_set_opt(ctx->mount_opt, NOTREELOG); else btrfs_clear_opt(ctx->mount_opt, NOTREELOG); break; case Opt_norecovery: btrfs_info(NULL, "'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'"); btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); break; case Opt_flushoncommit: if (result.negated) btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT); else btrfs_set_opt(ctx->mount_opt, FLUSHONCOMMIT); break; case Opt_ratio: ctx->metadata_ratio = result.uint_32; break; case Opt_discard: if (result.negated) { btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC); btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); btrfs_set_opt(ctx->mount_opt, NODISCARD); } else { btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC); btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); } break; case Opt_discard_mode: switch (result.uint_32) { case Opt_discard_sync: btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC); break; case Opt_discard_async: btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC); btrfs_set_opt(ctx->mount_opt, DISCARD_ASYNC); break; default: btrfs_err(NULL, "unrecognized discard mode value %s", param->key); return -EINVAL; } btrfs_clear_opt(ctx->mount_opt, NODISCARD); break; case Opt_space_cache: if (result.negated) { btrfs_set_opt(ctx->mount_opt, NOSPACECACHE); btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE); btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); } else { btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); btrfs_set_opt(ctx->mount_opt, SPACE_CACHE); } break; case Opt_space_cache_version: switch (result.uint_32) { case Opt_space_cache_v1: btrfs_set_opt(ctx->mount_opt, SPACE_CACHE); btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); break; case Opt_space_cache_v2: btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE); btrfs_set_opt(ctx->mount_opt, FREE_SPACE_TREE); break; default: btrfs_err(NULL, "unrecognized space_cache value %s", param->key); return -EINVAL; } break; case Opt_rescan_uuid_tree: btrfs_set_opt(ctx->mount_opt, RESCAN_UUID_TREE); break; case Opt_clear_cache: btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE); break; case Opt_user_subvol_rm_allowed: btrfs_set_opt(ctx->mount_opt, USER_SUBVOL_RM_ALLOWED); break; case Opt_enospc_debug: if (result.negated) btrfs_clear_opt(ctx->mount_opt, ENOSPC_DEBUG); else btrfs_set_opt(ctx->mount_opt, ENOSPC_DEBUG); break; case Opt_defrag: if (result.negated) btrfs_clear_opt(ctx->mount_opt, AUTO_DEFRAG); else btrfs_set_opt(ctx->mount_opt, AUTO_DEFRAG); break; case Opt_usebackuproot: btrfs_warn(NULL, "'usebackuproot' is deprecated, use 'rescue=usebackuproot' instead"); btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT); /* If we're loading the backup roots we can't trust the space cache. */ btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE); break; case Opt_skip_balance: btrfs_set_opt(ctx->mount_opt, SKIP_BALANCE); break; case Opt_fatal_errors: switch (result.uint_32) { case Opt_fatal_errors_panic: btrfs_set_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR); break; case Opt_fatal_errors_bug: btrfs_clear_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR); break; default: btrfs_err(NULL, "unrecognized fatal_errors value %s", param->key); return -EINVAL; } break; case Opt_commit_interval: ctx->commit_interval = result.uint_32; if (ctx->commit_interval > BTRFS_WARNING_COMMIT_INTERVAL) { btrfs_warn(NULL, "excessive commit interval %u, use with care", ctx->commit_interval); } if (ctx->commit_interval == 0) ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; break; case Opt_rescue: switch (result.uint_32) { case Opt_rescue_usebackuproot: btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT); break; case Opt_rescue_nologreplay: btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); break; case Opt_rescue_ignorebadroots: btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS); break; case Opt_rescue_ignoredatacsums: btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS); break; case Opt_rescue_ignoremetacsums: btrfs_set_opt(ctx->mount_opt, IGNOREMETACSUMS); break; case Opt_rescue_ignoresuperflags: btrfs_set_opt(ctx->mount_opt, IGNORESUPERFLAGS); break; case Opt_rescue_parameter_all: btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS); btrfs_set_opt(ctx->mount_opt, IGNOREMETACSUMS); btrfs_set_opt(ctx->mount_opt, IGNORESUPERFLAGS); btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS); btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); break; default: btrfs_info(NULL, "unrecognized rescue option '%s'", param->key); return -EINVAL; } break; #ifdef CONFIG_BTRFS_DEBUG case Opt_fragment: switch (result.uint_32) { case Opt_fragment_parameter_all: btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA); btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA); break; case Opt_fragment_parameter_metadata: btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA); break; case Opt_fragment_parameter_data: btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA); break; default: btrfs_info(NULL, "unrecognized fragment option '%s'", param->key); return -EINVAL; } break; case Opt_ref_verify: btrfs_set_opt(ctx->mount_opt, REF_VERIFY); break; case Opt_ref_tracker: btrfs_set_opt(ctx->mount_opt, REF_TRACKER); break; #endif default: btrfs_err(NULL, "unrecognized mount option '%s'", param->key); return -EINVAL; } return 0; } /* * Some options only have meaning at mount time and shouldn't persist across * remounts, or be displayed. Clear these at the end of mount and remount code * paths. */ static void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) { btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE); btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE); } static bool check_ro_option(const struct btrfs_fs_info *fs_info, unsigned long long mount_opt, unsigned long long opt, const char *opt_name) { if (mount_opt & opt) { btrfs_err(fs_info, "%s must be used with ro mount option", opt_name); return true; } return false; } bool btrfs_check_options(const struct btrfs_fs_info *info, unsigned long long *mount_opt, unsigned long flags) { bool ret = true; if (!(flags & SB_RDONLY) && (check_ro_option(info, *mount_opt, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums") || check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREMETACSUMS, "ignoremetacsums") || check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNORESUPERFLAGS, "ignoresuperflags"))) ret = false; if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE) && !btrfs_raw_test_opt(*mount_opt, CLEAR_CACHE)) { btrfs_err(info, "cannot disable free-space-tree"); ret = false; } if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) && !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) { btrfs_err(info, "cannot disable free-space-tree with block-group-tree feature"); ret = false; } if (btrfs_check_mountopts_zoned(info, mount_opt)) ret = false; if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) { if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { btrfs_warn(info, "space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2"); } } return ret; } /* * This is subtle, we only call this during open_ctree(). We need to pre-load * the mount options with the on-disk settings. Before the new mount API took * effect we would do this on mount and remount. With the new mount API we'll * only do this on the initial mount. * * This isn't a change in behavior, because we're using the current state of the * file system to set the current mount options. If you mounted with special * options to disable these features and then remounted we wouldn't revert the * settings, because mounting without these features cleared the on-disk * settings, so this being called on re-mount is not needed. */ void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info) { if (fs_info->sectorsize < PAGE_SIZE) { btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); if (!btrfs_test_opt(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "forcing free space tree for sector size %u with page size %lu", fs_info->sectorsize, PAGE_SIZE); btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); } } /* * At this point our mount options are populated, so we only mess with * these settings if we don't have any settings already. */ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) return; if (btrfs_is_zoned(fs_info) && btrfs_free_space_cache_v1_active(fs_info)) { btrfs_info(fs_info, "zoned: clearing existing space cache"); btrfs_set_super_cache_generation(fs_info->super_copy, 0); return; } if (btrfs_test_opt(fs_info, SPACE_CACHE)) return; if (btrfs_test_opt(fs_info, NOSPACECACHE)) return; /* * At this point we don't have explicit options set by the user, set * them ourselves based on the state of the file system. */ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); else if (btrfs_free_space_cache_v1_active(fs_info)) btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); } static void set_device_specific_options(struct btrfs_fs_info *fs_info) { if (!btrfs_test_opt(fs_info, NOSSD) && !fs_info->fs_devices->rotating) btrfs_set_opt(fs_info->mount_opt, SSD); /* * For devices supporting discard turn on discard=async automatically, * unless it's already set or disabled. This could be turned off by * nodiscard for the same mount. * * The zoned mode piggy backs on the discard functionality for * resetting a zone. There is no reason to delay the zone reset as it is * fast enough. So, do not enable async discard for zoned mode. */ if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || btrfs_test_opt(fs_info, DISCARD_ASYNC) || btrfs_test_opt(fs_info, NODISCARD)) && fs_info->fs_devices->discardable && !btrfs_is_zoned(fs_info)) btrfs_set_opt(fs_info->mount_opt, DISCARD_ASYNC); } char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid) { struct btrfs_root *root = fs_info->tree_root; struct btrfs_root *fs_root = NULL; struct btrfs_root_ref *root_ref; struct btrfs_inode_ref *inode_ref; struct btrfs_key key; BTRFS_PATH_AUTO_FREE(path); char *name = NULL, *ptr; u64 dirid; int len; int ret; path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); name = kmalloc(PATH_MAX, GFP_KERNEL); if (!name) { ret = -ENOMEM; goto err; } ptr = name + PATH_MAX - 1; ptr[0] = '\0'; /* * Walk up the subvolume trees in the tree of tree roots by root * backrefs until we hit the top-level subvolume. */ while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) { key.objectid = subvol_objectid; key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; ret = btrfs_search_backwards(root, &key, path); if (ret < 0) { goto err; } else if (ret > 0) { ret = -ENOENT; goto err; } subvol_objectid = key.offset; root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_root_ref); len = btrfs_root_ref_name_len(path->nodes[0], root_ref); ptr -= len + 1; if (ptr < name) { ret = -ENAMETOOLONG; goto err; } read_extent_buffer(path->nodes[0], ptr + 1, (unsigned long)(root_ref + 1), len); ptr[0] = '/'; dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref); btrfs_release_path(path); fs_root = btrfs_get_fs_root(fs_info, subvol_objectid, true); if (IS_ERR(fs_root)) { ret = PTR_ERR(fs_root); fs_root = NULL; goto err; } /* * Walk up the filesystem tree by inode refs until we hit the * root directory. */ while (dirid != BTRFS_FIRST_FREE_OBJECTID) { key.objectid = dirid; key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; ret = btrfs_search_backwards(fs_root, &key, path); if (ret < 0) { goto err; } else if (ret > 0) { ret = -ENOENT; goto err; } dirid = key.offset; inode_ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); len = btrfs_inode_ref_name_len(path->nodes[0], inode_ref); ptr -= len + 1; if (ptr < name) { ret = -ENAMETOOLONG; goto err; } read_extent_buffer(path->nodes[0], ptr + 1, (unsigned long)(inode_ref + 1), len); ptr[0] = '/'; btrfs_release_path(path); } btrfs_put_root(fs_root); fs_root = NULL; } if (ptr == name + PATH_MAX - 1) { name[0] = '/'; name[1] = '\0'; } else { memmove(name, ptr, name + PATH_MAX - ptr); } return name; err: btrfs_put_root(fs_root); kfree(name); return ERR_PTR(ret); } static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid) { struct btrfs_root *root = fs_info->tree_root; struct btrfs_dir_item *di; BTRFS_PATH_AUTO_FREE(path); struct btrfs_key location; struct fscrypt_str name = FSTR_INIT("default", 7); u64 dir_id; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* * Find the "default" dir item which points to the root item that we * will mount by default if we haven't been given a specific subvolume * to mount. */ dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0); if (IS_ERR(di)) { return PTR_ERR(di); } if (!di) { /* * Ok the default dir item isn't there. This is weird since * it's always been there, but don't freak out, just try and * mount the top-level subvolume. */ *objectid = BTRFS_FS_TREE_OBJECTID; return 0; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); *objectid = location.objectid; return 0; } static int btrfs_fill_super(struct super_block *sb, struct btrfs_fs_devices *fs_devices) { struct btrfs_inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); int ret; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_magic = BTRFS_SUPER_MAGIC; sb->s_op = &btrfs_super_ops; set_default_d_op(sb, &btrfs_dentry_operations); sb->s_export_op = &btrfs_export_ops; #ifdef CONFIG_FS_VERITY sb->s_vop = &btrfs_verityops; #endif sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM; ret = super_setup_bdi(sb); if (ret) { btrfs_err(fs_info, "super_setup_bdi failed"); return ret; } ret = open_ctree(sb, fs_devices); if (ret) { btrfs_err(fs_info, "open_ctree failed: %d", ret); return ret; } btrfs_emit_options(fs_info, NULL); inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); btrfs_handle_fs_error(fs_info, ret, NULL); goto fail_close; } sb->s_root = d_make_root(&inode->vfs_inode); if (!sb->s_root) { ret = -ENOMEM; goto fail_close; } sb->s_flags |= SB_ACTIVE; return 0; fail_close: close_ctree(fs_info); return ret; } int btrfs_sync_fs(struct super_block *sb, int wait) { struct btrfs_trans_handle *trans; struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; trace_btrfs_sync_fs(fs_info, wait); if (!wait) { filemap_flush(fs_info->btree_inode->i_mapping); return 0; } btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ if (PTR_ERR(trans) == -ENOENT) { /* * Exit unless we have some pending changes * that need to go through commit */ if (!test_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags)) return 0; /* * A non-blocking test if the fs is frozen. We must not * start a new transaction here otherwise a deadlock * happens. The pending operations are delayed to the * next commit after thawing. */ if (sb_start_write_trylock(sb)) sb_end_write(sb); else return 0; trans = btrfs_start_transaction(root, 0); } if (IS_ERR(trans)) return PTR_ERR(trans); } return btrfs_commit_transaction(trans); } static void print_rescue_option(struct seq_file *seq, const char *s, bool *printed) { seq_printf(seq, "%s%s", (*printed) ? ":" : ",rescue=", s); *printed = true; } static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); const char *compress_type; const char *subvol_name; bool printed = false; if (btrfs_test_opt(info, DEGRADED)) seq_puts(seq, ",degraded"); if (btrfs_test_opt(info, NODATASUM)) seq_puts(seq, ",nodatasum"); if (btrfs_test_opt(info, NODATACOW)) seq_puts(seq, ",nodatacow"); if (btrfs_test_opt(info, NOBARRIER)) seq_puts(seq, ",nobarrier"); if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) seq_printf(seq, ",max_inline=%llu", info->max_inline); if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%u", info->thread_pool_size); if (btrfs_test_opt(info, COMPRESS)) { compress_type = btrfs_compress_type2str(info->compress_type); if (btrfs_test_opt(info, FORCE_COMPRESS)) seq_printf(seq, ",compress-force=%s", compress_type); else seq_printf(seq, ",compress=%s", compress_type); if (info->compress_level && info->compress_type != BTRFS_COMPRESS_LZO) seq_printf(seq, ":%d", info->compress_level); } if (btrfs_test_opt(info, NOSSD)) seq_puts(seq, ",nossd"); if (btrfs_test_opt(info, SSD_SPREAD)) seq_puts(seq, ",ssd_spread"); else if (btrfs_test_opt(info, SSD)) seq_puts(seq, ",ssd"); if (btrfs_test_opt(info, NOTREELOG)) seq_puts(seq, ",notreelog"); if (btrfs_test_opt(info, NOLOGREPLAY)) print_rescue_option(seq, "nologreplay", &printed); if (btrfs_test_opt(info, USEBACKUPROOT)) print_rescue_option(seq, "usebackuproot", &printed); if (btrfs_test_opt(info, IGNOREBADROOTS)) print_rescue_option(seq, "ignorebadroots", &printed); if (btrfs_test_opt(info, IGNOREDATACSUMS)) print_rescue_option(seq, "ignoredatacsums", &printed); if (btrfs_test_opt(info, IGNOREMETACSUMS)) print_rescue_option(seq, "ignoremetacsums", &printed); if (btrfs_test_opt(info, IGNORESUPERFLAGS)) print_rescue_option(seq, "ignoresuperflags", &printed); if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD_SYNC)) seq_puts(seq, ",discard"); if (btrfs_test_opt(info, DISCARD_ASYNC)) seq_puts(seq, ",discard=async"); if (!(info->sb->s_flags & SB_POSIXACL)) seq_puts(seq, ",noacl"); if (btrfs_free_space_cache_v1_active(info)) seq_puts(seq, ",space_cache"); else if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) seq_puts(seq, ",space_cache=v2"); else seq_puts(seq, ",nospace_cache"); if (btrfs_test_opt(info, RESCAN_UUID_TREE)) seq_puts(seq, ",rescan_uuid_tree"); if (btrfs_test_opt(info, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED)) seq_puts(seq, ",user_subvol_rm_allowed"); if (btrfs_test_opt(info, ENOSPC_DEBUG)) seq_puts(seq, ",enospc_debug"); if (btrfs_test_opt(info, AUTO_DEFRAG)) seq_puts(seq, ",autodefrag"); if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); if (info->metadata_ratio) seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio); if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR)) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%u", info->commit_interval); #ifdef CONFIG_BTRFS_DEBUG if (btrfs_test_opt(info, FRAGMENT_DATA)) seq_puts(seq, ",fragment=data"); if (btrfs_test_opt(info, FRAGMENT_METADATA)) seq_puts(seq, ",fragment=metadata"); #endif if (btrfs_test_opt(info, REF_VERIFY)) seq_puts(seq, ",ref_verify"); if (btrfs_test_opt(info, REF_TRACKER)) seq_puts(seq, ",ref_tracker"); seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); subvol_name = btrfs_get_subvol_name_from_objectid(info, btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); if (!IS_ERR(subvol_name)) { seq_show_option(seq, "subvol", subvol_name); kfree(subvol_name); } return 0; } /* * subvolumes are identified by ino 256 */ static inline bool is_subvolume_inode(struct inode *inode) { if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) return true; return false; } static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, struct vfsmount *mnt) { struct dentry *root; int ret; if (!subvol_name) { if (!subvol_objectid) { ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb), &subvol_objectid); if (ret) { root = ERR_PTR(ret); goto out; } } subvol_name = btrfs_get_subvol_name_from_objectid( btrfs_sb(mnt->mnt_sb), subvol_objectid); if (IS_ERR(subvol_name)) { root = ERR_CAST(subvol_name); subvol_name = NULL; goto out; } } root = mount_subtree(mnt, subvol_name); /* mount_subtree() drops our reference on the vfsmount. */ mnt = NULL; if (!IS_ERR(root)) { struct super_block *s = root->d_sb; struct btrfs_fs_info *fs_info = btrfs_sb(s); struct inode *root_inode = d_inode(root); u64 root_objectid = btrfs_root_id(BTRFS_I(root_inode)->root); ret = 0; if (!is_subvolume_inode(root_inode)) { btrfs_err(fs_info, "'%s' is not a valid subvolume", subvol_name); ret = -EINVAL; } if (subvol_objectid && root_objectid != subvol_objectid) { /* * This will also catch a race condition where a * subvolume which was passed by ID is renamed and * another subvolume is renamed over the old location. */ btrfs_err(fs_info, "subvol '%s' does not match subvolid %llu", subvol_name, subvol_objectid); ret = -EINVAL; } if (ret) { dput(root); root = ERR_PTR(ret); deactivate_locked_super(s); } } out: mntput(mnt); kfree(subvol_name); return root; } static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, u32 new_pool_size, u32 old_pool_size) { if (new_pool_size == old_pool_size) return; fs_info->thread_pool_size = new_pool_size; btrfs_info(fs_info, "resize thread pool %d -> %d", old_pool_size, new_pool_size); btrfs_workqueue_set_max(fs_info->workers, new_pool_size); btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); workqueue_set_max_active(fs_info->endio_workers, new_pool_size); workqueue_set_max_active(fs_info->endio_meta_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); } static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, unsigned long long old_opts, int flags) { if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || (flags & SB_RDONLY))) { /* wait for any defraggers to finish */ wait_event(fs_info->transaction_wait, (atomic_read(&fs_info->defrag_running) == 0)); if (flags & SB_RDONLY) sync_filesystem(fs_info->sb); } } static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, unsigned long long old_opts) { const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); /* * We need to cleanup all defraggable inodes if the autodefragment is * close or the filesystem is read only. */ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || sb_rdonly(fs_info->sb))) { btrfs_cleanup_defrag_inodes(fs_info); } /* If we toggled discard async */ if (!btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) && btrfs_test_opt(fs_info, DISCARD_ASYNC)) btrfs_discard_resume(fs_info); else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) && !btrfs_test_opt(fs_info, DISCARD_ASYNC)) btrfs_discard_cleanup(fs_info); /* If we toggled space cache */ if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); } static int btrfs_remount_rw(struct btrfs_fs_info *fs_info) { int ret; if (BTRFS_FS_ERROR(fs_info)) { btrfs_err(fs_info, "remounting read-write after error is not allowed"); return -EINVAL; } if (fs_info->fs_devices->rw_devices == 0) return -EACCES; if (!btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, "too many missing devices, writable remount is not allowed"); return -EACCES; } if (btrfs_super_log_root(fs_info->super_copy) != 0) { btrfs_warn(fs_info, "mount required to replay tree-log, cannot remount read-write"); return -EINVAL; } /* * NOTE: when remounting with a change that does writes, don't put it * anywhere above this point, as we are not sure to be safe to write * until we pass the above checks. */ ret = btrfs_start_pre_rw_mount(fs_info); if (ret) return ret; btrfs_clear_sb_rdonly(fs_info->sb); set_bit(BTRFS_FS_OPEN, &fs_info->flags); /* * If we've gone from readonly -> read-write, we need to get our * sync/async discard lists in the right state. */ btrfs_discard_resume(fs_info); return 0; } static int btrfs_remount_ro(struct btrfs_fs_info *fs_info) { /* * This also happens on 'umount -rf' or on shutdown, when the * filesystem is busy. */ cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); btrfs_discard_cleanup(fs_info); /* Wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); /* Avoid complains from lockdep et al. */ up(&fs_info->uuid_tree_rescan_sem); btrfs_set_sb_rdonly(fs_info->sb); /* * Setting SB_RDONLY will put the cleaner thread to sleep at the next * loop if it's already active. If it's already asleep, we'll leave * unused block groups on disk until we're mounted read-write again * unless we clean them up here. */ btrfs_delete_unused_bgs(fs_info); /* * The cleaner task could be already running before we set the flag * BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). We must make * sure that after we finish the remount, i.e. after we call * btrfs_commit_super(), the cleaner can no longer start a transaction * - either because it was dropping a dead root, running delayed iputs * or deleting an unused block group (the cleaner picked a block * group from the list of unused block groups before we were able to * in the previous call to btrfs_delete_unused_bgs()). */ wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, TASK_UNINTERRUPTIBLE); /* * We've set the superblock to RO mode, so we might have made the * cleaner task sleep without running all pending delayed iputs. Go * through all the delayed iputs here, so that if an unmount happens * without remounting RW we don't end up at finishing close_ctree() * with a non-empty list of delayed iputs. */ btrfs_run_delayed_iputs(fs_info); btrfs_dev_replace_suspend_for_unmount(fs_info); btrfs_scrub_cancel(fs_info); btrfs_pause_balance(fs_info); /* * Pause the qgroup rescan worker if it is running. We don't want it to * be still running after we are in RO mode, as after that, by the time * we unmount, it might have left a transaction open, so we would leak * the transaction and/or crash. */ btrfs_qgroup_wait_for_completion(fs_info, false); return btrfs_commit_super(fs_info); } static void btrfs_ctx_to_info(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx) { fs_info->max_inline = ctx->max_inline; fs_info->commit_interval = ctx->commit_interval; fs_info->metadata_ratio = ctx->metadata_ratio; fs_info->thread_pool_size = ctx->thread_pool_size; fs_info->mount_opt = ctx->mount_opt; fs_info->compress_type = ctx->compress_type; fs_info->compress_level = ctx->compress_level; } static void btrfs_info_to_ctx(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx) { ctx->max_inline = fs_info->max_inline; ctx->commit_interval = fs_info->commit_interval; ctx->metadata_ratio = fs_info->metadata_ratio; ctx->thread_pool_size = fs_info->thread_pool_size; ctx->mount_opt = fs_info->mount_opt; ctx->compress_type = fs_info->compress_type; ctx->compress_level = fs_info->compress_level; } #define btrfs_info_if_set(fs_info, old_ctx, opt, fmt, args...) \ do { \ if ((!old_ctx || !btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \ btrfs_raw_test_opt(fs_info->mount_opt, opt)) \ btrfs_info(fs_info, fmt, ##args); \ } while (0) #define btrfs_info_if_unset(fs_info, old_ctx, opt, fmt, args...) \ do { \ if ((old_ctx && btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \ !btrfs_raw_test_opt(fs_info->mount_opt, opt)) \ btrfs_info(fs_info, fmt, ##args); \ } while (0) static void btrfs_emit_options(struct btrfs_fs_info *info, struct btrfs_fs_context *old) { btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts"); btrfs_info_if_set(info, old, NODATACOW, "setting nodatacow"); btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations"); btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme"); btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers"); btrfs_info_if_set(info, old, NOTREELOG, "disabling tree log"); btrfs_info_if_set(info, old, NOLOGREPLAY, "disabling log replay at mount time"); btrfs_info_if_set(info, old, FLUSHONCOMMIT, "turning on flush-on-commit"); btrfs_info_if_set(info, old, DISCARD_SYNC, "turning on sync discard"); btrfs_info_if_set(info, old, DISCARD_ASYNC, "turning on async discard"); btrfs_info_if_set(info, old, FREE_SPACE_TREE, "enabling free space tree"); btrfs_info_if_set(info, old, SPACE_CACHE, "enabling disk space caching"); btrfs_info_if_set(info, old, CLEAR_CACHE, "force clearing of disk cache"); btrfs_info_if_set(info, old, AUTO_DEFRAG, "enabling auto defrag"); btrfs_info_if_set(info, old, FRAGMENT_DATA, "fragmenting data"); btrfs_info_if_set(info, old, FRAGMENT_METADATA, "fragmenting metadata"); btrfs_info_if_set(info, old, REF_VERIFY, "doing ref verification"); btrfs_info_if_set(info, old, USEBACKUPROOT, "trying to use backup root at mount time"); btrfs_info_if_set(info, old, IGNOREBADROOTS, "ignoring bad roots"); btrfs_info_if_set(info, old, IGNOREDATACSUMS, "ignoring data csums"); btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums"); btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags"); btrfs_info_if_unset(info, old, NODATASUM, "setting datasum"); btrfs_info_if_unset(info, old, NODATACOW, "setting datacow"); btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations"); btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme"); btrfs_info_if_unset(info, old, NOBARRIER, "turning on barriers"); btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log"); btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching"); btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree"); btrfs_info_if_unset(info, old, AUTO_DEFRAG, "disabling auto defrag"); btrfs_info_if_unset(info, old, COMPRESS, "use no compression"); /* Did the compression settings change? */ if (btrfs_test_opt(info, COMPRESS) && (!old || old->compress_type != info->compress_type || old->compress_level != info->compress_level || (!btrfs_raw_test_opt(old->mount_opt, FORCE_COMPRESS) && btrfs_raw_test_opt(info->mount_opt, FORCE_COMPRESS)))) { const char *compress_type = btrfs_compress_type2str(info->compress_type); btrfs_info(info, "%s %s compression, level %d", btrfs_test_opt(info, FORCE_COMPRESS) ? "force" : "use", compress_type, info->compress_level); } if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) btrfs_info(info, "max_inline set to %llu", info->max_inline); } static int btrfs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_fs_context *ctx = fc->fs_private; struct btrfs_fs_context old_ctx; int ret = 0; bool mount_reconfigure = (fc->s_fs_info != NULL); btrfs_info_to_ctx(fs_info, &old_ctx); /* * This is our "bind mount" trick, we don't want to allow the user to do * anything other than mount a different ro/rw and a different subvol, * all of the mount options should be maintained. */ if (mount_reconfigure) ctx->mount_opt = old_ctx.mount_opt; sync_filesystem(sb); set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); if (!btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) return -EINVAL; ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY)); if (ret < 0) return ret; btrfs_ctx_to_info(fs_info, ctx); btrfs_remount_begin(fs_info, old_ctx.mount_opt, fc->sb_flags); btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_ctx.thread_pool_size); if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) != (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && (!sb_rdonly(sb) || (fc->sb_flags & SB_RDONLY))) { btrfs_warn(fs_info, "remount supports changing free space tree only from RO to RW"); /* Make sure free space cache options match the state on disk. */ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); } if (btrfs_free_space_cache_v1_active(fs_info)) { btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE); btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); } } ret = 0; if (!sb_rdonly(sb) && (fc->sb_flags & SB_RDONLY)) ret = btrfs_remount_ro(fs_info); else if (sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY)) ret = btrfs_remount_rw(fs_info); if (ret) goto restore; /* * If we set the mask during the parameter parsing VFS would reject the * remount. Here we can set the mask and the value will be updated * appropriately. */ if ((fc->sb_flags & SB_POSIXACL) != (sb->s_flags & SB_POSIXACL)) fc->sb_flags_mask |= SB_POSIXACL; btrfs_emit_options(fs_info, &old_ctx); wake_up_process(fs_info->transaction_kthread); btrfs_remount_cleanup(fs_info, old_ctx.mount_opt); btrfs_clear_oneshot_options(fs_info); clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); return 0; restore: btrfs_ctx_to_info(fs_info, &old_ctx); btrfs_remount_cleanup(fs_info, old_ctx.mount_opt); clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); return ret; } /* Used to sort the devices by max_avail(descending sort) */ static int btrfs_cmp_device_free_bytes(const void *a, const void *b) { const struct btrfs_device_info *dev_info1 = a; const struct btrfs_device_info *dev_info2 = b; if (dev_info1->max_avail > dev_info2->max_avail) return -1; else if (dev_info1->max_avail < dev_info2->max_avail) return 1; return 0; } /* * sort the devices by max_avail, in which max free extent size of each device * is stored.(Descending Sort) */ static inline void btrfs_descending_sort_devices( struct btrfs_device_info *devices, size_t nr_devices) { sort(devices, nr_devices, sizeof(struct btrfs_device_info), btrfs_cmp_device_free_bytes, NULL); } /* * The helper to calc the free space on the devices that can be used to store * file data. */ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, u64 *free_bytes) { struct btrfs_device_info AUTO_KFREE(devices_info); struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 type; u64 avail_space; u64 min_stripe_size; int num_stripes = 1; int i = 0, nr_devices; const struct btrfs_raid_attr *rattr; /* * We aren't under the device list lock, so this is racy-ish, but good * enough for our purposes. */ nr_devices = fs_info->fs_devices->open_devices; if (!nr_devices) { smp_mb(); nr_devices = fs_info->fs_devices->open_devices; ASSERT(nr_devices); if (!nr_devices) { *free_bytes = 0; return 0; } } devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), GFP_KERNEL); if (!devices_info) return -ENOMEM; /* calc min stripe number for data space allocation */ type = btrfs_data_alloc_profile(fs_info); rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)]; if (type & BTRFS_BLOCK_GROUP_RAID0) num_stripes = nr_devices; else if (type & BTRFS_BLOCK_GROUP_RAID1_MASK) num_stripes = rattr->ncopies; else if (type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = 4; /* Adjust for more than 1 stripe per device */ min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN; rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) || !device->bdev || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) continue; if (i >= nr_devices) break; avail_space = device->total_bytes - device->bytes_used; /* align with stripe_len */ avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN); /* * Ensure we have at least min_stripe_size on top of the * reserved space on the device. */ if (avail_space <= BTRFS_DEVICE_RANGE_RESERVED + min_stripe_size) continue; avail_space -= BTRFS_DEVICE_RANGE_RESERVED; devices_info[i].dev = device; devices_info[i].max_avail = avail_space; i++; } rcu_read_unlock(); nr_devices = i; btrfs_descending_sort_devices(devices_info, nr_devices); i = nr_devices - 1; avail_space = 0; while (nr_devices >= rattr->devs_min) { num_stripes = min(num_stripes, nr_devices); if (devices_info[i].max_avail >= min_stripe_size) { int j; u64 alloc_size; avail_space += devices_info[i].max_avail * num_stripes; alloc_size = devices_info[i].max_avail; for (j = i + 1 - num_stripes; j <= i; j++) devices_info[j].max_avail -= alloc_size; } i--; nr_devices--; } *free_bytes = avail_space; return 0; } /* * Calculate numbers for 'df', pessimistic in case of mixed raid profiles. * * If there's a redundant raid level at DATA block groups, use the respective * multiplier to scale the sizes. * * Unused device space usage is based on simulating the chunk allocator * algorithm that respects the device sizes and order of allocations. This is * a close approximation of the actual use but there are other factors that may * change the result (like a new metadata chunk). * * If metadata is exhausted, f_bavail will be 0. */ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); struct btrfs_super_block *disk_super = fs_info->super_copy; struct btrfs_space_info *found; u64 total_used = 0; u64 total_free_data = 0; u64 total_free_meta = 0; u32 bits = fs_info->sectorsize_bits; __be32 *fsid = (__be32 *)fs_info->fs_devices->fsid; unsigned factor = 1; struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; u64 thresh = 0; int mixed = 0; list_for_each_entry(found, &fs_info->space_info, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { int i; total_free_data += found->disk_total - found->disk_used; total_free_data -= btrfs_account_ro_block_groups_free_space(found); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { if (!list_empty(&found->block_groups[i])) factor = btrfs_bg_type_to_factor( btrfs_raid_array[i].bg_flag); } } /* * Metadata in mixed block group profiles are accounted in data */ if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) mixed = 1; else total_free_meta += found->disk_total - found->disk_used; } total_used += found->disk_used; } buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor); buf->f_blocks >>= bits; buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits); /* Account global block reserve as used, it's in logical size already */ spin_lock(&block_rsv->lock); /* Mixed block groups accounting is not byte-accurate, avoid overflow */ if (buf->f_bfree >= block_rsv->size >> bits) buf->f_bfree -= block_rsv->size >> bits; else buf->f_bfree = 0; spin_unlock(&block_rsv->lock); buf->f_bavail = div_u64(total_free_data, factor); ret = btrfs_calc_avail_data_space(fs_info, &total_free_data); if (ret) return ret; buf->f_bavail += div_u64(total_free_data, factor); buf->f_bavail = buf->f_bavail >> bits; /* * We calculate the remaining metadata space minus global reserve. If * this is (supposedly) smaller than zero, there's no space. But this * does not hold in practice, the exhausted state happens where's still * some positive delta. So we apply some guesswork and compare the * delta to a 4M threshold. (Practically observed delta was ~2M.) * * We probably cannot calculate the exact threshold value because this * depends on the internal reservations requested by various * operations, so some operations that consume a few metadata will * succeed even if the Avail is zero. But this is better than the other * way around. */ thresh = SZ_4M; /* * We only want to claim there's no available space if we can no longer * allocate chunks for our metadata profile and our global reserve will * not fit in the free metadata space. If we aren't ->full then we * still can allocate chunks and thus are fine using the currently * calculated f_bavail. */ if (!mixed && block_rsv->space_info->full && (total_free_meta < thresh || total_free_meta - thresh < block_rsv->size)) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; buf->f_bsize = fs_info->sectorsize; buf->f_namelen = BTRFS_NAME_LEN; /* We treat it as constant endianness (it doesn't matter _which_) because we want the fsid to come out the same whether mounted on a big-endian or little-endian host */ buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); /* Mask in the root object ID too, to disambiguate subvols */ buf->f_fsid.val[0] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root) >> 32; buf->f_fsid.val[1] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root); return 0; } static int btrfs_fc_test_super(struct super_block *sb, struct fs_context *fc) { struct btrfs_fs_info *p = fc->s_fs_info; struct btrfs_fs_info *fs_info = btrfs_sb(sb); return fs_info->fs_devices == p->fs_devices; } static int btrfs_get_tree_super(struct fs_context *fc) { struct btrfs_fs_info *fs_info = fc->s_fs_info; struct btrfs_fs_context *ctx = fc->fs_private; struct btrfs_fs_devices *fs_devices = NULL; struct btrfs_device *device; struct super_block *sb; blk_mode_t mode = sb_open_mode(fc->sb_flags); int ret; btrfs_ctx_to_info(fs_info, ctx); mutex_lock(&uuid_mutex); /* * With 'true' passed to btrfs_scan_one_device() (mount time) we expect * either a valid device or an error. */ device = btrfs_scan_one_device(fc->source, true); ASSERT(device != NULL); if (IS_ERR(device)) { mutex_unlock(&uuid_mutex); return PTR_ERR(device); } fs_devices = device->fs_devices; /* * We cannot hold uuid_mutex calling sget_fc(), it will lead to a * locking order reversal with s_umount. * * So here we increase the holding number of fs_devices, this will ensure * the fs_devices itself won't be freed. */ btrfs_fs_devices_inc_holding(fs_devices); fs_info->fs_devices = fs_devices; mutex_unlock(&uuid_mutex); sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc); if (IS_ERR(sb)) { mutex_lock(&uuid_mutex); btrfs_fs_devices_dec_holding(fs_devices); /* * Since the fs_devices is not opened, it can be freed at any * time after unlocking uuid_mutex. We need to avoid double * free through put_fs_context()->btrfs_free_fs_info(). * So here we reset fs_info->fs_devices to NULL, and let the * regular fs_devices reclaim path to handle it. * * This applies to all later branches where no fs_devices is * opened. */ fs_info->fs_devices = NULL; mutex_unlock(&uuid_mutex); return PTR_ERR(sb); } if (sb->s_root) { /* * Not the first mount of the fs thus got an existing super block. * Will reuse the returned super block, fs_info and fs_devices. * * fc->s_fs_info is not touched and will be later freed by * put_fs_context() through btrfs_free_fs_context(). */ ASSERT(fc->s_fs_info == fs_info); mutex_lock(&uuid_mutex); btrfs_fs_devices_dec_holding(fs_devices); fs_info->fs_devices = NULL; mutex_unlock(&uuid_mutex); /* * At this stage we may have RO flag mismatch between * fc->sb_flags and sb->s_flags. Caller should detect such * mismatch and reconfigure with sb->s_umount rwsem held if * needed. */ } else { struct block_device *bdev; /* * The first mount of the fs thus a new superblock, fc->s_fs_info * must be NULL, and the ownership of our fs_info and fs_devices is * transferred to the super block. */ ASSERT(fc->s_fs_info == NULL); mutex_lock(&uuid_mutex); btrfs_fs_devices_dec_holding(fs_devices); ret = btrfs_open_devices(fs_devices, mode, sb); if (ret < 0) fs_info->fs_devices = NULL; mutex_unlock(&uuid_mutex); if (ret < 0) { deactivate_locked_super(sb); return ret; } if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) { deactivate_locked_super(sb); return -EACCES; } set_device_specific_options(fs_info); bdev = fs_devices->latest_dev->bdev; snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); ret = btrfs_fill_super(sb, fs_devices); if (ret) { deactivate_locked_super(sb); return ret; } } btrfs_clear_oneshot_options(fs_info); fc->root = dget(sb->s_root); return 0; } /* * Ever since commit 0723a0473fb4 ("btrfs: allow mounting btrfs subvolumes * with different ro/rw options") the following works: * * (i) mount /dev/sda3 -o subvol=foo,ro /mnt/foo * (ii) mount /dev/sda3 -o subvol=bar,rw /mnt/bar * * which looks nice and innocent but is actually pretty intricate and deserves * a long comment. * * On another filesystem a subvolume mount is close to something like: * * (iii) # create rw superblock + initial mount * mount -t xfs /dev/sdb /opt/ * * # create ro bind mount * mount --bind -o ro /opt/foo /mnt/foo * * # unmount initial mount * umount /opt * * Of course, there's some special subvolume sauce and there's the fact that the * sb->s_root dentry is really swapped after mount_subtree(). But conceptually * it's very close and will help us understand the issue. * * The old mount API didn't cleanly distinguish between a mount being made ro * and a superblock being made ro. The only way to change the ro state of * either object was by passing ms_rdonly. If a new mount was created via * mount(2) such as: * * mount("/dev/sdb", "/mnt", "xfs", ms_rdonly, null); * * the MS_RDONLY flag being specified had two effects: * * (1) MNT_READONLY was raised -> the resulting mount got * @mnt->mnt_flags |= MNT_READONLY raised. * * (2) MS_RDONLY was passed to the filesystem's mount method and the filesystems * made the superblock ro. Note, how SB_RDONLY has the same value as * ms_rdonly and is raised whenever MS_RDONLY is passed through mount(2). * * Creating a subtree mount via (iii) ends up leaving a rw superblock with a * subtree mounted ro. * * But consider the effect on the old mount API on btrfs subvolume mounting * which combines the distinct step in (iii) into a single step. * * By issuing (i) both the mount and the superblock are turned ro. Now when (ii) * is issued the superblock is ro and thus even if the mount created for (ii) is * rw it wouldn't help. Hence, btrfs needed to transition the superblock from ro * to rw for (ii) which it did using an internal remount call. * * IOW, subvolume mounting was inherently complicated due to the ambiguity of * MS_RDONLY in mount(2). Note, this ambiguity has mount(8) always translate * "ro" to MS_RDONLY. IOW, in both (i) and (ii) "ro" becomes MS_RDONLY when * passed by mount(8) to mount(2). * * Enter the new mount API. The new mount API disambiguates making a mount ro * and making a superblock ro. * * (3) To turn a mount ro the MOUNT_ATTR_ONLY flag can be used with either * fsmount() or mount_setattr() this is a pure VFS level change for a * specific mount or mount tree that is never seen by the filesystem itself. * * (4) To turn a superblock ro the "ro" flag must be used with * fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem * in fc->sb_flags. * * But, currently the util-linux mount command already utilizes the new mount * API and is still setting fsconfig(FSCONFIG_SET_FLAG, "ro") no matter if it's * btrfs or not, setting the whole super block RO. To make per-subvolume mounting * work with different options work we need to keep backward compatibility. */ static int btrfs_reconfigure_for_mount(struct fs_context *fc) { int ret = 0; if (!(fc->sb_flags & SB_RDONLY) && (fc->root->d_sb->s_flags & SB_RDONLY)) ret = btrfs_reconfigure(fc); return ret; } static int btrfs_get_tree_subvol(struct fs_context *fc) { struct btrfs_fs_info *fs_info = NULL; struct btrfs_fs_context *ctx = fc->fs_private; struct fs_context *dup_fc; struct dentry *dentry; struct vfsmount *mnt; int ret = 0; /* * Setup a dummy root and fs_info for test/set super. This is because * we don't actually fill this stuff out until open_ctree, but we need * then open_ctree will properly initialize the file system specific * settings later. btrfs_init_fs_info initializes the static elements * of the fs_info (locks and such) to make cleanup easier if we find a * superblock with our given fs_devices later on at sget() time. */ fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); if (!fs_info) return -ENOMEM; fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); if (!fs_info->super_copy || !fs_info->super_for_commit) { /* * Dont call btrfs_free_fs_info() to free it as it's still * initialized partially. */ kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); kvfree(fs_info); return -ENOMEM; } btrfs_init_fs_info(fs_info); dup_fc = vfs_dup_fs_context(fc); if (IS_ERR(dup_fc)) { btrfs_free_fs_info(fs_info); return PTR_ERR(dup_fc); } /* * When we do the sget_fc this gets transferred to the sb, so we only * need to set it on the dup_fc as this is what creates the super block. */ dup_fc->s_fs_info = fs_info; ret = btrfs_get_tree_super(dup_fc); if (ret) goto error; ret = btrfs_reconfigure_for_mount(dup_fc); up_write(&dup_fc->root->d_sb->s_umount); if (ret) goto error; mnt = vfs_create_mount(dup_fc); put_fs_context(dup_fc); if (IS_ERR(mnt)) return PTR_ERR(mnt); /* * This free's ->subvol_name, because if it isn't set we have to * allocate a buffer to hold the subvol_name, so we just drop our * reference to it here. */ dentry = mount_subvol(ctx->subvol_name, ctx->subvol_objectid, mnt); ctx->subvol_name = NULL; if (IS_ERR(dentry)) return PTR_ERR(dentry); fc->root = dentry; return 0; error: put_fs_context(dup_fc); return ret; } static int btrfs_get_tree(struct fs_context *fc) { ASSERT(fc->s_fs_info == NULL); return btrfs_get_tree_subvol(fc); } static void btrfs_kill_super(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); kill_anon_super(sb); btrfs_free_fs_info(fs_info); } static void btrfs_free_fs_context(struct fs_context *fc) { struct btrfs_fs_context *ctx = fc->fs_private; struct btrfs_fs_info *fs_info = fc->s_fs_info; if (fs_info) btrfs_free_fs_info(fs_info); if (ctx && refcount_dec_and_test(&ctx->refs)) { kfree(ctx->subvol_name); kfree(ctx); } } static int btrfs_dup_fs_context(struct fs_context *fc, struct fs_context *src_fc) { struct btrfs_fs_context *ctx = src_fc->fs_private; /* * Give a ref to our ctx to this dup, as we want to keep it around for * our original fc so we can have the subvolume name or objectid. * * We unset ->source in the original fc because the dup needs it for * mounting, and then once we free the dup it'll free ->source, so we * need to make sure we're only pointing to it in one fc. */ refcount_inc(&ctx->refs); fc->fs_private = ctx; fc->source = src_fc->source; src_fc->source = NULL; return 0; } static const struct fs_context_operations btrfs_fs_context_ops = { .parse_param = btrfs_parse_param, .reconfigure = btrfs_reconfigure, .get_tree = btrfs_get_tree, .dup = btrfs_dup_fs_context, .free = btrfs_free_fs_context, }; static int btrfs_init_fs_context(struct fs_context *fc) { struct btrfs_fs_context *ctx; ctx = kzalloc(sizeof(struct btrfs_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; refcount_set(&ctx->refs, 1); fc->fs_private = ctx; fc->ops = &btrfs_fs_context_ops; if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { btrfs_info_to_ctx(btrfs_sb(fc->root->d_sb), ctx); } else { ctx->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); ctx->max_inline = BTRFS_DEFAULT_MAX_INLINE; ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; } #ifdef CONFIG_BTRFS_FS_POSIX_ACL fc->sb_flags |= SB_POSIXACL; #endif fc->sb_flags |= SB_I_VERSION; return 0; } static struct file_system_type btrfs_fs_type = { .owner = THIS_MODULE, .name = "btrfs", .init_fs_context = btrfs_init_fs_context, .parameters = btrfs_fs_parameters, .kill_sb = btrfs_kill_super, .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("btrfs"); static int btrfs_control_open(struct inode *inode, struct file *file) { /* * The control file's private_data is used to hold the * transaction when it is started and is used to keep * track of whether a transaction is already in progress. */ file->private_data = NULL; return 0; } /* * Used by /dev/btrfs-control for devices ioctls. */ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct btrfs_ioctl_vol_args *vol; struct btrfs_device *device = NULL; dev_t devt = 0; int ret = -ENOTTY; if (!capable(CAP_SYS_ADMIN)) return -EPERM; vol = memdup_user((void __user *)arg, sizeof(*vol)); if (IS_ERR(vol)) return PTR_ERR(vol); ret = btrfs_check_ioctl_vol_args_path(vol); if (ret < 0) goto out; switch (cmd) { case BTRFS_IOC_SCAN_DEV: mutex_lock(&uuid_mutex); /* * Scanning outside of mount can return NULL which would turn * into 0 error code. */ device = btrfs_scan_one_device(vol->name, false); ret = PTR_ERR_OR_ZERO(device); mutex_unlock(&uuid_mutex); break; case BTRFS_IOC_FORGET_DEV: if (vol->name[0] != 0) { ret = lookup_bdev(vol->name, &devt); if (ret) break; } ret = btrfs_forget_devices(devt); break; case BTRFS_IOC_DEVICES_READY: mutex_lock(&uuid_mutex); /* * Scanning outside of mount can return NULL which would turn * into 0 error code. */ device = btrfs_scan_one_device(vol->name, false); if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); ret = PTR_ERR_OR_ZERO(device); break; } ret = !(device->fs_devices->num_devices == device->fs_devices->total_devices); mutex_unlock(&uuid_mutex); break; case BTRFS_IOC_GET_SUPPORTED_FEATURES: ret = btrfs_ioctl_get_supported_features((void __user*)arg); break; } out: kfree(vol); return ret; } static int btrfs_freeze(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); set_bit(BTRFS_FS_FROZEN, &fs_info->flags); /* * We don't need a barrier here, we'll wait for any transaction that * could be in progress on other threads (and do delayed iputs that * we want to avoid on a frozen filesystem), or do the commit * ourselves. */ return btrfs_commit_current_transaction(fs_info->tree_root); } static int check_dev_super(struct btrfs_device *dev) { struct btrfs_fs_info *fs_info = dev->fs_info; struct btrfs_super_block *sb; u64 last_trans; u16 csum_type; int ret = 0; /* This should be called with fs still frozen. */ ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags)); /* Missing dev, no need to check. */ if (!dev->bdev) return 0; /* Only need to check the primary super block. */ sb = btrfs_read_disk_super(dev->bdev, 0, true); if (IS_ERR(sb)) return PTR_ERR(sb); /* Verify the checksum. */ csum_type = btrfs_super_csum_type(sb); if (unlikely(csum_type != btrfs_super_csum_type(fs_info->super_copy))) { btrfs_err(fs_info, "csum type changed, has %u expect %u", csum_type, btrfs_super_csum_type(fs_info->super_copy)); ret = -EUCLEAN; goto out; } if (unlikely(btrfs_check_super_csum(fs_info, sb))) { btrfs_err(fs_info, "csum for on-disk super block no longer matches"); ret = -EUCLEAN; goto out; } /* Btrfs_validate_super() includes fsid check against super->fsid. */ ret = btrfs_validate_super(fs_info, sb, 0); if (ret < 0) goto out; last_trans = btrfs_get_last_trans_committed(fs_info); if (unlikely(btrfs_super_generation(sb) != last_trans)) { btrfs_err(fs_info, "transid mismatch, has %llu expect %llu", btrfs_super_generation(sb), last_trans); ret = -EUCLEAN; goto out; } out: btrfs_release_disk_super(sb); return ret; } static int btrfs_unfreeze(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_device *device; int ret = 0; /* * Make sure the fs is not changed by accident (like hibernation then * modified by other OS). * If we found anything wrong, we mark the fs error immediately. * * And since the fs is frozen, no one can modify the fs yet, thus * we don't need to hold device_list_mutex. */ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { ret = check_dev_super(device); if (ret < 0) { btrfs_handle_fs_error(fs_info, ret, "super block on devid %llu got modified unexpectedly", device->devid); break; } } clear_bit(BTRFS_FS_FROZEN, &fs_info->flags); /* * We still return 0, to allow VFS layer to unfreeze the fs even the * above checks failed. Since the fs is either fine or read-only, we're * safe to continue, without causing further damage. */ return 0; } static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); /* * There should be always a valid pointer in latest_dev, it may be stale * for a short moment in case it's being deleted but still valid until * the end of RCU grace period. */ rcu_read_lock(); seq_escape(m, btrfs_dev_name(fs_info->fs_devices->latest_dev), " \t\n\\"); rcu_read_unlock(); return 0; } static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_control *sc) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); const s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); trace_btrfs_extent_map_shrinker_count(fs_info, nr); return nr; } static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc) { const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); struct btrfs_fs_info *fs_info = btrfs_sb(sb); btrfs_free_extent_maps(fs_info, nr_to_scan); /* The extent map shrinker runs asynchronously, so always return 0. */ return 0; } #ifdef CONFIG_BTRFS_EXPERIMENTAL static int btrfs_remove_bdev(struct super_block *sb, struct block_device *bdev) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_device *device; struct btrfs_dev_lookup_args lookup_args = { .devt = bdev->bd_dev }; bool can_rw; mutex_lock(&fs_info->fs_devices->device_list_mutex); device = btrfs_find_device(fs_info->fs_devices, &lookup_args); if (!device) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* Device not found, should not affect the running fs, just give a warning. */ btrfs_warn(fs_info, "unable to find btrfs device for block device '%pg'", bdev); return 0; } /* * The to-be-removed device is already missing? * * That's weird but no special handling needed and can exit right now. */ if (unlikely(test_and_set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_warn(fs_info, "btrfs device id %llu is already missing", device->devid); return 0; } device->fs_devices->missing_devices++; if (test_and_clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { list_del_init(&device->dev_alloc_list); WARN_ON(device->fs_devices->rw_devices < 1); device->fs_devices->rw_devices--; } can_rw = btrfs_check_rw_degradable(fs_info, device); mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* * Now device is considered missing, btrfs_device_name() won't give a * meaningful result anymore, so only output the devid. */ if (unlikely(!can_rw)) { btrfs_crit(fs_info, "btrfs device id %llu has gone missing, can not maintain read-write", device->devid); return -EIO; } btrfs_warn(fs_info, "btrfs device id %llu has gone missing, continue as degraded", device->devid); btrfs_set_opt(fs_info->mount_opt, DEGRADED); return 0; } static void btrfs_shutdown(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); btrfs_force_shutdown(fs_info); } #endif static const struct super_operations btrfs_super_ops = { .drop_inode = btrfs_drop_inode, .evict_inode = btrfs_evict_inode, .put_super = btrfs_put_super, .sync_fs = btrfs_sync_fs, .show_options = btrfs_show_options, .show_devname = btrfs_show_devname, .alloc_inode = btrfs_alloc_inode, .destroy_inode = btrfs_destroy_inode, .free_inode = btrfs_free_inode, .statfs = btrfs_statfs, .freeze_fs = btrfs_freeze, .unfreeze_fs = btrfs_unfreeze, .nr_cached_objects = btrfs_nr_cached_objects, .free_cached_objects = btrfs_free_cached_objects, #ifdef CONFIG_BTRFS_EXPERIMENTAL .remove_bdev = btrfs_remove_bdev, .shutdown = btrfs_shutdown, #endif }; static const struct file_operations btrfs_ctl_fops = { .open = btrfs_control_open, .unlocked_ioctl = btrfs_control_ioctl, .compat_ioctl = compat_ptr_ioctl, .owner = THIS_MODULE, .llseek = noop_llseek, }; static struct miscdevice btrfs_misc = { .minor = BTRFS_MINOR, .name = "btrfs-control", .fops = &btrfs_ctl_fops }; MODULE_ALIAS_MISCDEV(BTRFS_MINOR); MODULE_ALIAS("devname:btrfs-control"); static int __init btrfs_interface_init(void) { return misc_register(&btrfs_misc); } static __cold void btrfs_interface_exit(void) { misc_deregister(&btrfs_misc); } static int __init btrfs_print_mod_info(void) { static const char options[] = "" #ifdef CONFIG_BTRFS_EXPERIMENTAL ", experimental=on" #endif #ifdef CONFIG_BTRFS_DEBUG ", debug=on" #endif #ifdef CONFIG_BTRFS_ASSERT ", assert=on" #endif #ifdef CONFIG_BLK_DEV_ZONED ", zoned=yes" #else ", zoned=no" #endif #ifdef CONFIG_FS_VERITY ", fsverity=yes" #else ", fsverity=no" #endif ; #ifdef CONFIG_BTRFS_EXPERIMENTAL if (btrfs_get_mod_read_policy() == NULL) pr_info("Btrfs loaded%s\n", options); else pr_info("Btrfs loaded%s, read_policy=%s\n", options, btrfs_get_mod_read_policy()); #else pr_info("Btrfs loaded%s\n", options); #endif return 0; } static int register_btrfs(void) { return register_filesystem(&btrfs_fs_type); } static void unregister_btrfs(void) { unregister_filesystem(&btrfs_fs_type); } /* Helper structure for long init/exit functions. */ struct init_sequence { int (*init_func)(void); /* Can be NULL if the init_func doesn't need cleanup. */ void (*exit_func)(void); }; static const struct init_sequence mod_init_seq[] = { { .init_func = btrfs_props_init, .exit_func = NULL, }, { .init_func = btrfs_init_sysfs, .exit_func = btrfs_exit_sysfs, }, { .init_func = btrfs_init_compress, .exit_func = btrfs_exit_compress, }, { .init_func = btrfs_init_cachep, .exit_func = btrfs_destroy_cachep, }, { .init_func = btrfs_init_dio, .exit_func = btrfs_destroy_dio, }, { .init_func = btrfs_transaction_init, .exit_func = btrfs_transaction_exit, }, { .init_func = btrfs_ctree_init, .exit_func = btrfs_ctree_exit, }, { .init_func = btrfs_free_space_init, .exit_func = btrfs_free_space_exit, }, { .init_func = btrfs_extent_state_init_cachep, .exit_func = btrfs_extent_state_free_cachep, }, { .init_func = extent_buffer_init_cachep, .exit_func = extent_buffer_free_cachep, }, { .init_func = btrfs_bioset_init, .exit_func = btrfs_bioset_exit, }, { .init_func = btrfs_extent_map_init, .exit_func = btrfs_extent_map_exit, #ifdef CONFIG_BTRFS_EXPERIMENTAL }, { .init_func = btrfs_read_policy_init, .exit_func = NULL, #endif }, { .init_func = ordered_data_init, .exit_func = ordered_data_exit, }, { .init_func = btrfs_delayed_inode_init, .exit_func = btrfs_delayed_inode_exit, }, { .init_func = btrfs_auto_defrag_init, .exit_func = btrfs_auto_defrag_exit, }, { .init_func = btrfs_delayed_ref_init, .exit_func = btrfs_delayed_ref_exit, }, { .init_func = btrfs_prelim_ref_init, .exit_func = btrfs_prelim_ref_exit, }, { .init_func = btrfs_interface_init, .exit_func = btrfs_interface_exit, }, { .init_func = btrfs_print_mod_info, .exit_func = NULL, }, { .init_func = btrfs_run_sanity_tests, .exit_func = NULL, }, { .init_func = register_btrfs, .exit_func = unregister_btrfs, } }; static bool mod_init_result[ARRAY_SIZE(mod_init_seq)]; static __always_inline void btrfs_exit_btrfs_fs(void) { int i; for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) { if (!mod_init_result[i]) continue; if (mod_init_seq[i].exit_func) mod_init_seq[i].exit_func(); mod_init_result[i] = false; } } static void __exit exit_btrfs_fs(void) { btrfs_exit_btrfs_fs(); btrfs_cleanup_fs_uuids(); } static int __init init_btrfs_fs(void) { int ret; int i; for (i = 0; i < ARRAY_SIZE(mod_init_seq); i++) { ASSERT(!mod_init_result[i]); ret = mod_init_seq[i].init_func(); if (ret < 0) { btrfs_exit_btrfs_fs(); return ret; } mod_init_result[i] = true; } return 0; } late_initcall(init_btrfs_fs); module_exit(exit_btrfs_fs) MODULE_DESCRIPTION("B-Tree File System (BTRFS)"); MODULE_LICENSE("GPL"); MODULE_SOFTDEP("pre: crc32c"); MODULE_SOFTDEP("pre: xxhash64"); MODULE_SOFTDEP("pre: sha256"); MODULE_SOFTDEP("pre: blake2b-256"); |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 | // SPDX-License-Identifier: GPL-2.0 #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/xarray.h> #include <net/busy_poll.h> #include <net/net_debug.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> #include <net/page_pool/types.h> #include <net/page_pool/memory_provider.h> #include <net/sock.h> #include "page_pool_priv.h" #include "netdev-genl-gen.h" static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1); /* Protects: page_pools, netdevice->page_pools, pool->p.napi, pool->slow.netdev, * pool->user. * Ordering: inside rtnl_lock */ DEFINE_MUTEX(page_pools_lock); /* Page pools are only reachable from user space (via netlink) if they are * linked to a netdev at creation time. Following page pool "visibility" * states are possible: * - normal * - user.list: linked to real netdev, netdev: real netdev * - orphaned - real netdev has disappeared * - user.list: linked to lo, netdev: lo * - invisible - either (a) created without netdev linking, (b) unlisted due * to error, or (c) the entire namespace which owned this pool disappeared * - user.list: unhashed, netdev: unknown */ typedef int (*pp_nl_fill_cb)(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info); static int netdev_nl_page_pool_get_do(struct genl_info *info, u32 id, pp_nl_fill_cb fill) { struct page_pool *pool; struct sk_buff *rsp; int err; mutex_lock(&page_pools_lock); pool = xa_load(&page_pools, id); if (!pool || hlist_unhashed(&pool->user.list) || !net_eq(dev_net(pool->slow.netdev), genl_info_net(info))) { err = -ENOENT; goto err_unlock; } rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) { err = -ENOMEM; goto err_unlock; } err = fill(rsp, pool, info); if (err) goto err_free_msg; mutex_unlock(&page_pools_lock); return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); err_unlock: mutex_unlock(&page_pools_lock); return err; } struct page_pool_dump_cb { unsigned long ifindex; u32 pp_id; }; static int netdev_nl_page_pool_get_dump(struct sk_buff *skb, struct netlink_callback *cb, pp_nl_fill_cb fill) { struct page_pool_dump_cb *state = (void *)cb->ctx; const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; struct page_pool *pool; int err = 0; rtnl_lock(); mutex_lock(&page_pools_lock); for_each_netdev_dump(net, netdev, state->ifindex) { hlist_for_each_entry(pool, &netdev->page_pools, user.list) { if (state->pp_id && state->pp_id < pool->user.id) continue; state->pp_id = pool->user.id; err = fill(skb, pool, info); if (err) goto out; } state->pp_id = 0; } out: mutex_unlock(&page_pools_lock); rtnl_unlock(); return err; } static int page_pool_nl_stats_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { #ifdef CONFIG_PAGE_POOL_STATS struct page_pool_stats stats = {}; struct nlattr *nest; void *hdr; if (!page_pool_get_stats(pool, &stats)) return 0; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; nest = nla_nest_start(rsp, NETDEV_A_PAGE_POOL_STATS_INFO); if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id) || (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, pool->slow.netdev->ifindex))) goto err_cancel_nest; nla_nest_end(rsp, nest); if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_FAST, stats.alloc_stats.fast) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW, stats.alloc_stats.slow) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW_HIGH_ORDER, stats.alloc_stats.slow_high_order) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_EMPTY, stats.alloc_stats.empty) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_REFILL, stats.alloc_stats.refill) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_WAIVE, stats.alloc_stats.waive) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHED, stats.recycle_stats.cached) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHE_FULL, stats.recycle_stats.cache_full) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING, stats.recycle_stats.ring) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING_FULL, stats.recycle_stats.ring_full) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RELEASED_REFCNT, stats.recycle_stats.released_refcnt)) goto err_cancel_msg; genlmsg_end(rsp, hdr); return 0; err_cancel_nest: nla_nest_cancel(rsp, nest); err_cancel_msg: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; #else GENL_SET_ERR_MSG(info, "kernel built without CONFIG_PAGE_POOL_STATS"); return -EOPNOTSUPP; #endif } int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ARRAY_SIZE(netdev_page_pool_info_nl_policy)]; struct nlattr *nest; int err; u32 id; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_STATS_INFO)) return -EINVAL; nest = info->attrs[NETDEV_A_PAGE_POOL_STATS_INFO]; err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest, netdev_page_pool_info_nl_policy, info->extack); if (err) return err; if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, NETDEV_A_PAGE_POOL_ID)) return -EINVAL; if (tb[NETDEV_A_PAGE_POOL_IFINDEX]) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NETDEV_A_PAGE_POOL_IFINDEX], "selecting by ifindex not supported"); return -EINVAL; } id = nla_get_uint(tb[NETDEV_A_PAGE_POOL_ID]); return netdev_nl_page_pool_get_do(info, id, page_pool_nl_stats_fill); } int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_stats_fill); } static int page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { size_t inflight, refsz; unsigned int napi_id; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id)) goto err_cancel; if (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, pool->slow.netdev->ifindex)) goto err_cancel; napi_id = pool->p.napi ? READ_ONCE(pool->p.napi->napi_id) : 0; if (napi_id_valid(napi_id) && nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, napi_id)) goto err_cancel; inflight = page_pool_inflight(pool, false); refsz = PAGE_SIZE << pool->p.order; if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT, inflight) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT_MEM, inflight * refsz)) goto err_cancel; if (pool->user.detach_time && nla_put_uint(rsp, NETDEV_A_PAGE_POOL_DETACH_TIME, pool->user.detach_time)) goto err_cancel; if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL)) goto err_cancel; genlmsg_end(rsp, hdr); return 0; err_cancel: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static void netdev_nl_page_pool_event(const struct page_pool *pool, u32 cmd) { struct genl_info info; struct sk_buff *ntf; struct net *net; lockdep_assert_held(&page_pools_lock); /* 'invisible' page pools don't matter */ if (hlist_unhashed(&pool->user.list)) return; net = dev_net(pool->slow.netdev); if (!genl_has_listeners(&netdev_nl_family, net, NETDEV_NLGRP_PAGE_POOL)) return; genl_info_init_ntf(&info, &netdev_nl_family, cmd); ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!ntf) return; if (page_pool_nl_fill(ntf, pool, &info)) { nlmsg_free(ntf); return; } genlmsg_multicast_netns(&netdev_nl_family, net, ntf, 0, NETDEV_NLGRP_PAGE_POOL, GFP_KERNEL); } int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info) { u32 id; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_ID)) return -EINVAL; id = nla_get_uint(info->attrs[NETDEV_A_PAGE_POOL_ID]); return netdev_nl_page_pool_get_do(info, id, page_pool_nl_fill); } int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_fill); } int page_pool_list(struct page_pool *pool) { static u32 id_alloc_next; int err; mutex_lock(&page_pools_lock); err = xa_alloc_cyclic(&page_pools, &pool->user.id, pool, xa_limit_32b, &id_alloc_next, GFP_KERNEL); if (err < 0) goto err_unlock; INIT_HLIST_NODE(&pool->user.list); if (pool->slow.netdev) { hlist_add_head(&pool->user.list, &pool->slow.netdev->page_pools); netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF); } mutex_unlock(&page_pools_lock); return 0; err_unlock: mutex_unlock(&page_pools_lock); return err; } void page_pool_detached(struct page_pool *pool) { mutex_lock(&page_pools_lock); pool->user.detach_time = ktime_get_boottime_seconds(); netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF); mutex_unlock(&page_pools_lock); } void page_pool_unlist(struct page_pool *pool) { mutex_lock(&page_pools_lock); netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_DEL_NTF); xa_erase(&page_pools, pool->user.id); if (!hlist_unhashed(&pool->user.list)) hlist_del(&pool->user.list); mutex_unlock(&page_pools_lock); } int page_pool_check_memory_provider(struct net_device *dev, struct netdev_rx_queue *rxq) { void *binding = rxq->mp_params.mp_priv; struct page_pool *pool; struct hlist_node *n; if (!binding) return 0; mutex_lock(&page_pools_lock); hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) { if (pool->mp_priv != binding) continue; if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) { mutex_unlock(&page_pools_lock); return 0; } } mutex_unlock(&page_pools_lock); return -ENODATA; } static void page_pool_unreg_netdev_wipe(struct net_device *netdev) { struct page_pool *pool; struct hlist_node *n; mutex_lock(&page_pools_lock); hlist_for_each_entry_safe(pool, n, &netdev->page_pools, user.list) { hlist_del_init(&pool->user.list); pool->slow.netdev = NET_PTR_POISON; } mutex_unlock(&page_pools_lock); } static void page_pool_unreg_netdev(struct net_device *netdev) { struct page_pool *pool, *last; struct net_device *lo; lo = dev_net(netdev)->loopback_dev; mutex_lock(&page_pools_lock); last = NULL; hlist_for_each_entry(pool, &netdev->page_pools, user.list) { pool->slow.netdev = lo; netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF); last = pool; } if (last) hlist_splice_init(&netdev->page_pools, &last->user.list, &lo->page_pools); mutex_unlock(&page_pools_lock); } static int page_pool_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; if (hlist_empty(&netdev->page_pools)) return NOTIFY_OK; if (netdev->ifindex != LOOPBACK_IFINDEX) page_pool_unreg_netdev(netdev); else page_pool_unreg_netdev_wipe(netdev); return NOTIFY_OK; } static struct notifier_block page_pool_netdevice_nb = { .notifier_call = page_pool_netdevice_event, }; static int __init page_pool_user_init(void) { return register_netdevice_notifier(&page_pool_netdevice_nb); } subsys_initcall(page_pool_user_init); |
| 163 44 135 28 1 28 123 120 16 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | // SPDX-License-Identifier: GPL-2.0 /* * Out-of-line refcount functions. */ #include <linux/mutex.h> #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/bug.h> #define REFCOUNT_WARN(str) WARN_ONCE(1, "refcount_t: " str ".\n") void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t) { refcount_set(r, REFCOUNT_SATURATED); switch (t) { case REFCOUNT_ADD_NOT_ZERO_OVF: REFCOUNT_WARN("saturated; leaking memory"); break; case REFCOUNT_ADD_OVF: REFCOUNT_WARN("saturated; leaking memory"); break; case REFCOUNT_ADD_UAF: REFCOUNT_WARN("addition on 0; use-after-free"); break; case REFCOUNT_SUB_UAF: REFCOUNT_WARN("underflow; use-after-free"); break; case REFCOUNT_DEC_LEAK: REFCOUNT_WARN("decrement hit 0; leaking memory"); break; default: REFCOUNT_WARN("unknown saturation event!?"); } } EXPORT_SYMBOL(refcount_warn_saturate); /** * refcount_dec_if_one - decrement a refcount if it is 1 * @r: the refcount * * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the * success thereof. * * Like all decrement operations, it provides release memory order and provides * a control dependency. * * It can be used like a try-delete operator; this explicit case is provided * and not cmpxchg in generic, because that would allow implementing unsafe * operations. * * Return: true if the resulting refcount is 0, false otherwise */ bool refcount_dec_if_one(refcount_t *r) { int val = 1; return atomic_try_cmpxchg_release(&r->refs, &val, 0); } EXPORT_SYMBOL(refcount_dec_if_one); /** * refcount_dec_not_one - decrement a refcount if it is not 1 * @r: the refcount * * No atomic_t counterpart, it decrements unless the value is 1, in which case * it will return false. * * Was often done like: atomic_add_unless(&var, -1, 1) * * Return: true if the decrement operation was successful, false otherwise */ bool refcount_dec_not_one(refcount_t *r) { unsigned int new, val = atomic_read(&r->refs); do { if (unlikely(val == REFCOUNT_SATURATED)) return true; if (val == 1) return false; new = val - 1; if (new > val) { WARN_ONCE(new > val, "refcount_t: underflow; use-after-free.\n"); return true; } } while (!atomic_try_cmpxchg_release(&r->refs, &val, new)); return true; } EXPORT_SYMBOL(refcount_dec_not_one); /** * refcount_dec_and_mutex_lock - return holding mutex if able to decrement * refcount to 0 * @r: the refcount * @lock: the mutex to be locked * * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail * to decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. * * Return: true and hold mutex if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) { if (refcount_dec_not_one(r)) return false; mutex_lock(lock); if (!refcount_dec_and_test(r)) { mutex_unlock(lock); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_mutex_lock); /** * refcount_dec_and_lock - return holding spinlock if able to decrement * refcount to 0 * @r: the refcount * @lock: the spinlock to be locked * * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to * decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. * * Return: true and hold spinlock if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) { if (refcount_dec_not_one(r)) return false; spin_lock(lock); if (!refcount_dec_and_test(r)) { spin_unlock(lock); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_lock); /** * refcount_dec_and_lock_irqsave - return holding spinlock with disabled * interrupts if able to decrement refcount to 0 * @r: the refcount * @lock: the spinlock to be locked * @flags: saved IRQ-flags if the is acquired * * Same as refcount_dec_and_lock() above except that the spinlock is acquired * with disabled interrupts. * * Return: true and hold spinlock if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock, unsigned long *flags) { if (refcount_dec_not_one(r)) return false; spin_lock_irqsave(lock, *flags); if (!refcount_dec_and_test(r)) { spin_unlock_irqrestore(lock, *flags); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_lock_irqsave); |
| 58 858 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Kernel Electric-Fence (KFENCE). For more info please see * Documentation/dev-tools/kfence.rst. * * Copyright (C) 2020, Google LLC. */ #ifndef MM_KFENCE_KFENCE_H #define MM_KFENCE_KFENCE_H #include <linux/mm.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/types.h> #include "../slab.h" /* for struct kmem_cache */ /* * Get the canary byte pattern for @addr. Use a pattern that varies based on the * lower 3 bits of the address, to detect memory corruptions with higher * probability, where similar constants are used. */ #define KFENCE_CANARY_PATTERN_U8(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7)) /* * Define a continuous 8-byte canary starting from a multiple of 8. The canary * of each byte is only related to the lowest three bits of its address, so the * canary of every 8 bytes is the same. 64-bit memory can be filled and checked * at a time instead of byte by byte to improve performance. */ #define KFENCE_CANARY_PATTERN_U64 ((u64)0xaaaaaaaaaaaaaaaa ^ (u64)(le64_to_cpu(0x0706050403020100))) /* Maximum stack depth for reports. */ #define KFENCE_STACK_DEPTH 64 /* KFENCE object states. */ enum kfence_object_state { KFENCE_OBJECT_UNUSED, /* Object is unused. */ KFENCE_OBJECT_ALLOCATED, /* Object is currently allocated. */ KFENCE_OBJECT_RCU_FREEING, /* Object was allocated, and then being freed by rcu. */ KFENCE_OBJECT_FREED, /* Object was allocated, and then freed. */ }; /* Alloc/free tracking information. */ struct kfence_track { pid_t pid; int cpu; u64 ts_nsec; int num_stack_entries; unsigned long stack_entries[KFENCE_STACK_DEPTH]; }; /* KFENCE metadata per guarded allocation. */ struct kfence_metadata { struct list_head list; /* Freelist node; access under kfence_freelist_lock. */ struct rcu_head rcu_head; /* For delayed freeing. */ /* * Lock protecting below data; to ensure consistency of the below data, * since the following may execute concurrently: __kfence_alloc(), * __kfence_free(), kfence_handle_page_fault(). However, note that we * cannot grab the same metadata off the freelist twice, and multiple * __kfence_alloc() cannot run concurrently on the same metadata. */ raw_spinlock_t lock; /* The current state of the object; see above. */ enum kfence_object_state state; /* * Allocated object address; cannot be calculated from size, because of * alignment requirements. * * Invariant: ALIGN_DOWN(addr, PAGE_SIZE) is constant. */ unsigned long addr; /* * The size of the original allocation. */ size_t size; /* * The kmem_cache cache of the last allocation; NULL if never allocated * or the cache has already been destroyed. */ struct kmem_cache *cache; /* * In case of an invalid access, the page that was unprotected; we * optimistically only store one address. */ unsigned long unprotected_page; /* Allocation and free stack information. */ struct kfence_track alloc_track; struct kfence_track free_track; /* For updating alloc_covered on frees. */ u32 alloc_stack_hash; #ifdef CONFIG_MEMCG struct slabobj_ext obj_exts; #endif }; #define KFENCE_METADATA_SIZE PAGE_ALIGN(sizeof(struct kfence_metadata) * \ CONFIG_KFENCE_NUM_OBJECTS) extern struct kfence_metadata *kfence_metadata; static inline struct kfence_metadata *addr_to_metadata(unsigned long addr) { long index; /* The checks do not affect performance; only called from slow-paths. */ if (!is_kfence_address((void *)addr)) return NULL; /* * May be an invalid index if called with an address at the edge of * __kfence_pool, in which case we would report an "invalid access" * error. */ index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1; if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS) return NULL; return &kfence_metadata[index]; } /* KFENCE error types for report generation. */ enum kfence_error_type { KFENCE_ERROR_OOB, /* Detected a out-of-bounds access. */ KFENCE_ERROR_UAF, /* Detected a use-after-free access. */ KFENCE_ERROR_CORRUPTION, /* Detected a memory corruption on free. */ KFENCE_ERROR_INVALID, /* Invalid access of unknown type. */ KFENCE_ERROR_INVALID_FREE, /* Invalid free. */ }; void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, const struct kfence_metadata *meta, enum kfence_error_type type); void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta); #endif /* MM_KFENCE_KFENCE_H */ |
| 20 20 19 2 3 22 5 5 4 17 22 50 2 48 47 44 2 4 44 1 20 22 2 1 2 19 20 2 2 1 9 6 15 4 9 1 2 16 1 15 1 1 22 22 2 10 1 1 1 1 1 2 1 2 1 2 3 3 1 3 3 10 10 2 2 1 3 49 51 2 4 16 2 3 4 2 2 2 1 2 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 | // SPDX-License-Identifier: GPL-2.0-only /* * binfmt_misc.c * * Copyright (C) 1997 Richard Günther * * binfmt_misc detects binaries via a magic or filename extension and invokes * a specified wrapper. See Documentation/admin-guide/binfmt-misc.rst for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/magic.h> #include <linux/binfmts.h> #include <linux/slab.h> #include <linux/ctype.h> #include <linux/string_helpers.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/fs_context.h> #include <linux/syscalls.h> #include <linux/fs.h> #include <linux/uaccess.h> #include "internal.h" #ifdef DEBUG # define USE_DEBUG 1 #else # define USE_DEBUG 0 #endif enum { VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */ }; enum {Enabled, Magic}; #define MISC_FMT_PRESERVE_ARGV0 (1UL << 31) #define MISC_FMT_OPEN_BINARY (1UL << 30) #define MISC_FMT_CREDENTIALS (1UL << 29) #define MISC_FMT_OPEN_FILE (1UL << 28) typedef struct { struct list_head list; unsigned long flags; /* type, status, etc. */ int offset; /* offset of magic */ int size; /* size of magic/mask */ char *magic; /* magic or filename extension */ char *mask; /* mask, NULL for exact match */ const char *interpreter; /* filename of interpreter */ char *name; struct dentry *dentry; struct file *interp_file; refcount_t users; /* sync removal with load_misc_binary() */ } Node; static struct file_system_type bm_fs_type; /* * Max length of the register string. Determined by: * - 7 delimiters * - name: ~50 bytes * - type: 1 byte * - offset: 3 bytes (has to be smaller than BINPRM_BUF_SIZE) * - magic: 128 bytes (512 in escaped form) * - mask: 128 bytes (512 in escaped form) * - interp: ~50 bytes * - flags: 5 bytes * Round that up a bit, and then back off to hold the internal data * (like struct Node). */ #define MAX_REGISTER_LENGTH 1920 /** * search_binfmt_handler - search for a binary handler for @bprm * @misc: handle to binfmt_misc instance * @bprm: binary for which we are looking for a handler * * Search for a binary type handler for @bprm in the list of registered binary * type handlers. * * Return: binary type list entry on success, NULL on failure */ static Node *search_binfmt_handler(struct binfmt_misc *misc, struct linux_binprm *bprm) { char *p = strrchr(bprm->interp, '.'); Node *e; /* Walk all the registered handlers. */ list_for_each_entry(e, &misc->entries, list) { char *s; int j; /* Make sure this one is currently enabled. */ if (!test_bit(Enabled, &e->flags)) continue; /* Do matching based on extension if applicable. */ if (!test_bit(Magic, &e->flags)) { if (p && !strcmp(e->magic, p + 1)) return e; continue; } /* Do matching based on magic & mask. */ s = bprm->buf + e->offset; if (e->mask) { for (j = 0; j < e->size; j++) if ((*s++ ^ e->magic[j]) & e->mask[j]) break; } else { for (j = 0; j < e->size; j++) if ((*s++ ^ e->magic[j])) break; } if (j == e->size) return e; } return NULL; } /** * get_binfmt_handler - try to find a binary type handler * @misc: handle to binfmt_misc instance * @bprm: binary for which we are looking for a handler * * Try to find a binfmt handler for the binary type. If one is found take a * reference to protect against removal via bm_{entry,status}_write(). * * Return: binary type list entry on success, NULL on failure */ static Node *get_binfmt_handler(struct binfmt_misc *misc, struct linux_binprm *bprm) { Node *e; read_lock(&misc->entries_lock); e = search_binfmt_handler(misc, bprm); if (e) refcount_inc(&e->users); read_unlock(&misc->entries_lock); return e; } /** * put_binfmt_handler - put binary handler node * @e: node to put * * Free node syncing with load_misc_binary() and defer final free to * load_misc_binary() in case it is using the binary type handler we were * requested to remove. */ static void put_binfmt_handler(Node *e) { if (refcount_dec_and_test(&e->users)) { if (e->flags & MISC_FMT_OPEN_FILE) filp_close(e->interp_file, NULL); kfree(e); } } /** * load_binfmt_misc - load the binfmt_misc of the caller's user namespace * * To be called in load_misc_binary() to load the relevant struct binfmt_misc. * If a user namespace doesn't have its own binfmt_misc mount it can make use * of its ancestor's binfmt_misc handlers. This mimicks the behavior of * pre-namespaced binfmt_misc where all registered binfmt_misc handlers where * available to all user and user namespaces on the system. * * Return: the binfmt_misc instance of the caller's user namespace */ static struct binfmt_misc *load_binfmt_misc(void) { const struct user_namespace *user_ns; struct binfmt_misc *misc; user_ns = current_user_ns(); while (user_ns) { /* Pairs with smp_store_release() in bm_fill_super(). */ misc = smp_load_acquire(&user_ns->binfmt_misc); if (misc) return misc; user_ns = user_ns->parent; } return &init_binfmt_misc; } /* * the loader itself */ static int load_misc_binary(struct linux_binprm *bprm) { Node *fmt; struct file *interp_file = NULL; int retval = -ENOEXEC; struct binfmt_misc *misc; misc = load_binfmt_misc(); if (!misc->enabled) return retval; fmt = get_binfmt_handler(misc, bprm); if (!fmt) return retval; /* Need to be able to load the file after exec */ retval = -ENOENT; if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) goto ret; if (fmt->flags & MISC_FMT_PRESERVE_ARGV0) { bprm->interp_flags |= BINPRM_FLAGS_PRESERVE_ARGV0; } else { retval = remove_arg_zero(bprm); if (retval) goto ret; } if (fmt->flags & MISC_FMT_OPEN_BINARY) bprm->have_execfd = 1; /* make argv[1] be the path to the binary */ retval = copy_string_kernel(bprm->interp, bprm); if (retval < 0) goto ret; bprm->argc++; /* add the interp as argv[0] */ retval = copy_string_kernel(fmt->interpreter, bprm); if (retval < 0) goto ret; bprm->argc++; /* Update interp in case binfmt_script needs it. */ retval = bprm_change_interp(fmt->interpreter, bprm); if (retval < 0) goto ret; if (fmt->flags & MISC_FMT_OPEN_FILE) { interp_file = file_clone_open(fmt->interp_file); if (!IS_ERR(interp_file)) deny_write_access(interp_file); } else { interp_file = open_exec(fmt->interpreter); } retval = PTR_ERR(interp_file); if (IS_ERR(interp_file)) goto ret; bprm->interpreter = interp_file; if (fmt->flags & MISC_FMT_CREDENTIALS) bprm->execfd_creds = 1; retval = 0; ret: /* * If we actually put the node here all concurrent calls to * load_misc_binary() will have finished. We also know * that for the refcount to be zero someone must have concurently * removed the binary type handler from the list and it's our job to * free it. */ put_binfmt_handler(fmt); return retval; } /* Command parsers */ /* * parses and copies one argument enclosed in del from *sp to *dp, * recognising the \x special. * returns pointer to the copied argument or NULL in case of an * error (and sets err) or null argument length. */ static char *scanarg(char *s, char del) { char c; while ((c = *s++) != del) { if (c == '\\' && *s == 'x') { s++; if (!isxdigit(*s++)) return NULL; if (!isxdigit(*s++)) return NULL; } } s[-1] ='\0'; return s; } static char *check_special_flags(char *sfs, Node *e) { char *p = sfs; int cont = 1; /* special flags */ while (cont) { switch (*p) { case 'P': pr_debug("register: flag: P (preserve argv0)\n"); p++; e->flags |= MISC_FMT_PRESERVE_ARGV0; break; case 'O': pr_debug("register: flag: O (open binary)\n"); p++; e->flags |= MISC_FMT_OPEN_BINARY; break; case 'C': pr_debug("register: flag: C (preserve creds)\n"); p++; /* this flags also implies the open-binary flag */ e->flags |= (MISC_FMT_CREDENTIALS | MISC_FMT_OPEN_BINARY); break; case 'F': pr_debug("register: flag: F: open interpreter file now\n"); p++; e->flags |= MISC_FMT_OPEN_FILE; break; default: cont = 0; } } return p; } /* * This registers a new binary format, it recognises the syntax * ':name:type:offset:magic:mask:interpreter:flags' * where the ':' is the IFS, that can be chosen with the first char */ static Node *create_entry(const char __user *buffer, size_t count) { Node *e; int memsize, err; char *buf, *p; char del; pr_debug("register: received %zu bytes\n", count); /* some sanity checks */ err = -EINVAL; if ((count < 11) || (count > MAX_REGISTER_LENGTH)) goto out; err = -ENOMEM; memsize = sizeof(Node) + count + 8; e = kmalloc(memsize, GFP_KERNEL_ACCOUNT); if (!e) goto out; p = buf = (char *)e + sizeof(Node); memset(e, 0, sizeof(Node)); if (copy_from_user(buf, buffer, count)) goto efault; del = *p++; /* delimeter */ pr_debug("register: delim: %#x {%c}\n", del, del); /* Pad the buffer with the delim to simplify parsing below. */ memset(buf + count, del, 8); /* Parse the 'name' field. */ e->name = p; p = strchr(p, del); if (!p) goto einval; *p++ = '\0'; if (!e->name[0] || !strcmp(e->name, ".") || !strcmp(e->name, "..") || strchr(e->name, '/')) goto einval; pr_debug("register: name: {%s}\n", e->name); /* Parse the 'type' field. */ switch (*p++) { case 'E': pr_debug("register: type: E (extension)\n"); e->flags = 1 << Enabled; break; case 'M': pr_debug("register: type: M (magic)\n"); e->flags = (1 << Enabled) | (1 << Magic); break; default: goto einval; } if (*p++ != del) goto einval; if (test_bit(Magic, &e->flags)) { /* Handle the 'M' (magic) format. */ char *s; /* Parse the 'offset' field. */ s = strchr(p, del); if (!s) goto einval; *s = '\0'; if (p != s) { int r = kstrtoint(p, 10, &e->offset); if (r != 0 || e->offset < 0) goto einval; } p = s; if (*p++) goto einval; pr_debug("register: offset: %#x\n", e->offset); /* Parse the 'magic' field. */ e->magic = p; p = scanarg(p, del); if (!p) goto einval; if (!e->magic[0]) goto einval; if (USE_DEBUG) print_hex_dump_bytes( KBUILD_MODNAME ": register: magic[raw]: ", DUMP_PREFIX_NONE, e->magic, p - e->magic); /* Parse the 'mask' field. */ e->mask = p; p = scanarg(p, del); if (!p) goto einval; if (!e->mask[0]) { e->mask = NULL; pr_debug("register: mask[raw]: none\n"); } else if (USE_DEBUG) print_hex_dump_bytes( KBUILD_MODNAME ": register: mask[raw]: ", DUMP_PREFIX_NONE, e->mask, p - e->mask); /* * Decode the magic & mask fields. * Note: while we might have accepted embedded NUL bytes from * above, the unescape helpers here will stop at the first one * it encounters. */ e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX); if (e->mask && string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) goto einval; if (e->size > BINPRM_BUF_SIZE || BINPRM_BUF_SIZE - e->size < e->offset) goto einval; pr_debug("register: magic/mask length: %i\n", e->size); if (USE_DEBUG) { print_hex_dump_bytes( KBUILD_MODNAME ": register: magic[decoded]: ", DUMP_PREFIX_NONE, e->magic, e->size); if (e->mask) { int i; char *masked = kmalloc(e->size, GFP_KERNEL_ACCOUNT); print_hex_dump_bytes( KBUILD_MODNAME ": register: mask[decoded]: ", DUMP_PREFIX_NONE, e->mask, e->size); if (masked) { for (i = 0; i < e->size; ++i) masked[i] = e->magic[i] & e->mask[i]; print_hex_dump_bytes( KBUILD_MODNAME ": register: magic[masked]: ", DUMP_PREFIX_NONE, masked, e->size); kfree(masked); } } } } else { /* Handle the 'E' (extension) format. */ /* Skip the 'offset' field. */ p = strchr(p, del); if (!p) goto einval; *p++ = '\0'; /* Parse the 'magic' field. */ e->magic = p; p = strchr(p, del); if (!p) goto einval; *p++ = '\0'; if (!e->magic[0] || strchr(e->magic, '/')) goto einval; pr_debug("register: extension: {%s}\n", e->magic); /* Skip the 'mask' field. */ p = strchr(p, del); if (!p) goto einval; *p++ = '\0'; } /* Parse the 'interpreter' field. */ e->interpreter = p; p = strchr(p, del); if (!p) goto einval; *p++ = '\0'; if (!e->interpreter[0]) goto einval; pr_debug("register: interpreter: {%s}\n", e->interpreter); /* Parse the 'flags' field. */ p = check_special_flags(p, e); if (*p == '\n') p++; if (p != buf + count) goto einval; return e; out: return ERR_PTR(err); efault: kfree(e); return ERR_PTR(-EFAULT); einval: kfree(e); return ERR_PTR(-EINVAL); } /* * Set status of entry/binfmt_misc: * '1' enables, '0' disables and '-1' clears entry/binfmt_misc */ static int parse_command(const char __user *buffer, size_t count) { char s[4]; if (count > 3) return -EINVAL; if (copy_from_user(s, buffer, count)) return -EFAULT; if (!count) return 0; if (s[count - 1] == '\n') count--; if (count == 1 && s[0] == '0') return 1; if (count == 1 && s[0] == '1') return 2; if (count == 2 && s[0] == '-' && s[1] == '1') return 3; return -EINVAL; } /* generic stuff */ static void entry_status(Node *e, char *page) { char *dp = page; const char *status = "disabled"; if (test_bit(Enabled, &e->flags)) status = "enabled"; if (!VERBOSE_STATUS) { sprintf(page, "%s\n", status); return; } dp += sprintf(dp, "%s\ninterpreter %s\n", status, e->interpreter); /* print the special flags */ dp += sprintf(dp, "flags: "); if (e->flags & MISC_FMT_PRESERVE_ARGV0) *dp++ = 'P'; if (e->flags & MISC_FMT_OPEN_BINARY) *dp++ = 'O'; if (e->flags & MISC_FMT_CREDENTIALS) *dp++ = 'C'; if (e->flags & MISC_FMT_OPEN_FILE) *dp++ = 'F'; *dp++ = '\n'; if (!test_bit(Magic, &e->flags)) { sprintf(dp, "extension .%s\n", e->magic); } else { dp += sprintf(dp, "offset %i\nmagic ", e->offset); dp = bin2hex(dp, e->magic, e->size); if (e->mask) { dp += sprintf(dp, "\nmask "); dp = bin2hex(dp, e->mask, e->size); } *dp++ = '\n'; *dp = '\0'; } } static struct inode *bm_get_inode(struct super_block *sb, int mode) { struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = mode; simple_inode_init_ts(inode); } return inode; } /** * i_binfmt_misc - retrieve struct binfmt_misc from a binfmt_misc inode * @inode: inode of the relevant binfmt_misc instance * * This helper retrieves struct binfmt_misc from a binfmt_misc inode. This can * be done without any memory barriers because we are guaranteed that * user_ns->binfmt_misc is fully initialized. It was fully initialized when the * binfmt_misc mount was first created. * * Return: struct binfmt_misc of the relevant binfmt_misc instance */ static struct binfmt_misc *i_binfmt_misc(struct inode *inode) { return inode->i_sb->s_user_ns->binfmt_misc; } /** * bm_evict_inode - cleanup data associated with @inode * @inode: inode to which the data is attached * * Cleanup the binary type handler data associated with @inode if a binary type * entry is removed or the filesystem is unmounted and the super block is * shutdown. * * If the ->evict call was not caused by a super block shutdown but by a write * to remove the entry or all entries via bm_{entry,status}_write() the entry * will have already been removed from the list. We keep the list_empty() check * to make that explicit. */ static void bm_evict_inode(struct inode *inode) { Node *e = inode->i_private; clear_inode(inode); if (e) { struct binfmt_misc *misc; misc = i_binfmt_misc(inode); write_lock(&misc->entries_lock); if (!list_empty(&e->list)) list_del_init(&e->list); write_unlock(&misc->entries_lock); put_binfmt_handler(e); } } /** * remove_binfmt_handler - remove a binary type handler * @misc: handle to binfmt_misc instance * @e: binary type handler to remove * * Remove a binary type handler from the list of binary type handlers and * remove its associated dentry. This is called from * binfmt_{entry,status}_write(). In the future, we might want to think about * adding a proper ->unlink() method to binfmt_misc instead of forcing caller's * to use writes to files in order to delete binary type handlers. But it has * worked for so long that it's not a pressing issue. */ static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e) { write_lock(&misc->entries_lock); list_del_init(&e->list); write_unlock(&misc->entries_lock); locked_recursive_removal(e->dentry, NULL); } /* /<entry> */ static ssize_t bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { Node *e = file_inode(file)->i_private; ssize_t res; char *page; page = (char *) __get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; entry_status(e, page); res = simple_read_from_buffer(buf, nbytes, ppos, page, strlen(page)); free_page((unsigned long) page); return res; } static ssize_t bm_entry_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct inode *inode = file_inode(file); Node *e = inode->i_private; int res = parse_command(buffer, count); switch (res) { case 1: /* Disable this handler. */ clear_bit(Enabled, &e->flags); break; case 2: /* Enable this handler. */ set_bit(Enabled, &e->flags); break; case 3: /* Delete this handler. */ inode = d_inode(inode->i_sb->s_root); inode_lock_nested(inode, I_MUTEX_PARENT); /* * In order to add new element or remove elements from the list * via bm_{entry,register,status}_write() inode_lock() on the * root inode must be held. * The lock is exclusive ensuring that the list can't be * modified. Only load_misc_binary() can access but does so * read-only. So we only need to take the write lock when we * actually remove the entry from the list. */ if (!list_empty(&e->list)) remove_binfmt_handler(i_binfmt_misc(inode), e); inode_unlock(inode); break; default: return res; } return count; } static const struct file_operations bm_entry_operations = { .read = bm_entry_read, .write = bm_entry_write, .llseek = default_llseek, }; /* /register */ /* add to filesystem */ static int add_entry(Node *e, struct super_block *sb) { struct dentry *dentry = simple_start_creating(sb->s_root, e->name); struct inode *inode; struct binfmt_misc *misc; if (IS_ERR(dentry)) return PTR_ERR(dentry); inode = bm_get_inode(sb, S_IFREG | 0644); if (unlikely(!inode)) { simple_done_creating(dentry); return -ENOMEM; } refcount_set(&e->users, 1); e->dentry = dentry; inode->i_private = e; inode->i_fop = &bm_entry_operations; d_make_persistent(dentry, inode); misc = i_binfmt_misc(inode); write_lock(&misc->entries_lock); list_add(&e->list, &misc->entries); write_unlock(&misc->entries_lock); simple_done_creating(dentry); return 0; } static ssize_t bm_register_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { Node *e; struct super_block *sb = file_inode(file)->i_sb; int err = 0; struct file *f = NULL; e = create_entry(buffer, count); if (IS_ERR(e)) return PTR_ERR(e); if (e->flags & MISC_FMT_OPEN_FILE) { /* * Now that we support unprivileged binfmt_misc mounts make * sure we use the credentials that the register @file was * opened with to also open the interpreter. Before that this * didn't matter much as only a privileged process could open * the register file. */ scoped_with_creds(file->f_cred) f = open_exec(e->interpreter); if (IS_ERR(f)) { pr_notice("register: failed to install interpreter file %s\n", e->interpreter); kfree(e); return PTR_ERR(f); } e->interp_file = f; } err = add_entry(e, sb); if (err) { if (f) { exe_file_allow_write_access(f); filp_close(f, NULL); } kfree(e); return err; } return count; } static const struct file_operations bm_register_operations = { .write = bm_register_write, .llseek = noop_llseek, }; /* /status */ static ssize_t bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { struct binfmt_misc *misc; char *s; misc = i_binfmt_misc(file_inode(file)); s = misc->enabled ? "enabled\n" : "disabled\n"; return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); } static ssize_t bm_status_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct binfmt_misc *misc; int res = parse_command(buffer, count); Node *e, *next; struct inode *inode; misc = i_binfmt_misc(file_inode(file)); switch (res) { case 1: /* Disable all handlers. */ misc->enabled = false; break; case 2: /* Enable all handlers. */ misc->enabled = true; break; case 3: /* Delete all handlers. */ inode = d_inode(file_inode(file)->i_sb->s_root); inode_lock_nested(inode, I_MUTEX_PARENT); /* * In order to add new element or remove elements from the list * via bm_{entry,register,status}_write() inode_lock() on the * root inode must be held. * The lock is exclusive ensuring that the list can't be * modified. Only load_misc_binary() can access but does so * read-only. So we only need to take the write lock when we * actually remove the entry from the list. */ list_for_each_entry_safe(e, next, &misc->entries, list) remove_binfmt_handler(misc, e); inode_unlock(inode); break; default: return res; } return count; } static const struct file_operations bm_status_operations = { .read = bm_status_read, .write = bm_status_write, .llseek = default_llseek, }; /* Superblock handling */ static void bm_put_super(struct super_block *sb) { struct user_namespace *user_ns = sb->s_fs_info; sb->s_fs_info = NULL; put_user_ns(user_ns); } static const struct super_operations s_ops = { .statfs = simple_statfs, .evict_inode = bm_evict_inode, .put_super = bm_put_super, }; static int bm_fill_super(struct super_block *sb, struct fs_context *fc) { int err; struct user_namespace *user_ns = sb->s_user_ns; struct binfmt_misc *misc; static const struct tree_descr bm_files[] = { [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, [3] = {"register", &bm_register_operations, S_IWUSR}, /* last one */ {""} }; if (WARN_ON(user_ns != current_user_ns())) return -EINVAL; /* * Lazily allocate a new binfmt_misc instance for this namespace, i.e. * do it here during the first mount of binfmt_misc. We don't need to * waste memory for every user namespace allocation. It's likely much * more common to not mount a separate binfmt_misc instance than it is * to mount one. * * While multiple superblocks can exist they are keyed by userns in * s_fs_info for binfmt_misc. Hence, the vfs guarantees that * bm_fill_super() is called exactly once whenever a binfmt_misc * superblock for a userns is created. This in turn lets us conclude * that when a binfmt_misc superblock is created for the first time for * a userns there's no one racing us. Therefore we don't need any * barriers when we dereference binfmt_misc. */ misc = user_ns->binfmt_misc; if (!misc) { /* * If it turns out that most user namespaces actually want to * register their own binary type handler and therefore all * create their own separate binfmt_misc mounts we should * consider turning this into a kmem cache. */ misc = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL); if (!misc) return -ENOMEM; INIT_LIST_HEAD(&misc->entries); rwlock_init(&misc->entries_lock); /* Pairs with smp_load_acquire() in load_binfmt_misc(). */ smp_store_release(&user_ns->binfmt_misc, misc); } /* * When the binfmt_misc superblock for this userns is shutdown * ->enabled might have been set to false and we don't reinitialize * ->enabled again in put_super() as someone might already be mounting * binfmt_misc again. It also would be pointless since by the time * ->put_super() is called we know that the binary type list for this * bintfmt_misc mount is empty making load_misc_binary() return * -ENOEXEC independent of whether ->enabled is true. Instead, if * someone mounts binfmt_misc for the first time or again we simply * reset ->enabled to true. */ misc->enabled = true; err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); if (!err) sb->s_op = &s_ops; return err; } static void bm_free(struct fs_context *fc) { if (fc->s_fs_info) put_user_ns(fc->s_fs_info); } static int bm_get_tree(struct fs_context *fc) { return get_tree_keyed(fc, bm_fill_super, get_user_ns(fc->user_ns)); } static const struct fs_context_operations bm_context_ops = { .free = bm_free, .get_tree = bm_get_tree, }; static int bm_init_fs_context(struct fs_context *fc) { fc->ops = &bm_context_ops; return 0; } static struct linux_binfmt misc_format = { .module = THIS_MODULE, .load_binary = load_misc_binary, }; static struct file_system_type bm_fs_type = { .owner = THIS_MODULE, .name = "binfmt_misc", .init_fs_context = bm_init_fs_context, .fs_flags = FS_USERNS_MOUNT, .kill_sb = kill_anon_super, }; MODULE_ALIAS_FS("binfmt_misc"); static int __init init_misc_binfmt(void) { int err = register_filesystem(&bm_fs_type); if (!err) insert_binfmt(&misc_format); return err; } static void __exit exit_misc_binfmt(void) { unregister_binfmt(&misc_format); unregister_filesystem(&bm_fs_type); } core_initcall(init_misc_binfmt); module_exit(exit_misc_binfmt); MODULE_DESCRIPTION("Kernel support for miscellaneous binaries"); MODULE_LICENSE("GPL"); |
| 168 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | /* SPDX-License-Identifier: GPL-2.0 */ /* * kernel/workqueue_internal.h * * Workqueue internal header file. Only to be included by workqueue and * core kernel subsystems. */ #ifndef _KERNEL_WORKQUEUE_INTERNAL_H #define _KERNEL_WORKQUEUE_INTERNAL_H #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/preempt.h> struct worker_pool; /* * The poor guys doing the actual heavy lifting. All on-duty workers are * either serving the manager role, on idle list or on busy hash. For * details on the locking annotation (L, I, X...), refer to workqueue.c. * * Only to be used in workqueue and async. */ struct worker { /* on idle list while idle, on busy hash table while busy */ union { struct list_head entry; /* L: while idle */ struct hlist_node hentry; /* L: while busy */ }; struct work_struct *current_work; /* K: work being processed and its */ work_func_t current_func; /* K: function */ struct pool_workqueue *current_pwq; /* K: pwq */ u64 current_at; /* K: runtime at start or last wakeup */ unsigned int current_color; /* K: color */ int sleeping; /* S: is worker sleeping? */ /* used by the scheduler to determine a worker's last known identity */ work_func_t last_func; /* K: last work's fn */ struct list_head scheduled; /* L: scheduled works */ struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* A: the associated pool */ /* L: for rescuers */ struct list_head node; /* A: anchored at pool->workers */ /* A: runs through worker->node */ unsigned long last_active; /* K: last active timestamp */ unsigned int flags; /* L: flags */ int id; /* I: worker id */ /* * Opaque string set with work_set_desc(). Printed out with task * dump for debugging - WARN, BUG, panic or sysrq. */ char desc[WORKER_DESC_LEN]; /* used only by rescuers to point to the target workqueue */ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ }; /** * current_wq_worker - return struct worker if %current is a workqueue worker */ static inline struct worker *current_wq_worker(void) { if (in_task() && (current->flags & PF_WQ_WORKER)) return kthread_data(current); return NULL; } /* * Scheduler hooks for concurrency managed workqueue. Only to be used from * sched/ and workqueue.c. */ void wq_worker_running(struct task_struct *task); void wq_worker_sleeping(struct task_struct *task); void wq_worker_tick(struct task_struct *task); work_func_t wq_worker_last_func(struct task_struct *task); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ |
| 5 1 1 7 3 1 6 3 6 1 7 7 32 5 12 15 4 2 2 13 2 12 20 14 7 1 2 13 1 1 5 1 5 1 5 9 68 33 38 27 41 38 7 16 4 2 11 17 47 74 25 64 74 74 137 133 4 137 1 1 14 10 4 1 1 60 57 2 1 25 19 14 2 42 4 42 4 27 15 23 2 17 36 4 9 31 26 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/fat/file.c * * Written 1992,1993 by Werner Almesberger * * regular file handling primitives for fat-based filesystems */ #include <linux/capability.h> #include <linux/module.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/falloc.h> #include "fat.h" static long fat_fallocate(struct file *file, int mode, loff_t offset, loff_t len); static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { u32 attr; inode_lock_shared(inode); attr = fat_make_attrs(inode); inode_unlock_shared(inode); return put_user(attr, user_attr); } static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) { struct inode *inode = file_inode(file); struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int is_dir = S_ISDIR(inode->i_mode); u32 attr, oldattr; struct iattr ia; int err; err = get_user(attr, user_attr); if (err) goto out; err = mnt_want_write_file(file); if (err) goto out; inode_lock(inode); /* * ATTR_VOLUME and ATTR_DIR cannot be changed; this also * prevents the user from turning us into a VFAT * longname entry. Also, we obviously can't set * any of the NTFS attributes in the high 24 bits. */ attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR); /* Merge in ATTR_VOLUME and ATTR_DIR */ attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) | (is_dir ? ATTR_DIR : 0); oldattr = fat_make_attrs(inode); /* Equivalent to a chmod() */ ia.ia_valid = ATTR_MODE | ATTR_CTIME; ia.ia_ctime = current_time(inode); if (is_dir) ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO); else { ia.ia_mode = fat_make_mode(sbi, attr, S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO)); } /* The root directory has no attributes */ if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) { err = -EINVAL; goto out_unlock_inode; } if (sbi->options.sys_immutable && ((attr | oldattr) & ATTR_SYS) && !capable(CAP_LINUX_IMMUTABLE)) { err = -EPERM; goto out_unlock_inode; } /* * The security check is questionable... We single * out the RO attribute for checking by the security * module, just because it maps to a file mode. */ err = security_inode_setattr(file_mnt_idmap(file), file->f_path.dentry, &ia); if (err) goto out_unlock_inode; /* This MUST be done before doing anything irreversible... */ err = fat_setattr(file_mnt_idmap(file), file->f_path.dentry, &ia); if (err) goto out_unlock_inode; fsnotify_change(file->f_path.dentry, ia.ia_valid); if (sbi->options.sys_immutable) { if (attr & ATTR_SYS) inode->i_flags |= S_IMMUTABLE; else inode->i_flags &= ~S_IMMUTABLE; } fat_save_attrs(inode, attr); mark_inode_dirty(inode); out_unlock_inode: inode_unlock(inode); mnt_drop_write_file(file); out: return err; } static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); return put_user(sbi->vol_id, user_attr); } static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg) { struct super_block *sb = inode->i_sb; struct fstrim_range __user *user_range; struct fstrim_range range; int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!bdev_max_discard_sectors(sb->s_bdev)) return -EOPNOTSUPP; user_range = (struct fstrim_range __user *)arg; if (copy_from_user(&range, user_range, sizeof(range))) return -EFAULT; range.minlen = max_t(unsigned int, range.minlen, bdev_discard_granularity(sb->s_bdev)); err = fat_trim_fs(inode, &range); if (err < 0) return err; if (copy_to_user(user_range, &range, sizeof(range))) return -EFAULT; return 0; } long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); u32 __user *user_attr = (u32 __user *)arg; switch (cmd) { case FAT_IOCTL_GET_ATTRIBUTES: return fat_ioctl_get_attributes(inode, user_attr); case FAT_IOCTL_SET_ATTRIBUTES: return fat_ioctl_set_attributes(filp, user_attr); case FAT_IOCTL_GET_VOLUME_ID: return fat_ioctl_get_volume_id(inode, user_attr); case FITRIM: return fat_ioctl_fitrim(inode, arg); default: return -ENOTTY; /* Inappropriate ioctl for device */ } } static int fat_file_release(struct inode *inode, struct file *filp) { if ((filp->f_mode & FMODE_WRITE) && MSDOS_SB(inode->i_sb)->options.flush) { fat_flush_inodes(inode->i_sb, inode, NULL); set_current_state(TASK_UNINTERRUPTIBLE); io_schedule_timeout(HZ/10); } return 0; } int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; int err; err = __generic_file_fsync(filp, start, end, datasync); if (err) return err; err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); if (err) return err; return blkdev_issue_flush(inode->i_sb->s_bdev); } const struct file_operations fat_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap_prepare = generic_file_mmap_prepare, .release = fat_file_release, .unlocked_ioctl = fat_generic_ioctl, .compat_ioctl = compat_ptr_ioctl, .fsync = fat_file_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = fat_fallocate, }; static int fat_cont_expand(struct inode *inode, loff_t size) { struct address_space *mapping = inode->i_mapping; loff_t start = inode->i_size, count = size - inode->i_size; int err; err = generic_cont_expand_simple(inode, size); if (err) goto out; fat_truncate_time(inode, NULL, S_CTIME|S_MTIME); mark_inode_dirty(inode); if (IS_SYNC(inode)) { int err2; /* * Opencode syncing since we don't have a file open to use * standard fsync path. */ err = filemap_fdatawrite_range(mapping, start, start + count - 1); err2 = sync_mapping_buffers(mapping); if (!err) err = err2; err2 = write_inode_now(inode, 1); if (!err) err = err2; if (!err) { err = filemap_fdatawait_range(mapping, start, start + count - 1); } } out: return err; } /* * Preallocate space for a file. This implements fat's fallocate file * operation, which gets called from sys_fallocate system call. User * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set * we just allocate clusters without zeroing them out. Otherwise we * allocate and zero out clusters via an expanding truncate. */ static long fat_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { int nr_cluster; /* Number of clusters to be allocated */ loff_t mm_bytes; /* Number of bytes to be allocated for file */ loff_t ondisksize; /* block aligned on-disk size in bytes*/ struct inode *inode = file->f_mapping->host; struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); int err = 0; /* No support for hole punch or other fallocate flags. */ if (mode & ~FALLOC_FL_KEEP_SIZE) return -EOPNOTSUPP; /* No support for dir */ if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; inode_lock(inode); if (mode & FALLOC_FL_KEEP_SIZE) { ondisksize = inode->i_blocks << 9; if ((offset + len) <= ondisksize) goto error; /* First compute the number of clusters to be allocated */ mm_bytes = offset + len - ondisksize; nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >> sbi->cluster_bits; /* Start the allocation.We are not zeroing out the clusters */ while (nr_cluster-- > 0) { err = fat_add_cluster(inode); if (err) goto error; } } else { if ((offset + len) <= i_size_read(inode)) goto error; /* This is just an expanding truncate */ err = fat_cont_expand(inode, (offset + len)); } error: inode_unlock(inode); return err; } /* Free all clusters after the skip'th cluster. */ static int fat_free(struct inode *inode, int skip) { struct super_block *sb = inode->i_sb; int err, wait, free_start, i_start, i_logstart; if (MSDOS_I(inode)->i_start == 0) return 0; fat_cache_inval_inode(inode); wait = IS_DIRSYNC(inode); i_start = free_start = MSDOS_I(inode)->i_start; i_logstart = MSDOS_I(inode)->i_logstart; /* First, we write the new file size. */ if (!skip) { MSDOS_I(inode)->i_start = 0; MSDOS_I(inode)->i_logstart = 0; } MSDOS_I(inode)->i_attrs |= ATTR_ARCH; fat_truncate_time(inode, NULL, S_CTIME|S_MTIME); if (wait) { err = fat_sync_inode(inode); if (err) { MSDOS_I(inode)->i_start = i_start; MSDOS_I(inode)->i_logstart = i_logstart; return err; } } else mark_inode_dirty(inode); /* Write a new EOF, and get the remaining cluster chain for freeing. */ if (skip) { struct fat_entry fatent; int ret, fclus, dclus; ret = fat_get_cluster(inode, skip - 1, &fclus, &dclus); if (ret < 0) return ret; else if (ret == FAT_ENT_EOF) return 0; fatent_init(&fatent); ret = fat_ent_read(inode, &fatent, dclus); if (ret == FAT_ENT_EOF) { fatent_brelse(&fatent); return 0; } else if (ret == FAT_ENT_FREE) { fat_fs_error(sb, "%s: invalid cluster chain (i_pos %lld)", __func__, MSDOS_I(inode)->i_pos); ret = -EIO; } else if (ret > 0) { err = fat_ent_write(inode, &fatent, FAT_ENT_EOF, wait); if (err) ret = err; } fatent_brelse(&fatent); if (ret < 0) return ret; free_start = ret; } inode->i_blocks = skip << (MSDOS_SB(sb)->cluster_bits - 9); /* Freeing the remained cluster chain */ return fat_free_clusters(inode, free_start); } void fat_truncate_blocks(struct inode *inode, loff_t offset) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); const unsigned int cluster_size = sbi->cluster_size; int nr_clusters; /* * This protects against truncating a file bigger than it was then * trying to write into the hole. */ if (MSDOS_I(inode)->mmu_private > offset) MSDOS_I(inode)->mmu_private = offset; nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits; fat_free(inode, nr_clusters); fat_flush_inodes(inode->i_sb, inode, NULL); } int fat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); generic_fillattr(idmap, request_mask, inode, stat); stat->blksize = sbi->cluster_size; if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) { /* Use i_pos for ino. This is used as fileid of nfs. */ stat->ino = fat_i_pos_read(sbi, inode); } if (sbi->options.isvfat && request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime = MSDOS_I(inode)->i_crtime; } return 0; } EXPORT_SYMBOL_GPL(fat_getattr); static int fat_sanitize_mode(const struct msdos_sb_info *sbi, struct inode *inode, umode_t *mode_ptr) { umode_t mask, perm; /* * Note, the basic check is already done by a caller of * (attr->ia_mode & ~FAT_VALID_MODE) */ if (S_ISREG(inode->i_mode)) mask = sbi->options.fs_fmask; else mask = sbi->options.fs_dmask; perm = *mode_ptr & ~(S_IFMT | mask); /* * Of the r and x bits, all (subject to umask) must be present. Of the * w bits, either all (subject to umask) or none must be present. * * If fat_mode_can_hold_ro(inode) is false, can't change w bits. */ if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO))) return -EPERM; if (fat_mode_can_hold_ro(inode)) { if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask))) return -EPERM; } else { if ((perm & S_IWUGO) != (S_IWUGO & ~mask)) return -EPERM; } *mode_ptr &= S_IFMT | perm; return 0; } static int fat_allow_set_time(struct mnt_idmap *idmap, struct msdos_sb_info *sbi, struct inode *inode) { umode_t allow_utime = sbi->options.allow_utime; if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) { if (vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode))) allow_utime >>= 3; if (allow_utime & MAY_WRITE) return 1; } /* use a default check */ return 0; } #define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) /* valid file mode bits */ #define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO) int fat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); struct inode *inode = d_inode(dentry); unsigned int ia_valid; int error; /* Check for setting the inode time. */ ia_valid = attr->ia_valid; if (ia_valid & TIMES_SET_FLAGS) { if (fat_allow_set_time(idmap, sbi, inode)) attr->ia_valid &= ~TIMES_SET_FLAGS; } error = setattr_prepare(idmap, dentry, attr); attr->ia_valid = ia_valid; if (error) { if (sbi->options.quiet) error = 0; goto out; } /* * Expand the file. Since inode_setattr() updates ->i_size * before calling the ->truncate(), but FAT needs to fill the * hole before it. XXX: this is no longer true with new truncate * sequence. */ if (attr->ia_valid & ATTR_SIZE) { inode_dio_wait(inode); if (attr->ia_size > inode->i_size) { error = fat_cont_expand(inode, attr->ia_size); if (error || attr->ia_valid == ATTR_SIZE) goto out; attr->ia_valid &= ~ATTR_SIZE; } } if (((attr->ia_valid & ATTR_UID) && (!uid_eq(from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid), sbi->options.fs_uid))) || ((attr->ia_valid & ATTR_GID) && (!gid_eq(from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid), sbi->options.fs_gid))) || ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~FAT_VALID_MODE))) error = -EPERM; if (error) { if (sbi->options.quiet) error = 0; goto out; } /* * We don't return -EPERM here. Yes, strange, but this is too * old behavior. */ if (attr->ia_valid & ATTR_MODE) { if (fat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0) attr->ia_valid &= ~ATTR_MODE; } if (attr->ia_valid & ATTR_SIZE) { error = fat_block_truncate_page(inode, attr->ia_size); if (error) goto out; down_write(&MSDOS_I(inode)->truncate_lock); truncate_setsize(inode, attr->ia_size); fat_truncate_blocks(inode, attr->ia_size); up_write(&MSDOS_I(inode)->truncate_lock); } /* * setattr_copy can't truncate these appropriately, so we'll * copy them ourselves */ if (attr->ia_valid & ATTR_ATIME) fat_truncate_time(inode, &attr->ia_atime, S_ATIME); if (attr->ia_valid & ATTR_CTIME) fat_truncate_time(inode, &attr->ia_ctime, S_CTIME); if (attr->ia_valid & ATTR_MTIME) fat_truncate_time(inode, &attr->ia_mtime, S_MTIME); attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME); setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); out: return error; } EXPORT_SYMBOL_GPL(fat_setattr); const struct inode_operations fat_file_inode_operations = { .setattr = fat_setattr, .getattr = fat_getattr, .update_time = fat_update_time, }; |
| 110 110 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/sgi.c * * Code extracted from drivers/block/genhd.c */ #include "check.h" #define SGI_LABEL_MAGIC 0x0be5a941 enum { LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ }; struct sgi_disklabel { __be32 magic_mushroom; /* Big fat spliff... */ __be16 root_part_num; /* Root partition number */ __be16 swap_part_num; /* Swap partition number */ s8 boot_file[16]; /* Name of boot file for ARCS */ u8 _unused0[48]; /* Device parameter useless crapola.. */ struct sgi_volume { s8 name[8]; /* Name of volume */ __be32 block_num; /* Logical block number */ __be32 num_bytes; /* How big, in bytes */ } volume[15]; struct sgi_partition { __be32 num_blocks; /* Size in logical blocks */ __be32 first_block; /* First logical block */ __be32 type; /* Type of this partition */ } partitions[16]; __be32 csum; /* Disk label checksum */ __be32 _unused1; /* Padding */ }; int sgi_partition(struct parsed_partitions *state) { int i, csum; __be32 magic; int slot = 1; unsigned int start, blocks; __be32 *ui, cs; Sector sect; struct sgi_disklabel *label; struct sgi_partition *p; label = read_part_sector(state, 0, §); if (!label) return -1; p = &label->partitions[0]; magic = label->magic_mushroom; if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { put_dev_sector(sect); return 0; } ui = ((__be32 *) (label + 1)) - 1; for(csum = 0; ui >= ((__be32 *) label);) { cs = *ui--; csum += be32_to_cpu(cs); } if(csum) { printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", state->disk->disk_name); put_dev_sector(sect); return 0; } /* All SGI disk labels have 16 partitions, disks under Linux only * have 15 minor's. Luckily there are always a few zero length * partitions which we don't care about so we never overflow the * current_minor. */ for(i = 0; i < 16; i++, p++) { blocks = be32_to_cpu(p->num_blocks); start = be32_to_cpu(p->first_block); if (blocks) { put_partition(state, slot, start, blocks); if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION) state->parts[slot].flags = ADDPART_FLAG_RAID; } slot++; } strlcat(state->pp_buf, "\n", PAGE_SIZE); put_dev_sector(sect); return 1; } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BLK_INTEGRITY_H #define _LINUX_BLK_INTEGRITY_H #include <linux/blk-mq.h> #include <linux/bio-integrity.h> #include <linux/blk-mq-dma.h> struct request; /* * Maximum contiguous integrity buffer allocation. */ #define BLK_INTEGRITY_MAX_SIZE SZ_2M enum blk_integrity_flags { BLK_INTEGRITY_NOVERIFY = 1 << 0, BLK_INTEGRITY_NOGENERATE = 1 << 1, BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, BLK_INTEGRITY_REF_TAG = 1 << 3, BLK_INTEGRITY_STACKED = 1 << 4, }; const char *blk_integrity_profile_name(struct blk_integrity *bi); bool queue_limits_stack_integrity(struct queue_limits *t, struct queue_limits *b); static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, struct block_device *bdev) { return queue_limits_stack_integrity(t, &bdev->bd_disk->queue->limits); } #ifdef CONFIG_BLK_DEV_INTEGRITY int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes); int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, struct logical_block_metadata_cap __user *argp); bool blk_rq_integrity_dma_map_iter_start(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter); bool blk_rq_integrity_dma_map_iter_next(struct request *req, struct device *dma_dev, struct blk_dma_iter *iter); static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) { return q->limits.integrity.metadata_size; } static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) { if (!blk_integrity_queue_supports_integrity(disk->queue)) return NULL; return &disk->queue->limits.integrity; } static inline struct blk_integrity * bdev_get_integrity(struct block_device *bdev) { return blk_get_integrity(bdev->bd_disk); } static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { return q->limits.max_integrity_segments; } /** * bio_integrity_intervals - Return number of integrity intervals for a bio * @bi: blk_integrity profile for device * @sectors: Size of the bio in 512-byte sectors * * Description: The block layer calculates everything in 512 byte * sectors but integrity metadata is done in terms of the data integrity * interval size of the storage device. Convert the block layer sectors * to the appropriate number of integrity intervals. */ static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, unsigned int sectors) { return sectors >> (bi->interval_exp - 9); } static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, unsigned int sectors) { return bio_integrity_intervals(bi, sectors) * bi->metadata_size; } static inline bool blk_integrity_rq(struct request *rq) { return rq->cmd_flags & REQ_INTEGRITY; } /* * Return the current bvec that contains the integrity data. bip_iter may be * advanced to iterate over the integrity data. */ static inline struct bio_vec rq_integrity_vec(struct request *rq) { return mp_bvec_iter_bvec(rq->bio->bi_integrity->bip_vec, rq->bio->bi_integrity->bip_iter); } #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, struct logical_block_metadata_cap __user *argp) { return -ENOIOCTLCMD; } static inline int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *b) { return 0; } static inline int blk_rq_map_integrity_sg(struct request *q, struct scatterlist *s) { return 0; } static inline int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) { return -EINVAL; } static inline bool blk_rq_integrity_dma_map_iter_start(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter) { return false; } static inline bool blk_rq_integrity_dma_map_iter_next(struct request *req, struct device *dma_dev, struct blk_dma_iter *iter) { return false; } static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) { return NULL; } static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) { return NULL; } static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) { return false; } static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { return 0; } static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, unsigned int sectors) { return 0; } static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, unsigned int sectors) { return 0; } static inline int blk_integrity_rq(struct request *rq) { return 0; } static inline struct bio_vec rq_integrity_vec(struct request *rq) { /* the optimizer will remove all calls to this function */ return (struct bio_vec){ }; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ #endif /* _LINUX_BLK_INTEGRITY_H */ |
| 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH * Copyright (C) 2018-2025 Intel Corporation * * utilities for mac80211 */ #include <net/mac80211.h> #include <linux/netdevice.h> #include <linux/export.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/bitmap.h> #include <linux/crc32.h> #include <net/net_namespace.h> #include <net/cfg80211.h> #include <net/rtnetlink.h> #include <kunit/visibility.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "mesh.h" #include "wme.h" #include "led.h" #include "wep.h" /* privid for wiphys to determine whether they belong to us or not */ const void *const mac80211_wiphy_privid = &mac80211_wiphy_privid; struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy) { struct ieee80211_local *local; local = wiphy_priv(wiphy); return &local->hw; } EXPORT_SYMBOL(wiphy_to_ieee80211_hw); const struct ieee80211_conn_settings ieee80211_conn_settings_unlimited = { .mode = IEEE80211_CONN_MODE_EHT, .bw_limit = IEEE80211_CONN_BW_LIMIT_320, }; u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, enum nl80211_iftype type) { __le16 fc = hdr->frame_control; if (ieee80211_is_data(fc)) { if (len < 24) /* drop incorrect hdr len (data) */ return NULL; if (ieee80211_has_a4(fc)) return NULL; if (ieee80211_has_tods(fc)) return hdr->addr1; if (ieee80211_has_fromds(fc)) return hdr->addr2; return hdr->addr3; } if (ieee80211_is_s1g_beacon(fc)) { struct ieee80211_ext *ext = (void *) hdr; return ext->u.s1g_beacon.sa; } if (ieee80211_is_mgmt(fc)) { if (len < 24) /* drop incorrect hdr len (mgmt) */ return NULL; return hdr->addr3; } if (ieee80211_is_ctl(fc)) { if (ieee80211_is_pspoll(fc)) return hdr->addr1; if (ieee80211_is_back_req(fc)) { switch (type) { case NL80211_IFTYPE_STATION: return hdr->addr2; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: return hdr->addr1; default: break; /* fall through to the return */ } } } return NULL; } EXPORT_SYMBOL(ieee80211_get_bssid); void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx) { struct sk_buff *skb; struct ieee80211_hdr *hdr; skb_queue_walk(&tx->skbs, skb) { hdr = (struct ieee80211_hdr *) skb->data; hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); } } int ieee80211_frame_duration(enum nl80211_band band, size_t len, int rate, int erp, int short_preamble) { int dur; /* calculate duration (in microseconds, rounded up to next higher * integer if it includes a fractional microsecond) to send frame of * len bytes (does not include FCS) at the given rate. Duration will * also include SIFS. * * rate is in 100 kbps, so divident is multiplied by 10 in the * DIV_ROUND_UP() operations. */ if (band == NL80211_BAND_5GHZ || erp) { /* * OFDM: * * N_DBPS = DATARATE x 4 * N_SYM = Ceiling((16+8xLENGTH+6) / N_DBPS) * (16 = SIGNAL time, 6 = tail bits) * TXTIME = T_PREAMBLE + T_SIGNAL + T_SYM x N_SYM + Signal Ext * * T_SYM = 4 usec * 802.11a - 18.5.2: aSIFSTime = 16 usec * 802.11g - 19.8.4: aSIFSTime = 10 usec + * signal ext = 6 usec */ dur = 16; /* SIFS + signal ext */ dur += 16; /* IEEE 802.11-2012 18.3.2.4: T_PREAMBLE = 16 usec */ dur += 4; /* IEEE 802.11-2012 18.3.2.4: T_SIGNAL = 4 usec */ /* rates should already consider the channel bandwidth, * don't apply divisor again. */ dur += 4 * DIV_ROUND_UP((16 + 8 * (len + 4) + 6) * 10, 4 * rate); /* T_SYM x N_SYM */ } else { /* * 802.11b or 802.11g with 802.11b compatibility: * 18.3.4: TXTIME = PreambleLength + PLCPHeaderTime + * Ceiling(((LENGTH+PBCC)x8)/DATARATE). PBCC=0. * * 802.11 (DS): 15.3.3, 802.11b: 18.3.4 * aSIFSTime = 10 usec * aPreambleLength = 144 usec or 72 usec with short preamble * aPLCPHeaderLength = 48 usec or 24 usec with short preamble */ dur = 10; /* aSIFSTime = 10 usec */ dur += short_preamble ? (72 + 24) : (144 + 48); dur += DIV_ROUND_UP(8 * (len + 4) * 10, rate); } return dur; } /* Exported duration function for driver use */ __le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum nl80211_band band, size_t frame_len, struct ieee80211_rate *rate) { struct ieee80211_sub_if_data *sdata; u16 dur; int erp; bool short_preamble = false; erp = 0; if (vif) { sdata = vif_to_sdata(vif); short_preamble = sdata->vif.bss_conf.use_short_preamble; if (sdata->deflink.operating_11g_mode) erp = rate->flags & IEEE80211_RATE_ERP_G; } dur = ieee80211_frame_duration(band, frame_len, rate->bitrate, erp, short_preamble); return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_generic_frame_duration); __le16 ieee80211_rts_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_info *frame_txctl) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate; struct ieee80211_sub_if_data *sdata; bool short_preamble; int erp, bitrate; u16 dur; struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[frame_txctl->band]; short_preamble = false; rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx]; erp = 0; if (vif) { sdata = vif_to_sdata(vif); short_preamble = sdata->vif.bss_conf.use_short_preamble; if (sdata->deflink.operating_11g_mode) erp = rate->flags & IEEE80211_RATE_ERP_G; } bitrate = rate->bitrate; /* CTS duration */ dur = ieee80211_frame_duration(sband->band, 10, bitrate, erp, short_preamble); /* Data frame duration */ dur += ieee80211_frame_duration(sband->band, frame_len, bitrate, erp, short_preamble); /* ACK duration */ dur += ieee80211_frame_duration(sband->band, 10, bitrate, erp, short_preamble); return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_rts_duration); __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_info *frame_txctl) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate; struct ieee80211_sub_if_data *sdata; bool short_preamble; int erp, bitrate; u16 dur; struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[frame_txctl->band]; short_preamble = false; rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx]; erp = 0; if (vif) { sdata = vif_to_sdata(vif); short_preamble = sdata->vif.bss_conf.use_short_preamble; if (sdata->deflink.operating_11g_mode) erp = rate->flags & IEEE80211_RATE_ERP_G; } bitrate = rate->bitrate; /* Data frame duration */ dur = ieee80211_frame_duration(sband->band, frame_len, bitrate, erp, short_preamble); if (!(frame_txctl->flags & IEEE80211_TX_CTL_NO_ACK)) { /* ACK duration */ dur += ieee80211_frame_duration(sband->band, 10, bitrate, erp, short_preamble); } return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_ctstoself_duration); static void wake_tx_push_queue(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_txq *queue) { struct ieee80211_tx_control control = { .sta = queue->sta, }; struct sk_buff *skb; while (1) { skb = ieee80211_tx_dequeue(&local->hw, queue); if (!skb) break; drv_tx(local, &control, skb); } } /* wake_tx_queue handler for driver not implementing a custom one*/ void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif); struct ieee80211_txq *queue; spin_lock(&local->handle_wake_tx_queue_lock); /* Use ieee80211_next_txq() for airtime fairness accounting */ ieee80211_txq_schedule_start(hw, txq->ac); while ((queue = ieee80211_next_txq(hw, txq->ac))) { wake_tx_push_queue(local, sdata, queue); ieee80211_return_txq(hw, queue, false); } ieee80211_txq_schedule_end(hw, txq->ac); spin_unlock(&local->handle_wake_tx_queue_lock); } EXPORT_SYMBOL(ieee80211_handle_wake_tx_queue); static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) { struct ieee80211_local *local = sdata->local; struct ieee80211_vif *vif = &sdata->vif; struct fq *fq = &local->fq; struct ps_data *ps = NULL; struct txq_info *txqi; struct sta_info *sta; int i; local_bh_disable(); spin_lock(&fq->lock); if (!test_bit(SDATA_STATE_RUNNING, &sdata->state)) goto out; if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct ieee80211_txq *txq = sta->sta.txq[i]; if (!txq) continue; txqi = to_txq_info(txq); if (ac != txq->ac) continue; if (!test_and_clear_bit(IEEE80211_TXQ_DIRTY, &txqi->flags)) continue; spin_unlock(&fq->lock); drv_wake_tx_queue(local, txqi); spin_lock(&fq->lock); } } if (!vif->txq) goto out; txqi = to_txq_info(vif->txq); if (!test_and_clear_bit(IEEE80211_TXQ_DIRTY, &txqi->flags) || (ps && atomic_read(&ps->num_sta_ps)) || ac != vif->txq->ac) goto out; spin_unlock(&fq->lock); drv_wake_tx_queue(local, txqi); local_bh_enable(); return; out: spin_unlock(&fq->lock); local_bh_enable(); } static void __releases(&local->queue_stop_reason_lock) __acquires(&local->queue_stop_reason_lock) _ieee80211_wake_txqs(struct ieee80211_local *local, unsigned long *flags) { struct ieee80211_sub_if_data *sdata; int n_acs = IEEE80211_NUM_ACS; int i; rcu_read_lock(); if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; for (i = 0; i < local->hw.queues; i++) { if (local->queue_stop_reasons[i]) continue; spin_unlock_irqrestore(&local->queue_stop_reason_lock, *flags); list_for_each_entry_rcu(sdata, &local->interfaces, list) { int ac; for (ac = 0; ac < n_acs; ac++) { int ac_queue = sdata->vif.hw_queue[ac]; if (ac_queue == i || sdata->vif.cab_queue == i) __ieee80211_wake_txqs(sdata, ac); } } spin_lock_irqsave(&local->queue_stop_reason_lock, *flags); } rcu_read_unlock(); } void ieee80211_wake_txqs(struct tasklet_struct *t) { struct ieee80211_local *local = from_tasklet(local, t, wake_txqs_tasklet); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); _ieee80211_wake_txqs(local, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted, unsigned long *flags) { struct ieee80211_local *local = hw_to_local(hw); if (WARN_ON(queue >= hw->queues)) return; if (!test_bit(reason, &local->queue_stop_reasons[queue])) return; if (!refcounted) { local->q_stop_reasons[queue][reason] = 0; } else { local->q_stop_reasons[queue][reason]--; if (WARN_ON(local->q_stop_reasons[queue][reason] < 0)) local->q_stop_reasons[queue][reason] = 0; } if (local->q_stop_reasons[queue][reason] == 0) __clear_bit(reason, &local->queue_stop_reasons[queue]); trace_wake_queue(local, queue, reason, local->q_stop_reasons[queue][reason]); if (local->queue_stop_reasons[queue] != 0) /* someone still has this queue stopped */ return; if (!skb_queue_empty(&local->pending[queue])) tasklet_schedule(&local->tx_pending_tasklet); /* * Calling _ieee80211_wake_txqs here can be a problem because it may * release queue_stop_reason_lock which has been taken by * __ieee80211_wake_queue's caller. It is certainly not very nice to * release someone's lock, but it is fine because all the callers of * __ieee80211_wake_queue call it right before releasing the lock. */ if (reason == IEEE80211_QUEUE_STOP_REASON_DRIVER) tasklet_schedule(&local->wake_txqs_tasklet); else _ieee80211_wake_txqs(local, flags); } void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); __ieee80211_wake_queue(hw, queue, reason, refcounted, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue) { ieee80211_wake_queue_by_reason(hw, queue, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_wake_queue); static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); if (WARN_ON(queue >= hw->queues)) return; if (!refcounted) local->q_stop_reasons[queue][reason] = 1; else local->q_stop_reasons[queue][reason]++; trace_stop_queue(local, queue, reason, local->q_stop_reasons[queue][reason]); set_bit(reason, &local->queue_stop_reasons[queue]); } void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); __ieee80211_stop_queue(hw, queue, reason, refcounted); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue) { ieee80211_stop_queue_by_reason(hw, queue, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_stop_queue); void ieee80211_add_pending_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_hw *hw = &local->hw; unsigned long flags; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int queue = info->hw_queue; if (WARN_ON(!info->control.vif)) { ieee80211_free_txskb(&local->hw, skb); return; } spin_lock_irqsave(&local->queue_stop_reason_lock, flags); __ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false); __skb_queue_tail(&local->pending[queue], skb); __ieee80211_wake_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_add_pending_skbs(struct ieee80211_local *local, struct sk_buff_head *skbs) { struct ieee80211_hw *hw = &local->hw; struct sk_buff *skb; unsigned long flags; int queue, i; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); while ((skb = skb_dequeue(skbs))) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); if (WARN_ON(!info->control.vif)) { ieee80211_free_txskb(&local->hw, skb); continue; } queue = info->hw_queue; __ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false); __skb_queue_tail(&local->pending[queue], skb); } for (i = 0; i < hw->queues; i++) __ieee80211_wake_queue(hw, i, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; int i; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for_each_set_bit(i, &queues, hw->queues) __ieee80211_stop_queue(hw, i, reason, refcounted); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queues(struct ieee80211_hw *hw) { ieee80211_stop_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_stop_queues); int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; int ret; if (WARN_ON(queue >= hw->queues)) return true; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); ret = test_bit(IEEE80211_QUEUE_STOP_REASON_DRIVER, &local->queue_stop_reasons[queue]); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); return ret; } EXPORT_SYMBOL(ieee80211_queue_stopped); void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; int i; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for_each_set_bit(i, &queues, hw->queues) __ieee80211_wake_queue(hw, i, reason, refcounted, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_wake_queues(struct ieee80211_hw *hw) { ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_wake_queues); unsigned int ieee80211_get_vif_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { unsigned int queues; if (sdata && ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { int ac; queues = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) if (sdata->vif.hw_queue[ac] != IEEE80211_INVAL_HW_QUEUE) queues |= BIT(sdata->vif.hw_queue[ac]); if (sdata->vif.cab_queue != IEEE80211_INVAL_HW_QUEUE) queues |= BIT(sdata->vif.cab_queue); } else { /* all queues */ queues = BIT(local->hw.queues) - 1; } return queues; } void __ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int queues, bool drop) { if (!local->ops->flush && !drop) return; /* * If no queue was set, or if the HW doesn't support * IEEE80211_HW_QUEUE_CONTROL - flush all queues */ if (!queues || !ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) queues = ieee80211_get_vif_queues(local, sdata); ieee80211_stop_queues_by_reason(&local->hw, queues, IEEE80211_QUEUE_STOP_REASON_FLUSH, false); if (drop) { struct sta_info *sta; /* Purge the queues, so the frames on them won't be * sent during __ieee80211_wake_queue() */ list_for_each_entry(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; ieee80211_purge_sta_txqs(sta); } } if (local->ops->flush) drv_flush(local, sdata, queues, drop); ieee80211_wake_queues_by_reason(&local->hw, queues, IEEE80211_QUEUE_STOP_REASON_FLUSH, false); } void ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool drop) { __ieee80211_flush_queues(local, sdata, 0, drop); } static void __iterate_interfaces(struct ieee80211_local *local, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_sub_if_data *sdata; bool active_only = iter_flags & IEEE80211_IFACE_ITER_ACTIVE; list_for_each_entry_rcu(sdata, &local->interfaces, list, lockdep_is_held(&local->iflist_mtx) || lockdep_is_held(&local->hw.wiphy->mtx)) { switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; break; case NL80211_IFTYPE_AP_VLAN: continue; default: break; } if (!(iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL) && active_only && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) continue; if ((iter_flags & IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER) && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) continue; if (ieee80211_sdata_running(sdata) || !active_only) iterator(data, sdata->vif.addr, &sdata->vif); } sdata = rcu_dereference_check(local->monitor_sdata, lockdep_is_held(&local->iflist_mtx) || lockdep_is_held(&local->hw.wiphy->mtx)); if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only || sdata->flags & IEEE80211_SDATA_IN_DRIVER)) iterator(data, sdata->vif.addr, &sdata->vif); } void ieee80211_iterate_interfaces( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_local *local = hw_to_local(hw); mutex_lock(&local->iflist_mtx); __iterate_interfaces(local, iter_flags, iterator, data); mutex_unlock(&local->iflist_mtx); } EXPORT_SYMBOL_GPL(ieee80211_iterate_interfaces); void ieee80211_iterate_active_interfaces_atomic( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_local *local = hw_to_local(hw); rcu_read_lock(); __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, iterator, data); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic); void ieee80211_iterate_active_interfaces_mtx( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_local *local = hw_to_local(hw); lockdep_assert_wiphy(hw->wiphy); __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, iterator, data); } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_mtx); static void __iterate_stations(struct ieee80211_local *local, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data) { struct sta_info *sta; list_for_each_entry_rcu(sta, &local->sta_list, list, lockdep_is_held(&local->hw.wiphy->mtx)) { if (!sta->uploaded) continue; iterator(data, &sta->sta); } } void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data) { struct ieee80211_local *local = hw_to_local(hw); rcu_read_lock(); __iterate_stations(local, iterator, data); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_atomic); void ieee80211_iterate_stations_mtx(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data) { struct ieee80211_local *local = hw_to_local(hw); lockdep_assert_wiphy(local->hw.wiphy); __iterate_stations(local, iterator, data); } EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_mtx); struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (!ieee80211_sdata_running(sdata) || !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) return NULL; return &sdata->vif; } EXPORT_SYMBOL_GPL(wdev_to_ieee80211_vif); struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif) { if (!vif) return NULL; return &vif_to_sdata(vif)->wdev; } EXPORT_SYMBOL_GPL(ieee80211_vif_to_wdev); /* * Nothing should have been stuffed into the workqueue during * the suspend->resume cycle. Since we can't check each caller * of this function if we are already quiescing / suspended, * check here and don't WARN since this can actually happen when * the rx path (for example) is racing against __ieee80211_suspend * and suspending / quiescing was set after the rx path checked * them. */ static bool ieee80211_can_queue_work(struct ieee80211_local *local) { if (local->quiescing || (local->suspended && !local->resuming)) { pr_warn("queueing ieee80211 work while going to suspend\n"); return false; } return true; } void ieee80211_queue_work(struct ieee80211_hw *hw, struct work_struct *work) { struct ieee80211_local *local = hw_to_local(hw); if (!ieee80211_can_queue_work(local)) return; queue_work(local->workqueue, work); } EXPORT_SYMBOL(ieee80211_queue_work); void ieee80211_queue_delayed_work(struct ieee80211_hw *hw, struct delayed_work *dwork, unsigned long delay) { struct ieee80211_local *local = hw_to_local(hw); if (!ieee80211_can_queue_work(local)) return; queue_delayed_work(local->workqueue, dwork, delay); } EXPORT_SYMBOL(ieee80211_queue_delayed_work); void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_queue_params *qparam, int ac) { struct ieee80211_chanctx_conf *chanctx_conf; const struct ieee80211_reg_rule *rrule; const struct ieee80211_wmm_ac *wmm_ac; u16 center_freq = 0; if (sdata->vif.type != NL80211_IFTYPE_AP && sdata->vif.type != NL80211_IFTYPE_STATION) return; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (chanctx_conf) center_freq = chanctx_conf->def.chan->center_freq; if (!center_freq) { rcu_read_unlock(); return; } rrule = freq_reg_info(sdata->wdev.wiphy, MHZ_TO_KHZ(center_freq)); if (IS_ERR_OR_NULL(rrule) || !rrule->has_wmm) { rcu_read_unlock(); return; } if (sdata->vif.type == NL80211_IFTYPE_AP) wmm_ac = &rrule->wmm_rule.ap[ac]; else wmm_ac = &rrule->wmm_rule.client[ac]; qparam->cw_min = max_t(u16, qparam->cw_min, wmm_ac->cw_min); qparam->cw_max = max_t(u16, qparam->cw_max, wmm_ac->cw_max); qparam->aifs = max_t(u8, qparam->aifs, wmm_ac->aifsn); qparam->txop = min_t(u16, qparam->txop, wmm_ac->cot / 32); rcu_read_unlock(); } void ieee80211_set_wmm_default(struct ieee80211_link_data *link, bool bss_notify, bool enable_qos) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_tx_queue_params qparam; struct ieee80211_chanctx_conf *chanctx_conf; int ac; bool use_11b; bool is_ocb; /* Use another EDCA parameters if dot11OCBActivated=true */ int aCWmin, aCWmax; if (!local->ops->conf_tx) return; if (local->hw.queues < IEEE80211_NUM_ACS) return; memset(&qparam, 0, sizeof(qparam)); rcu_read_lock(); chanctx_conf = rcu_dereference(link->conf->chanctx_conf); use_11b = (chanctx_conf && chanctx_conf->def.chan->band == NL80211_BAND_2GHZ) && !link->operating_11g_mode; rcu_read_unlock(); is_ocb = (sdata->vif.type == NL80211_IFTYPE_OCB); /* Set defaults according to 802.11-2007 Table 7-37 */ aCWmax = 1023; if (use_11b) aCWmin = 31; else aCWmin = 15; /* Configure old 802.11b/g medium access rules. */ qparam.cw_max = aCWmax; qparam.cw_min = aCWmin; qparam.txop = 0; qparam.aifs = 2; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { /* Update if QoS is enabled. */ if (enable_qos) { switch (ac) { case IEEE80211_AC_BK: qparam.cw_max = aCWmax; qparam.cw_min = aCWmin; qparam.txop = 0; if (is_ocb) qparam.aifs = 9; else qparam.aifs = 7; break; /* never happens but let's not leave undefined */ default: case IEEE80211_AC_BE: qparam.cw_max = aCWmax; qparam.cw_min = aCWmin; qparam.txop = 0; if (is_ocb) qparam.aifs = 6; else qparam.aifs = 3; break; case IEEE80211_AC_VI: qparam.cw_max = aCWmin; qparam.cw_min = (aCWmin + 1) / 2 - 1; if (is_ocb) qparam.txop = 0; else if (use_11b) qparam.txop = 6016/32; else qparam.txop = 3008/32; if (is_ocb) qparam.aifs = 3; else qparam.aifs = 2; break; case IEEE80211_AC_VO: qparam.cw_max = (aCWmin + 1) / 2 - 1; qparam.cw_min = (aCWmin + 1) / 4 - 1; if (is_ocb) qparam.txop = 0; else if (use_11b) qparam.txop = 3264/32; else qparam.txop = 1504/32; qparam.aifs = 2; break; } } ieee80211_regulatory_limit_wmm_params(sdata, &qparam, ac); qparam.uapsd = false; link->tx_conf[ac] = qparam; drv_conf_tx(local, link, ac, &qparam); } if (sdata->vif.type != NL80211_IFTYPE_MONITOR && sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && sdata->vif.type != NL80211_IFTYPE_NAN) { link->conf->qos = enable_qos; if (bss_notify) ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_QOS); } } void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *da, const u8 *bssid, const u8 *key, u8 key_len, u8 key_idx, u32 tx_flags) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; bool multi_link = ieee80211_vif_is_mld(&sdata->vif); struct { u8 id; u8 len; u8 ext_id; struct ieee80211_multi_link_elem ml; struct ieee80211_mle_basic_common_info basic; } __packed mle = { .id = WLAN_EID_EXTENSION, .len = sizeof(mle) - 2, .ext_id = WLAN_EID_EXT_EHT_MULTI_LINK, .ml.control = cpu_to_le16(IEEE80211_ML_CONTROL_TYPE_BASIC), .basic.len = sizeof(mle.basic), }; int err; memcpy(mle.basic.mld_mac_addr, sdata->vif.addr, ETH_ALEN); /* 24 + 6 = header + auth_algo + auth_transaction + status_code */ skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN + 24 + 6 + extra_len + IEEE80211_WEP_ICV_LEN + multi_link * sizeof(mle)); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN); mgmt = skb_put_zero(skb, 24 + 6); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_AUTH); memcpy(mgmt->da, da, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, bssid, ETH_ALEN); mgmt->u.auth.auth_alg = cpu_to_le16(auth_alg); mgmt->u.auth.auth_transaction = cpu_to_le16(transaction); mgmt->u.auth.status_code = cpu_to_le16(status); if (extra) skb_put_data(skb, extra, extra_len); if (multi_link) skb_put_data(skb, &mle, sizeof(mle)); if (auth_alg == WLAN_AUTH_SHARED_KEY && transaction == 3) { mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); err = ieee80211_wep_encrypt(local, skb, key, key_len, key_idx); if (WARN_ON(err)) { kfree_skb(skb); return; } } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | tx_flags; ieee80211_tx_skb(sdata, skb); } void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, const u8 *da, const u8 *bssid, u16 stype, u16 reason, bool send_frame, u8 *frame_buf) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt = (void *)frame_buf; /* build frame */ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype); mgmt->duration = 0; /* initialize only */ mgmt->seq_ctrl = 0; /* initialize only */ memcpy(mgmt->da, da, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, bssid, ETH_ALEN); /* u.deauth.reason_code == u.disassoc.reason_code */ mgmt->u.deauth.reason_code = cpu_to_le16(reason); if (send_frame) { skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_DEAUTH_FRAME_LEN); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); /* copy in frame */ skb_put_data(skb, mgmt, IEEE80211_DEAUTH_FRAME_LEN); if (sdata->vif.type != NL80211_IFTYPE_STATION || !(sdata->u.mgd.flags & IEEE80211_STA_MFP_ENABLED)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; ieee80211_tx_skb(sdata, skb); } } static int ieee80211_put_s1g_cap(struct sk_buff *skb, struct ieee80211_sta_s1g_cap *s1g_cap) { if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_s1g_cap)) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_S1G_CAPABILITIES); skb_put_u8(skb, sizeof(struct ieee80211_s1g_cap)); skb_put_data(skb, &s1g_cap->cap, sizeof(s1g_cap->cap)); skb_put_data(skb, &s1g_cap->nss_mcs, sizeof(s1g_cap->nss_mcs)); return 0; } static int ieee80211_put_preq_ies_band(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const u8 *ie, size_t ie_len, size_t *offset, enum nl80211_band band, u32 rate_mask, struct cfg80211_chan_def *chandef, u32 flags) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; int i, err; size_t noffset; bool have_80mhz = false; *offset = 0; sband = local->hw.wiphy->bands[band]; if (WARN_ON_ONCE(!sband)) return 0; /* For direct scan add S1G IE and consider its override bits */ if (band == NL80211_BAND_S1GHZ) return ieee80211_put_s1g_cap(skb, &sband->s1g_cap); err = ieee80211_put_srates_elem(skb, sband, 0, ~rate_mask, WLAN_EID_SUPP_RATES); if (err) return err; /* insert "request information" if in custom IEs */ if (ie && ie_len) { static const u8 before_extrates[] = { WLAN_EID_SSID, WLAN_EID_SUPP_RATES, WLAN_EID_REQUEST, }; noffset = ieee80211_ie_split(ie, ie_len, before_extrates, ARRAY_SIZE(before_extrates), *offset); if (skb_tailroom(skb) < noffset - *offset) return -ENOBUFS; skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } err = ieee80211_put_srates_elem(skb, sband, 0, ~rate_mask, WLAN_EID_EXT_SUPP_RATES); if (err) return err; if (chandef->chan && sband->band == NL80211_BAND_2GHZ) { if (skb_tailroom(skb) < 3) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_DS_PARAMS); skb_put_u8(skb, 1); skb_put_u8(skb, ieee80211_frequency_to_channel(chandef->chan->center_freq)); } if (flags & IEEE80211_PROBE_FLAG_MIN_CONTENT) return 0; /* insert custom IEs that go before HT */ if (ie && ie_len) { static const u8 before_ht[] = { /* * no need to list the ones split off already * (or generated here) */ WLAN_EID_DS_PARAMS, WLAN_EID_SUPPORTED_REGULATORY_CLASSES, }; noffset = ieee80211_ie_split(ie, ie_len, before_ht, ARRAY_SIZE(before_ht), *offset); if (skb_tailroom(skb) < noffset - *offset) return -ENOBUFS; skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } if (sband->ht_cap.ht_supported) { u8 *pos; if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_cap)) return -ENOBUFS; pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_cap)); ieee80211_ie_build_ht_cap(pos, &sband->ht_cap, sband->ht_cap.cap); } /* insert custom IEs that go before VHT */ if (ie && ie_len) { static const u8 before_vht[] = { /* * no need to list the ones split off already * (or generated here) */ WLAN_EID_BSS_COEX_2040, WLAN_EID_EXT_CAPABILITY, WLAN_EID_SSID_LIST, WLAN_EID_CHANNEL_USAGE, WLAN_EID_INTERWORKING, WLAN_EID_MESH_ID, /* 60 GHz (Multi-band, DMG, MMS) can't happen */ }; noffset = ieee80211_ie_split(ie, ie_len, before_vht, ARRAY_SIZE(before_vht), *offset); if (skb_tailroom(skb) < noffset - *offset) return -ENOBUFS; skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } /* Check if any channel in this sband supports at least 80 MHz */ for (i = 0; i < sband->n_channels; i++) { if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | IEEE80211_CHAN_NO_80MHZ)) continue; have_80mhz = true; break; } if (sband->vht_cap.vht_supported && have_80mhz) { u8 *pos; if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_vht_cap)) return -ENOBUFS; pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_cap)); ieee80211_ie_build_vht_cap(pos, &sband->vht_cap, sband->vht_cap.cap); } /* insert custom IEs that go before HE */ if (ie && ie_len) { static const u8 before_he[] = { /* * no need to list the ones split off before VHT * or generated here */ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_REQ_PARAMS, WLAN_EID_AP_CSN, /* TODO: add 11ah/11aj/11ak elements */ }; noffset = ieee80211_ie_split(ie, ie_len, before_he, ARRAY_SIZE(before_he), *offset); if (skb_tailroom(skb) < noffset - *offset) return -ENOBUFS; skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), IEEE80211_CHAN_NO_HE)) { err = ieee80211_put_he_cap(skb, sdata, sband, NULL); if (err) return err; } if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), IEEE80211_CHAN_NO_HE | IEEE80211_CHAN_NO_EHT)) { err = ieee80211_put_eht_cap(skb, sdata, sband, NULL); if (err) return err; } err = ieee80211_put_he_6ghz_cap(skb, sdata, IEEE80211_SMPS_OFF); if (err) return err; /* * If adding more here, adjust code in main.c * that calculates local->scan_ies_len. */ return 0; } static int ieee80211_put_preq_ies(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, struct cfg80211_chan_def *chandef, u32 flags) { size_t custom_ie_offset = 0; int i, err; memset(ie_desc, 0, sizeof(*ie_desc)); for (i = 0; i < NUM_NL80211_BANDS; i++) { if (bands_used & BIT(i)) { ie_desc->ies[i] = skb_tail_pointer(skb); err = ieee80211_put_preq_ies_band(skb, sdata, ie, ie_len, &custom_ie_offset, i, rate_masks[i], chandef, flags); if (err) return err; ie_desc->len[i] = skb_tail_pointer(skb) - ie_desc->ies[i]; } } /* add any remaining custom IEs */ if (ie && ie_len) { if (WARN_ONCE(skb_tailroom(skb) < ie_len - custom_ie_offset, "not enough space for preq custom IEs\n")) return -ENOBUFS; ie_desc->common_ies = skb_tail_pointer(skb); skb_put_data(skb, ie + custom_ie_offset, ie_len - custom_ie_offset); ie_desc->common_ie_len = skb_tail_pointer(skb) - ie_desc->common_ies; } return 0; }; int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, struct cfg80211_chan_def *chandef, u32 flags) { struct sk_buff *skb = alloc_skb(buffer_len, GFP_KERNEL); uintptr_t offs; int ret, i; u8 *start; if (!skb) return -ENOMEM; start = skb_tail_pointer(skb); memset(start, 0, skb_tailroom(skb)); ret = ieee80211_put_preq_ies(skb, sdata, ie_desc, ie, ie_len, bands_used, rate_masks, chandef, flags); if (ret < 0) { goto out; } if (skb->len > buffer_len) { ret = -ENOBUFS; goto out; } memcpy(buffer, start, skb->len); /* adjust ie_desc for copy */ for (i = 0; i < NUM_NL80211_BANDS; i++) { offs = ie_desc->ies[i] - start; ie_desc->ies[i] = buffer + offs; } offs = ie_desc->common_ies - start; ie_desc->common_ies = buffer + offs; ret = skb->len; out: consume_skb(skb); return ret; } struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, u32 ratemask, struct ieee80211_channel *chan, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 flags) { struct ieee80211_local *local = sdata->local; struct cfg80211_chan_def chandef; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u32 rate_masks[NUM_NL80211_BANDS] = {}; struct ieee80211_scan_ies dummy_ie_desc; /* * Do not send DS Channel parameter for directed probe requests * in order to maximize the chance that we get a response. Some * badly-behaved APs don't respond when this parameter is included. */ chandef.width = sdata->vif.bss_conf.chanreq.oper.width; if (flags & IEEE80211_PROBE_FLAG_DIRECTED) chandef.chan = NULL; else chandef.chan = chan; skb = ieee80211_probereq_get(&local->hw, src, ssid, ssid_len, local->scan_ies_len + ie_len); if (!skb) return NULL; rate_masks[chan->band] = ratemask; ieee80211_put_preq_ies(skb, sdata, &dummy_ie_desc, ie, ie_len, BIT(chan->band), rate_masks, &chandef, flags); if (dst) { mgmt = (struct ieee80211_mgmt *) skb->data; memcpy(mgmt->da, dst, ETH_ALEN); memcpy(mgmt->bssid, dst, ETH_ALEN); } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; return skb; } u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band band, u32 *basic_rates) { struct ieee80211_supported_band *sband; size_t num_rates; u32 supp_rates; int i, j; sband = sdata->local->hw.wiphy->bands[band]; if (WARN_ON(!sband)) return 1; num_rates = sband->n_bitrates; supp_rates = 0; for (i = 0; i < elems->supp_rates_len + elems->ext_supp_rates_len; i++) { u8 rate = 0; int own_rate; bool is_basic; if (i < elems->supp_rates_len) rate = elems->supp_rates[i]; else if (elems->ext_supp_rates) rate = elems->ext_supp_rates [i - elems->supp_rates_len]; own_rate = 5 * (rate & 0x7f); is_basic = !!(rate & 0x80); if (is_basic && (rate & 0x7f) == BSS_MEMBERSHIP_SELECTOR_HT_PHY) continue; for (j = 0; j < num_rates; j++) { int brate = sband->bitrates[j].bitrate; if (brate == own_rate) { supp_rates |= BIT(j); if (basic_rates && is_basic) *basic_rates |= BIT(j); } } } return supp_rates; } void ieee80211_stop_device(struct ieee80211_local *local, bool suspend) { local_bh_disable(); ieee80211_handle_queued_frames(local); local_bh_enable(); ieee80211_led_radio(local, false); ieee80211_mod_tpt_led_trig(local, 0, IEEE80211_TPT_LEDTRIG_FL_RADIO); wiphy_work_cancel(local->hw.wiphy, &local->reconfig_filter); flush_workqueue(local->workqueue); wiphy_work_flush(local->hw.wiphy, NULL); drv_stop(local, suspend); } static void ieee80211_flush_completed_scan(struct ieee80211_local *local, bool aborted) { /* It's possible that we don't handle the scan completion in * time during suspend, so if it's still marked as completed * here, queue the work and flush it to clean things up. * Instead of calling the worker function directly here, we * really queue it to avoid potential races with other flows * scheduling the same work. */ if (test_bit(SCAN_COMPLETED, &local->scanning)) { /* If coming from reconfiguration failure, abort the scan so * we don't attempt to continue a partial HW scan - which is * possible otherwise if (e.g.) the 2.4 GHz portion was the * completed scan, and a 5 GHz portion is still pending. */ if (aborted) set_bit(SCAN_ABORTED, &local->scanning); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); wiphy_delayed_work_flush(local->hw.wiphy, &local->scan_work); } } static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; struct ieee80211_chanctx *ctx; lockdep_assert_wiphy(local->hw.wiphy); /* * We get here if during resume the device can't be restarted properly. * We might also get here if this happens during HW reset, which is a * slightly different situation and we need to drop all connections in * the latter case. * * Ask cfg80211 to turn off all interfaces, this will result in more * warnings but at least we'll then get into a clean stopped state. */ local->resuming = false; local->suspended = false; local->in_reconfig = false; local->reconfig_failure = true; ieee80211_flush_completed_scan(local, true); /* scheduled scan clearly can't be running any more, but tell * cfg80211 and clear local state */ ieee80211_sched_scan_end(local); list_for_each_entry(sdata, &local->interfaces, list) sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER; /* Mark channel contexts as not being in the driver any more to avoid * removing them from the driver during the shutdown process... */ list_for_each_entry(ctx, &local->chanctx_list, list) ctx->driver_present = false; } static void ieee80211_assign_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link) { struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *ctx; lockdep_assert_wiphy(local->hw.wiphy); conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); if (conf) { ctx = container_of(conf, struct ieee80211_chanctx, conf); drv_assign_vif_chanctx(local, sdata, link->conf, ctx); } } static void ieee80211_reconfig_stations(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; lockdep_assert_wiphy(local->hw.wiphy); /* add STAs back */ list_for_each_entry(sta, &local->sta_list, list) { enum ieee80211_sta_state state; if (!sta->uploaded || sta->sdata != sdata) continue; for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) WARN_ON(drv_sta_state(local, sta->sdata, sta, state, state + 1)); } } static int ieee80211_reconfig_nan(struct ieee80211_sub_if_data *sdata) { struct cfg80211_nan_func *func, **funcs; int res, id, i = 0; res = drv_start_nan(sdata->local, sdata, &sdata->u.nan.conf); if (WARN_ON(res)) return res; funcs = kcalloc(sdata->local->hw.max_nan_de_entries + 1, sizeof(*funcs), GFP_KERNEL); if (!funcs) return -ENOMEM; /* Add all the functions: * This is a little bit ugly. We need to call a potentially sleeping * callback for each NAN function, so we can't hold the spinlock. */ spin_lock_bh(&sdata->u.nan.func_lock); idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id) funcs[i++] = func; spin_unlock_bh(&sdata->u.nan.func_lock); for (i = 0; funcs[i]; i++) { res = drv_add_nan_func(sdata->local, sdata, funcs[i]); if (WARN_ON(res)) ieee80211_nan_func_terminated(&sdata->vif, funcs[i]->instance_id, NL80211_NAN_FUNC_TERM_REASON_ERROR, GFP_KERNEL); } kfree(funcs); return 0; } static void ieee80211_reconfig_ap_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 changed) { int link_id; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; if (!(sdata->vif.active_links & BIT(link_id))) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; if (rcu_access_pointer(link->u.ap.beacon)) drv_start_ap(local, sdata, link->conf); if (!link->conf->enable_beacon) continue; changed |= BSS_CHANGED_BEACON | BSS_CHANGED_BEACON_ENABLED; ieee80211_link_info_change_notify(sdata, link, changed); } } int ieee80211_reconfig(struct ieee80211_local *local) { struct ieee80211_hw *hw = &local->hw; struct ieee80211_sub_if_data *sdata; struct ieee80211_chanctx *ctx; struct sta_info *sta; int res, i; bool reconfig_due_to_wowlan = false; struct ieee80211_sub_if_data *sched_scan_sdata; struct cfg80211_sched_scan_request *sched_scan_req; bool sched_scan_stopped = false; bool suspended = local->suspended; bool in_reconfig = false; lockdep_assert_wiphy(local->hw.wiphy); /* nothing to do if HW shouldn't run */ if (!local->open_count) goto wake_up; #ifdef CONFIG_PM if (suspended) local->resuming = true; if (local->wowlan) { /* * In the wowlan case, both mac80211 and the device * are functional when the resume op is called, so * clear local->suspended so the device could operate * normally (e.g. pass rx frames). */ local->suspended = false; res = drv_resume(local); local->wowlan = false; if (res < 0) { local->resuming = false; return res; } if (res == 0) goto wake_up; WARN_ON(res > 1); /* * res is 1, which means the driver requested * to go through a regular reset on wakeup. * restore local->suspended in this case. */ reconfig_due_to_wowlan = true; local->suspended = true; } #endif /* * In case of hw_restart during suspend (without wowlan), * cancel restart work, as we are reconfiguring the device * anyway. * Note that restart_work is scheduled on a frozen workqueue, * so we can't deadlock in this case. */ if (suspended && local->in_reconfig && !reconfig_due_to_wowlan) cancel_work_sync(&local->restart_work); local->started = false; /* * Upon resume hardware can sometimes be goofy due to * various platform / driver / bus issues, so restarting * the device may at times not work immediately. Propagate * the error. */ res = drv_start(local); if (res) { if (suspended) WARN(1, "Hardware became unavailable upon resume. This could be a software issue prior to suspend or a hardware issue.\n"); else WARN(1, "Hardware became unavailable during restart.\n"); ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_SUSPEND, false); ieee80211_handle_reconfig_failure(local); return res; } /* setup fragmentation threshold */ drv_set_frag_threshold(local, -1, hw->wiphy->frag_threshold); /* setup RTS threshold */ if (hw->wiphy->n_radio > 0) { for (i = 0; i < hw->wiphy->n_radio; i++) { u32 rts_threshold = hw->wiphy->radio_cfg[i].rts_threshold; drv_set_rts_threshold(local, i, rts_threshold); } } else { drv_set_rts_threshold(local, -1, hw->wiphy->rts_threshold); } /* reset coverage class */ drv_set_coverage_class(local, -1, hw->wiphy->coverage_class); ieee80211_led_radio(local, true); ieee80211_mod_tpt_led_trig(local, IEEE80211_TPT_LEDTRIG_FL_RADIO, 0); /* add interfaces */ sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { /* in HW restart it exists already */ WARN_ON(local->resuming); res = drv_add_interface(local, sdata); if (WARN_ON(res)) { RCU_INIT_POINTER(local->monitor_sdata, NULL); synchronize_net(); kfree(sdata); } } list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && ieee80211_sdata_running(sdata)) { res = drv_add_interface(local, sdata); if (WARN_ON(res)) break; } } /* If adding any of the interfaces failed above, roll back and * report failure. */ if (res) { list_for_each_entry_continue_reverse(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && ieee80211_sdata_running(sdata)) drv_remove_interface(local, sdata); } ieee80211_handle_reconfig_failure(local); return res; } /* add channel contexts */ list_for_each_entry(ctx, &local->chanctx_list, list) if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) WARN_ON(drv_add_chanctx(local, ctx)); sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata && ieee80211_sdata_running(sdata)) ieee80211_assign_chanctx(local, sdata, &sdata->deflink); /* reconfigure hardware */ ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_LISTEN_INTERVAL | IEEE80211_CONF_CHANGE_MONITOR | IEEE80211_CONF_CHANGE_PS | IEEE80211_CONF_CHANGE_RETRY_LIMITS | IEEE80211_CONF_CHANGE_IDLE); ieee80211_configure_filter(local); /* Finally also reconfigure all the BSS information */ list_for_each_entry(sdata, &local->interfaces, list) { /* common change flags for all interface types - link only */ u64 changed = BSS_CHANGED_ERP_CTS_PROT | BSS_CHANGED_ERP_PREAMBLE | BSS_CHANGED_ERP_SLOT | BSS_CHANGED_HT | BSS_CHANGED_BASIC_RATES | BSS_CHANGED_BEACON_INT | BSS_CHANGED_BSSID | BSS_CHANGED_CQM | BSS_CHANGED_QOS | BSS_CHANGED_TXPOWER | BSS_CHANGED_MCAST_RATE; struct ieee80211_link_data *link = NULL; unsigned int link_id; u32 active_links = 0; if (!ieee80211_sdata_running(sdata)) continue; if (ieee80211_vif_is_mld(&sdata->vif)) { struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS] = { [0] = &sdata->vif.bss_conf, }; if (sdata->vif.type == NL80211_IFTYPE_STATION) { /* start with a single active link */ active_links = sdata->vif.active_links; link_id = ffs(active_links) - 1; sdata->vif.active_links = BIT(link_id); } drv_change_vif_links(local, sdata, 0, sdata->vif.active_links, old); } sdata->restart_active_links = active_links; for (link_id = 0; link_id < ARRAY_SIZE(sdata->vif.link_conf); link_id++) { if (!ieee80211_vif_link_active(&sdata->vif, link_id)) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; ieee80211_assign_chanctx(local, sdata, link); } switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: break; case NL80211_IFTYPE_ADHOC: if (sdata->vif.cfg.ibss_joined) WARN_ON(drv_join_ibss(local, sdata)); fallthrough; default: ieee80211_reconfig_stations(sdata); fallthrough; case NL80211_IFTYPE_AP: /* AP stations are handled later */ for (i = 0; i < IEEE80211_NUM_ACS; i++) drv_conf_tx(local, &sdata->deflink, i, &sdata->deflink.tx_conf[i]); break; } if (sdata->vif.bss_conf.mu_mimo_owner) changed |= BSS_CHANGED_MU_GROUPS; if (!ieee80211_vif_is_mld(&sdata->vif)) changed |= BSS_CHANGED_IDLE; switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: if (!ieee80211_vif_is_mld(&sdata->vif)) { changed |= BSS_CHANGED_ASSOC | BSS_CHANGED_ARP_FILTER | BSS_CHANGED_PS; /* Re-send beacon info report to the driver */ if (sdata->deflink.u.mgd.have_beacon) changed |= BSS_CHANGED_BEACON_INFO; if (sdata->vif.bss_conf.max_idle_period || sdata->vif.bss_conf.protected_keep_alive) changed |= BSS_CHANGED_KEEP_ALIVE; ieee80211_bss_info_change_notify(sdata, changed); } else if (!WARN_ON(!link)) { ieee80211_link_info_change_notify(sdata, link, changed); changed = BSS_CHANGED_ASSOC | BSS_CHANGED_IDLE | BSS_CHANGED_PS | BSS_CHANGED_ARP_FILTER; ieee80211_vif_cfg_change_notify(sdata, changed); } break; case NL80211_IFTYPE_OCB: changed |= BSS_CHANGED_OCB; ieee80211_bss_info_change_notify(sdata, changed); break; case NL80211_IFTYPE_ADHOC: changed |= BSS_CHANGED_IBSS; fallthrough; case NL80211_IFTYPE_AP: changed |= BSS_CHANGED_P2P_PS; if (ieee80211_vif_is_mld(&sdata->vif)) ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_SSID); else changed |= BSS_CHANGED_SSID; if (sdata->vif.bss_conf.ftm_responder == 1 && wiphy_ext_feature_isset(sdata->local->hw.wiphy, NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER)) changed |= BSS_CHANGED_FTM_RESPONDER; if (sdata->vif.type == NL80211_IFTYPE_AP) { changed |= BSS_CHANGED_AP_PROBE_RESP; if (ieee80211_vif_is_mld(&sdata->vif)) { ieee80211_reconfig_ap_links(local, sdata, changed); break; } if (rcu_access_pointer(sdata->deflink.u.ap.beacon)) drv_start_ap(local, sdata, sdata->deflink.conf); } fallthrough; case NL80211_IFTYPE_MESH_POINT: if (sdata->vif.bss_conf.enable_beacon) { changed |= BSS_CHANGED_BEACON | BSS_CHANGED_BEACON_ENABLED; ieee80211_bss_info_change_notify(sdata, changed); } break; case NL80211_IFTYPE_NAN: res = ieee80211_reconfig_nan(sdata); if (res < 0) { ieee80211_handle_reconfig_failure(local); return res; } break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_DEVICE: /* nothing to do */ break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_WDS: WARN_ON(1); break; } } ieee80211_recalc_ps(local); /* * The sta might be in psm against the ap (e.g. because * this was the state before a hw restart), so we * explicitly send a null packet in order to make sure * it'll sync against the ap (and get out of psm). */ if (!(local->hw.conf.flags & IEEE80211_CONF_PS)) { list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type != NL80211_IFTYPE_STATION) continue; if (!sdata->u.mgd.associated) continue; ieee80211_send_nullfunc(local, sdata, false); } } /* APs are now beaconing, add back stations */ list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_AP: ieee80211_reconfig_stations(sdata); break; default: break; } } /* add back keys */ list_for_each_entry(sdata, &local->interfaces, list) ieee80211_reenable_keys(sdata); /* re-enable multi-link for client interfaces */ list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->restart_active_links) ieee80211_set_active_links(&sdata->vif, sdata->restart_active_links); /* * If a link switch was scheduled before the restart, and ran * before reconfig, it will do nothing, so re-schedule. */ if (sdata->desired_active_links) wiphy_work_queue(sdata->local->hw.wiphy, &sdata->activate_links_work); } /* Reconfigure sched scan if it was interrupted by FW restart */ sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); sched_scan_req = rcu_dereference_protected(local->sched_scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); if (sched_scan_sdata && sched_scan_req) /* * Sched scan stopped, but we don't want to report it. Instead, * we're trying to reschedule. However, if more than one scan * plan was set, we cannot reschedule since we don't know which * scan plan was currently running (and some scan plans may have * already finished). */ if (sched_scan_req->n_scan_plans > 1 || __ieee80211_request_sched_scan_start(sched_scan_sdata, sched_scan_req)) { RCU_INIT_POINTER(local->sched_scan_sdata, NULL); RCU_INIT_POINTER(local->sched_scan_req, NULL); sched_scan_stopped = true; } if (sched_scan_stopped) cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0); wake_up: /* * Clear the WLAN_STA_BLOCK_BA flag so new aggregation * sessions can be established after a resume. * * Also tear down aggregation sessions since reconfiguring * them in a hardware restart scenario is not easily done * right now, and the hardware will have lost information * about the sessions, but we and the AP still think they * are active. This is really a workaround though. */ if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) { list_for_each_entry(sta, &local->sta_list, list) { if (!local->resuming) ieee80211_sta_tear_down_BA_sessions( sta, AGG_STOP_LOCAL_REQUEST); clear_sta_flag(sta, WLAN_STA_BLOCK_BA); } } /* * If this is for hw restart things are still running. * We may want to change that later, however. */ if (local->open_count && (!suspended || reconfig_due_to_wowlan)) drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_RESTART); if (local->in_reconfig) { in_reconfig = local->in_reconfig; local->in_reconfig = false; barrier(); ieee80211_reconfig_roc(local); /* Requeue all works */ list_for_each_entry(sdata, &local->interfaces, list) { if (ieee80211_sdata_running(sdata)) wiphy_work_queue(local->hw.wiphy, &sdata->work); } } ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_SUSPEND, false); if (in_reconfig) { list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_restart(sdata); } } /* Passing NULL means an interface is picked for configuration */ if (local->virt_monitors > 0 && local->virt_monitors == local->open_count) ieee80211_add_virtual_monitor(local, NULL); if (!suspended) return 0; #ifdef CONFIG_PM /* first set suspended false, then resuming */ local->suspended = false; mb(); local->resuming = false; ieee80211_flush_completed_scan(local, false); if (local->open_count && !reconfig_due_to_wowlan) drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_SUSPEND); list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_restart(sdata); } mod_timer(&local->sta_cleanup, jiffies + 1); #else WARN_ON(1); #endif return 0; } static void ieee80211_reconfig_disconnect(struct ieee80211_vif *vif, u8 flag) { struct ieee80211_sub_if_data *sdata; struct ieee80211_local *local; struct ieee80211_key *key; if (WARN_ON(!vif)) return; sdata = vif_to_sdata(vif); local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(flag & IEEE80211_SDATA_DISCONNECT_RESUME && !local->resuming)) return; if (WARN_ON(flag & IEEE80211_SDATA_DISCONNECT_HW_RESTART && !local->in_reconfig)) return; if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return; sdata->flags |= flag; list_for_each_entry(key, &sdata->key_list, list) key->flags |= KEY_FLAG_TAINTED; } void ieee80211_hw_restart_disconnect(struct ieee80211_vif *vif) { ieee80211_reconfig_disconnect(vif, IEEE80211_SDATA_DISCONNECT_HW_RESTART); } EXPORT_SYMBOL_GPL(ieee80211_hw_restart_disconnect); void ieee80211_resume_disconnect(struct ieee80211_vif *vif) { ieee80211_reconfig_disconnect(vif, IEEE80211_SDATA_DISCONNECT_RESUME); } EXPORT_SYMBOL_GPL(ieee80211_resume_disconnect); void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_chanctx *chanctx; lockdep_assert_wiphy(local->hw.wiphy); chanctx_conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); /* * This function can be called from a work, thus it may be possible * that the chanctx_conf is removed (due to a disconnection, for * example). * So nothing should be done in such case. */ if (!chanctx_conf) return; chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); ieee80211_recalc_smps_chanctx(local, chanctx); } void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata, int link_id) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_chanctx *chanctx; int i; lockdep_assert_wiphy(local->hw.wiphy); for (i = 0; i < ARRAY_SIZE(sdata->vif.link_conf); i++) { struct ieee80211_bss_conf *bss_conf; if (link_id >= 0 && link_id != i) continue; rcu_read_lock(); bss_conf = rcu_dereference(sdata->vif.link_conf[i]); if (!bss_conf) { rcu_read_unlock(); continue; } chanctx_conf = rcu_dereference_protected(bss_conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); /* * Since we hold the wiphy mutex (checked above) * we can take the chanctx_conf pointer out of the * RCU critical section, it cannot go away without * the mutex. Just the way we reached it could - in * theory - go away, but we don't really care and * it really shouldn't happen anyway. */ rcu_read_unlock(); if (!chanctx_conf) return; chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); ieee80211_recalc_chanctx_min_def(local, chanctx); } } size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset) { size_t pos = offset; while (pos < ielen && ies[pos] != WLAN_EID_VENDOR_SPECIFIC) pos += 2 + ies[pos + 1]; return pos; } u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, u16 cap) { __le16 tmp; *pos++ = WLAN_EID_HT_CAPABILITY; *pos++ = sizeof(struct ieee80211_ht_cap); memset(pos, 0, sizeof(struct ieee80211_ht_cap)); /* capability flags */ tmp = cpu_to_le16(cap); memcpy(pos, &tmp, sizeof(u16)); pos += sizeof(u16); /* AMPDU parameters */ *pos++ = ht_cap->ampdu_factor | (ht_cap->ampdu_density << IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT); /* MCS set */ memcpy(pos, &ht_cap->mcs, sizeof(ht_cap->mcs)); pos += sizeof(ht_cap->mcs); /* extended capabilities */ pos += sizeof(__le16); /* BF capabilities */ pos += sizeof(__le32); /* antenna selection */ pos += sizeof(u8); return pos; } u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u32 cap) { __le32 tmp; *pos++ = WLAN_EID_VHT_CAPABILITY; *pos++ = sizeof(struct ieee80211_vht_cap); memset(pos, 0, sizeof(struct ieee80211_vht_cap)); /* capability flags */ tmp = cpu_to_le32(cap); memcpy(pos, &tmp, sizeof(u32)); pos += sizeof(u32); /* VHT MCS set */ memcpy(pos, &vht_cap->vht_mcs, sizeof(vht_cap->vht_mcs)); pos += sizeof(vht_cap->vht_mcs); return pos; } /* this may return more than ieee80211_put_he_6ghz_cap() will need */ u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata) { const struct ieee80211_sta_he_cap *he_cap; struct ieee80211_supported_band *sband; u8 n; sband = ieee80211_get_sband(sdata); if (!sband) return 0; he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); if (!he_cap) return 0; n = ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem); return 2 + 1 + sizeof(he_cap->he_cap_elem) + n + ieee80211_he_ppe_size(he_cap->ppe_thres[0], he_cap->he_cap_elem.phy_cap_info); } static void ieee80211_get_adjusted_he_cap(const struct ieee80211_conn_settings *conn, const struct ieee80211_sta_he_cap *he_cap, struct ieee80211_he_cap_elem *elem) { u8 ru_limit, max_ru; *elem = he_cap->he_cap_elem; switch (conn->bw_limit) { case IEEE80211_CONN_BW_LIMIT_20: ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_242; break; case IEEE80211_CONN_BW_LIMIT_40: ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_484; break; case IEEE80211_CONN_BW_LIMIT_80: ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_996; break; default: ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_2x996; break; } max_ru = elem->phy_cap_info[8] & IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK; max_ru = min(max_ru, ru_limit); elem->phy_cap_info[8] &= ~IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK; elem->phy_cap_info[8] |= max_ru; if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_40) { elem->phy_cap_info[0] &= ~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G); elem->phy_cap_info[9] &= ~IEEE80211_HE_PHY_CAP9_LONGER_THAN_16_SIGB_OFDM_SYM; } if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160) { elem->phy_cap_info[0] &= ~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G); elem->phy_cap_info[5] &= ~IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK; elem->phy_cap_info[7] &= ~(IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ | IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ); } } int ieee80211_put_he_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const struct ieee80211_supported_band *sband, const struct ieee80211_conn_settings *conn) { const struct ieee80211_sta_he_cap *he_cap; struct ieee80211_he_cap_elem elem; u8 *len; u8 n; u8 ie_len; if (!conn) conn = &ieee80211_conn_settings_unlimited; he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); if (!he_cap) return 0; /* modify on stack first to calculate 'n' and 'ie_len' correctly */ ieee80211_get_adjusted_he_cap(conn, he_cap, &elem); n = ieee80211_he_mcs_nss_size(&elem); ie_len = 2 + 1 + sizeof(he_cap->he_cap_elem) + n + ieee80211_he_ppe_size(he_cap->ppe_thres[0], he_cap->he_cap_elem.phy_cap_info); if (skb_tailroom(skb) < ie_len) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_EXTENSION); len = skb_put(skb, 1); /* We'll set the size later below */ skb_put_u8(skb, WLAN_EID_EXT_HE_CAPABILITY); /* Fixed data */ skb_put_data(skb, &elem, sizeof(elem)); skb_put_data(skb, &he_cap->he_mcs_nss_supp, n); /* Check if PPE Threshold should be present */ if ((he_cap->he_cap_elem.phy_cap_info[6] & IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0) goto end; /* * Calculate how many PPET16/PPET8 pairs are to come. Algorithm: * (NSS_M1 + 1) x (num of 1 bits in RU_INDEX_BITMASK) */ n = hweight8(he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); n *= (1 + ((he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_NSS_MASK) >> IEEE80211_PPE_THRES_NSS_POS)); /* * Each pair is 6 bits, and we need to add the 7 "header" bits to the * total size. */ n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7; n = DIV_ROUND_UP(n, 8); /* Copy PPE Thresholds */ skb_put_data(skb, &he_cap->ppe_thres, n); end: *len = skb_tail_pointer(skb) - len - 1; return 0; } int ieee80211_put_reg_conn(struct sk_buff *skb, enum ieee80211_channel_flags flags) { u8 reg_conn = IEEE80211_REG_CONN_LPI_VALID | IEEE80211_REG_CONN_LPI_VALUE | IEEE80211_REG_CONN_SP_VALID; if (!(flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT)) reg_conn |= IEEE80211_REG_CONN_SP_VALUE; skb_put_u8(skb, WLAN_EID_EXTENSION); skb_put_u8(skb, 1 + sizeof(reg_conn)); skb_put_u8(skb, WLAN_EID_EXT_NON_AP_STA_REG_CON); skb_put_u8(skb, reg_conn); return 0; } int ieee80211_put_he_6ghz_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps_mode) { struct ieee80211_supported_band *sband; const struct ieee80211_sband_iftype_data *iftd; enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); __le16 cap; if (!cfg80211_any_usable_channels(sdata->local->hw.wiphy, BIT(NL80211_BAND_6GHZ), IEEE80211_CHAN_NO_HE)) return 0; sband = sdata->local->hw.wiphy->bands[NL80211_BAND_6GHZ]; iftd = ieee80211_get_sband_iftype_data(sband, iftype); if (!iftd) return 0; /* Check for device HE 6 GHz capability before adding element */ if (!iftd->he_6ghz_capa.capa) return 0; cap = iftd->he_6ghz_capa.capa; cap &= cpu_to_le16(~IEEE80211_HE_6GHZ_CAP_SM_PS); switch (smps_mode) { case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); fallthrough; case IEEE80211_SMPS_OFF: cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_DISABLED, IEEE80211_HE_6GHZ_CAP_SM_PS); break; case IEEE80211_SMPS_STATIC: cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_STATIC, IEEE80211_HE_6GHZ_CAP_SM_PS); break; case IEEE80211_SMPS_DYNAMIC: cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_DYNAMIC, IEEE80211_HE_6GHZ_CAP_SM_PS); break; } if (skb_tailroom(skb) < 2 + 1 + sizeof(cap)) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_EXTENSION); skb_put_u8(skb, 1 + sizeof(cap)); skb_put_u8(skb, WLAN_EID_EXT_HE_6GHZ_CAPA); skb_put_data(skb, &cap, sizeof(cap)); return 0; } u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, u16 prot_mode, bool rifs_mode) { struct ieee80211_ht_operation *ht_oper; /* Build HT Information */ *pos++ = WLAN_EID_HT_OPERATION; *pos++ = sizeof(struct ieee80211_ht_operation); ht_oper = (struct ieee80211_ht_operation *)pos; ht_oper->primary_chan = ieee80211_frequency_to_channel( chandef->chan->center_freq); switch (chandef->width) { case NL80211_CHAN_WIDTH_160: case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_80: case NL80211_CHAN_WIDTH_40: if (chandef->center_freq1 > chandef->chan->center_freq) ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_ABOVE; else ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_BELOW; break; case NL80211_CHAN_WIDTH_320: /* HT information element should not be included on 6GHz */ WARN_ON(1); return pos; default: ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_NONE; break; } if (ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 && chandef->width != NL80211_CHAN_WIDTH_20_NOHT && chandef->width != NL80211_CHAN_WIDTH_20) ht_oper->ht_param |= IEEE80211_HT_PARAM_CHAN_WIDTH_ANY; if (rifs_mode) ht_oper->ht_param |= IEEE80211_HT_PARAM_RIFS_MODE; ht_oper->operation_mode = cpu_to_le16(prot_mode); ht_oper->stbc_param = 0x0000; /* It seems that Basic MCS set and Supported MCS set are identical for the first 10 bytes */ memset(&ht_oper->basic_set, 0, 16); memcpy(&ht_oper->basic_set, &ht_cap->mcs, 10); return pos + sizeof(struct ieee80211_ht_operation); } void ieee80211_ie_build_wide_bw_cs(u8 *pos, const struct cfg80211_chan_def *chandef) { *pos++ = WLAN_EID_WIDE_BW_CHANNEL_SWITCH; /* EID */ *pos++ = 3; /* IE length */ /* New channel width */ switch (chandef->width) { case NL80211_CHAN_WIDTH_80: *pos++ = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_160: *pos++ = IEEE80211_VHT_CHANWIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80P80: *pos++ = IEEE80211_VHT_CHANWIDTH_80P80MHZ; break; case NL80211_CHAN_WIDTH_320: /* The behavior is not defined for 320 MHz channels */ WARN_ON(1); fallthrough; default: *pos++ = IEEE80211_VHT_CHANWIDTH_USE_HT; } /* new center frequency segment 0 */ *pos++ = ieee80211_frequency_to_channel(chandef->center_freq1); /* new center frequency segment 1 */ if (chandef->center_freq2) *pos++ = ieee80211_frequency_to_channel(chandef->center_freq2); else *pos++ = 0; } u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, const struct cfg80211_chan_def *chandef) { struct ieee80211_vht_operation *vht_oper; *pos++ = WLAN_EID_VHT_OPERATION; *pos++ = sizeof(struct ieee80211_vht_operation); vht_oper = (struct ieee80211_vht_operation *)pos; vht_oper->center_freq_seg0_idx = ieee80211_frequency_to_channel( chandef->center_freq1); if (chandef->center_freq2) vht_oper->center_freq_seg1_idx = ieee80211_frequency_to_channel(chandef->center_freq2); else vht_oper->center_freq_seg1_idx = 0x00; switch (chandef->width) { case NL80211_CHAN_WIDTH_160: /* * Convert 160 MHz channel width to new style as interop * workaround. */ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; vht_oper->center_freq_seg1_idx = vht_oper->center_freq_seg0_idx; if (chandef->chan->center_freq < chandef->center_freq1) vht_oper->center_freq_seg0_idx -= 8; else vht_oper->center_freq_seg0_idx += 8; break; case NL80211_CHAN_WIDTH_80P80: /* * Convert 80+80 MHz channel width to new style as interop * workaround. */ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_80: vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_320: /* VHT information element should not be included on 6GHz */ WARN_ON(1); return pos; default: vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_USE_HT; break; } /* don't require special VHT peer rates */ vht_oper->basic_mcs_set = cpu_to_le16(0xffff); return pos + sizeof(struct ieee80211_vht_operation); } u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef) { struct ieee80211_he_operation *he_oper; struct ieee80211_he_6ghz_oper *he_6ghz_op; struct cfg80211_chan_def he_chandef; u32 he_oper_params; u8 ie_len = 1 + sizeof(struct ieee80211_he_operation); if (chandef->chan->band == NL80211_BAND_6GHZ) ie_len += sizeof(struct ieee80211_he_6ghz_oper); *pos++ = WLAN_EID_EXTENSION; *pos++ = ie_len; *pos++ = WLAN_EID_EXT_HE_OPERATION; he_oper_params = 0; he_oper_params |= u32_encode_bits(1023, /* disabled */ IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_ER_SU_DISABLE); he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED); if (chandef->chan->band == NL80211_BAND_6GHZ) he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_6GHZ_OP_INFO); he_oper = (struct ieee80211_he_operation *)pos; he_oper->he_oper_params = cpu_to_le32(he_oper_params); /* don't require special HE peer rates */ he_oper->he_mcs_nss_set = cpu_to_le16(0xffff); pos += sizeof(struct ieee80211_he_operation); if (chandef->chan->band != NL80211_BAND_6GHZ) goto out; cfg80211_chandef_create(&he_chandef, chandef->chan, NL80211_CHAN_NO_HT); he_chandef.center_freq1 = chandef->center_freq1; he_chandef.center_freq2 = chandef->center_freq2; he_chandef.width = chandef->width; /* TODO add VHT operational */ he_6ghz_op = (struct ieee80211_he_6ghz_oper *)pos; he_6ghz_op->minrate = 6; /* 6 Mbps */ he_6ghz_op->primary = ieee80211_frequency_to_channel(he_chandef.chan->center_freq); he_6ghz_op->ccfs0 = ieee80211_frequency_to_channel(he_chandef.center_freq1); if (he_chandef.center_freq2) he_6ghz_op->ccfs1 = ieee80211_frequency_to_channel(he_chandef.center_freq2); else he_6ghz_op->ccfs1 = 0; switch (he_chandef.width) { case NL80211_CHAN_WIDTH_320: /* Downgrade EHT 320 MHz BW to 160 MHz for HE and set new * center_freq1 */ ieee80211_chandef_downgrade(&he_chandef, NULL); he_6ghz_op->ccfs0 = ieee80211_frequency_to_channel(he_chandef.center_freq1); fallthrough; case NL80211_CHAN_WIDTH_160: /* Convert 160 MHz channel width to new style as interop * workaround. */ he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; he_6ghz_op->ccfs1 = he_6ghz_op->ccfs0; if (he_chandef.chan->center_freq < he_chandef.center_freq1) he_6ghz_op->ccfs0 -= 8; else he_6ghz_op->ccfs0 += 8; fallthrough; case NL80211_CHAN_WIDTH_80P80: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_40: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ; break; default: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ; break; } pos += sizeof(struct ieee80211_he_6ghz_oper); out: return pos; } u8 *ieee80211_ie_build_eht_oper(u8 *pos, const struct cfg80211_chan_def *chandef, const struct ieee80211_sta_eht_cap *eht_cap) { const struct ieee80211_eht_mcs_nss_supp_20mhz_only *eht_mcs_nss = &eht_cap->eht_mcs_nss_supp.only_20mhz; struct ieee80211_eht_operation *eht_oper; struct ieee80211_eht_operation_info *eht_oper_info; u8 eht_oper_len = offsetof(struct ieee80211_eht_operation, optional); u8 eht_oper_info_len = offsetof(struct ieee80211_eht_operation_info, optional); u8 chan_width = 0; *pos++ = WLAN_EID_EXTENSION; *pos++ = 1 + eht_oper_len + eht_oper_info_len; *pos++ = WLAN_EID_EXT_EHT_OPERATION; eht_oper = (struct ieee80211_eht_operation *)pos; memcpy(&eht_oper->basic_mcs_nss, eht_mcs_nss, sizeof(*eht_mcs_nss)); eht_oper->params |= IEEE80211_EHT_OPER_INFO_PRESENT; pos += eht_oper_len; eht_oper_info = (struct ieee80211_eht_operation_info *)eht_oper->optional; eht_oper_info->ccfs0 = ieee80211_frequency_to_channel(chandef->center_freq1); if (chandef->center_freq2) eht_oper_info->ccfs1 = ieee80211_frequency_to_channel(chandef->center_freq2); else eht_oper_info->ccfs1 = 0; switch (chandef->width) { case NL80211_CHAN_WIDTH_320: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ; eht_oper_info->ccfs1 = eht_oper_info->ccfs0; if (chandef->chan->center_freq < chandef->center_freq1) eht_oper_info->ccfs0 -= 16; else eht_oper_info->ccfs0 += 16; break; case NL80211_CHAN_WIDTH_160: eht_oper_info->ccfs1 = eht_oper_info->ccfs0; if (chandef->chan->center_freq < chandef->center_freq1) eht_oper_info->ccfs0 -= 8; else eht_oper_info->ccfs0 += 8; fallthrough; case NL80211_CHAN_WIDTH_80P80: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_40: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ; break; default: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ; break; } eht_oper_info->control = chan_width; pos += eht_oper_info_len; /* TODO: eht_oper_info->optional */ return pos; } bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef) { enum nl80211_channel_type channel_type; if (!ht_oper) return false; switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { case IEEE80211_HT_PARAM_CHA_SEC_NONE: channel_type = NL80211_CHAN_HT20; break; case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: channel_type = NL80211_CHAN_HT40PLUS; break; case IEEE80211_HT_PARAM_CHA_SEC_BELOW: channel_type = NL80211_CHAN_HT40MINUS; break; default: return false; } cfg80211_chandef_create(chandef, chandef->chan, channel_type); return true; } bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_vht_operation *oper, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef) { struct cfg80211_chan_def new = *chandef; int cf0, cf1; int ccfs0, ccfs1, ccfs2; int ccf0, ccf1; u32 vht_cap; bool support_80_80 = false; bool support_160 = false; u8 ext_nss_bw_supp = u32_get_bits(vht_cap_info, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); u8 supp_chwidth = u32_get_bits(vht_cap_info, IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK); if (!oper || !htop) return false; vht_cap = hw->wiphy->bands[chandef->chan->band]->vht_cap.cap; support_160 = (vht_cap & (IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK | IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)); support_80_80 = ((vht_cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) || (vht_cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) || ((vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) >> IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT > 1)); ccfs0 = oper->center_freq_seg0_idx; ccfs1 = oper->center_freq_seg1_idx; ccfs2 = (le16_to_cpu(htop->operation_mode) & IEEE80211_HT_OP_MODE_CCFS2_MASK) >> IEEE80211_HT_OP_MODE_CCFS2_SHIFT; ccf0 = ccfs0; /* if not supported, parse as though we didn't understand it */ if (!ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW)) ext_nss_bw_supp = 0; /* * Cf. IEEE 802.11 Table 9-250 * * We really just consider that because it's inefficient to connect * at a higher bandwidth than we'll actually be able to use. */ switch ((supp_chwidth << 4) | ext_nss_bw_supp) { default: case 0x00: ccf1 = 0; support_160 = false; support_80_80 = false; break; case 0x01: support_80_80 = false; fallthrough; case 0x02: case 0x03: ccf1 = ccfs2; break; case 0x10: ccf1 = ccfs1; break; case 0x11: case 0x12: if (!ccfs1) ccf1 = ccfs2; else ccf1 = ccfs1; break; case 0x13: case 0x20: case 0x23: ccf1 = ccfs1; break; } cf0 = ieee80211_channel_to_frequency(ccf0, chandef->chan->band); cf1 = ieee80211_channel_to_frequency(ccf1, chandef->chan->band); switch (oper->chan_width) { case IEEE80211_VHT_CHANWIDTH_USE_HT: /* just use HT information directly */ break; case IEEE80211_VHT_CHANWIDTH_80MHZ: new.width = NL80211_CHAN_WIDTH_80; new.center_freq1 = cf0; /* If needed, adjust based on the newer interop workaround. */ if (ccf1) { unsigned int diff; diff = abs(ccf1 - ccf0); if ((diff == 8) && support_160) { new.width = NL80211_CHAN_WIDTH_160; new.center_freq1 = cf1; } else if ((diff > 8) && support_80_80) { new.width = NL80211_CHAN_WIDTH_80P80; new.center_freq2 = cf1; } } break; case IEEE80211_VHT_CHANWIDTH_160MHZ: /* deprecated encoding */ new.width = NL80211_CHAN_WIDTH_160; new.center_freq1 = cf0; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: /* deprecated encoding */ new.width = NL80211_CHAN_WIDTH_80P80; new.center_freq1 = cf0; new.center_freq2 = cf1; break; default: return false; } if (!cfg80211_chandef_valid(&new)) return false; *chandef = new; return true; } void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation_info *info, struct cfg80211_chan_def *chandef) { chandef->center_freq1 = ieee80211_channel_to_frequency(info->ccfs0, chandef->chan->band); switch (u8_get_bits(info->control, IEEE80211_EHT_OPER_CHAN_WIDTH)) { case IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ: chandef->width = NL80211_CHAN_WIDTH_20; break; case IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ: chandef->width = NL80211_CHAN_WIDTH_40; break; case IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ: chandef->width = NL80211_CHAN_WIDTH_80; break; case IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ: chandef->width = NL80211_CHAN_WIDTH_160; chandef->center_freq1 = ieee80211_channel_to_frequency(info->ccfs1, chandef->chan->band); break; case IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ: chandef->width = NL80211_CHAN_WIDTH_320; chandef->center_freq1 = ieee80211_channel_to_frequency(info->ccfs1, chandef->chan->band); break; } } bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, const struct ieee80211_he_operation *he_oper, const struct ieee80211_eht_operation *eht_oper, struct cfg80211_chan_def *chandef) { struct cfg80211_chan_def he_chandef = *chandef; const struct ieee80211_he_6ghz_oper *he_6ghz_oper; u32 freq; if (chandef->chan->band != NL80211_BAND_6GHZ) return true; if (!he_oper) return false; he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper); if (!he_6ghz_oper) return false; /* * The EHT operation IE does not contain the primary channel so the * primary channel frequency should be taken from the 6 GHz operation * information. */ freq = ieee80211_channel_to_frequency(he_6ghz_oper->primary, NL80211_BAND_6GHZ); he_chandef.chan = ieee80211_get_channel(local->hw.wiphy, freq); if (!he_chandef.chan) return false; if (!eht_oper || !(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT)) { switch (u8_get_bits(he_6ghz_oper->control, IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH)) { case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ: he_chandef.width = NL80211_CHAN_WIDTH_20; break; case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ: he_chandef.width = NL80211_CHAN_WIDTH_40; break; case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ: he_chandef.width = NL80211_CHAN_WIDTH_80; break; case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ: he_chandef.width = NL80211_CHAN_WIDTH_80; if (!he_6ghz_oper->ccfs1) break; if (abs(he_6ghz_oper->ccfs1 - he_6ghz_oper->ccfs0) == 8) he_chandef.width = NL80211_CHAN_WIDTH_160; else he_chandef.width = NL80211_CHAN_WIDTH_80P80; break; } if (he_chandef.width == NL80211_CHAN_WIDTH_160) { he_chandef.center_freq1 = ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, NL80211_BAND_6GHZ); } else { he_chandef.center_freq1 = ieee80211_channel_to_frequency(he_6ghz_oper->ccfs0, NL80211_BAND_6GHZ); he_chandef.center_freq2 = ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, NL80211_BAND_6GHZ); } } else { ieee80211_chandef_eht_oper((const void *)eht_oper->optional, &he_chandef); he_chandef.punctured = ieee80211_eht_oper_dis_subchan_bitmap(eht_oper); } if (!cfg80211_chandef_valid(&he_chandef)) return false; *chandef = he_chandef; return true; } bool ieee80211_chandef_s1g_oper(struct ieee80211_local *local, const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef) { u32 oper_khz, pri_1mhz_khz, pri_2mhz_khz; if (!oper) return false; switch (FIELD_GET(S1G_OPER_CH_WIDTH_OPER, oper->ch_width)) { case IEEE80211_S1G_CHANWIDTH_1MHZ: chandef->width = NL80211_CHAN_WIDTH_1; break; case IEEE80211_S1G_CHANWIDTH_2MHZ: chandef->width = NL80211_CHAN_WIDTH_2; break; case IEEE80211_S1G_CHANWIDTH_4MHZ: chandef->width = NL80211_CHAN_WIDTH_4; break; case IEEE80211_S1G_CHANWIDTH_8MHZ: chandef->width = NL80211_CHAN_WIDTH_8; break; case IEEE80211_S1G_CHANWIDTH_16MHZ: chandef->width = NL80211_CHAN_WIDTH_16; break; default: return false; } chandef->s1g_primary_2mhz = false; switch (u8_get_bits(oper->ch_width, S1G_OPER_CH_WIDTH_PRIMARY)) { case IEEE80211_S1G_PRI_CHANWIDTH_1MHZ: pri_1mhz_khz = ieee80211_channel_to_freq_khz( oper->primary_ch, NL80211_BAND_S1GHZ); break; case IEEE80211_S1G_PRI_CHANWIDTH_2MHZ: chandef->s1g_primary_2mhz = true; pri_2mhz_khz = ieee80211_channel_to_freq_khz( oper->primary_ch, NL80211_BAND_S1GHZ); if (u8_get_bits(oper->ch_width, S1G_OPER_CH_PRIMARY_LOCATION) == S1G_2M_PRIMARY_LOCATION_LOWER) pri_1mhz_khz = pri_2mhz_khz - 500; else pri_1mhz_khz = pri_2mhz_khz + 500; break; default: return false; } oper_khz = ieee80211_channel_to_freq_khz(oper->oper_ch, NL80211_BAND_S1GHZ); chandef->center_freq1 = KHZ_TO_MHZ(oper_khz); chandef->freq1_offset = oper_khz % 1000; chandef->chan = ieee80211_get_channel_khz(local->hw.wiphy, pri_1mhz_khz); return chandef->chan; } int ieee80211_put_srates_elem(struct sk_buff *skb, const struct ieee80211_supported_band *sband, u32 basic_rates, u32 masked_rates, u8 element_id) { u8 i, rates, skip; rates = 0; for (i = 0; i < sband->n_bitrates; i++) { if (masked_rates & BIT(i)) continue; rates++; } if (element_id == WLAN_EID_SUPP_RATES) { rates = min_t(u8, rates, 8); skip = 0; } else { skip = 8; if (rates <= skip) return 0; rates -= skip; } if (skb_tailroom(skb) < rates + 2) return -ENOBUFS; skb_put_u8(skb, element_id); skb_put_u8(skb, rates); for (i = 0; i < sband->n_bitrates && rates; i++) { int rate; u8 basic; if (masked_rates & BIT(i)) continue; if (skip > 0) { skip--; continue; } basic = basic_rates & BIT(i) ? 0x80 : 0; rate = DIV_ROUND_UP(sband->bitrates[i].bitrate, 5); skb_put_u8(skb, basic | (u8)rate); rates--; } WARN(rates > 0, "rates confused: rates:%d, element:%d\n", rates, element_id); return 0; } int ieee80211_ave_rssi(struct ieee80211_vif *vif, int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_link_data *link_data; if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION)) return 0; if (link_id < 0) link_data = &sdata->deflink; else link_data = wiphy_dereference(sdata->local->hw.wiphy, sdata->link[link_id]); if (WARN_ON_ONCE(!link_data)) return -99; return -ewma_beacon_signal_read(&link_data->u.mgd.ave_beacon_signal); } EXPORT_SYMBOL_GPL(ieee80211_ave_rssi); u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs) { if (!mcs) return 1; /* TODO: consider rx_highest */ if (mcs->rx_mask[3]) return 4; if (mcs->rx_mask[2]) return 3; if (mcs->rx_mask[1]) return 2; return 1; } /** * ieee80211_calculate_rx_timestamp - calculate timestamp in frame * @local: mac80211 hw info struct * @status: RX status * @mpdu_len: total MPDU length (including FCS) * @mpdu_offset: offset into MPDU to calculate timestamp at * * This function calculates the RX timestamp at the given MPDU offset, taking * into account what the RX timestamp was. An offset of 0 will just normalize * the timestamp to TSF at beginning of MPDU reception. * * Returns: the calculated timestamp */ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, struct ieee80211_rx_status *status, unsigned int mpdu_len, unsigned int mpdu_offset) { u64 ts = status->mactime; bool mactime_plcp_start; struct rate_info ri; u16 rate; u8 n_ltf; if (WARN_ON(!ieee80211_have_rx_timestamp(status))) return 0; mactime_plcp_start = (status->flag & RX_FLAG_MACTIME) == RX_FLAG_MACTIME_PLCP_START; memset(&ri, 0, sizeof(ri)); ri.bw = status->bw; /* Fill cfg80211 rate info */ switch (status->encoding) { case RX_ENC_EHT: ri.flags |= RATE_INFO_FLAGS_EHT_MCS; ri.mcs = status->rate_idx; ri.nss = status->nss; ri.eht_ru_alloc = status->eht.ru; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* TODO/FIXME: is this right? handle other PPDUs */ if (mactime_plcp_start) { mpdu_offset += 2; ts += 36; } break; case RX_ENC_HE: ri.flags |= RATE_INFO_FLAGS_HE_MCS; ri.mcs = status->rate_idx; ri.nss = status->nss; ri.he_ru_alloc = status->he_ru; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* * See P802.11ax_D6.0, section 27.3.4 for * VHT PPDU format. */ if (mactime_plcp_start) { mpdu_offset += 2; ts += 36; /* * TODO: * For HE MU PPDU, add the HE-SIG-B. * For HE ER PPDU, add 8us for the HE-SIG-A. * For HE TB PPDU, add 4us for the HE-STF. * Add the HE-LTF durations - variable. */ } break; case RX_ENC_HT: ri.mcs = status->rate_idx; ri.flags |= RATE_INFO_FLAGS_MCS; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* * See P802.11REVmd_D3.0, section 19.3.2 for * HT PPDU format. */ if (mactime_plcp_start) { mpdu_offset += 2; if (status->enc_flags & RX_ENC_FLAG_HT_GF) ts += 24; else ts += 32; /* * Add Data HT-LTFs per streams * TODO: add Extension HT-LTFs, 4us per LTF */ n_ltf = ((ri.mcs >> 3) & 3) + 1; n_ltf = n_ltf == 3 ? 4 : n_ltf; ts += n_ltf * 4; } break; case RX_ENC_VHT: ri.flags |= RATE_INFO_FLAGS_VHT_MCS; ri.mcs = status->rate_idx; ri.nss = status->nss; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* * See P802.11REVmd_D3.0, section 21.3.2 for * VHT PPDU format. */ if (mactime_plcp_start) { mpdu_offset += 2; ts += 36; /* * Add VHT-LTFs per streams */ n_ltf = (ri.nss != 1) && (ri.nss % 2) ? ri.nss + 1 : ri.nss; ts += 4 * n_ltf; } break; default: WARN_ON(1); fallthrough; case RX_ENC_LEGACY: { struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[status->band]; ri.legacy = sband->bitrates[status->rate_idx].bitrate; if (mactime_plcp_start) { if (status->band == NL80211_BAND_5GHZ) { ts += 20; mpdu_offset += 2; } else if (status->enc_flags & RX_ENC_FLAG_SHORTPRE) { ts += 96; } else { ts += 192; } } break; } } rate = cfg80211_calculate_bitrate(&ri); if (WARN_ONCE(!rate, "Invalid bitrate: flags=0x%llx, idx=%d, vht_nss=%d\n", (unsigned long long)status->flag, status->rate_idx, status->nss)) return 0; /* rewind from end of MPDU */ if ((status->flag & RX_FLAG_MACTIME) == RX_FLAG_MACTIME_END) ts -= mpdu_len * 8 * 10 / rate; ts += mpdu_offset * 8 * 10 / rate; return ts; } /* Cancel CAC for the interfaces under the specified @local. If @ctx is * also provided, only the interfaces using that ctx will be canceled. */ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_sub_if_data *sdata; struct cfg80211_chan_def chandef; struct ieee80211_link_data *link; struct ieee80211_chanctx_conf *chanctx_conf; unsigned int link_id; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(sdata, &local->interfaces, list) { for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; chanctx_conf = sdata_dereference(link->conf->chanctx_conf, sdata); if (ctx && &ctx->conf != chanctx_conf) continue; wiphy_delayed_work_cancel(local->hw.wiphy, &link->dfs_cac_timer_work); if (!sdata->wdev.links[link_id].cac_started) continue; chandef = link->conf->chanreq.oper; ieee80211_link_release_channel(link); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, GFP_KERNEL, link_id); } } } void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, radar_detected_work); struct cfg80211_chan_def chandef; struct ieee80211_chanctx *ctx; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) continue; if (!ctx->radar_detected) continue; ctx->radar_detected = false; chandef = ctx->conf.def; ieee80211_dfs_cac_cancel(local, ctx); cfg80211_radar_event(local->hw.wiphy, &chandef, GFP_KERNEL); } } static void ieee80211_radar_mark_chan_ctx_iterator(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf, void *data) { struct ieee80211_chanctx *ctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) return; if (data && data != chanctx_conf) return; ctx->radar_detected = true; } void ieee80211_radar_detected(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf) { struct ieee80211_local *local = hw_to_local(hw); trace_api_radar_detected(local); ieee80211_iter_chan_contexts_atomic(hw, ieee80211_radar_mark_chan_ctx_iterator, chanctx_conf); wiphy_work_queue(hw->wiphy, &local->radar_detected_work); } EXPORT_SYMBOL(ieee80211_radar_detected); void ieee80211_chandef_downgrade(struct cfg80211_chan_def *c, struct ieee80211_conn_settings *conn) { enum nl80211_chan_width new_primary_width; struct ieee80211_conn_settings _ignored = {}; /* allow passing NULL if caller doesn't care */ if (!conn) conn = &_ignored; again: /* no-HT indicates nothing to do */ new_primary_width = NL80211_CHAN_WIDTH_20_NOHT; switch (c->width) { default: case NL80211_CHAN_WIDTH_20_NOHT: WARN_ON_ONCE(1); fallthrough; case NL80211_CHAN_WIDTH_20: c->width = NL80211_CHAN_WIDTH_20_NOHT; conn->mode = IEEE80211_CONN_MODE_LEGACY; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; c->punctured = 0; break; case NL80211_CHAN_WIDTH_40: c->width = NL80211_CHAN_WIDTH_20; c->center_freq1 = c->chan->center_freq; if (conn->mode == IEEE80211_CONN_MODE_VHT) conn->mode = IEEE80211_CONN_MODE_HT; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; c->punctured = 0; break; case NL80211_CHAN_WIDTH_80: new_primary_width = NL80211_CHAN_WIDTH_40; if (conn->mode == IEEE80211_CONN_MODE_VHT) conn->mode = IEEE80211_CONN_MODE_HT; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_40; break; case NL80211_CHAN_WIDTH_80P80: c->center_freq2 = 0; c->width = NL80211_CHAN_WIDTH_80; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_80; break; case NL80211_CHAN_WIDTH_160: new_primary_width = NL80211_CHAN_WIDTH_80; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_80; break; case NL80211_CHAN_WIDTH_320: new_primary_width = NL80211_CHAN_WIDTH_160; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_160; break; case NL80211_CHAN_WIDTH_1: case NL80211_CHAN_WIDTH_2: case NL80211_CHAN_WIDTH_4: case NL80211_CHAN_WIDTH_8: case NL80211_CHAN_WIDTH_16: WARN_ON_ONCE(1); /* keep c->width */ conn->mode = IEEE80211_CONN_MODE_S1G; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; break; case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: WARN_ON_ONCE(1); /* keep c->width */ conn->mode = IEEE80211_CONN_MODE_LEGACY; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; break; } if (new_primary_width != NL80211_CHAN_WIDTH_20_NOHT) { c->center_freq1 = cfg80211_chandef_primary(c, new_primary_width, &c->punctured); c->width = new_primary_width; } /* * With an 80 MHz channel, we might have the puncturing in the primary * 40 Mhz channel, but that's not valid when downgraded to 40 MHz width. * In that case, downgrade again. */ if (!cfg80211_chandef_valid(c) && c->punctured) goto again; WARN_ON_ONCE(!cfg80211_chandef_valid(c)); } int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings) { struct sk_buff *skb; struct ieee80211_mgmt *mgmt; struct ieee80211_local *local = sdata->local; int freq; int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.chan_switch); u8 *pos; if (sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT) return -EOPNOTSUPP; skb = dev_alloc_skb(local->tx_headroom + hdr_len + 5 + /* channel switch announcement element */ 3 + /* secondary channel offset element */ 5 + /* wide bandwidth channel switch announcement */ 8); /* mesh channel switch parameters element */ if (!skb) return -ENOMEM; skb_reserve(skb, local->tx_headroom); mgmt = skb_put_zero(skb, hdr_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); eth_broadcast_addr(mgmt->da); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); if (ieee80211_vif_is_mesh(&sdata->vif)) { memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); } else { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN); } mgmt->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT; mgmt->u.action.u.chan_switch.action_code = WLAN_ACTION_SPCT_CHL_SWITCH; pos = skb_put(skb, 5); *pos++ = WLAN_EID_CHANNEL_SWITCH; /* EID */ *pos++ = 3; /* IE length */ *pos++ = csa_settings->block_tx ? 1 : 0; /* CSA mode */ freq = csa_settings->chandef.chan->center_freq; *pos++ = ieee80211_frequency_to_channel(freq); /* channel */ *pos++ = csa_settings->count; /* count */ if (csa_settings->chandef.width == NL80211_CHAN_WIDTH_40) { enum nl80211_channel_type ch_type; skb_put(skb, 3); *pos++ = WLAN_EID_SECONDARY_CHANNEL_OFFSET; /* EID */ *pos++ = 1; /* IE length */ ch_type = cfg80211_get_chandef_type(&csa_settings->chandef); if (ch_type == NL80211_CHAN_HT40PLUS) *pos++ = IEEE80211_HT_PARAM_CHA_SEC_ABOVE; else *pos++ = IEEE80211_HT_PARAM_CHA_SEC_BELOW; } if (ieee80211_vif_is_mesh(&sdata->vif)) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; skb_put(skb, 8); *pos++ = WLAN_EID_CHAN_SWITCH_PARAM; /* EID */ *pos++ = 6; /* IE length */ *pos++ = sdata->u.mesh.mshcfg.dot11MeshTTL; /* Mesh TTL */ *pos = 0x00; /* Mesh Flag: Tx Restrict, Initiator, Reason */ *pos |= WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR; *pos++ |= csa_settings->block_tx ? WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT : 0x00; put_unaligned_le16(WLAN_REASON_MESH_CHAN, pos); /* Reason Cd */ pos += 2; put_unaligned_le16(ifmsh->pre_value, pos);/* Precedence Value */ pos += 2; } if (csa_settings->chandef.width == NL80211_CHAN_WIDTH_80 || csa_settings->chandef.width == NL80211_CHAN_WIDTH_80P80 || csa_settings->chandef.width == NL80211_CHAN_WIDTH_160) { skb_put(skb, 5); ieee80211_ie_build_wide_bw_cs(pos, &csa_settings->chandef); } ieee80211_tx_skb(sdata, skb); return 0; } static bool ieee80211_extend_noa_desc(struct ieee80211_noa_data *data, u32 tsf, int i) { s32 end = data->desc[i].start + data->desc[i].duration - (tsf + 1); int skip; if (end > 0) return false; /* One shot NOA */ if (data->count[i] == 1) return false; if (data->desc[i].interval == 0) return false; /* End time is in the past, check for repetitions */ skip = DIV_ROUND_UP(-end, data->desc[i].interval); if (data->count[i] < 255) { if (data->count[i] <= skip) { data->count[i] = 0; return false; } data->count[i] -= skip; } data->desc[i].start += skip * data->desc[i].interval; return true; } static bool ieee80211_extend_absent_time(struct ieee80211_noa_data *data, u32 tsf, s32 *offset) { bool ret = false; int i; for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { s32 cur; if (!data->count[i]) continue; if (ieee80211_extend_noa_desc(data, tsf + *offset, i)) ret = true; cur = data->desc[i].start - tsf; if (cur > *offset) continue; cur = data->desc[i].start + data->desc[i].duration - tsf; if (cur > *offset) *offset = cur; } return ret; } static u32 ieee80211_get_noa_absent_time(struct ieee80211_noa_data *data, u32 tsf) { s32 offset = 0; int tries = 0; /* * arbitrary limit, used to avoid infinite loops when combined NoA * descriptors cover the full time period. */ int max_tries = 5; ieee80211_extend_absent_time(data, tsf, &offset); do { if (!ieee80211_extend_absent_time(data, tsf, &offset)) break; tries++; } while (tries < max_tries); return offset; } void ieee80211_update_p2p_noa(struct ieee80211_noa_data *data, u32 tsf) { u32 next_offset = BIT(31) - 1; int i; data->absent = 0; data->has_next_tsf = false; for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { s32 start; if (!data->count[i]) continue; ieee80211_extend_noa_desc(data, tsf, i); start = data->desc[i].start - tsf; if (start <= 0) data->absent |= BIT(i); if (next_offset > start) next_offset = start; data->has_next_tsf = true; } if (data->absent) next_offset = ieee80211_get_noa_absent_time(data, tsf); data->next_tsf = tsf + next_offset; } EXPORT_SYMBOL(ieee80211_update_p2p_noa); int ieee80211_parse_p2p_noa(const struct ieee80211_p2p_noa_attr *attr, struct ieee80211_noa_data *data, u32 tsf) { int ret = 0; int i; memset(data, 0, sizeof(*data)); for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { const struct ieee80211_p2p_noa_desc *desc = &attr->desc[i]; if (!desc->count || !desc->duration) continue; data->count[i] = desc->count; data->desc[i].start = le32_to_cpu(desc->start_time); data->desc[i].duration = le32_to_cpu(desc->duration); data->desc[i].interval = le32_to_cpu(desc->interval); if (data->count[i] > 1 && data->desc[i].interval < data->desc[i].duration) continue; ieee80211_extend_noa_desc(data, tsf, i); ret++; } if (ret) ieee80211_update_p2p_noa(data, tsf); return ret; } EXPORT_SYMBOL(ieee80211_parse_p2p_noa); void ieee80211_recalc_dtim(struct ieee80211_sub_if_data *sdata, u64 tsf) { u64 dtim_count = 0; u32 beacon_int = sdata->vif.bss_conf.beacon_int * 1024; u8 dtim_period = sdata->vif.bss_conf.dtim_period; struct ps_data *ps; u8 bcns_from_dtim; if (tsf == -1ULL || !beacon_int || !dtim_period) return; if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (!sdata->bss) return; ps = &sdata->bss->ps; } else if (ieee80211_vif_is_mesh(&sdata->vif)) { ps = &sdata->u.mesh.ps; } else { return; } /* * actually finds last dtim_count, mac80211 will update in * __beacon_add_tim(). * dtim_count = dtim_period - (tsf / bcn_int) % dtim_period */ do_div(tsf, beacon_int); bcns_from_dtim = do_div(tsf, dtim_period); /* just had a DTIM */ if (!bcns_from_dtim) dtim_count = 0; else dtim_count = dtim_period - bcns_from_dtim; ps->dtim_count = dtim_count; } /* * Given a long beacon period, calculate the current index into * that period to determine the number of TSBTTs until the next TBTT. * It is completely valid to have a short beacon period that differs * from the dtim period (i.e a TBTT thats not a DTIM). */ void ieee80211_recalc_sb_count(struct ieee80211_sub_if_data *sdata, u64 tsf) { u32 sb_idx; struct ps_data *ps = &sdata->bss->ps; u8 lb_period = sdata->vif.bss_conf.s1g_long_beacon_period; u32 beacon_int = sdata->vif.bss_conf.beacon_int * 1024; /* No mesh / IBSS support for short beaconing */ if (tsf == -1ULL || !lb_period || (sdata->vif.type != NL80211_IFTYPE_AP && sdata->vif.type != NL80211_IFTYPE_AP_VLAN)) return; /* find the current TSBTT index in our lb_period */ do_div(tsf, beacon_int); sb_idx = do_div(tsf, lb_period); /* num TSBTTs until the next TBTT */ ps->sb_count = sb_idx ? lb_period - sb_idx : 0; } static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_link_data *link; u8 radar_detect = 0; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)) return 0; for_each_sdata_link(local, link) { if (rcu_access_pointer(link->conf->chanctx_conf) == &ctx->conf) { /* * An in-place reservation context should not have any * assigned links until it replaces the other context. */ WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER); if (link->radar_required) radar_detect |= BIT(link->conf->chanreq.oper.width); } if (link->reserved_chanctx == ctx && link->reserved_radar_required) radar_detect |= BIT(link->reserved.oper.width); } return radar_detect; } bool ieee80211_is_radio_idx_in_scan_req(struct wiphy *wiphy, struct cfg80211_scan_request *scan_req, int radio_idx) { struct ieee80211_channel *chan; int i, chan_radio_idx; for (i = 0; i < scan_req->n_channels; i++) { chan = scan_req->channels[i]; chan_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan); /* The radio index either matched successfully, or an error * occurred. For example, if radio-level information is * missing, the same error value is returned. This * typically implies a single-radio setup, in which case * the operation should not be allowed. */ if (chan_radio_idx == radio_idx) return true; } return false; } static u32 __ieee80211_get_radio_mask(struct ieee80211_sub_if_data *sdata) { struct ieee80211_bss_conf *link_conf; struct ieee80211_chanctx_conf *conf; unsigned int link_id; u32 mask = 0; for_each_vif_active_link(&sdata->vif, link_conf, link_id) { conf = sdata_dereference(link_conf->chanctx_conf, sdata); if (!conf || conf->radio_idx < 0) continue; mask |= BIT(conf->radio_idx); } return mask; } u32 ieee80211_get_radio_mask(struct wiphy *wiphy, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); return __ieee80211_get_radio_mask(sdata); } static bool ieee80211_sdata_uses_radio(struct ieee80211_sub_if_data *sdata, int radio_idx) { if (radio_idx < 0) return true; return __ieee80211_get_radio_mask(sdata) & BIT(radio_idx); } static int ieee80211_fill_ifcomb_params(struct ieee80211_local *local, struct iface_combination_params *params, const struct cfg80211_chan_def *chandef, struct ieee80211_sub_if_data *sdata) { struct ieee80211_sub_if_data *sdata_iter; struct ieee80211_chanctx *ctx; int total = !!sdata; list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) continue; if (params->radio_idx >= 0 && ctx->conf.radio_idx != params->radio_idx) continue; params->radar_detect |= ieee80211_chanctx_radar_detect(local, ctx); if (chandef && ctx->mode != IEEE80211_CHANCTX_EXCLUSIVE && cfg80211_chandef_compatible(chandef, &ctx->conf.def)) continue; params->num_different_channels++; } list_for_each_entry(sdata_iter, &local->interfaces, list) { struct wireless_dev *wdev_iter; wdev_iter = &sdata_iter->wdev; if (sdata_iter == sdata || !ieee80211_sdata_running(sdata_iter) || cfg80211_iftype_allowed(local->hw.wiphy, wdev_iter->iftype, 0, 1)) continue; if (!ieee80211_sdata_uses_radio(sdata_iter, params->radio_idx)) continue; params->iftype_num[wdev_iter->iftype]++; total++; } return total; } int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode chanmode, u8 radar_detect, int radio_idx) { bool shared = chanmode == IEEE80211_CHANCTX_SHARED; struct ieee80211_local *local = sdata->local; enum nl80211_iftype iftype = sdata->wdev.iftype; struct iface_combination_params params = { .radar_detect = radar_detect, .radio_idx = radio_idx, }; int total; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(hweight32(radar_detect) > 1)) return -EINVAL; if (WARN_ON(chandef && chanmode == IEEE80211_CHANCTX_SHARED && !chandef->chan)) return -EINVAL; if (WARN_ON(iftype >= NUM_NL80211_IFTYPES)) return -EINVAL; if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_MESH_POINT) { /* * always passing this is harmless, since it'll be the * same value that cfg80211 finds if it finds the same * interface ... and that's always allowed */ params.new_beacon_int = sdata->vif.bss_conf.beacon_int; } /* Always allow software iftypes */ if (cfg80211_iftype_allowed(local->hw.wiphy, iftype, 0, 1)) { if (radar_detect) return -EINVAL; return 0; } if (chandef) params.num_different_channels = 1; if (iftype != NL80211_IFTYPE_UNSPECIFIED) params.iftype_num[iftype] = 1; total = ieee80211_fill_ifcomb_params(local, ¶ms, shared ? chandef : NULL, sdata); if (total == 1 && !params.radar_detect) return 0; return cfg80211_check_combinations(local->hw.wiphy, ¶ms); } static void ieee80211_iter_max_chans(const struct ieee80211_iface_combination *c, void *data) { u32 *max_num_different_channels = data; *max_num_different_channels = max(*max_num_different_channels, c->num_different_channels); } int ieee80211_max_num_channels(struct ieee80211_local *local, int radio_idx) { u32 max_num_different_channels = 1; int err; struct iface_combination_params params = { .radio_idx = radio_idx, }; lockdep_assert_wiphy(local->hw.wiphy); ieee80211_fill_ifcomb_params(local, ¶ms, NULL, NULL); err = cfg80211_iter_combinations(local->hw.wiphy, ¶ms, ieee80211_iter_max_chans, &max_num_different_channels); if (err < 0) return err; return max_num_different_channels; } void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_s1g_cap *caps, struct sk_buff *skb) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_s1g_cap s1g_capab; u8 *pos; int i; if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; if (!caps->s1g) return; memcpy(s1g_capab.capab_info, caps->cap, sizeof(caps->cap)); memcpy(s1g_capab.supp_mcs_nss, caps->nss_mcs, sizeof(caps->nss_mcs)); /* override the capability info */ for (i = 0; i < sizeof(ifmgd->s1g_capa.capab_info); i++) { u8 mask = ifmgd->s1g_capa_mask.capab_info[i]; s1g_capab.capab_info[i] &= ~mask; s1g_capab.capab_info[i] |= ifmgd->s1g_capa.capab_info[i] & mask; } /* then MCS and NSS set */ for (i = 0; i < sizeof(ifmgd->s1g_capa.supp_mcs_nss); i++) { u8 mask = ifmgd->s1g_capa_mask.supp_mcs_nss[i]; s1g_capab.supp_mcs_nss[i] &= ~mask; s1g_capab.supp_mcs_nss[i] |= ifmgd->s1g_capa.supp_mcs_nss[i] & mask; } pos = skb_put(skb, 2 + sizeof(s1g_capab)); *pos++ = WLAN_EID_S1G_CAPABILITIES; *pos++ = sizeof(s1g_capab); memcpy(pos, &s1g_capab, sizeof(s1g_capab)); } void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { u8 *pos = skb_put(skb, 3); *pos++ = WLAN_EID_AID_REQUEST; *pos++ = 1; *pos++ = 0; } u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo) { *buf++ = WLAN_EID_VENDOR_SPECIFIC; *buf++ = 7; /* len */ *buf++ = 0x00; /* Microsoft OUI 00:50:F2 */ *buf++ = 0x50; *buf++ = 0xf2; *buf++ = 2; /* WME */ *buf++ = 0; /* WME info */ *buf++ = 1; /* WME ver */ *buf++ = qosinfo; /* U-APSD no in use */ return buf; } void ieee80211_txq_get_depth(struct ieee80211_txq *txq, unsigned long *frame_cnt, unsigned long *byte_cnt) { struct txq_info *txqi = to_txq_info(txq); u32 frag_cnt = 0, frag_bytes = 0; struct sk_buff *skb; skb_queue_walk(&txqi->frags, skb) { frag_cnt++; frag_bytes += skb->len; } if (frame_cnt) *frame_cnt = txqi->tin.backlog_packets + frag_cnt; if (byte_cnt) *byte_cnt = txqi->tin.backlog_bytes + frag_bytes; } EXPORT_SYMBOL(ieee80211_txq_get_depth); const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS] = { IEEE80211_WMM_IE_STA_QOSINFO_AC_VO, IEEE80211_WMM_IE_STA_QOSINFO_AC_VI, IEEE80211_WMM_IE_STA_QOSINFO_AC_BE, IEEE80211_WMM_IE_STA_QOSINFO_AC_BK }; u16 ieee80211_encode_usf(int listen_interval) { static const int listen_int_usf[] = { 1, 10, 1000, 10000 }; u16 ui, usf = 0; /* find greatest USF */ while (usf < IEEE80211_MAX_USF) { if (listen_interval % listen_int_usf[usf + 1]) break; usf += 1; } ui = listen_interval / listen_int_usf[usf]; /* error if there is a remainder. Should've been checked by user */ WARN_ON_ONCE(ui > IEEE80211_MAX_UI); listen_interval = FIELD_PREP(LISTEN_INT_USF, usf) | FIELD_PREP(LISTEN_INT_UI, ui); return (u16) listen_interval; } /* this may return more than ieee80211_put_eht_cap() will need */ u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata) { const struct ieee80211_sta_he_cap *he_cap; const struct ieee80211_sta_eht_cap *eht_cap; struct ieee80211_supported_band *sband; bool is_ap; u8 n; sband = ieee80211_get_sband(sdata); if (!sband) return 0; he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); if (!he_cap || !eht_cap) return 0; is_ap = sdata->vif.type == NL80211_IFTYPE_AP; n = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, &eht_cap->eht_cap_elem, is_ap); return 2 + 1 + sizeof(eht_cap->eht_cap_elem) + n + ieee80211_eht_ppe_size(eht_cap->eht_ppe_thres[0], eht_cap->eht_cap_elem.phy_cap_info); return 0; } int ieee80211_put_eht_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const struct ieee80211_supported_band *sband, const struct ieee80211_conn_settings *conn) { const struct ieee80211_sta_he_cap *he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); const struct ieee80211_sta_eht_cap *eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); bool for_ap = sdata->vif.type == NL80211_IFTYPE_AP; struct ieee80211_eht_cap_elem_fixed fixed; struct ieee80211_he_cap_elem he; u8 mcs_nss_len, ppet_len; u8 orig_mcs_nss_len; u8 ie_len; if (!conn) conn = &ieee80211_conn_settings_unlimited; /* Make sure we have place for the IE */ if (!he_cap || !eht_cap) return 0; orig_mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, &eht_cap->eht_cap_elem, for_ap); ieee80211_get_adjusted_he_cap(conn, he_cap, &he); fixed = eht_cap->eht_cap_elem; if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_80) fixed.phy_cap_info[6] &= ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ; if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160) { fixed.phy_cap_info[1] &= ~IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK; fixed.phy_cap_info[2] &= ~IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK; fixed.phy_cap_info[6] &= ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ; } if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_320) { fixed.phy_cap_info[0] &= ~IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ; fixed.phy_cap_info[1] &= ~IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK; fixed.phy_cap_info[2] &= ~IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK; fixed.phy_cap_info[6] &= ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ; } if (conn->bw_limit == IEEE80211_CONN_BW_LIMIT_20) fixed.phy_cap_info[0] &= ~IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ; mcs_nss_len = ieee80211_eht_mcs_nss_size(&he, &fixed, for_ap); ppet_len = ieee80211_eht_ppe_size(eht_cap->eht_ppe_thres[0], fixed.phy_cap_info); ie_len = 2 + 1 + sizeof(eht_cap->eht_cap_elem) + mcs_nss_len + ppet_len; if (skb_tailroom(skb) < ie_len) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_EXTENSION); skb_put_u8(skb, ie_len - 2); skb_put_u8(skb, WLAN_EID_EXT_EHT_CAPABILITY); skb_put_data(skb, &fixed, sizeof(fixed)); if (mcs_nss_len == 4 && orig_mcs_nss_len != 4) { /* * If the (non-AP) STA became 20 MHz only, then convert from * <=80 to 20-MHz-only format, where MCSes are indicated in * the groups 0-7, 8-9, 10-11, 12-13 rather than just 0-9, * 10-11, 12-13. Thus, use 0-9 for 0-7 and 8-9. */ skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs9_max_nss); skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs9_max_nss); skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs11_max_nss); skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs13_max_nss); } else { skb_put_data(skb, &eht_cap->eht_mcs_nss_supp, mcs_nss_len); } if (ppet_len) skb_put_data(skb, &eht_cap->eht_ppe_thres, ppet_len); return 0; } const char *ieee80211_conn_mode_str(enum ieee80211_conn_mode mode) { static const char * const modes[] = { [IEEE80211_CONN_MODE_S1G] = "S1G", [IEEE80211_CONN_MODE_LEGACY] = "legacy", [IEEE80211_CONN_MODE_HT] = "HT", [IEEE80211_CONN_MODE_VHT] = "VHT", [IEEE80211_CONN_MODE_HE] = "HE", [IEEE80211_CONN_MODE_EHT] = "EHT", }; if (WARN_ON(mode >= ARRAY_SIZE(modes))) return "<out of range>"; return modes[mode] ?: "<missing string>"; } enum ieee80211_conn_bw_limit ieee80211_min_bw_limit_from_chandef(struct cfg80211_chan_def *chandef) { switch (chandef->width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: return IEEE80211_CONN_BW_LIMIT_20; case NL80211_CHAN_WIDTH_40: return IEEE80211_CONN_BW_LIMIT_40; case NL80211_CHAN_WIDTH_80: return IEEE80211_CONN_BW_LIMIT_80; case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_160: return IEEE80211_CONN_BW_LIMIT_160; case NL80211_CHAN_WIDTH_320: return IEEE80211_CONN_BW_LIMIT_320; default: WARN(1, "unhandled chandef width %d\n", chandef->width); return IEEE80211_CONN_BW_LIMIT_20; } } void ieee80211_clear_tpe(struct ieee80211_parsed_tpe *tpe) { for (int i = 0; i < 2; i++) { tpe->max_local[i].valid = false; memset(tpe->max_local[i].power, IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT, sizeof(tpe->max_local[i].power)); tpe->max_reg_client[i].valid = false; memset(tpe->max_reg_client[i].power, IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT, sizeof(tpe->max_reg_client[i].power)); tpe->psd_local[i].valid = false; memset(tpe->psd_local[i].power, IEEE80211_TPE_PSD_NO_LIMIT, sizeof(tpe->psd_local[i].power)); tpe->psd_reg_client[i].valid = false; memset(tpe->psd_reg_client[i].power, IEEE80211_TPE_PSD_NO_LIMIT, sizeof(tpe->psd_reg_client[i].power)); } } bool ieee80211_vif_nan_started(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); return vif->type == NL80211_IFTYPE_NAN && sdata->u.nan.started; } EXPORT_SYMBOL_GPL(ieee80211_vif_nan_started); |
| 13 1 6 1 3 4 7 5 30 1 1 28 43 3 40 4 4 3 1 2 3 2 1 1 1 1 2 32 1 31 19 12 31 36 42 37 31 36 5 34 36 4 36 20 36 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 | // SPDX-License-Identifier: GPL-2.0 /* * fs/bfs/dir.c * BFS directory operations. * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com> * Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005 */ #include <linux/time.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/sched.h> #include "bfs.h" #undef DEBUG #ifdef DEBUG #define dprintf(x...) printf(x) #else #define dprintf(x...) #endif static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino); static struct buffer_head *bfs_find_entry(struct inode *dir, const struct qstr *child, struct bfs_dirent **res_dir); static int bfs_readdir(struct file *f, struct dir_context *ctx) { struct inode *dir = file_inode(f); struct buffer_head *bh; struct bfs_dirent *de; unsigned int offset; int block; if (ctx->pos & (BFS_DIRENT_SIZE - 1)) { printf("Bad f_pos=%08lx for %s:%08lx\n", (unsigned long)ctx->pos, dir->i_sb->s_id, dir->i_ino); return -EINVAL; } while (ctx->pos < dir->i_size) { offset = ctx->pos & (BFS_BSIZE - 1); block = BFS_I(dir)->i_sblock + (ctx->pos >> BFS_BSIZE_BITS); bh = sb_bread(dir->i_sb, block); if (!bh) { ctx->pos += BFS_BSIZE - offset; continue; } do { de = (struct bfs_dirent *)(bh->b_data + offset); if (de->ino) { int size = strnlen(de->name, BFS_NAMELEN); if (!dir_emit(ctx, de->name, size, le16_to_cpu(de->ino), DT_UNKNOWN)) { brelse(bh); return 0; } } offset += BFS_DIRENT_SIZE; ctx->pos += BFS_DIRENT_SIZE; } while ((offset < BFS_BSIZE) && (ctx->pos < dir->i_size)); brelse(bh); } return 0; } const struct file_operations bfs_dir_operations = { .read = generic_read_dir, .iterate_shared = bfs_readdir, .fsync = generic_file_fsync, .llseek = generic_file_llseek, }; static int bfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { int err; struct inode *inode; struct super_block *s = dir->i_sb; struct bfs_sb_info *info = BFS_SB(s); unsigned long ino; inode = new_inode(s); if (!inode) return -ENOMEM; mutex_lock(&info->bfs_lock); ino = find_first_zero_bit(info->si_imap, info->si_lasti + 1); if (ino > info->si_lasti) { mutex_unlock(&info->bfs_lock); iput(inode); return -ENOSPC; } set_bit(ino, info->si_imap); info->si_freei--; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); simple_inode_init_ts(inode); inode->i_blocks = 0; inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; inode->i_mapping->a_ops = &bfs_aops; inode->i_ino = ino; BFS_I(inode)->i_dsk_ino = ino; BFS_I(inode)->i_sblock = 0; BFS_I(inode)->i_eblock = 0; insert_inode_hash(inode); mark_inode_dirty(inode); bfs_dump_imap("create", s); err = bfs_add_entry(dir, &dentry->d_name, inode->i_ino); if (err) { inode_dec_link_count(inode); mutex_unlock(&info->bfs_lock); iput(inode); return err; } mutex_unlock(&info->bfs_lock); d_instantiate(dentry, inode); return 0; } static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode = NULL; struct buffer_head *bh; struct bfs_dirent *de; struct bfs_sb_info *info = BFS_SB(dir->i_sb); if (dentry->d_name.len > BFS_NAMELEN) return ERR_PTR(-ENAMETOOLONG); mutex_lock(&info->bfs_lock); bh = bfs_find_entry(dir, &dentry->d_name, &de); if (bh) { unsigned long ino = (unsigned long)le16_to_cpu(de->ino); brelse(bh); inode = bfs_iget(dir->i_sb, ino); } mutex_unlock(&info->bfs_lock); return d_splice_alias(inode, dentry); } static int bfs_link(struct dentry *old, struct inode *dir, struct dentry *new) { struct inode *inode = d_inode(old); struct bfs_sb_info *info = BFS_SB(inode->i_sb); int err; mutex_lock(&info->bfs_lock); err = bfs_add_entry(dir, &new->d_name, inode->i_ino); if (err) { mutex_unlock(&info->bfs_lock); return err; } inc_nlink(inode); inode_set_ctime_current(inode); mark_inode_dirty(inode); ihold(inode); d_instantiate(new, inode); mutex_unlock(&info->bfs_lock); return 0; } static int bfs_unlink(struct inode *dir, struct dentry *dentry) { int error = -ENOENT; struct inode *inode = d_inode(dentry); struct buffer_head *bh; struct bfs_dirent *de; struct bfs_sb_info *info = BFS_SB(inode->i_sb); mutex_lock(&info->bfs_lock); bh = bfs_find_entry(dir, &dentry->d_name, &de); if (!bh || (le16_to_cpu(de->ino) != inode->i_ino)) goto out_brelse; if (!inode->i_nlink) { printf("unlinking non-existent file %s:%lu (nlink=%d)\n", inode->i_sb->s_id, inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } de->ino = 0; mark_buffer_dirty_inode(bh, dir); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); error = 0; out_brelse: brelse(bh); mutex_unlock(&info->bfs_lock); return error; } static int bfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode, *new_inode; struct buffer_head *old_bh, *new_bh; struct bfs_dirent *old_de, *new_de; struct bfs_sb_info *info; int error = -ENOENT; if (flags & ~RENAME_NOREPLACE) return -EINVAL; old_bh = new_bh = NULL; old_inode = d_inode(old_dentry); if (S_ISDIR(old_inode->i_mode)) return -EINVAL; info = BFS_SB(old_inode->i_sb); mutex_lock(&info->bfs_lock); old_bh = bfs_find_entry(old_dir, &old_dentry->d_name, &old_de); if (!old_bh || (le16_to_cpu(old_de->ino) != old_inode->i_ino)) goto end_rename; error = -EPERM; new_inode = d_inode(new_dentry); new_bh = bfs_find_entry(new_dir, &new_dentry->d_name, &new_de); if (new_bh && !new_inode) { brelse(new_bh); new_bh = NULL; } if (!new_bh) { error = bfs_add_entry(new_dir, &new_dentry->d_name, old_inode->i_ino); if (error) goto end_rename; } old_de->ino = 0; inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); mark_inode_dirty(old_dir); if (new_inode) { inode_set_ctime_current(new_inode); inode_dec_link_count(new_inode); } mark_buffer_dirty_inode(old_bh, old_dir); error = 0; end_rename: mutex_unlock(&info->bfs_lock); brelse(old_bh); brelse(new_bh); return error; } const struct inode_operations bfs_dir_inops = { .create = bfs_create, .lookup = bfs_lookup, .link = bfs_link, .unlink = bfs_unlink, .rename = bfs_rename, }; static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino) { const unsigned char *name = child->name; int namelen = child->len; struct buffer_head *bh; struct bfs_dirent *de; int block, sblock, eblock, off, pos; int i; dprintf("name=%s, namelen=%d\n", name, namelen); sblock = BFS_I(dir)->i_sblock; eblock = BFS_I(dir)->i_eblock; for (block = sblock; block <= eblock; block++) { bh = sb_bread(dir->i_sb, block); if (!bh) return -EIO; for (off = 0; off < BFS_BSIZE; off += BFS_DIRENT_SIZE) { de = (struct bfs_dirent *)(bh->b_data + off); if (!de->ino) { pos = (block - sblock) * BFS_BSIZE + off; if (pos >= dir->i_size) { dir->i_size += BFS_DIRENT_SIZE; inode_set_ctime_current(dir); } inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); de->ino = cpu_to_le16((u16)ino); for (i = 0; i < BFS_NAMELEN; i++) de->name[i] = (i < namelen) ? name[i] : 0; mark_buffer_dirty_inode(bh, dir); brelse(bh); return 0; } } brelse(bh); } return -ENOSPC; } static inline int bfs_namecmp(int len, const unsigned char *name, const char *buffer) { if ((len < BFS_NAMELEN) && buffer[len]) return 0; return !memcmp(name, buffer, len); } static struct buffer_head *bfs_find_entry(struct inode *dir, const struct qstr *child, struct bfs_dirent **res_dir) { unsigned long block = 0, offset = 0; struct buffer_head *bh = NULL; struct bfs_dirent *de; const unsigned char *name = child->name; int namelen = child->len; *res_dir = NULL; if (namelen > BFS_NAMELEN) return NULL; while (block * BFS_BSIZE + offset < dir->i_size) { if (!bh) { bh = sb_bread(dir->i_sb, BFS_I(dir)->i_sblock + block); if (!bh) { block++; continue; } } de = (struct bfs_dirent *)(bh->b_data + offset); offset += BFS_DIRENT_SIZE; if (le16_to_cpu(de->ino) && bfs_namecmp(namelen, name, de->name)) { *res_dir = de; return bh; } if (offset < bh->b_size) continue; brelse(bh); bh = NULL; offset = 0; block++; } brelse(bh); return NULL; } |
| 985 985 1 577 289 576 1536 1535 1536 296 1535 1534 1536 1533 1533 578 577 575 1525 1536 1534 630 877 1523 1526 1530 982 959 301 985 301 959 302 275 300 569 571 570 243 245 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 | // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/group.c - Operations for adding/removing multiple files at once. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2013 Greg Kroah-Hartman * Copyright (c) 2013 The Linux Foundation */ #include <linux/kobject.h> #include <linux/module.h> #include <linux/dcache.h> #include <linux/namei.h> #include <linux/err.h> #include <linux/fs.h> #include "sysfs.h" static void remove_files(struct kernfs_node *parent, const struct attribute_group *grp) { struct attribute *const *attr; const struct bin_attribute *const *bin_attr; if (grp->attrs) for (attr = grp->attrs; *attr; attr++) kernfs_remove_by_name(parent, (*attr)->name); if (grp->bin_attrs) for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) kernfs_remove_by_name(parent, (*bin_attr)->attr.name); } static umode_t __first_visible(const struct attribute_group *grp, struct kobject *kobj) { if (grp->attrs && grp->attrs[0] && grp->is_visible) return grp->is_visible(kobj, grp->attrs[0], 0); if (grp->attrs && grp->attrs[0] && grp->is_visible_const) return grp->is_visible_const(kobj, grp->attrs[0], 0); if (grp->bin_attrs && grp->bin_attrs[0] && grp->is_bin_visible) return grp->is_bin_visible(kobj, grp->bin_attrs[0], 0); return 0; } static int create_files(struct kernfs_node *parent, struct kobject *kobj, kuid_t uid, kgid_t gid, const struct attribute_group *grp, int update) { struct attribute *const *attr; const struct bin_attribute *const *bin_attr; int error = 0, i; if (grp->attrs) { for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) { umode_t mode = (*attr)->mode; /* * In update mode, we're changing the permissions or * visibility. Do this by first removing then * re-adding (if required) the file. */ if (update) kernfs_remove_by_name(parent, (*attr)->name); if (grp->is_visible || grp->is_visible_const) { if (grp->is_visible) mode = grp->is_visible(kobj, *attr, i); else mode = grp->is_visible_const(kobj, *attr, i); mode &= ~SYSFS_GROUP_INVISIBLE; if (!mode) continue; } WARN(mode & ~(SYSFS_PREALLOC | 0664), "Attribute %s: Invalid permissions 0%o\n", (*attr)->name, mode); mode &= SYSFS_PREALLOC | 0664; error = sysfs_add_file_mode_ns(parent, *attr, mode, uid, gid, NULL); if (unlikely(error)) break; } if (error) { remove_files(parent, grp); goto exit; } } if (grp->bin_attrs) { for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) { umode_t mode = (*bin_attr)->attr.mode; size_t size = (*bin_attr)->size; if (update) kernfs_remove_by_name(parent, (*bin_attr)->attr.name); if (grp->is_bin_visible) { mode = grp->is_bin_visible(kobj, *bin_attr, i); mode &= ~SYSFS_GROUP_INVISIBLE; if (!mode) continue; } if (grp->bin_size) size = grp->bin_size(kobj, *bin_attr, i); WARN(mode & ~(SYSFS_PREALLOC | 0664), "Attribute %s: Invalid permissions 0%o\n", (*bin_attr)->attr.name, mode); mode &= SYSFS_PREALLOC | 0664; error = sysfs_add_bin_file_mode_ns(parent, *bin_attr, mode, size, uid, gid, NULL); if (error) break; } if (error) remove_files(parent, grp); } exit: return error; } static int internal_create_group(struct kobject *kobj, int update, const struct attribute_group *grp) { struct kernfs_node *kn; kuid_t uid; kgid_t gid; int error; if (WARN_ON(!kobj || (!update && !kobj->sd))) return -EINVAL; /* Updates may happen before the object has been instantiated */ if (unlikely(update && !kobj->sd)) return -EINVAL; if (!grp->attrs && !grp->bin_attrs) { pr_debug("sysfs: (bin_)attrs not set by subsystem for group: %s/%s, skipping\n", kobj->name, grp->name ?: ""); return 0; } kobject_get_ownership(kobj, &uid, &gid); if (grp->name) { umode_t mode = __first_visible(grp, kobj); if (mode & SYSFS_GROUP_INVISIBLE) mode = 0; else mode = S_IRWXU | S_IRUGO | S_IXUGO; if (update) { kn = kernfs_find_and_get(kobj->sd, grp->name); if (!kn) { pr_debug("attr grp %s/%s not created yet\n", kobj->name, grp->name); /* may have been invisible prior to this update */ update = 0; } else if (!mode) { sysfs_remove_group(kobj, grp); kernfs_put(kn); return 0; } } if (!update) { if (!mode) return 0; kn = kernfs_create_dir_ns(kobj->sd, grp->name, mode, uid, gid, kobj, NULL); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(kobj->sd, grp->name); return PTR_ERR(kn); } } } else { kn = kobj->sd; } kernfs_get(kn); error = create_files(kn, kobj, uid, gid, grp, update); if (error) { if (grp->name) kernfs_remove(kn); } kernfs_put(kn); if (grp->name && update) kernfs_put(kn); return error; } /** * sysfs_create_group - given a directory kobject, create an attribute group * @kobj: The kobject to create the group on * @grp: The attribute group to create * * This function creates a group for the first time. It will explicitly * warn and error if any of the attribute files being created already exist. * * Returns 0 on success or error code on failure. */ int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) { return internal_create_group(kobj, 0, grp); } EXPORT_SYMBOL_GPL(sysfs_create_group); static int internal_create_groups(struct kobject *kobj, int update, const struct attribute_group **groups) { int error = 0; int i; if (!groups) return 0; for (i = 0; groups[i]; i++) { error = internal_create_group(kobj, update, groups[i]); if (error) { while (--i >= 0) sysfs_remove_group(kobj, groups[i]); break; } } return error; } /** * sysfs_create_groups - given a directory kobject, create a bunch of attribute groups * @kobj: The kobject to create the group on * @groups: The attribute groups to create, NULL terminated * * This function creates a bunch of attribute groups. If an error occurs when * creating a group, all previously created groups will be removed, unwinding * everything back to the original state when this function was called. * It will explicitly warn and error if any of the attribute files being * created already exist. * * Returns 0 on success or error code from sysfs_create_group on failure. */ int sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups) { return internal_create_groups(kobj, 0, groups); } EXPORT_SYMBOL_GPL(sysfs_create_groups); /** * sysfs_update_groups - given a directory kobject, create a bunch of attribute groups * @kobj: The kobject to update the group on * @groups: The attribute groups to update, NULL terminated * * This function update a bunch of attribute groups. If an error occurs when * updating a group, all previously updated groups will be removed together * with already existing (not updated) attributes. * * Returns 0 on success or error code from sysfs_update_group on failure. */ int sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups) { return internal_create_groups(kobj, 1, groups); } EXPORT_SYMBOL_GPL(sysfs_update_groups); /** * sysfs_update_group - given a directory kobject, update an attribute group * @kobj: The kobject to update the group on * @grp: The attribute group to update * * This function updates an attribute group. Unlike * sysfs_create_group(), it will explicitly not warn or error if any * of the attribute files being created already exist. Furthermore, * if the visibility of the files has changed through the is_visible() * callback, it will update the permissions and add or remove the * relevant files. Changing a group's name (subdirectory name under * kobj's directory in sysfs) is not allowed. * * The primary use for this function is to call it after making a change * that affects group visibility. * * Returns 0 on success or error code on failure. */ int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp) { return internal_create_group(kobj, 1, grp); } EXPORT_SYMBOL_GPL(sysfs_update_group); /** * sysfs_remove_group: remove a group from a kobject * @kobj: kobject to remove the group from * @grp: group to remove * * This function removes a group of attributes from a kobject. The attributes * previously have to have been created for this group, otherwise it will fail. */ void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { struct kernfs_node *parent = kobj->sd; struct kernfs_node *kn; if (grp->name) { kn = kernfs_find_and_get(parent, grp->name); if (!kn) { pr_debug("sysfs group '%s' not found for kobject '%s'\n", grp->name, kobject_name(kobj)); return; } } else { kn = parent; kernfs_get(kn); } remove_files(kn, grp); if (grp->name) kernfs_remove(kn); kernfs_put(kn); } EXPORT_SYMBOL_GPL(sysfs_remove_group); /** * sysfs_remove_groups - remove a list of groups * * @kobj: The kobject for the groups to be removed from * @groups: NULL terminated list of groups to be removed * * If groups is not NULL, remove the specified groups from the kobject. */ void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups) { int i; if (!groups) return; for (i = 0; groups[i]; i++) sysfs_remove_group(kobj, groups[i]); } EXPORT_SYMBOL_GPL(sysfs_remove_groups); /** * sysfs_merge_group - merge files into a pre-existing named attribute group. * @kobj: The kobject containing the group. * @grp: The files to create and the attribute group they belong to. * * This function returns an error if the group doesn't exist, the .name field is * NULL or any of the files already exist in that group, in which case none of * the new files are created. */ int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { struct kernfs_node *parent; kuid_t uid; kgid_t gid; int error = 0; struct attribute *const *attr; int i; parent = kernfs_find_and_get(kobj->sd, grp->name); if (!parent) return -ENOENT; kobject_get_ownership(kobj, &uid, &gid); for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr)) error = sysfs_add_file_mode_ns(parent, *attr, (*attr)->mode, uid, gid, NULL); if (error) { while (--i >= 0) kernfs_remove_by_name(parent, (*--attr)->name); } kernfs_put(parent); return error; } EXPORT_SYMBOL_GPL(sysfs_merge_group); /** * sysfs_unmerge_group - remove files from a pre-existing named attribute group. * @kobj: The kobject containing the group. * @grp: The files to remove and the attribute group they belong to. */ void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { struct kernfs_node *parent; struct attribute *const *attr; parent = kernfs_find_and_get(kobj->sd, grp->name); if (parent) { for (attr = grp->attrs; *attr; ++attr) kernfs_remove_by_name(parent, (*attr)->name); kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_unmerge_group); /** * sysfs_add_link_to_group - add a symlink to an attribute group. * @kobj: The kobject containing the group. * @group_name: The name of the group. * @target: The target kobject of the symlink to create. * @link_name: The name of the symlink to create. */ int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { struct kernfs_node *parent; int error = 0; parent = kernfs_find_and_get(kobj->sd, group_name); if (!parent) return -ENOENT; error = sysfs_create_link_sd(parent, target, link_name); kernfs_put(parent); return error; } EXPORT_SYMBOL_GPL(sysfs_add_link_to_group); /** * sysfs_remove_link_from_group - remove a symlink from an attribute group. * @kobj: The kobject containing the group. * @group_name: The name of the group. * @link_name: The name of the symlink to remove. */ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { struct kernfs_node *parent; parent = kernfs_find_and_get(kobj->sd, group_name); if (parent) { kernfs_remove_by_name(parent, link_name); kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); /** * compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing * to a group or an attribute * @kobj: The kobject containing the group. * @target_kobj: The target kobject. * @target_name: The name of the target group or attribute. * @symlink_name: The name of the symlink file (target_name will be * considered if symlink_name is NULL). */ int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name) { struct kernfs_node *target; struct kernfs_node *entry; struct kernfs_node *link; /* * We don't own @target_kobj and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See sysfs_remove_dir() * for details. */ spin_lock(&sysfs_symlink_target_lock); target = target_kobj->sd; if (target) kernfs_get(target); spin_unlock(&sysfs_symlink_target_lock); if (!target) return -ENOENT; entry = kernfs_find_and_get(target, target_name); if (!entry) { kernfs_put(target); return -ENOENT; } if (!symlink_name) symlink_name = target_name; link = kernfs_create_link(kobj->sd, symlink_name, entry); if (PTR_ERR(link) == -EEXIST) sysfs_warn_dup(kobj->sd, symlink_name); kernfs_put(entry); kernfs_put(target); return PTR_ERR_OR_ZERO(link); } EXPORT_SYMBOL_GPL(compat_only_sysfs_link_entry_to_kobj); static int sysfs_group_attrs_change_owner(struct kobject *kobj, struct kernfs_node *grp_kn, const struct attribute_group *grp, struct iattr *newattrs) { struct kernfs_node *kn; int error, i; umode_t mode; if (grp->attrs) { struct attribute *const *attr; for (i = 0, attr = grp->attrs; *attr; i++, attr++) { if (grp->is_visible) { mode = grp->is_visible(kobj, *attr, i); if (mode & SYSFS_GROUP_INVISIBLE) break; if (!mode) continue; } kn = kernfs_find_and_get(grp_kn, (*attr)->name); if (!kn) return -ENOENT; error = kernfs_setattr(kn, newattrs); kernfs_put(kn); if (error) return error; } } if (grp->bin_attrs) { const struct bin_attribute *const *bin_attr; for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) { if (grp->is_bin_visible) { mode = grp->is_bin_visible(kobj, *bin_attr, i); if (mode & SYSFS_GROUP_INVISIBLE) break; if (!mode) continue; } kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name); if (!kn) return -ENOENT; error = kernfs_setattr(kn, newattrs); kernfs_put(kn); if (error) return error; } } return 0; } /** * sysfs_group_change_owner - change owner of an attribute group. * @kobj: The kobject containing the group. * @grp: The attribute group. * @kuid: new owner's kuid * @kgid: new owner's kgid * * Returns 0 on success or error code on failure. */ int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *grp, kuid_t kuid, kgid_t kgid) { struct kernfs_node *grp_kn; int error; struct iattr newattrs = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = kuid, .ia_gid = kgid, }; if (!kobj->state_in_sysfs) return -EINVAL; if (grp->name) { grp_kn = kernfs_find_and_get(kobj->sd, grp->name); } else { kernfs_get(kobj->sd); grp_kn = kobj->sd; } if (!grp_kn) return -ENOENT; error = kernfs_setattr(grp_kn, &newattrs); if (!error) error = sysfs_group_attrs_change_owner(kobj, grp_kn, grp, &newattrs); kernfs_put(grp_kn); return error; } EXPORT_SYMBOL_GPL(sysfs_group_change_owner); /** * sysfs_groups_change_owner - change owner of a set of attribute groups. * @kobj: The kobject containing the groups. * @groups: The attribute groups. * @kuid: new owner's kuid * @kgid: new owner's kgid * * Returns 0 on success or error code on failure. */ int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid) { int error = 0, i; if (!kobj->state_in_sysfs) return -EINVAL; if (!groups) return 0; for (i = 0; groups[i]; i++) { error = sysfs_group_change_owner(kobj, groups[i], kuid, kgid); if (error) break; } return error; } EXPORT_SYMBOL_GPL(sysfs_groups_change_owner); |
| 197 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | /* SPDX-License-Identifier: GPL-2.0+ */ /* * Persistent object (dat entry/disk inode) allocator/deallocator * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * * Originally written by Koji Sato. * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji. */ #ifndef _NILFS_ALLOC_H #define _NILFS_ALLOC_H #include <linux/types.h> #include <linux/buffer_head.h> #include <linux/fs.h> /** * nilfs_palloc_entries_per_group - get the number of entries per group * @inode: inode of metadata file using this allocator * * The number of entries per group is defined by the number of bits * that a bitmap block can maintain. * * Return: Number of entries per group. */ static inline unsigned long nilfs_palloc_entries_per_group(const struct inode *inode) { return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */); } int nilfs_palloc_init_blockgroup(struct inode *, unsigned int); int nilfs_palloc_get_entry_block(struct inode *, __u64, int, struct buffer_head **); size_t nilfs_palloc_entry_offset(const struct inode *inode, __u64 nr, const struct buffer_head *bh); int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *); /** * struct nilfs_palloc_req - persistent allocator request and reply * @pr_entry_nr: entry number (vblocknr or inode number) * @pr_desc_bh: buffer head of the buffer containing block group descriptors * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap * @pr_entry_bh: buffer head of the buffer containing translation entries */ struct nilfs_palloc_req { __u64 pr_entry_nr; struct buffer_head *pr_desc_bh; struct buffer_head *pr_bitmap_bh; struct buffer_head *pr_entry_bh; }; int nilfs_palloc_prepare_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req, bool wrap); void nilfs_palloc_commit_alloc_entry(struct inode *, struct nilfs_palloc_req *); void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *); void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *); int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *); void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *); int nilfs_palloc_freev(struct inode *, __u64 *, size_t); #define nilfs_set_bit_atomic ext2_set_bit_atomic #define nilfs_clear_bit_atomic ext2_clear_bit_atomic #define nilfs_find_next_zero_bit find_next_zero_bit_le #define nilfs_find_next_bit find_next_bit_le /** * struct nilfs_bh_assoc - block offset and buffer head association * @blkoff: block offset * @bh: buffer head */ struct nilfs_bh_assoc { unsigned long blkoff; struct buffer_head *bh; }; /** * struct nilfs_palloc_cache - persistent object allocator cache * @lock: cache protecting lock * @prev_desc: blockgroup descriptors cache * @prev_bitmap: blockgroup bitmap cache * @prev_entry: translation entries cache */ struct nilfs_palloc_cache { spinlock_t lock; struct nilfs_bh_assoc prev_desc; struct nilfs_bh_assoc prev_bitmap; struct nilfs_bh_assoc prev_entry; }; void nilfs_palloc_setup_cache(struct inode *inode, struct nilfs_palloc_cache *cache); void nilfs_palloc_clear_cache(struct inode *inode); void nilfs_palloc_destroy_cache(struct inode *inode); #endif /* _NILFS_ALLOC_H */ |
| 1 1 1 1 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 | // SPDX-License-Identifier: GPL-2.0-only /* * VMware vSockets Driver * * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. */ /* Implementation notes: * * - There are two kinds of sockets: those created by user action (such as * calling socket(2)) and those created by incoming connection request packets. * * - There are two "global" tables, one for bound sockets (sockets that have * specified an address that they are responsible for) and one for connected * sockets (sockets that have established a connection with another socket). * These tables are "global" in that all sockets on the system are placed * within them. - Note, though, that the bound table contains an extra entry * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in * that list. The bound table is used solely for lookup of sockets when packets * are received and that's not necessary for SOCK_DGRAM sockets since we create * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM * sockets out of the bound hash buckets will reduce the chance of collisions * when looking for SOCK_STREAM sockets and prevents us from having to check the * socket type in the hash table lookups. * * - Sockets created by user action will either be "client" sockets that * initiate a connection or "server" sockets that listen for connections; we do * not support simultaneous connects (two "client" sockets connecting). * * - "Server" sockets are referred to as listener sockets throughout this * implementation because they are in the TCP_LISTEN state. When a * connection request is received (the second kind of socket mentioned above), * we create a new socket and refer to it as a pending socket. These pending * sockets are placed on the pending connection list of the listener socket. * When future packets are received for the address the listener socket is * bound to, we check if the source of the packet is from one that has an * existing pending connection. If it does, we process the packet for the * pending socket. When that socket reaches the connected state, it is removed * from the listener socket's pending list and enqueued in the listener * socket's accept queue. Callers of accept(2) will accept connected sockets * from the listener socket's accept queue. If the socket cannot be accepted * for some reason then it is marked rejected. Once the connection is * accepted, it is owned by the user process and the responsibility for cleanup * falls with that user process. * * - It is possible that these pending sockets will never reach the connected * state; in fact, we may never receive another packet after the connection * request. Because of this, we must schedule a cleanup function to run in the * future, after some amount of time passes where a connection should have been * established. This function ensures that the socket is off all lists so it * cannot be retrieved, then drops all references to the socket so it is cleaned * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this * function will also cleanup rejected sockets, those that reach the connected * state but leave it before they have been accepted. * * - Lock ordering for pending or accept queue sockets is: * * lock_sock(listener); * lock_sock_nested(pending, SINGLE_DEPTH_NESTING); * * Using explicit nested locking keeps lockdep happy since normally only one * lock of a given class may be taken at a time. * * - Sockets created by user action will be cleaned up when the user process * calls close(2), causing our release implementation to be called. Our release * implementation will perform some cleanup then drop the last reference so our * sk_destruct implementation is invoked. Our sk_destruct implementation will * perform additional cleanup that's common for both types of sockets. * * - A socket's reference count is what ensures that the structure won't be * freed. Each entry in a list (such as the "global" bound and connected tables * and the listener socket's pending list and connected queue) ensures a * reference. When we defer work until process context and pass a socket as our * argument, we must ensure the reference count is increased to ensure the * socket isn't freed before the function is run; the deferred function will * then drop the reference. * * - sk->sk_state uses the TCP state constants because they are widely used by * other address families and exposed to userspace tools like ss(8): * * TCP_CLOSE - unconnected * TCP_SYN_SENT - connecting * TCP_ESTABLISHED - connected * TCP_CLOSING - disconnecting * TCP_LISTEN - listening */ #include <linux/compat.h> #include <linux/types.h> #include <linux/bitops.h> #include <linux/cred.h> #include <linux/errqueue.h> #include <linux/init.h> #include <linux/io.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/kmod.h> #include <linux/list.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/net.h> #include <linux/poll.h> #include <linux/random.h> #include <linux/skbuff.h> #include <linux/smp.h> #include <linux/socket.h> #include <linux/stddef.h> #include <linux/unistd.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <net/sock.h> #include <net/af_vsock.h> #include <uapi/linux/vm_sockets.h> #include <uapi/asm-generic/ioctls.h> static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); static void vsock_sk_destruct(struct sock *sk); static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); static void vsock_close(struct sock *sk, long timeout); /* Protocol family. */ struct proto vsock_proto = { .name = "AF_VSOCK", .owner = THIS_MODULE, .obj_size = sizeof(struct vsock_sock), .close = vsock_close, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = vsock_bpf_update_proto, #endif }; /* The default peer timeout indicates how long we will wait for a peer response * to a control message. */ #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) #define VSOCK_DEFAULT_BUFFER_SIZE (1024 * 256) #define VSOCK_DEFAULT_BUFFER_MAX_SIZE (1024 * 256) #define VSOCK_DEFAULT_BUFFER_MIN_SIZE 128 /* Transport used for host->guest communication */ static const struct vsock_transport *transport_h2g; /* Transport used for guest->host communication */ static const struct vsock_transport *transport_g2h; /* Transport used for DGRAM communication */ static const struct vsock_transport *transport_dgram; /* Transport used for local communication */ static const struct vsock_transport *transport_local; static DEFINE_MUTEX(vsock_register_mutex); /**** UTILS ****/ /* Each bound VSocket is stored in the bind hash table and each connected * VSocket is stored in the connected hash table. * * Unbound sockets are all put on the same list attached to the end of the hash * table (vsock_unbound_sockets). Bound sockets are added to the hash table in * the bucket that their local address hashes to (vsock_bound_sockets(addr) * represents the list that addr hashes to). * * Specifically, we initialize the vsock_bind_table array to a size of * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function * mods with VSOCK_HASH_SIZE to ensure this. */ #define MAX_PORT_RETRIES 24 #define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE) #define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)]) #define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE]) /* XXX This can probably be implemented in a better way. */ #define VSOCK_CONN_HASH(src, dst) \ (((src)->svm_cid ^ (dst)->svm_port) % VSOCK_HASH_SIZE) #define vsock_connected_sockets(src, dst) \ (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)]) #define vsock_connected_sockets_vsk(vsk) \ vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr) struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; EXPORT_SYMBOL_GPL(vsock_bind_table); struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; EXPORT_SYMBOL_GPL(vsock_connected_table); DEFINE_SPINLOCK(vsock_table_lock); EXPORT_SYMBOL_GPL(vsock_table_lock); /* Autobind this socket to the local address if necessary. */ static int vsock_auto_bind(struct vsock_sock *vsk) { struct sock *sk = sk_vsock(vsk); struct sockaddr_vm local_addr; if (vsock_addr_bound(&vsk->local_addr)) return 0; vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); return __vsock_bind(sk, &local_addr); } static void vsock_init_tables(void) { int i; for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++) INIT_LIST_HEAD(&vsock_bind_table[i]); for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) INIT_LIST_HEAD(&vsock_connected_table[i]); } static void __vsock_insert_bound(struct list_head *list, struct vsock_sock *vsk) { sock_hold(&vsk->sk); list_add(&vsk->bound_table, list); } static void __vsock_insert_connected(struct list_head *list, struct vsock_sock *vsk) { sock_hold(&vsk->sk); list_add(&vsk->connected_table, list); } static void __vsock_remove_bound(struct vsock_sock *vsk) { list_del_init(&vsk->bound_table); sock_put(&vsk->sk); } static void __vsock_remove_connected(struct vsock_sock *vsk) { list_del_init(&vsk->connected_table); sock_put(&vsk->sk); } static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) { struct vsock_sock *vsk; list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) { if (vsock_addr_equals_addr(addr, &vsk->local_addr)) return sk_vsock(vsk); if (addr->svm_port == vsk->local_addr.svm_port && (vsk->local_addr.svm_cid == VMADDR_CID_ANY || addr->svm_cid == VMADDR_CID_ANY)) return sk_vsock(vsk); } return NULL; } static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst) { struct vsock_sock *vsk; list_for_each_entry(vsk, vsock_connected_sockets(src, dst), connected_table) { if (vsock_addr_equals_addr(src, &vsk->remote_addr) && dst->svm_port == vsk->local_addr.svm_port) { return sk_vsock(vsk); } } return NULL; } static void vsock_insert_unbound(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); __vsock_insert_bound(vsock_unbound_sockets, vsk); spin_unlock_bh(&vsock_table_lock); } void vsock_insert_connected(struct vsock_sock *vsk) { struct list_head *list = vsock_connected_sockets( &vsk->remote_addr, &vsk->local_addr); spin_lock_bh(&vsock_table_lock); __vsock_insert_connected(list, vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_insert_connected); void vsock_remove_bound(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); if (__vsock_in_bound_table(vsk)) __vsock_remove_bound(vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_remove_bound); void vsock_remove_connected(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); if (__vsock_in_connected_table(vsk)) __vsock_remove_connected(vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_remove_connected); struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) { struct sock *sk; spin_lock_bh(&vsock_table_lock); sk = __vsock_find_bound_socket(addr); if (sk) sock_hold(sk); spin_unlock_bh(&vsock_table_lock); return sk; } EXPORT_SYMBOL_GPL(vsock_find_bound_socket); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst) { struct sock *sk; spin_lock_bh(&vsock_table_lock); sk = __vsock_find_connected_socket(src, dst); if (sk) sock_hold(sk); spin_unlock_bh(&vsock_table_lock); return sk; } EXPORT_SYMBOL_GPL(vsock_find_connected_socket); void vsock_remove_sock(struct vsock_sock *vsk) { /* Transport reassignment must not remove the binding. */ if (sock_flag(sk_vsock(vsk), SOCK_DEAD)) vsock_remove_bound(vsk); vsock_remove_connected(vsk); } EXPORT_SYMBOL_GPL(vsock_remove_sock); void vsock_for_each_connected_socket(struct vsock_transport *transport, void (*fn)(struct sock *sk)) { int i; spin_lock_bh(&vsock_table_lock); for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) { struct vsock_sock *vsk; list_for_each_entry(vsk, &vsock_connected_table[i], connected_table) { if (vsk->transport != transport) continue; fn(sk_vsock(vsk)); } } spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket); void vsock_add_pending(struct sock *listener, struct sock *pending) { struct vsock_sock *vlistener; struct vsock_sock *vpending; vlistener = vsock_sk(listener); vpending = vsock_sk(pending); sock_hold(pending); sock_hold(listener); list_add_tail(&vpending->pending_links, &vlistener->pending_links); } EXPORT_SYMBOL_GPL(vsock_add_pending); void vsock_remove_pending(struct sock *listener, struct sock *pending) { struct vsock_sock *vpending = vsock_sk(pending); list_del_init(&vpending->pending_links); sock_put(listener); sock_put(pending); } EXPORT_SYMBOL_GPL(vsock_remove_pending); void vsock_enqueue_accept(struct sock *listener, struct sock *connected) { struct vsock_sock *vlistener; struct vsock_sock *vconnected; vlistener = vsock_sk(listener); vconnected = vsock_sk(connected); sock_hold(connected); sock_hold(listener); list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue); } EXPORT_SYMBOL_GPL(vsock_enqueue_accept); static bool vsock_use_local_transport(unsigned int remote_cid) { lockdep_assert_held(&vsock_register_mutex); if (!transport_local) return false; if (remote_cid == VMADDR_CID_LOCAL) return true; if (transport_g2h) { return remote_cid == transport_g2h->get_local_cid(); } else { return remote_cid == VMADDR_CID_HOST; } } static void vsock_deassign_transport(struct vsock_sock *vsk) { if (!vsk->transport) return; vsk->transport->destruct(vsk); module_put(vsk->transport->module); vsk->transport = NULL; } /* Assign a transport to a socket and call the .init transport callback. * * Note: for connection oriented socket this must be called when vsk->remote_addr * is set (e.g. during the connect() or when a connection request on a listener * socket is received). * The vsk->remote_addr is used to decide which transport to use: * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if * g2h is not loaded, will use local transport; * - remote CID <= VMADDR_CID_HOST or h2g is not loaded or remote flags field * includes VMADDR_FLAG_TO_HOST flag value, will use guest->host transport; * - remote CID > VMADDR_CID_HOST will use host->guest transport; */ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) { const struct vsock_transport *new_transport; struct sock *sk = sk_vsock(vsk); unsigned int remote_cid = vsk->remote_addr.svm_cid; __u8 remote_flags; int ret; /* If the packet is coming with the source and destination CIDs higher * than VMADDR_CID_HOST, then a vsock channel where all the packets are * forwarded to the host should be established. Then the host will * need to forward the packets to the guest. * * The flag is set on the (listen) receive path (psk is not NULL). On * the connect path the flag can be set by the user space application. */ if (psk && vsk->local_addr.svm_cid > VMADDR_CID_HOST && vsk->remote_addr.svm_cid > VMADDR_CID_HOST) vsk->remote_addr.svm_flags |= VMADDR_FLAG_TO_HOST; remote_flags = vsk->remote_addr.svm_flags; mutex_lock(&vsock_register_mutex); switch (sk->sk_type) { case SOCK_DGRAM: new_transport = transport_dgram; break; case SOCK_STREAM: case SOCK_SEQPACKET: if (vsock_use_local_transport(remote_cid)) new_transport = transport_local; else if (remote_cid <= VMADDR_CID_HOST || !transport_h2g || (remote_flags & VMADDR_FLAG_TO_HOST)) new_transport = transport_g2h; else new_transport = transport_h2g; break; default: ret = -ESOCKTNOSUPPORT; goto err; } if (vsk->transport && vsk->transport == new_transport) { ret = 0; goto err; } /* We increase the module refcnt to prevent the transport unloading * while there are open sockets assigned to it. */ if (!new_transport || !try_module_get(new_transport->module)) { ret = -ENODEV; goto err; } /* It's safe to release the mutex after a successful try_module_get(). * Whichever transport `new_transport` points at, it won't go away until * the last module_put() below or in vsock_deassign_transport(). */ mutex_unlock(&vsock_register_mutex); if (vsk->transport) { /* transport->release() must be called with sock lock acquired. * This path can only be taken during vsock_connect(), where we * have already held the sock lock. In the other cases, this * function is called on a new socket which is not assigned to * any transport. */ vsk->transport->release(vsk); vsock_deassign_transport(vsk); /* transport's release() and destruct() can touch some socket * state, since we are reassigning the socket to a new transport * during vsock_connect(), let's reset these fields to have a * clean state. */ sock_reset_flag(sk, SOCK_DONE); sk->sk_state = TCP_CLOSE; vsk->peer_shutdown = 0; } if (sk->sk_type == SOCK_SEQPACKET) { if (!new_transport->seqpacket_allow || !new_transport->seqpacket_allow(remote_cid)) { module_put(new_transport->module); return -ESOCKTNOSUPPORT; } } ret = new_transport->init(vsk, psk); if (ret) { module_put(new_transport->module); return ret; } vsk->transport = new_transport; return 0; err: mutex_unlock(&vsock_register_mutex); return ret; } EXPORT_SYMBOL_GPL(vsock_assign_transport); /* * Provide safe access to static transport_{h2g,g2h,dgram,local} callbacks. * Otherwise we may race with module removal. Do not use on `vsk->transport`. */ static u32 vsock_registered_transport_cid(const struct vsock_transport **transport) { u32 cid = VMADDR_CID_ANY; mutex_lock(&vsock_register_mutex); if (*transport) cid = (*transport)->get_local_cid(); mutex_unlock(&vsock_register_mutex); return cid; } bool vsock_find_cid(unsigned int cid) { if (cid == vsock_registered_transport_cid(&transport_g2h)) return true; if (transport_h2g && cid == VMADDR_CID_HOST) return true; if (transport_local && cid == VMADDR_CID_LOCAL) return true; return false; } EXPORT_SYMBOL_GPL(vsock_find_cid); static struct sock *vsock_dequeue_accept(struct sock *listener) { struct vsock_sock *vlistener; struct vsock_sock *vconnected; vlistener = vsock_sk(listener); if (list_empty(&vlistener->accept_queue)) return NULL; vconnected = list_entry(vlistener->accept_queue.next, struct vsock_sock, accept_queue); list_del_init(&vconnected->accept_queue); sock_put(listener); /* The caller will need a reference on the connected socket so we let * it call sock_put(). */ return sk_vsock(vconnected); } static bool vsock_is_accept_queue_empty(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); return list_empty(&vsk->accept_queue); } static bool vsock_is_pending(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); return !list_empty(&vsk->pending_links); } static int vsock_send_shutdown(struct sock *sk, int mode) { struct vsock_sock *vsk = vsock_sk(sk); if (!vsk->transport) return -ENODEV; return vsk->transport->shutdown(vsk, mode); } static void vsock_pending_work(struct work_struct *work) { struct sock *sk; struct sock *listener; struct vsock_sock *vsk; bool cleanup; vsk = container_of(work, struct vsock_sock, pending_work.work); sk = sk_vsock(vsk); listener = vsk->listener; cleanup = true; lock_sock(listener); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (vsock_is_pending(sk)) { vsock_remove_pending(listener, sk); sk_acceptq_removed(listener); } else if (!vsk->rejected) { /* We are not on the pending list and accept() did not reject * us, so we must have been accepted by our user process. We * just need to drop our references to the sockets and be on * our way. */ cleanup = false; goto out; } /* We need to remove ourself from the global connected sockets list so * incoming packets can't find this socket, and to reduce the reference * count. */ vsock_remove_connected(vsk); sk->sk_state = TCP_CLOSE; out: release_sock(sk); release_sock(listener); if (cleanup) sock_put(sk); sock_put(sk); sock_put(listener); } /**** SOCKET OPERATIONS ****/ static int __vsock_bind_connectible(struct vsock_sock *vsk, struct sockaddr_vm *addr) { static u32 port; struct sockaddr_vm new_addr; if (!port) port = get_random_u32_above(LAST_RESERVED_PORT); vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); if (addr->svm_port == VMADDR_PORT_ANY) { bool found = false; unsigned int i; for (i = 0; i < MAX_PORT_RETRIES; i++) { if (port == VMADDR_PORT_ANY || port <= LAST_RESERVED_PORT) port = LAST_RESERVED_PORT + 1; new_addr.svm_port = port++; if (!__vsock_find_bound_socket(&new_addr)) { found = true; break; } } if (!found) return -EADDRNOTAVAIL; } else { /* If port is in reserved range, ensure caller * has necessary privileges. */ if (addr->svm_port <= LAST_RESERVED_PORT && !capable(CAP_NET_BIND_SERVICE)) { return -EACCES; } if (__vsock_find_bound_socket(&new_addr)) return -EADDRINUSE; } vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port); /* Remove connection oriented sockets from the unbound list and add them * to the hash table for easy lookup by its address. The unbound list * is simply an extra entry at the end of the hash table, a trick used * by AF_UNIX. */ __vsock_remove_bound(vsk); __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk); return 0; } static int __vsock_bind_dgram(struct vsock_sock *vsk, struct sockaddr_vm *addr) { return vsk->transport->dgram_bind(vsk, addr); } static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) { struct vsock_sock *vsk = vsock_sk(sk); int retval; /* First ensure this socket isn't already bound. */ if (vsock_addr_bound(&vsk->local_addr)) return -EINVAL; /* Now bind to the provided address or select appropriate values if * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that * like AF_INET prevents binding to a non-local IP address (in most * cases), we only allow binding to a local CID. */ if (addr->svm_cid != VMADDR_CID_ANY && !vsock_find_cid(addr->svm_cid)) return -EADDRNOTAVAIL; switch (sk->sk_socket->type) { case SOCK_STREAM: case SOCK_SEQPACKET: spin_lock_bh(&vsock_table_lock); retval = __vsock_bind_connectible(vsk, addr); spin_unlock_bh(&vsock_table_lock); break; case SOCK_DGRAM: retval = __vsock_bind_dgram(vsk, addr); break; default: retval = -EINVAL; break; } return retval; } static void vsock_connect_timeout(struct work_struct *work); static struct sock *__vsock_create(struct net *net, struct socket *sock, struct sock *parent, gfp_t priority, unsigned short type, int kern) { struct sock *sk; struct vsock_sock *psk; struct vsock_sock *vsk; sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern); if (!sk) return NULL; sock_init_data(sock, sk); /* sk->sk_type is normally set in sock_init_data, but only if sock is * non-NULL. We make sure that our sockets always have a type by * setting it here if needed. */ if (!sock) sk->sk_type = type; vsk = vsock_sk(sk); vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); sk->sk_destruct = vsock_sk_destruct; sk->sk_backlog_rcv = vsock_queue_rcv_skb; sock_reset_flag(sk, SOCK_DONE); INIT_LIST_HEAD(&vsk->bound_table); INIT_LIST_HEAD(&vsk->connected_table); vsk->listener = NULL; INIT_LIST_HEAD(&vsk->pending_links); INIT_LIST_HEAD(&vsk->accept_queue); vsk->rejected = false; vsk->sent_request = false; vsk->ignore_connecting_rst = false; vsk->peer_shutdown = 0; INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout); INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work); psk = parent ? vsock_sk(parent) : NULL; if (parent) { vsk->trusted = psk->trusted; vsk->owner = get_cred(psk->owner); vsk->connect_timeout = psk->connect_timeout; vsk->buffer_size = psk->buffer_size; vsk->buffer_min_size = psk->buffer_min_size; vsk->buffer_max_size = psk->buffer_max_size; security_sk_clone(parent, sk); } else { vsk->trusted = ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN); vsk->owner = get_current_cred(); vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE; vsk->buffer_min_size = VSOCK_DEFAULT_BUFFER_MIN_SIZE; vsk->buffer_max_size = VSOCK_DEFAULT_BUFFER_MAX_SIZE; } return sk; } static bool sock_type_connectible(u16 type) { return (type == SOCK_STREAM) || (type == SOCK_SEQPACKET); } static void __vsock_release(struct sock *sk, int level) { struct vsock_sock *vsk; struct sock *pending; vsk = vsock_sk(sk); pending = NULL; /* Compiler warning. */ /* When "level" is SINGLE_DEPTH_NESTING, use the nested * version to avoid the warning "possible recursive locking * detected". When "level" is 0, lock_sock_nested(sk, level) * is the same as lock_sock(sk). */ lock_sock_nested(sk, level); /* Indicate to vsock_remove_sock() that the socket is being released and * can be removed from the bound_table. Unlike transport reassignment * case, where the socket must remain bound despite vsock_remove_sock() * being called from the transport release() callback. */ sock_set_flag(sk, SOCK_DEAD); if (vsk->transport) vsk->transport->release(vsk); else if (sock_type_connectible(sk->sk_type)) vsock_remove_sock(vsk); sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; skb_queue_purge(&sk->sk_receive_queue); /* Clean up any sockets that never were accepted. */ while ((pending = vsock_dequeue_accept(sk)) != NULL) { __vsock_release(pending, SINGLE_DEPTH_NESTING); sock_put(pending); } release_sock(sk); sock_put(sk); } static void vsock_sk_destruct(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); /* Flush MSG_ZEROCOPY leftovers. */ __skb_queue_purge(&sk->sk_error_queue); vsock_deassign_transport(vsk); /* When clearing these addresses, there's no need to set the family and * possibly register the address family with the kernel. */ vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); put_cred(vsk->owner); } static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; err = sock_queue_rcv_skb(sk, skb); if (err) kfree_skb(skb); return err; } struct sock *vsock_create_connected(struct sock *parent) { return __vsock_create(sock_net(parent), NULL, parent, GFP_KERNEL, parent->sk_type, 0); } EXPORT_SYMBOL_GPL(vsock_create_connected); s64 vsock_stream_has_data(struct vsock_sock *vsk) { if (WARN_ON(!vsk->transport)) return 0; return vsk->transport->stream_has_data(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_data); s64 vsock_connectible_has_data(struct vsock_sock *vsk) { struct sock *sk = sk_vsock(vsk); if (WARN_ON(!vsk->transport)) return 0; if (sk->sk_type == SOCK_SEQPACKET) return vsk->transport->seqpacket_has_data(vsk); else return vsock_stream_has_data(vsk); } EXPORT_SYMBOL_GPL(vsock_connectible_has_data); s64 vsock_stream_has_space(struct vsock_sock *vsk) { if (WARN_ON(!vsk->transport)) return 0; return vsk->transport->stream_has_space(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_space); void vsock_data_ready(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); if (vsock_stream_has_data(vsk) >= sk->sk_rcvlowat || sock_flag(sk, SOCK_DONE)) sk->sk_data_ready(sk); } EXPORT_SYMBOL_GPL(vsock_data_ready); /* Dummy callback required by sockmap. * See unconditional call of saved_close() in sock_map_close(). */ static void vsock_close(struct sock *sk, long timeout) { } static int vsock_release(struct socket *sock) { struct sock *sk = sock->sk; if (!sk) return 0; sk->sk_prot->close(sk, 0); __vsock_release(sk, 0); sock->sk = NULL; sock->state = SS_FREE; return 0; } static int vsock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { int err; struct sock *sk; struct sockaddr_vm *vm_addr; sk = sock->sk; if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0) return -EINVAL; lock_sock(sk); err = __vsock_bind(sk, vm_addr); release_sock(sk); return err; } static int vsock_getname(struct socket *sock, struct sockaddr *addr, int peer) { int err; struct sock *sk; struct vsock_sock *vsk; struct sockaddr_vm *vm_addr; sk = sock->sk; vsk = vsock_sk(sk); err = 0; lock_sock(sk); if (peer) { if (sock->state != SS_CONNECTED) { err = -ENOTCONN; goto out; } vm_addr = &vsk->remote_addr; } else { vm_addr = &vsk->local_addr; } BUILD_BUG_ON(sizeof(*vm_addr) > sizeof(struct sockaddr_storage)); memcpy(addr, vm_addr, sizeof(*vm_addr)); err = sizeof(*vm_addr); out: release_sock(sk); return err; } void vsock_linger(struct sock *sk) { DEFINE_WAIT_FUNC(wait, woken_wake_function); ssize_t (*unsent)(struct vsock_sock *vsk); struct vsock_sock *vsk = vsock_sk(sk); long timeout; if (!sock_flag(sk, SOCK_LINGER)) return; timeout = sk->sk_lingertime; if (!timeout) return; /* Transports must implement `unsent_bytes` if they want to support * SOCK_LINGER through `vsock_linger()` since we use it to check when * the socket can be closed. */ unsent = vsk->transport->unsent_bytes; if (!unsent) return; add_wait_queue(sk_sleep(sk), &wait); do { if (sk_wait_event(sk, &timeout, unsent(vsk) == 0, &wait)) break; } while (!signal_pending(current) && timeout); remove_wait_queue(sk_sleep(sk), &wait); } EXPORT_SYMBOL_GPL(vsock_linger); static int vsock_shutdown(struct socket *sock, int mode) { int err; struct sock *sk; /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode * here like the other address families do. Note also that the * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3), * which is what we want. */ mode++; if ((mode & ~SHUTDOWN_MASK) || !mode) return -EINVAL; /* If this is a connection oriented socket and it is not connected then * bail out immediately. If it is a DGRAM socket then we must first * kick the socket so that it wakes up from any sleeping calls, for * example recv(), and then afterwards return the error. */ sk = sock->sk; lock_sock(sk); if (sock->state == SS_UNCONNECTED) { err = -ENOTCONN; if (sock_type_connectible(sk->sk_type)) goto out; } else { sock->state = SS_DISCONNECTING; err = 0; } /* Receive and send shutdowns are treated alike. */ mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); if (mode) { sk->sk_shutdown |= mode; sk->sk_state_change(sk); if (sock_type_connectible(sk->sk_type)) { sock_reset_flag(sk, SOCK_DONE); vsock_send_shutdown(sk, mode); } } out: release_sock(sk); return err; } static __poll_t vsock_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk; __poll_t mask; struct vsock_sock *vsk; sk = sock->sk; vsk = vsock_sk(sk); poll_wait(file, sk_sleep(sk), wait); mask = 0; if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) /* Signify that there has been an error on this socket. */ mask |= EPOLLERR; /* INET sockets treat local write shutdown and peer write shutdown as a * case of EPOLLHUP set. */ if ((sk->sk_shutdown == SHUTDOWN_MASK) || ((sk->sk_shutdown & SEND_SHUTDOWN) && (vsk->peer_shutdown & SEND_SHUTDOWN))) { mask |= EPOLLHUP; } if (sk->sk_shutdown & RCV_SHUTDOWN || vsk->peer_shutdown & SEND_SHUTDOWN) { mask |= EPOLLRDHUP; } if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; if (sock->type == SOCK_DGRAM) { /* For datagram sockets we can read if there is something in * the queue and write as long as the socket isn't shutdown for * sending. */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) { mask |= EPOLLIN | EPOLLRDNORM; } if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; } else if (sock_type_connectible(sk->sk_type)) { const struct vsock_transport *transport; lock_sock(sk); transport = vsk->transport; /* Listening sockets that have connections in their accept * queue can be read. */ if (sk->sk_state == TCP_LISTEN && !vsock_is_accept_queue_empty(sk)) mask |= EPOLLIN | EPOLLRDNORM; /* If there is something in the queue then we can read. */ if (transport && transport->stream_is_active(vsk) && !(sk->sk_shutdown & RCV_SHUTDOWN)) { bool data_ready_now = false; int target = sock_rcvlowat(sk, 0, INT_MAX); int ret = transport->notify_poll_in( vsk, target, &data_ready_now); if (ret < 0) { mask |= EPOLLERR; } else { if (data_ready_now) mask |= EPOLLIN | EPOLLRDNORM; } } /* Sockets whose connections have been closed, reset, or * terminated should also be considered read, and we check the * shutdown flag for that. */ if (sk->sk_shutdown & RCV_SHUTDOWN || vsk->peer_shutdown & SEND_SHUTDOWN) { mask |= EPOLLIN | EPOLLRDNORM; } /* Connected sockets that can produce data can be written. */ if (transport && sk->sk_state == TCP_ESTABLISHED) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { bool space_avail_now = false; int ret = transport->notify_poll_out( vsk, 1, &space_avail_now); if (ret < 0) { mask |= EPOLLERR; } else { if (space_avail_now) /* Remove EPOLLWRBAND since INET * sockets are not setting it. */ mask |= EPOLLOUT | EPOLLWRNORM; } } } /* Simulate INET socket poll behaviors, which sets * EPOLLOUT|EPOLLWRNORM when peer is closed and nothing to read, * but local send is not shutdown. */ if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= EPOLLOUT | EPOLLWRNORM; } release_sock(sk); } return mask; } static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor) { struct vsock_sock *vsk = vsock_sk(sk); if (WARN_ON_ONCE(!vsk->transport)) return -ENODEV; return vsk->transport->read_skb(vsk, read_actor); } static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { int err; struct sock *sk; struct vsock_sock *vsk; struct sockaddr_vm *remote_addr; const struct vsock_transport *transport; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; /* For now, MSG_DONTWAIT is always assumed... */ err = 0; sk = sock->sk; vsk = vsock_sk(sk); lock_sock(sk); transport = vsk->transport; err = vsock_auto_bind(vsk); if (err) goto out; /* If the provided message contains an address, use that. Otherwise * fall back on the socket's remote handle (if it has been connected). */ if (msg->msg_name && vsock_addr_cast(msg->msg_name, msg->msg_namelen, &remote_addr) == 0) { /* Ensure this address is of the right type and is a valid * destination. */ if (remote_addr->svm_cid == VMADDR_CID_ANY) remote_addr->svm_cid = transport->get_local_cid(); if (!vsock_addr_bound(remote_addr)) { err = -EINVAL; goto out; } } else if (sock->state == SS_CONNECTED) { remote_addr = &vsk->remote_addr; if (remote_addr->svm_cid == VMADDR_CID_ANY) remote_addr->svm_cid = transport->get_local_cid(); /* XXX Should connect() or this function ensure remote_addr is * bound? */ if (!vsock_addr_bound(&vsk->remote_addr)) { err = -EINVAL; goto out; } } else { err = -EINVAL; goto out; } if (!transport->dgram_allow(remote_addr->svm_cid, remote_addr->svm_port)) { err = -EINVAL; goto out; } err = transport->dgram_enqueue(vsk, remote_addr, msg, len); out: release_sock(sk); return err; } static int vsock_dgram_connect(struct socket *sock, struct sockaddr_unsized *addr, int addr_len, int flags) { int err; struct sock *sk; struct vsock_sock *vsk; struct sockaddr_vm *remote_addr; sk = sock->sk; vsk = vsock_sk(sk); err = vsock_addr_cast(addr, addr_len, &remote_addr); if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) { lock_sock(sk); vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); sock->state = SS_UNCONNECTED; release_sock(sk); return 0; } else if (err != 0) return -EINVAL; lock_sock(sk); err = vsock_auto_bind(vsk); if (err) goto out; if (!vsk->transport->dgram_allow(remote_addr->svm_cid, remote_addr->svm_port)) { err = -EINVAL; goto out; } memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr)); sock->state = SS_CONNECTED; /* sock map disallows redirection of non-TCP sockets with sk_state != * TCP_ESTABLISHED (see sock_map_redirect_allowed()), so we set * TCP_ESTABLISHED here to allow redirection of connected vsock dgrams. * * This doesn't seem to be abnormal state for datagram sockets, as the * same approach can be see in other datagram socket types as well * (such as unix sockets). */ sk->sk_state = TCP_ESTABLISHED; out: release_sock(sk); return err; } int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct vsock_sock *vsk = vsock_sk(sk); return vsk->transport->dgram_dequeue(vsk, msg, len, flags); } int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { #ifdef CONFIG_BPF_SYSCALL struct sock *sk = sock->sk; const struct proto *prot; prot = READ_ONCE(sk->sk_prot); if (prot != &vsock_proto) return prot->recvmsg(sk, msg, len, flags, NULL); #endif return __vsock_dgram_recvmsg(sock, msg, len, flags); } EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg); static int vsock_do_ioctl(struct socket *sock, unsigned int cmd, int __user *arg) { struct sock *sk = sock->sk; struct vsock_sock *vsk; int ret; vsk = vsock_sk(sk); switch (cmd) { case SIOCINQ: { ssize_t n_bytes; if (!vsk->transport) { ret = -EOPNOTSUPP; break; } if (sock_type_connectible(sk->sk_type) && sk->sk_state == TCP_LISTEN) { ret = -EINVAL; break; } n_bytes = vsock_stream_has_data(vsk); if (n_bytes < 0) { ret = n_bytes; break; } ret = put_user(n_bytes, arg); break; } case SIOCOUTQ: { ssize_t n_bytes; if (!vsk->transport || !vsk->transport->unsent_bytes) { ret = -EOPNOTSUPP; break; } if (sock_type_connectible(sk->sk_type) && sk->sk_state == TCP_LISTEN) { ret = -EINVAL; break; } n_bytes = vsk->transport->unsent_bytes(vsk); if (n_bytes < 0) { ret = n_bytes; break; } ret = put_user(n_bytes, arg); break; } default: ret = -ENOIOCTLCMD; } return ret; } static int vsock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int ret; lock_sock(sock->sk); ret = vsock_do_ioctl(sock, cmd, (int __user *)arg); release_sock(sock->sk); return ret; } static const struct proto_ops vsock_dgram_ops = { .family = PF_VSOCK, .owner = THIS_MODULE, .release = vsock_release, .bind = vsock_bind, .connect = vsock_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = vsock_getname, .poll = vsock_poll, .ioctl = vsock_ioctl, .listen = sock_no_listen, .shutdown = vsock_shutdown, .sendmsg = vsock_dgram_sendmsg, .recvmsg = vsock_dgram_recvmsg, .mmap = sock_no_mmap, .read_skb = vsock_read_skb, }; static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) { const struct vsock_transport *transport = vsk->transport; if (!transport || !transport->cancel_pkt) return -EOPNOTSUPP; return transport->cancel_pkt(vsk); } static void vsock_connect_timeout(struct work_struct *work) { struct sock *sk; struct vsock_sock *vsk; vsk = container_of(work, struct vsock_sock, connect_work.work); sk = sk_vsock(vsk); lock_sock(sk); if (sk->sk_state == TCP_SYN_SENT && (sk->sk_shutdown != SHUTDOWN_MASK)) { sk->sk_state = TCP_CLOSE; sk->sk_socket->state = SS_UNCONNECTED; sk->sk_err = ETIMEDOUT; sk_error_report(sk); vsock_transport_cancel_pkt(vsk); } release_sock(sk); sock_put(sk); } static int vsock_connect(struct socket *sock, struct sockaddr_unsized *addr, int addr_len, int flags) { int err; struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; struct sockaddr_vm *remote_addr; long timeout; DEFINE_WAIT(wait); err = 0; sk = sock->sk; vsk = vsock_sk(sk); lock_sock(sk); /* XXX AF_UNSPEC should make us disconnect like AF_INET. */ switch (sock->state) { case SS_CONNECTED: err = -EISCONN; goto out; case SS_DISCONNECTING: err = -EINVAL; goto out; case SS_CONNECTING: /* This continues on so we can move sock into the SS_CONNECTED * state once the connection has completed (at which point err * will be set to zero also). Otherwise, we will either wait * for the connection or return -EALREADY should this be a * non-blocking call. */ err = -EALREADY; if (flags & O_NONBLOCK) goto out; break; default: if ((sk->sk_state == TCP_LISTEN) || vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { err = -EINVAL; goto out; } /* Set the remote address that we are connecting to. */ memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr)); err = vsock_assign_transport(vsk, NULL); if (err) goto out; transport = vsk->transport; /* The hypervisor and well-known contexts do not have socket * endpoints. */ if (!transport || !transport->stream_allow(remote_addr->svm_cid, remote_addr->svm_port)) { err = -ENETUNREACH; goto out; } if (vsock_msgzerocopy_allow(transport)) { set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); } else if (sock_flag(sk, SOCK_ZEROCOPY)) { /* If this option was set before 'connect()', * when transport was unknown, check that this * feature is supported here. */ err = -EOPNOTSUPP; goto out; } err = vsock_auto_bind(vsk); if (err) goto out; sk->sk_state = TCP_SYN_SENT; err = transport->connect(vsk); if (err < 0) goto out; /* sk_err might have been set as a result of an earlier * (failed) connect attempt. */ sk->sk_err = 0; /* Mark sock as connecting and set the error code to in * progress in case this is a non-blocking connect. */ sock->state = SS_CONNECTING; err = -EINPROGRESS; } /* The receive path will handle all communication until we are able to * enter the connected state. Here we wait for the connection to be * completed or a notification of an error. */ timeout = vsk->connect_timeout; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); /* If the socket is already closing or it is in an error state, there * is no point in waiting. */ while (sk->sk_state != TCP_ESTABLISHED && sk->sk_state != TCP_CLOSING && sk->sk_err == 0) { if (flags & O_NONBLOCK) { /* If we're not going to block, we schedule a timeout * function to generate a timeout on the connection * attempt, in case the peer doesn't respond in a * timely manner. We hold on to the socket until the * timeout fires. */ sock_hold(sk); /* If the timeout function is already scheduled, * reschedule it, then ungrab the socket refcount to * keep it balanced. */ if (mod_delayed_work(system_percpu_wq, &vsk->connect_work, timeout)) sock_put(sk); /* Skip ahead to preserve error code set above. */ goto out_wait; } release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); /* Connection established. Whatever happens to socket once we * release it, that's not connect()'s concern. No need to go * into signal and timeout handling. Call it a day. * * Note that allowing to "reset" an already established socket * here is racy and insecure. */ if (sk->sk_state == TCP_ESTABLISHED) break; /* If connection was _not_ established and a signal/timeout came * to be, we want the socket's state reset. User space may want * to retry. * * sk_state != TCP_ESTABLISHED implies that socket is not on * vsock_connected_table. We keep the binding and the transport * assigned. */ if (signal_pending(current) || timeout == 0) { err = timeout == 0 ? -ETIMEDOUT : sock_intr_errno(timeout); /* Listener might have already responded with * VIRTIO_VSOCK_OP_RESPONSE. Its handling expects our * sk_state == TCP_SYN_SENT, which hereby we break. * In such case VIRTIO_VSOCK_OP_RST will follow. */ sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; /* Try to cancel VIRTIO_VSOCK_OP_REQUEST skb sent out by * transport->connect(). */ vsock_transport_cancel_pkt(vsk); goto out_wait; } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } if (sk->sk_err) { err = -sk->sk_err; sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; } else { err = 0; } out_wait: finish_wait(sk_sleep(sk), &wait); out: release_sock(sk); return err; } static int vsock_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *listener; int err; struct sock *connected; struct vsock_sock *vconnected; long timeout; DEFINE_WAIT(wait); err = 0; listener = sock->sk; lock_sock(listener); if (!sock_type_connectible(sock->type)) { err = -EOPNOTSUPP; goto out; } if (listener->sk_state != TCP_LISTEN) { err = -EINVAL; goto out; } /* Wait for children sockets to appear; these are the new sockets * created upon connection establishment. */ timeout = sock_rcvtimeo(listener, arg->flags & O_NONBLOCK); prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); while ((connected = vsock_dequeue_accept(listener)) == NULL && listener->sk_err == 0) { release_sock(listener); timeout = schedule_timeout(timeout); finish_wait(sk_sleep(listener), &wait); lock_sock(listener); if (signal_pending(current)) { err = sock_intr_errno(timeout); goto out; } else if (timeout == 0) { err = -EAGAIN; goto out; } prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); } finish_wait(sk_sleep(listener), &wait); if (listener->sk_err) err = -listener->sk_err; if (connected) { sk_acceptq_removed(listener); lock_sock_nested(connected, SINGLE_DEPTH_NESTING); vconnected = vsock_sk(connected); /* If the listener socket has received an error, then we should * reject this socket and return. Note that we simply mark the * socket rejected, drop our reference, and let the cleanup * function handle the cleanup; the fact that we found it in * the listener's accept queue guarantees that the cleanup * function hasn't run yet. */ if (err) { vconnected->rejected = true; } else { newsock->state = SS_CONNECTED; sock_graft(connected, newsock); if (vsock_msgzerocopy_allow(vconnected->transport)) set_bit(SOCK_SUPPORT_ZC, &connected->sk_socket->flags); } release_sock(connected); sock_put(connected); } out: release_sock(listener); return err; } static int vsock_listen(struct socket *sock, int backlog) { int err; struct sock *sk; struct vsock_sock *vsk; sk = sock->sk; lock_sock(sk); if (!sock_type_connectible(sk->sk_type)) { err = -EOPNOTSUPP; goto out; } if (sock->state != SS_UNCONNECTED) { err = -EINVAL; goto out; } vsk = vsock_sk(sk); if (!vsock_addr_bound(&vsk->local_addr)) { err = -EINVAL; goto out; } sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; err = 0; out: release_sock(sk); return err; } static void vsock_update_buffer_size(struct vsock_sock *vsk, const struct vsock_transport *transport, u64 val) { if (val > vsk->buffer_max_size) val = vsk->buffer_max_size; if (val < vsk->buffer_min_size) val = vsk->buffer_min_size; if (val != vsk->buffer_size && transport && transport->notify_buffer_size) transport->notify_buffer_size(vsk, &val); vsk->buffer_size = val; } static int vsock_connectible_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { int err; struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; u64 val; if (level != AF_VSOCK && level != SOL_SOCKET) return -ENOPROTOOPT; #define COPY_IN(_v) \ do { \ if (optlen < sizeof(_v)) { \ err = -EINVAL; \ goto exit; \ } \ if (copy_from_sockptr(&_v, optval, sizeof(_v)) != 0) { \ err = -EFAULT; \ goto exit; \ } \ } while (0) err = 0; sk = sock->sk; vsk = vsock_sk(sk); lock_sock(sk); transport = vsk->transport; if (level == SOL_SOCKET) { int zerocopy; if (optname != SO_ZEROCOPY) { release_sock(sk); return sock_setsockopt(sock, level, optname, optval, optlen); } /* Use 'int' type here, because variable to * set this option usually has this type. */ COPY_IN(zerocopy); if (zerocopy < 0 || zerocopy > 1) { err = -EINVAL; goto exit; } if (transport && !vsock_msgzerocopy_allow(transport)) { err = -EOPNOTSUPP; goto exit; } sock_valbool_flag(sk, SOCK_ZEROCOPY, zerocopy); goto exit; } switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: COPY_IN(val); vsock_update_buffer_size(vsk, transport, val); break; case SO_VM_SOCKETS_BUFFER_MAX_SIZE: COPY_IN(val); vsk->buffer_max_size = val; vsock_update_buffer_size(vsk, transport, vsk->buffer_size); break; case SO_VM_SOCKETS_BUFFER_MIN_SIZE: COPY_IN(val); vsk->buffer_min_size = val; vsock_update_buffer_size(vsk, transport, vsk->buffer_size); break; case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: { struct __kernel_sock_timeval tv; err = sock_copy_user_timeval(&tv, optval, optlen, optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); if (err) break; if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC && tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) { vsk->connect_timeout = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, (USEC_PER_SEC / HZ)); if (vsk->connect_timeout == 0) vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; } else { err = -ERANGE; } break; } default: err = -ENOPROTOOPT; break; } #undef COPY_IN exit: release_sock(sk); return err; } static int vsock_connectible_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct vsock_sock *vsk = vsock_sk(sk); union { u64 val64; struct old_timeval32 tm32; struct __kernel_old_timeval tm; struct __kernel_sock_timeval stm; } v; int lv = sizeof(v.val64); int len; if (level != AF_VSOCK) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; memset(&v, 0, sizeof(v)); switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: v.val64 = vsk->buffer_size; break; case SO_VM_SOCKETS_BUFFER_MAX_SIZE: v.val64 = vsk->buffer_max_size; break; case SO_VM_SOCKETS_BUFFER_MIN_SIZE: v.val64 = vsk->buffer_min_size; break; case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: lv = sock_get_timeout(vsk->connect_timeout, &v, optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); break; default: return -ENOPROTOOPT; } if (len < lv) return -EINVAL; if (len > lv) len = lv; if (copy_to_user(optval, &v, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; ssize_t total_written; long timeout; int err; struct vsock_transport_send_notify_data send_data; DEFINE_WAIT_FUNC(wait, woken_wake_function); sk = sock->sk; vsk = vsock_sk(sk); total_written = 0; err = 0; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; lock_sock(sk); transport = vsk->transport; /* Callers should not provide a destination with connection oriented * sockets. */ if (msg->msg_namelen) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out; } /* Send data only if both sides are not shutdown in the direction. */ if (sk->sk_shutdown & SEND_SHUTDOWN || vsk->peer_shutdown & RCV_SHUTDOWN) { err = -EPIPE; goto out; } if (!transport || sk->sk_state != TCP_ESTABLISHED || !vsock_addr_bound(&vsk->local_addr)) { err = -ENOTCONN; goto out; } if (!vsock_addr_bound(&vsk->remote_addr)) { err = -EDESTADDRREQ; goto out; } if (msg->msg_flags & MSG_ZEROCOPY && !vsock_msgzerocopy_allow(transport)) { err = -EOPNOTSUPP; goto out; } /* Wait for room in the produce queue to enqueue our user's data. */ timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); err = transport->notify_send_init(vsk, &send_data); if (err < 0) goto out; while (total_written < len) { ssize_t written; add_wait_queue(sk_sleep(sk), &wait); while (vsock_stream_has_space(vsk) == 0 && sk->sk_err == 0 && !(sk->sk_shutdown & SEND_SHUTDOWN) && !(vsk->peer_shutdown & RCV_SHUTDOWN)) { /* Don't wait for non-blocking sockets. */ if (timeout == 0) { err = -EAGAIN; remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } err = transport->notify_send_pre_block(vsk, &send_data); if (err < 0) { remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } release_sock(sk); timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } else if (timeout == 0) { err = -EAGAIN; remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } } remove_wait_queue(sk_sleep(sk), &wait); /* These checks occur both as part of and after the loop * conditional since we need to check before and after * sleeping. */ if (sk->sk_err) { err = -sk->sk_err; goto out_err; } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || (vsk->peer_shutdown & RCV_SHUTDOWN)) { err = -EPIPE; goto out_err; } err = transport->notify_send_pre_enqueue(vsk, &send_data); if (err < 0) goto out_err; /* Note that enqueue will only write as many bytes as are free * in the produce queue, so we don't need to ensure len is * smaller than the queue size. It is the caller's * responsibility to check how many bytes we were able to send. */ if (sk->sk_type == SOCK_SEQPACKET) { written = transport->seqpacket_enqueue(vsk, msg, len - total_written); } else { written = transport->stream_enqueue(vsk, msg, len - total_written); } if (written < 0) { err = written; goto out_err; } total_written += written; err = transport->notify_send_post_enqueue( vsk, written, &send_data); if (err < 0) goto out_err; } out_err: if (total_written > 0) { /* Return number of written bytes only if: * 1) SOCK_STREAM socket. * 2) SOCK_SEQPACKET socket when whole buffer is sent. */ if (sk->sk_type == SOCK_STREAM || total_written == len) err = total_written; } out: if (sk->sk_type == SOCK_STREAM) err = sk_stream_error(sk, msg->msg_flags, err); release_sock(sk); return err; } static int vsock_connectible_wait_data(struct sock *sk, struct wait_queue_entry *wait, long timeout, struct vsock_transport_recv_notify_data *recv_data, size_t target) { const struct vsock_transport *transport; struct vsock_sock *vsk; s64 data; int err; vsk = vsock_sk(sk); err = 0; transport = vsk->transport; while (1) { prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE); data = vsock_connectible_has_data(vsk); if (data != 0) break; if (sk->sk_err != 0 || (sk->sk_shutdown & RCV_SHUTDOWN) || (vsk->peer_shutdown & SEND_SHUTDOWN)) { break; } /* Don't wait for non-blocking sockets. */ if (timeout == 0) { err = -EAGAIN; break; } if (recv_data) { err = transport->notify_recv_pre_block(vsk, target, recv_data); if (err < 0) break; } release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); break; } else if (timeout == 0) { err = -EAGAIN; break; } } finish_wait(sk_sleep(sk), wait); if (err) return err; /* Internal transport error when checking for available * data. XXX This should be changed to a connection * reset in a later change. */ if (data < 0) return -ENOMEM; return data; } static int __vsock_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { struct vsock_transport_recv_notify_data recv_data; const struct vsock_transport *transport; struct vsock_sock *vsk; ssize_t copied; size_t target; long timeout; int err; DEFINE_WAIT(wait); vsk = vsock_sk(sk); transport = vsk->transport; /* We must not copy less than target bytes into the user's buffer * before returning successfully, so we wait for the consume queue to * have that much data to consume before dequeueing. Note that this * makes it impossible to handle cases where target is greater than the * queue size. */ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); if (target >= transport->stream_rcvhiwat(vsk)) { err = -ENOMEM; goto out; } timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); copied = 0; err = transport->notify_recv_init(vsk, target, &recv_data); if (err < 0) goto out; while (1) { ssize_t read; err = vsock_connectible_wait_data(sk, &wait, timeout, &recv_data, target); if (err <= 0) break; err = transport->notify_recv_pre_dequeue(vsk, target, &recv_data); if (err < 0) break; read = transport->stream_dequeue(vsk, msg, len - copied, flags); if (read < 0) { err = read; break; } copied += read; err = transport->notify_recv_post_dequeue(vsk, target, read, !(flags & MSG_PEEK), &recv_data); if (err < 0) goto out; if (read >= target || flags & MSG_PEEK) break; target -= read; } if (sk->sk_err) err = -sk->sk_err; else if (sk->sk_shutdown & RCV_SHUTDOWN) err = 0; if (copied > 0) err = copied; out: return err; } static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { const struct vsock_transport *transport; struct vsock_sock *vsk; ssize_t msg_len; long timeout; int err = 0; DEFINE_WAIT(wait); vsk = vsock_sk(sk); transport = vsk->transport; timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); err = vsock_connectible_wait_data(sk, &wait, timeout, NULL, 0); if (err <= 0) goto out; msg_len = transport->seqpacket_dequeue(vsk, msg, flags); if (msg_len < 0) { err = msg_len; goto out; } if (sk->sk_err) { err = -sk->sk_err; } else if (sk->sk_shutdown & RCV_SHUTDOWN) { err = 0; } else { /* User sets MSG_TRUNC, so return real length of * packet. */ if (flags & MSG_TRUNC) err = msg_len; else err = len - msg_data_left(msg); /* Always set MSG_TRUNC if real length of packet is * bigger than user's buffer. */ if (msg_len > len) msg->msg_flags |= MSG_TRUNC; } out: return err; } int __vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; int err; sk = sock->sk; if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, VSOCK_RECVERR); vsk = vsock_sk(sk); err = 0; lock_sock(sk); transport = vsk->transport; if (!transport || sk->sk_state != TCP_ESTABLISHED) { /* Recvmsg is supposed to return 0 if a peer performs an * orderly shutdown. Differentiate between that case and when a * peer has not connected or a local shutdown occurred with the * SOCK_DONE flag. */ if (sock_flag(sk, SOCK_DONE)) err = 0; else err = -ENOTCONN; goto out; } if (flags & MSG_OOB) { err = -EOPNOTSUPP; goto out; } /* We don't check peer_shutdown flag here since peer may actually shut * down, but there can be data in the queue that a local socket can * receive. */ if (sk->sk_shutdown & RCV_SHUTDOWN) { err = 0; goto out; } /* It is valid on Linux to pass in a zero-length receive buffer. This * is not an error. We may as well bail out now. */ if (!len) { err = 0; goto out; } if (sk->sk_type == SOCK_STREAM) err = __vsock_stream_recvmsg(sk, msg, len, flags); else err = __vsock_seqpacket_recvmsg(sk, msg, len, flags); out: release_sock(sk); return err; } int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { #ifdef CONFIG_BPF_SYSCALL struct sock *sk = sock->sk; const struct proto *prot; prot = READ_ONCE(sk->sk_prot); if (prot != &vsock_proto) return prot->recvmsg(sk, msg, len, flags, NULL); #endif return __vsock_connectible_recvmsg(sock, msg, len, flags); } EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg); static int vsock_set_rcvlowat(struct sock *sk, int val) { const struct vsock_transport *transport; struct vsock_sock *vsk; vsk = vsock_sk(sk); if (val > vsk->buffer_size) return -EINVAL; transport = vsk->transport; if (transport && transport->notify_set_rcvlowat) { int err; err = transport->notify_set_rcvlowat(vsk, val); if (err) return err; } WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); return 0; } static const struct proto_ops vsock_stream_ops = { .family = PF_VSOCK, .owner = THIS_MODULE, .release = vsock_release, .bind = vsock_bind, .connect = vsock_connect, .socketpair = sock_no_socketpair, .accept = vsock_accept, .getname = vsock_getname, .poll = vsock_poll, .ioctl = vsock_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, .setsockopt = vsock_connectible_setsockopt, .getsockopt = vsock_connectible_getsockopt, .sendmsg = vsock_connectible_sendmsg, .recvmsg = vsock_connectible_recvmsg, .mmap = sock_no_mmap, .set_rcvlowat = vsock_set_rcvlowat, .read_skb = vsock_read_skb, }; static const struct proto_ops vsock_seqpacket_ops = { .family = PF_VSOCK, .owner = THIS_MODULE, .release = vsock_release, .bind = vsock_bind, .connect = vsock_connect, .socketpair = sock_no_socketpair, .accept = vsock_accept, .getname = vsock_getname, .poll = vsock_poll, .ioctl = vsock_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, .setsockopt = vsock_connectible_setsockopt, .getsockopt = vsock_connectible_getsockopt, .sendmsg = vsock_connectible_sendmsg, .recvmsg = vsock_connectible_recvmsg, .mmap = sock_no_mmap, .read_skb = vsock_read_skb, }; static int vsock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct vsock_sock *vsk; struct sock *sk; int ret; if (!sock) return -EINVAL; if (protocol && protocol != PF_VSOCK) return -EPROTONOSUPPORT; switch (sock->type) { case SOCK_DGRAM: sock->ops = &vsock_dgram_ops; break; case SOCK_STREAM: sock->ops = &vsock_stream_ops; break; case SOCK_SEQPACKET: sock->ops = &vsock_seqpacket_ops; break; default: return -ESOCKTNOSUPPORT; } sock->state = SS_UNCONNECTED; sk = __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern); if (!sk) return -ENOMEM; vsk = vsock_sk(sk); if (sock->type == SOCK_DGRAM) { ret = vsock_assign_transport(vsk, NULL); if (ret < 0) { sock->sk = NULL; sock_put(sk); return ret; } } /* SOCK_DGRAM doesn't have 'setsockopt' callback set in its * proto_ops, so there is no handler for custom logic. */ if (sock_type_connectible(sock->type)) set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); vsock_insert_unbound(vsk); return 0; } static const struct net_proto_family vsock_family_ops = { .family = AF_VSOCK, .create = vsock_create, .owner = THIS_MODULE, }; static long vsock_dev_do_ioctl(struct file *filp, unsigned int cmd, void __user *ptr) { u32 __user *p = ptr; int retval = 0; u32 cid; switch (cmd) { case IOCTL_VM_SOCKETS_GET_LOCAL_CID: /* To be compatible with the VMCI behavior, we prioritize the * guest CID instead of well-know host CID (VMADDR_CID_HOST). */ cid = vsock_registered_transport_cid(&transport_g2h); if (cid == VMADDR_CID_ANY) cid = vsock_registered_transport_cid(&transport_h2g); if (cid == VMADDR_CID_ANY) cid = vsock_registered_transport_cid(&transport_local); if (put_user(cid, p) != 0) retval = -EFAULT; break; default: retval = -ENOIOCTLCMD; } return retval; } static long vsock_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg); } #ifdef CONFIG_COMPAT static long vsock_dev_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg)); } #endif static const struct file_operations vsock_device_ops = { .owner = THIS_MODULE, .unlocked_ioctl = vsock_dev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vsock_dev_compat_ioctl, #endif .open = nonseekable_open, }; static struct miscdevice vsock_device = { .name = "vsock", .fops = &vsock_device_ops, }; static int __init vsock_init(void) { int err = 0; vsock_init_tables(); vsock_proto.owner = THIS_MODULE; vsock_device.minor = MISC_DYNAMIC_MINOR; err = misc_register(&vsock_device); if (err) { pr_err("Failed to register misc device\n"); goto err_reset_transport; } err = proto_register(&vsock_proto, 1); /* we want our slab */ if (err) { pr_err("Cannot register vsock protocol\n"); goto err_deregister_misc; } err = sock_register(&vsock_family_ops); if (err) { pr_err("could not register af_vsock (%d) address family: %d\n", AF_VSOCK, err); goto err_unregister_proto; } vsock_bpf_build_proto(); return 0; err_unregister_proto: proto_unregister(&vsock_proto); err_deregister_misc: misc_deregister(&vsock_device); err_reset_transport: return err; } static void __exit vsock_exit(void) { misc_deregister(&vsock_device); sock_unregister(AF_VSOCK); proto_unregister(&vsock_proto); } const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk) { return vsk->transport; } EXPORT_SYMBOL_GPL(vsock_core_get_transport); int vsock_core_register(const struct vsock_transport *t, int features) { const struct vsock_transport *t_h2g, *t_g2h, *t_dgram, *t_local; int err = mutex_lock_interruptible(&vsock_register_mutex); if (err) return err; t_h2g = transport_h2g; t_g2h = transport_g2h; t_dgram = transport_dgram; t_local = transport_local; if (features & VSOCK_TRANSPORT_F_H2G) { if (t_h2g) { err = -EBUSY; goto err_busy; } t_h2g = t; } if (features & VSOCK_TRANSPORT_F_G2H) { if (t_g2h) { err = -EBUSY; goto err_busy; } t_g2h = t; } if (features & VSOCK_TRANSPORT_F_DGRAM) { if (t_dgram) { err = -EBUSY; goto err_busy; } t_dgram = t; } if (features & VSOCK_TRANSPORT_F_LOCAL) { if (t_local) { err = -EBUSY; goto err_busy; } t_local = t; } transport_h2g = t_h2g; transport_g2h = t_g2h; transport_dgram = t_dgram; transport_local = t_local; err_busy: mutex_unlock(&vsock_register_mutex); return err; } EXPORT_SYMBOL_GPL(vsock_core_register); void vsock_core_unregister(const struct vsock_transport *t) { mutex_lock(&vsock_register_mutex); if (transport_h2g == t) transport_h2g = NULL; if (transport_g2h == t) transport_g2h = NULL; if (transport_dgram == t) transport_dgram = NULL; if (transport_local == t) transport_local = NULL; mutex_unlock(&vsock_register_mutex); } EXPORT_SYMBOL_GPL(vsock_core_unregister); module_init(vsock_init); module_exit(vsock_exit); MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMware Virtual Socket Family"); MODULE_VERSION("1.0.2.0-k"); MODULE_LICENSE("GPL v2"); |
| 55 355 307 8 189 180 286 303 252 218 326 325 326 326 13 27 382 356 8 25 19 24 17 7 25 11 5 11 6 16 16 16 5 16 16 16 16 16 9 16 16 14 2 2 2 16 3 16 16 14 2 16 16 6 15 333 25 357 274 228 275 274 214 198 358 16 16 16 16 381 380 376 267 315 147 11 319 25 289 289 287 63 3 286 258 31 361 361 360 134 226 360 314 361 315 315 192 315 314 315 192 26 289 289 289 288 289 315 361 361 135 226 361 361 361 361 300 61 361 300 61 359 361 2 72 252 36 361 411 333 91 371 371 371 370 329 42 42 371 371 369 371 306 306 306 306 157 2 158 180 33 180 307 82 307 119 43 8 107 115 110 76 118 103 188 13 1 180 180 179 20 20 99 115 14 170 124 123 145 102 112 119 300 300 36 301 300 300 151 152 143 150 301 305 176 221 305 70 297 291 306 291 159 291 123 123 122 3 122 3 306 299 279 86 232 149 197 208 144 121 111 300 197 228 93 119 183 144 144 143 369 351 26 10 47 7 44 18 1 1 18 38 27 12 12 27 27 27 300 300 223 169 300 300 2 2 2 2 2 2 27 281 19 19 2 2 300 316 316 16 247 28 8 28 174 102 358 356 55 62 357 55 332 200 201 97 74 68 316 357 55 358 358 341 11 23 214 107 358 221 137 358 329 358 328 29 358 358 358 358 203 10 275 143 232 300 369 28 247 53 178 319 105 203 358 357 358 296 144 358 358 28 358 423 421 423 423 423 422 423 422 422 423 422 422 423 423 423 417 6 422 1 179 243 421 422 1 1 1 1 1 1 1 1 1 1 423 415 34 27 27 27 27 422 423 423 423 420 423 421 399 24 423 423 1 1 26 1 26 26 19 7 26 26 26 26 26 26 26 26 26 26 26 18 18 18 18 18 348 16 344 2 347 348 347 348 302 158 16 345 199 149 149 127 62 347 347 301 302 301 16 297 297 134 11 11 11 99 11 11 8 5 8 8 5 5 8 8 8 11 367 246 134 102 102 73 43 102 14 32 43 11 14 1 24 9 9 9 102 60 62 102 102 99 102 102 1 4 61 62 61 370 369 73 42 25 5 3 8 73 73 73 73 73 37 37 43 369 234 203 6 44 44 43 7 43 116 137 136 137 137 37 137 136 37 289 289 51 72 317 154 27 27 27 27 1 69 69 29 53 53 11 47 5 53 69 69 69 69 111 111 111 111 111 69 67 67 67 65 16 65 65 63 6 51 51 51 107 55 60 60 60 9 60 60 60 60 51 20 60 51 20 563 217 448 447 57 57 447 57 56 57 57 366 301 301 301 234 208 208 175 51 137 370 368 369 370 112 112 111 111 301 73 112 112 1 112 112 154 142 301 142 301 299 136 370 108 108 108 108 108 108 60 55 55 377 377 321 68 24 52 67 67 294 294 349 73 366 26 199 232 16 296 108 295 370 321 77 297 24 66 370 188 26 186 186 186 28 158 158 158 146 12 158 157 2 158 199 172 45 1 198 10 199 171 46 182 8 39 10 178 25 4 19 134 78 78 189 6 6 8 8 13 9 1 8 3 1 2 6 6 6 6 6 4 4 10 10 10 17 17 17 17 1 1 4 12 12 2 11 13 4 10 11 1 10 10 15 15 2 13 3 10 7 2 4 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> */ /* * mballoc.c contains the multiblocks allocation routines */ #include "ext4_jbd2.h" #include "mballoc.h" #include <linux/log2.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/backing-dev.h> #include <linux/freezer.h> #include <trace/events/ext4.h> #include <kunit/static_stub.h> /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() * - search for metadata in few groups * * TODO v4: * - normalization should take into account whether file is still open * - discard preallocations if no free space left (policy?) * - don't normalize tails * - quota * - reservation for superuser * * TODO v3: * - bitmap read-ahead (proposed by Oleg Drokin aka green) * - track min/max extents in each group for better group selection * - mb_mark_used() may allocate chunk right after splitting buddy * - tree of groups sorted by number of free blocks * - error handling */ /* * The allocation request involve request for multiple number of blocks * near to the goal(block) value specified. * * During initialization phase of the allocator we decide to use the * group preallocation or inode preallocation depending on the size of * the file. The size of the file could be the resulting file size we * would have after allocation, or the current file size, which ever * is larger. If the size is less than sbi->s_mb_stream_request we * select to use the group preallocation. The default value of * s_mb_stream_request is 16 blocks. This can also be tuned via * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in * terms of number of blocks. * * The main motivation for having small file use group preallocation is to * ensure that we have small files closer together on the disk. * * First stage the allocator looks at the inode prealloc list, * ext4_inode_info->i_prealloc_list, which contains list of prealloc * spaces for this particular inode. The inode prealloc space is * represented as: * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space * pa_len -> length for this prealloc space (in clusters) * pa_free -> free space available in this prealloc space (in clusters) * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc * space we will consume the particular prealloc space. This makes sure that * we have contiguous physical blocks representing the file blocks * * The important thing to be noted in case of inode prealloc space is that * we don't modify the values associated to inode prealloc space except * pa_free. * * If we are not able to find blocks in the inode prealloc space and if we * have the group allocation flag set then we look at the locality group * prealloc space. These are per CPU prealloc list represented as * * ext4_sb_info.s_locality_groups[smp_processor_id()] * * The reason for having a per cpu locality group is to reduce the contention * between CPUs. It is possible to get scheduled at this point. * * The locality group prealloc space is used looking at whether we have * enough free space (pa_free) within the prealloc space. * * If we can't allocate blocks via inode prealloc or/and locality group * prealloc then we look at the buddy cache. The buddy cache is represented * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets * mapped to the buddy and bitmap information regarding different * groups. The buddy information is attached to buddy cache inode so that * we can access them through the page cache. The information regarding * each group is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are stored in the * inode as: * * { folio } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. So for each group we * take up 2 blocks. A folio can contain blocks_per_folio (folio_size / * blocksize) blocks. So it can have information regarding groups_per_folio * which is blocks_per_folio/2 * * The buddy cache inode is not stored on disk. The inode is thrown * away when the filesystem is unmounted. * * We look for count number of blocks in the buddy cache. If we were able * to locate that many free blocks we return with additional information * regarding rest of the contiguous physical block available * * Before allocating blocks via buddy cache we normalize the request * blocks. This ensure we ask for more blocks that we needed. The extra * blocks that we get after allocation is added to the respective prealloc * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is * dependent on the cluster size; for non-bigalloc file systems, it is * 512 blocks. This can be tuned via * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe=<value> option the group prealloc request is normalized to the * smallest multiple of the stripe value (sbi->s_stripe) which is * greater than the default mb_group_prealloc. * * If "mb_optimize_scan" mount option is set, we maintain in memory group info * structures in two data structures: * * 1) Array of largest free order xarrays (sbi->s_mb_largest_free_orders) * * Locking: Writers use xa_lock, readers use rcu_read_lock. * * This is an array of xarrays where the index in the array represents the * largest free order in the buddy bitmap of the participating group infos of * that xarray. So, there are exactly MB_NUM_ORDERS(sb) (which means total * number of buddy bitmap orders possible) number of xarrays. Group-infos are * placed in appropriate xarrays. * * 2) Average fragment size xarrays (sbi->s_mb_avg_fragment_size) * * Locking: Writers use xa_lock, readers use rcu_read_lock. * * This is an array of xarrays where in the i-th xarray there are groups with * average fragment size >= 2^i and < 2^(i+1). The average fragment size * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. * Note that we don't bother with a special xarray for completely empty * groups so we only have MB_NUM_ORDERS(sb) xarrays. Group-infos are placed * in appropriate xarrays. * * In xarray, the index is the block group number, the value is the block group * information, and a non-empty value indicates the block group is present in * the current xarray. * * When "mb_optimize_scan" mount option is set, mballoc consults the above data * structures to decide the order in which groups are to be traversed for * fulfilling an allocation request. * * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order * >= the order of the request. We directly look at the largest free order list * in the data structure (1) above where largest_free_order = order of the * request. If that list is empty, we look at remaining list in the increasing * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED * lookup in O(1) time. * * At CR_GOAL_LEN_FAST, we only consider groups where * average fragment size > request size. So, we lookup a group which has average * fragment size just above or equal to request size using our average fragment * size group lists (data structure 2) in O(1) time. * * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in * CR_GOAL_LEN_FAST suggests that there is no BG that has avg * fragment size > goal length. So before falling to the slower * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big * enough average fragment size. This increases the chances of finding a * suitable block group in O(1) time and results in faster allocation at the * cost of reduced size of allocation. * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and * CR_GOAL_LEN_FAST phase. * * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req * /sys/fs/ext4/<partition>/mb_max_linear_groups * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The * value of s_mb_order2_reqs can be tuned via * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to * stripe size (sbi->s_stripe), we try to search for contiguous block in * stripe size. This should result in better allocation on RAID setups. If * not, we search in the specific group using bitmap for best extents. The * tunable min_to_scan and max_to_scan control the behaviour here. * min_to_scan indicate how long the mballoc __must__ look for a best * extent and max_to_scan indicates how long the mballoc __can__ look for a * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not * get traversed linearly. That may result in subsequent allocations being not * close to each other. And so, the underlying device may get filled up in a * non-linear fashion. While that may not matter on non-rotational devices, for * rotational devices that may result in higher seek times. "mb_max_linear_groups" * tells mballoc how many groups mballoc should search linearly before * performing consulting above data structures for more efficient lookups. For * non rotational devices, this value defaults to 0 and for rotational devices * this is set to MB_DEFAULT_LINEAR_LIMIT. * * Both the prealloc space are getting populated as above. So for the first * request we will hit the buddy cache which will result in this prealloc * space getting filled. The prealloc space is then later used for the * subsequent request. */ /* * mballoc operates on the following data: * - on-disk bitmap * - in-core buddy (actually includes buddy and bitmap) * - preallocation descriptors (PAs) * * there are two types of preallocations: * - inode * assiged to specific inode and can be used for this inode only. * it describes part of inode's space preallocated to specific * physical blocks. any block from that preallocated can be used * independent. the descriptor just tracks number of blocks left * unused. so, before taking some block from descriptor, one must * make sure corresponded logical block isn't allocated yet. this * also means that freeing any block within descriptor's range * must discard all preallocated blocks. * - locality group * assigned to specific locality group which does not translate to * permanent set of inodes: inode can join and leave group. space * from this type of preallocation can be used for any inode. thus * it's consumed from the beginning to the end. * * relation between them can be expressed as: * in-core buddy = on-disk bitmap + preallocation descriptors * * this mean blocks mballoc considers used are: * - allocated blocks (persistent) * - preallocated blocks (non-persistent) * * consistency in mballoc world means that at any time a block is either * free or used in ALL structures. notice: "any time" should not be read * literally -- time is discrete and delimited by locks. * * to keep it simple, we don't use block numbers, instead we count number of * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. * * all operations can be expressed as: * - init buddy: buddy = on-disk + PAs * - new PA: buddy += N; PA = N * - use inode PA: on-disk += N; PA -= N * - discard inode PA buddy -= on-disk - PA; PA = 0 * - use locality group PA on-disk += N; PA -= N * - discard locality group PA buddy -= PA; PA = 0 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap * is used in real operation because we can't know actual used * bits from PA, only from on-disk bitmap * * if we follow this strict logic, then all operations above should be atomic. * given some of them can block, we'd have to use something like semaphores * killing performance on high-end SMP hardware. let's try to relax it using * the following knowledge: * 1) if buddy is referenced, it's already initialized * 2) while block is used in buddy and the buddy is referenced, * nobody can re-allocate that block * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has * bit set and PA claims same block, it's OK. IOW, one can set bit in * on-disk bitmap if buddy has same bit set or/and PA covers corresponded * block * * so, now we're building a concurrency table: * - init buddy vs. * - new PA * blocks for PA are allocated in the buddy, buddy must be referenced * until PA is linked to allocation group to avoid concurrent buddy init * - use inode PA * we need to make sure that either on-disk bitmap or PA has uptodate data * given (3) we care that PA-=N operation doesn't interfere with init * - discard inode PA * the simplest way would be to have buddy initialized by the discard * - use locality group PA * again PA-=N must be serialized with init * - discard locality group PA * the simplest way would be to have buddy initialized by the discard * - new PA vs. * - use inode PA * i_data_sem serializes them * - discard inode PA * discard process must wait until PA isn't used by another process * - use locality group PA * some mutex should serialize them * - discard locality group PA * discard process must wait until PA isn't used by another process * - use inode PA * - use inode PA * i_data_sem or another mutex should serializes them * - discard inode PA * discard process must wait until PA isn't used by another process * - use locality group PA * nothing wrong here -- they're different PAs covering different blocks * - discard locality group PA * discard process must wait until PA isn't used by another process * * now we're ready to make few consequences: * - PA is referenced and while it is no discard is possible * - PA is referenced until block isn't marked in on-disk bitmap * - PA changes only after on-disk bitmap * - discard must not compete with init. either init is done before * any discard or they're serialized somehow * - buddy init as sum of on-disk bitmap and PAs is done atomically * * a special case when we've used PA to emptiness. no need to modify buddy * in this case, but we should care about concurrent init * */ /* * Logic in few words: * * - allocation: * load group * find blocks * mark bits in on-disk bitmap * release group * * - use preallocation: * find proper PA (per-inode or group) * load group * mark bits in on-disk bitmap * release group * release PA * * - free: * load group * mark bits in on-disk bitmap * release group * * - discard preallocations in group: * mark PAs deleted * move them onto local list * load on-disk bitmap * load group * remove PA from object (inode or locality group) * mark free blocks in-core * * - discard inode's preallocations: */ /* * Locking rules * * Locks: * - bitlock on a group (group) * - object (inode/locality) (object) * - per-pa lock (pa) * - cr_power2_aligned lists lock (cr_power2_aligned) * - cr_goal_len_fast lists lock (cr_goal_len_fast) * * Paths: * - new pa * object * group * * - find and use pa: * pa * * - release consumed pa: * pa * group * object * * - generate in-core bitmap: * group * pa * * - discard all for given object (inode, locality group): * object * pa * group * * - discard all for given group: * group * pa * group * object * * - allocation path (ext4_mb_regular_allocator) * group * cr_power2_aligned/cr_goal_len_fast */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; static struct kmem_cache *ext4_free_data_cachep; /* We create slab caches for groupinfo data structures based on the * superblock block size. There will be one per mounted filesystem for * each unique s_blocksize_bits */ #define NR_GRPINFO_CACHES 8 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", "ext4_groupinfo_64k", "ext4_groupinfo_128k" }; static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); static int ext4_mb_scan_group(struct ext4_allocation_context *ac, ext4_group_t group); static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks); /* * The algorithm using this percpu seq counter goes below: * 1. We sample the percpu discard_pa_seq counter before trying for block * allocation in ext4_mb_new_blocks(). * 2. We increment this percpu discard_pa_seq counter when we either allocate * or free these blocks i.e. while marking those blocks as used/free in * mb_mark_used()/mb_free_blocks(). * 3. We also increment this percpu seq counter when we successfully identify * that the bb_prealloc_list is not empty and hence proceed for discarding * of those PAs inside ext4_mb_discard_group_preallocations(). * * Now to make sure that the regular fast path of block allocation is not * affected, as a small optimization we only sample the percpu seq counter * on that cpu. Only when the block allocation fails and when freed blocks * found were 0, that is when we sample percpu seq counter for all cpus using * below function ext4_get_discard_pa_seq_sum(). This happens after making * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. */ static DEFINE_PER_CPU(u64, discard_pa_seq); static inline u64 ext4_get_discard_pa_seq_sum(void) { int __cpu; u64 __seq = 0; for_each_possible_cpu(__cpu) __seq += per_cpu(discard_pa_seq, __cpu); return __seq; } static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { #if BITS_PER_LONG == 64 *bit += ((unsigned long) addr & 7UL) << 3; addr = (void *) ((unsigned long) addr & ~7UL); #elif BITS_PER_LONG == 32 *bit += ((unsigned long) addr & 3UL) << 3; addr = (void *) ((unsigned long) addr & ~3UL); #else #error "how many bits you are?!" #endif return addr; } static inline int mb_test_bit(int bit, void *addr) { /* * ext4_test_bit on architecture like powerpc * needs unsigned long aligned address */ addr = mb_correct_addr_and_bit(&bit, addr); return ext4_test_bit(bit, addr); } static inline void mb_set_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_set_bit(bit, addr); } static inline void mb_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_clear_bit(bit, addr); } static inline int mb_test_and_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); return ext4_test_and_clear_bit(bit, addr); } static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); tmpmax = max + fix; start += fix; ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; if (ret > max) return max; return ret; } static inline int mb_find_next_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); tmpmax = max + fix; start += fix; ret = ext4_find_next_bit(addr, tmpmax, start) - fix; if (ret > max) return max; return ret; } static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(max == NULL); if (order > e4b->bd_blkbits + 1) { *max = 0; return NULL; } /* at order 0 we see each particular block */ if (order == 0) { *max = 1 << (e4b->bd_blkbits + 3); return e4b->bd_bitmap; } bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; return bb; } #ifdef DOUBLE_CHECK static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { int i; struct super_block *sb = e4b->bd_sb; if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); for (i = 0; i < count; i++) { if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(EXT4_SB(sb), first + i); ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing block already freed " "(bit %u)", first + i); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } } static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) { int i; if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); for (i = 0; i < count; i++) { BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); mb_set_bit(first + i, e4b->bd_info->bb_bitmap); } } static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { unsigned char *b1, *b2; int i; b1 = (unsigned char *) e4b->bd_info->bb_bitmap; b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { ext4_msg(e4b->bd_sb, KERN_ERR, "corruption in group %u " "at byte %u(%u): %x in copy != %x " "on disk/prealloc", e4b->bd_group, i, i * 8, b1[i], b2[i]); BUG(); } } } } static void mb_group_bb_bitmap_alloc(struct super_block *sb, struct ext4_group_info *grp, ext4_group_t group) { struct buffer_head *bh; grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); if (!grp->bb_bitmap) return; bh = ext4_read_block_bitmap(sb, group); if (IS_ERR_OR_NULL(bh)) { kfree(grp->bb_bitmap); grp->bb_bitmap = NULL; return; } memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize); put_bh(bh); } static void mb_group_bb_bitmap_free(struct ext4_group_info *grp) { kfree(grp->bb_bitmap); } #else static inline void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { return; } static inline void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) { return; } static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { return; } static inline void mb_group_bb_bitmap_alloc(struct super_block *sb, struct ext4_group_info *grp, ext4_group_t group) { return; } static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp) { return; } #endif #ifdef AGGRESSIVE_CHECK #define MB_CHECK_ASSERT(assert) \ do { \ if (!(assert)) { \ printk(KERN_EMERG \ "Assertion failure in %s() at %s:%d: \"%s\"\n", \ function, file, line, # assert); \ BUG(); \ } \ } while (0) /* * Perform buddy integrity check with the following steps: * * 1. Top-down validation (from highest order down to order 1, excluding order-0 bitmap): * For each pair of adjacent orders, if a higher-order bit is set (indicating a free block), * at most one of the two corresponding lower-order bits may be clear (free). * * 2. Order-0 (bitmap) validation, performed on bit pairs: * - If either bit in a pair is set (1, allocated), then all corresponding higher-order bits * must not be free (0). * - If both bits in a pair are clear (0, free), then exactly one of the corresponding * higher-order bits must be free (0). * * 3. Preallocation (pa) list validation: * For each preallocated block (pa) in the group: * - Verify that pa_pstart falls within the bounds of this block group. * - Ensure the corresponding bit(s) in the order-0 bitmap are marked as allocated (1). */ static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, const char *function, int line) { struct super_block *sb = e4b->bd_sb; int order = e4b->bd_blkbits + 1; int max; int max2; int i; int j; int k; int count; struct ext4_group_info *grp; int fragments = 0; int fstart; struct list_head *cur; void *buddy; void *buddy2; if (e4b->bd_info->bb_check_counter++ % 10) return; while (order > 1) { buddy = mb_find_buddy(e4b, order, &max); MB_CHECK_ASSERT(buddy); buddy2 = mb_find_buddy(e4b, order - 1, &max2); MB_CHECK_ASSERT(buddy2); MB_CHECK_ASSERT(buddy != buddy2); MB_CHECK_ASSERT(max * 2 == max2); count = 0; for (i = 0; i < max; i++) { if (mb_test_bit(i, buddy)) { /* only single bit in buddy2 may be 0 */ if (!mb_test_bit(i << 1, buddy2)) { MB_CHECK_ASSERT( mb_test_bit((i<<1)+1, buddy2)); } continue; } count++; } MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); order--; } fstart = -1; buddy = mb_find_buddy(e4b, 0, &max); for (i = 0; i < max; i++) { if (!mb_test_bit(i, buddy)) { MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); if (fstart == -1) { fragments++; fstart = i; } } else { fstart = -1; } if (!(i & 1)) { int in_use, zero_bit_count = 0; in_use = mb_test_bit(i, buddy) || mb_test_bit(i + 1, buddy); for (j = 1; j < e4b->bd_blkbits + 2; j++) { buddy2 = mb_find_buddy(e4b, j, &max2); k = i >> j; MB_CHECK_ASSERT(k < max2); if (!mb_test_bit(k, buddy2)) zero_bit_count++; } MB_CHECK_ASSERT(zero_bit_count == !in_use); } } MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); grp = ext4_get_group_info(sb, e4b->bd_group); if (!grp) return; list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); if (!pa->pa_len) continue; ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); MB_CHECK_ASSERT(groupnr == e4b->bd_group); for (i = 0; i < pa->pa_len; i++) MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); } } #undef MB_CHECK_ASSERT #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ __FILE__, __func__, __LINE__) #else #define mb_check_buddy(e4b) #endif /* * Divide blocks started from @first with length @len into * smaller chunks with power of 2 blocks. * Clear the bits in bitmap which the blocks of the chunk(s) covered, * then increase bb_counters[] for corresponded chunk size. */ static void ext4_mb_mark_free_simple(struct super_block *sb, void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t min; ext4_grpblk_t max; ext4_grpblk_t chunk; unsigned int border; BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); border = 2 << sb->s_blocksize_bits; while (len > 0) { /* find how many blocks can be covered since this position */ max = ffs(first | border) - 1; /* find how many blocks of power 2 we need to mark */ min = fls(len) - 1; if (max < min) min = max; chunk = 1 << min; /* mark multiblock chunks only */ grp->bb_counters[min]++; if (min > 0) mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); len -= chunk; first += chunk; } } static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) { int order; /* * We don't bother with a special lists groups with only 1 block free * extents and for completely empty groups. */ order = fls(len) - 2; if (order < 0) return 0; if (order == MB_NUM_ORDERS(sb)) order--; if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb))) order = MB_NUM_ORDERS(sb) - 1; return order; } /* Move group to appropriate avg_fragment_size list */ static void mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); int new, old; if (!test_opt2(sb, MB_OPTIMIZE_SCAN)) return; old = grp->bb_avg_fragment_size_order; new = grp->bb_fragments == 0 ? -1 : mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments); if (new == old) return; if (old >= 0) xa_erase(&sbi->s_mb_avg_fragment_size[old], grp->bb_group); grp->bb_avg_fragment_size_order = new; if (new >= 0) { /* * Cannot use __GFP_NOFAIL because we hold the group lock. * Although allocation for insertion may fails, it's not fatal * as we have linear traversal to fall back on. */ int err = xa_insert(&sbi->s_mb_avg_fragment_size[new], grp->bb_group, grp, GFP_ATOMIC); if (err) mb_debug(sb, "insert group: %u to s_mb_avg_fragment_size[%d] failed, err %d", grp->bb_group, new, err); } } static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac, struct xarray *xa, ext4_group_t start, ext4_group_t end) { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); enum criteria cr = ac->ac_criteria; ext4_group_t ngroups = ext4_get_groups_count(sb); unsigned long group = start; struct ext4_group_info *grp; if (WARN_ON_ONCE(end > ngroups || start >= end)) return 0; xa_for_each_range(xa, group, grp, start, end - 1) { int err; if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); err = ext4_mb_scan_group(ac, grp->bb_group); if (err || ac->ac_status != AC_STATUS_CONTINUE) return err; cond_resched(); } return 0; } /* * Find a suitable group of given order from the largest free orders xarray. */ static inline int ext4_mb_scan_groups_largest_free_order_range(struct ext4_allocation_context *ac, int order, ext4_group_t start, ext4_group_t end) { struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_largest_free_orders[order]; if (xa_empty(xa)) return 0; return ext4_mb_scan_groups_xa_range(ac, xa, start, end); } /* * Choose next group by traversing largest_free_order lists. Updates *new_cr if * cr level needs an update. */ static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac, ext4_group_t group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int i; int ret = 0; ext4_group_t start, end; start = group; end = ext4_get_groups_count(ac->ac_sb); wrap_around: for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { ret = ext4_mb_scan_groups_largest_free_order_range(ac, i, start, end); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret; } if (start) { end = start; start = 0; goto wrap_around; } if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); /* Increment cr and search again if no group is found */ ac->ac_criteria = CR_GOAL_LEN_FAST; return ret; } /* * Find a suitable group of given order from the average fragments xarray. */ static int ext4_mb_scan_groups_avg_frag_order_range(struct ext4_allocation_context *ac, int order, ext4_group_t start, ext4_group_t end) { struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_avg_fragment_size[order]; if (xa_empty(xa)) return 0; return ext4_mb_scan_groups_xa_range(ac, xa, start, end); } /* * Choose next group by traversing average fragment size list of suitable * order. Updates *new_cr if cr level needs an update. */ static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac, ext4_group_t group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int i, ret = 0; ext4_group_t start, end; start = group; end = ext4_get_groups_count(ac->ac_sb); wrap_around: i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) { ret = ext4_mb_scan_groups_avg_frag_order_range(ac, i, start, end); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret; } if (start) { end = start; start = 0; goto wrap_around; } if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); /* * CR_BEST_AVAIL_LEN works based on the concept that we have * a larger normalized goal len request which can be trimmed to * a smaller goal len such that it can still satisfy original * request len. However, allocation request for non-regular * files never gets normalized. * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA). */ if (ac->ac_flags & EXT4_MB_HINT_DATA) ac->ac_criteria = CR_BEST_AVAIL_LEN; else ac->ac_criteria = CR_GOAL_LEN_SLOW; return ret; } /* * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment * order we have and proactively trim the goal request length to that order to * find a suitable group faster. * * This optimizes allocation speed at the cost of slightly reduced * preallocations. However, we make sure that we don't trim the request too * much and fall to CR_GOAL_LEN_SLOW in that case. */ static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac, ext4_group_t group) { int ret = 0; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int i, order, min_order; unsigned long num_stripe_clusters = 0; ext4_group_t start, end; /* * mb_avg_fragment_size_order() returns order in a way that makes * retrieving back the length using (1 << order) inaccurate. Hence, use * fls() instead since we need to know the actual length while modifying * goal length. */ order = fls(ac->ac_g_ex.fe_len) - 1; if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb))) order = MB_NUM_ORDERS(ac->ac_sb); min_order = order - sbi->s_mb_best_avail_max_trim_order; if (min_order < 0) min_order = 0; if (sbi->s_stripe > 0) { /* * We are assuming that stripe size is always a multiple of * cluster ratio otherwise __ext4_fill_super exists early. */ num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe); if (1 << min_order < num_stripe_clusters) /* * We consider 1 order less because later we round * up the goal len to num_stripe_clusters */ min_order = fls(num_stripe_clusters) - 1; } if (1 << min_order < ac->ac_o_ex.fe_len) min_order = fls(ac->ac_o_ex.fe_len); start = group; end = ext4_get_groups_count(ac->ac_sb); wrap_around: for (i = order; i >= min_order; i--) { int frag_order; /* * Scale down goal len to make sure we find something * in the free fragments list. Basically, reduce * preallocations. */ ac->ac_g_ex.fe_len = 1 << i; if (num_stripe_clusters > 0) { /* * Try to round up the adjusted goal length to * stripe size (in cluster units) multiple for * efficiency. */ ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len, num_stripe_clusters); } frag_order = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); ret = ext4_mb_scan_groups_avg_frag_order_range(ac, frag_order, start, end); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret; } if (start) { end = start; start = 0; goto wrap_around; } /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); ac->ac_criteria = CR_GOAL_LEN_SLOW; return ret; } static inline int should_optimize_scan(struct ext4_allocation_context *ac) { if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) return 0; if (ac->ac_criteria >= CR_GOAL_LEN_SLOW) return 0; if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) return 0; return 1; } /* * next linear group for allocation. */ static void next_linear_group(ext4_group_t *group, ext4_group_t ngroups) { /* * Artificially restricted ngroups for non-extent * files makes group > ngroups possible on first loop. */ *group = *group + 1 >= ngroups ? 0 : *group + 1; } static int ext4_mb_scan_groups_linear(struct ext4_allocation_context *ac, ext4_group_t ngroups, ext4_group_t *start, ext4_group_t count) { int ret, i; enum criteria cr = ac->ac_criteria; struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group = *start; for (i = 0; i < count; i++, next_linear_group(&group, ngroups)) { ret = ext4_mb_scan_group(ac, group); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret; cond_resched(); } *start = group; if (count == ngroups) ac->ac_criteria++; /* Processed all groups and haven't found blocks */ if (sbi->s_mb_stats && i == ngroups) atomic64_inc(&sbi->s_bal_cX_failed[cr]); return 0; } static int ext4_mb_scan_groups(struct ext4_allocation_context *ac) { int ret = 0; ext4_group_t start; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb); /* non-extent files are limited to low blocks/groups */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) ngroups = sbi->s_blockfile_groups; /* searching for the right group start from the goal value specified */ start = ac->ac_g_ex.fe_group; ac->ac_prefetch_grp = start; ac->ac_prefetch_nr = 0; if (!should_optimize_scan(ac)) return ext4_mb_scan_groups_linear(ac, ngroups, &start, ngroups); /* * Optimized scanning can return non adjacent groups which can cause * seek overhead for rotational disks. So try few linear groups before * trying optimized scan. */ if (sbi->s_mb_max_linear_groups) ret = ext4_mb_scan_groups_linear(ac, ngroups, &start, sbi->s_mb_max_linear_groups); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret; switch (ac->ac_criteria) { case CR_POWER2_ALIGNED: return ext4_mb_scan_groups_p2_aligned(ac, start); case CR_GOAL_LEN_FAST: return ext4_mb_scan_groups_goal_fast(ac, start); case CR_BEST_AVAIL_LEN: return ext4_mb_scan_groups_best_avail(ac, start); default: /* * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an * rb tree sorted by bb_free. But until that happens, we should * never come here. */ WARN_ON(1); } return 0; } /* * Cache the order of the largest free extent we have available in this block * group. */ static void mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); int new, old = grp->bb_largest_free_order; for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--) if (grp->bb_counters[new] > 0) break; /* No need to move between order lists? */ if (new == old) return; if (old >= 0) { struct xarray *xa = &sbi->s_mb_largest_free_orders[old]; if (!xa_empty(xa) && xa_load(xa, grp->bb_group)) xa_erase(xa, grp->bb_group); } grp->bb_largest_free_order = new; if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) { /* * Cannot use __GFP_NOFAIL because we hold the group lock. * Although allocation for insertion may fails, it's not fatal * as we have linear traversal to fall back on. */ int err = xa_insert(&sbi->s_mb_largest_free_orders[new], grp->bb_group, grp, GFP_ATOMIC); if (err) mb_debug(sb, "insert group: %u to s_mb_largest_free_orders[%d] failed, err %d", grp->bb_group, new, err); } } static noinline_for_stack void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; ext4_grpblk_t len; unsigned free = 0; unsigned fragments = 0; unsigned long long period = get_cycles(); /* initialize buddy from bitmap which is aggregation * of on-disk bitmap and preallocations */ i = mb_find_next_zero_bit(bitmap, max, 0); grp->bb_first_free = i; while (i < max) { fragments++; first = i; i = mb_find_next_bit(bitmap, max, i); len = i - first; free += len; if (len > 1) ext4_mb_mark_free_simple(sb, buddy, first, len, grp); else grp->bb_counters[0]++; if (i < max) i = mb_find_next_zero_bit(bitmap, max, i); } grp->bb_fragments = fragments; if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, "block bitmap and bg descriptor " "inconsistent: %u vs %u free clusters", free, grp->bb_free); /* * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; ext4_mark_group_bitmap_corrupted(sb, group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_set_largest_free_order(sb, grp); mb_update_avg_fragment_size(sb, grp); clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); period = get_cycles() - period; atomic_inc(&sbi->s_mb_buddies_generated); atomic64_add(period, &sbi->s_mb_generation_time); } static void mb_regenerate_buddy(struct ext4_buddy *e4b) { int count; int order = 1; void *buddy; while ((buddy = mb_find_buddy(e4b, order++, &count))) mb_set_bits(buddy, 0, count); e4b->bd_info->bb_fragments = 0; memset(e4b->bd_info->bb_counters, 0, sizeof(*e4b->bd_info->bb_counters) * (e4b->bd_sb->s_blocksize_bits + 2)); ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, e4b->bd_bitmap, e4b->bd_group, e4b->bd_info); } /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are * stored in the inode as * * { folio } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. * So for each group we take up 2 blocks. A folio can * contain blocks_per_folio (folio_size / blocksize) blocks. * So it can have information regarding groups_per_folio which * is blocks_per_folio/2 * * Locking note: This routine takes the block group lock of all groups * for this folio; do not hold this lock when calling this routine! */ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) { ext4_group_t ngroups; unsigned int blocksize; int blocks_per_folio; int groups_per_folio; int err = 0; int i; ext4_group_t first_group, group; int first_block; struct super_block *sb; struct buffer_head *bhs; struct buffer_head **bh = NULL; struct inode *inode; char *data; char *bitmap; struct ext4_group_info *grinfo; inode = folio->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); blocks_per_folio = folio_size(folio) / blocksize; WARN_ON_ONCE(!blocks_per_folio); groups_per_folio = DIV_ROUND_UP(blocks_per_folio, 2); mb_debug(sb, "init folio %lu\n", folio->index); /* allocate buffer_heads to read bitmaps */ if (groups_per_folio > 1) { i = sizeof(struct buffer_head *) * groups_per_folio; bh = kzalloc(i, gfp); if (bh == NULL) return -ENOMEM; } else bh = &bhs; /* read all groups the folio covers into the cache */ first_group = EXT4_PG_TO_LBLK(inode, folio->index) / 2; for (i = 0, group = first_group; i < groups_per_folio; i++, group++) { if (group >= ngroups) break; grinfo = ext4_get_group_info(sb, group); if (!grinfo) continue; /* * If folio is uptodate then we came here after online resize * which added some new uninitialized group info structs, so * we must skip all initialized uptodate buddies on the folio, * which may be currently in use by an allocating task. */ if (folio_test_uptodate(folio) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { bh[i] = NULL; continue; } bh[i] = ext4_read_block_bitmap_nowait(sb, group, false); if (IS_ERR(bh[i])) { err = PTR_ERR(bh[i]); bh[i] = NULL; goto out; } mb_debug(sb, "read bitmap for group %u\n", group); } /* wait for I/O completion */ for (i = 0, group = first_group; i < groups_per_folio; i++, group++) { int err2; if (!bh[i]) continue; err2 = ext4_wait_block_bitmap(sb, group, bh[i]); if (!err) err = err2; } first_block = EXT4_PG_TO_LBLK(inode, folio->index); for (i = 0; i < blocks_per_folio; i++) { group = (first_block + i) >> 1; if (group >= ngroups) break; if (!bh[group - first_group]) /* skip initialized uptodate buddy */ continue; if (!buffer_verified(bh[group - first_group])) /* Skip faulty bitmaps */ continue; err = 0; /* * data carry information regarding this * particular group in the format specified * above * */ data = folio_address(folio) + (i * blocksize); bitmap = bh[group - first_group]->b_data; /* * We place the buddy block and bitmap block * close together */ grinfo = ext4_get_group_info(sb, group); if (!grinfo) { err = -EFSCORRUPTED; goto out; } if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); mb_debug(sb, "put buddy for group %u in folio %lu/%x\n", group, folio->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) * (MB_NUM_ORDERS(sb))); /* * incore got set to the group block bitmap below */ ext4_lock_group(sb, group); /* init the buddy */ memset(data, 0xff, blocksize); ext4_mb_generate_buddy(sb, data, incore, group, grinfo); ext4_unlock_group(sb, group); incore = NULL; } else { /* this is block of bitmap */ BUG_ON(incore != NULL); mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n", group, folio->index, i * blocksize); trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ ext4_lock_group(sb, group); memcpy(data, bitmap, blocksize); /* mark all preallocated blks used in in-core bitmap */ ext4_mb_generate_from_pa(sb, data, group); WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root)); ext4_unlock_group(sb, group); /* set incore so that the buddy information can be * generated using this */ incore = data; } } folio_mark_uptodate(folio); out: if (bh) { for (i = 0; i < groups_per_folio; i++) brelse(bh[i]); if (bh != &bhs) kfree(bh); } return err; } /* * Lock the buddy and bitmap folios. This makes sure other parallel init_group * on the same buddy folio doesn't happen while holding the buddy folio lock. * Return locked buddy and bitmap folios on e4b struct. If buddy and bitmap * are on the same folio e4b->bd_buddy_folio is NULL and return value is 0. */ static int ext4_mb_get_buddy_folio_lock(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum; struct folio *folio; e4b->bd_buddy_folio = NULL; e4b->bd_bitmap_folio = NULL; /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; pnum = EXT4_LBLK_TO_PG(inode, block); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio); BUG_ON(folio->mapping != inode->i_mapping); WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize); e4b->bd_bitmap_folio = folio; e4b->bd_bitmap = folio_address(folio) + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); block++; pnum = EXT4_LBLK_TO_PG(inode, block); if (folio_contains(folio, pnum)) { /* buddy and bitmap are on the same folio */ return 0; } /* we need another folio for the buddy */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio); BUG_ON(folio->mapping != inode->i_mapping); WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize); e4b->bd_buddy_folio = folio; return 0; } static void ext4_mb_put_buddy_folio_lock(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_folio) { folio_unlock(e4b->bd_bitmap_folio); folio_put(e4b->bd_bitmap_folio); } if (e4b->bd_buddy_folio) { folio_unlock(e4b->bd_buddy_folio); folio_put(e4b->bd_buddy_folio); } } /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this folio; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) { struct ext4_group_info *this_grp; struct ext4_buddy e4b; struct folio *folio; int ret = 0; might_sleep(); mb_debug(sb, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); if (!this_grp) return -EFSCORRUPTED; /* * This ensures that we don't reinit the buddy cache * folio which map to the group from which we are already * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that * would have pinned buddy folio to page cache. * The call to ext4_mb_get_buddy_folio_lock will mark the * folio accessed. */ ret = ext4_mb_get_buddy_folio_lock(sb, group, &e4b, gfp); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group * return without doing anything */ goto err; } folio = e4b.bd_bitmap_folio; ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) goto err; if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } if (e4b.bd_buddy_folio == NULL) { /* * If both the bitmap and buddy are in * the same folio we don't need to force * init the buddy */ ret = 0; goto err; } /* init buddy cache */ folio = e4b.bd_buddy_folio; ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp); if (ret) goto err; if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } err: ext4_mb_put_buddy_folio_lock(&e4b); return ret; } /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this folio; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { int block; int pnum; struct folio *folio; int ret; struct ext4_group_info *grp; struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; might_sleep(); mb_debug(sb, "load group %u\n", group); grp = ext4_get_group_info(sb, group); if (!grp) return -EFSCORRUPTED; e4b->bd_blkbits = sb->s_blocksize_bits; e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; e4b->bd_buddy_folio = NULL; e4b->bd_bitmap_folio = NULL; if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { /* * we need full data about the group * to make a good selection */ ret = ext4_mb_init_group(sb, group, gfp); if (ret) return ret; } /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; pnum = EXT4_LBLK_TO_PG(inode, block); /* Avoid locking the folio in the fast path ... */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); if (IS_ERR(folio) || !folio_test_uptodate(folio)) { if (!IS_ERR(folio)) /* * drop the folio reference and try * to get the folio with lock. If we * are not uptodate that implies * somebody just created the folio but * is yet to initialize it. So * wait for it to initialize. */ folio_put(folio); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (!IS_ERR(folio)) { if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, "ext4: bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ folio_unlock(folio); ret = -EINVAL; goto err; } if (!folio_test_uptodate(folio)) { ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) { folio_unlock(folio); goto err; } mb_cmp_bitmaps(e4b, folio_address(folio) + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block))); } folio_unlock(folio); } } if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto err; } if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } /* Folios marked accessed already */ e4b->bd_bitmap_folio = folio; e4b->bd_bitmap = folio_address(folio) + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); block++; pnum = EXT4_LBLK_TO_PG(inode, block); /* buddy and bitmap are on the same folio? */ if (folio_contains(folio, pnum)) { folio_get(folio); goto update_buddy; } /* we need another folio for the buddy */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); if (IS_ERR(folio) || !folio_test_uptodate(folio)) { if (!IS_ERR(folio)) folio_put(folio); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (!IS_ERR(folio)) { if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, "ext4: buddy bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ folio_unlock(folio); ret = -EINVAL; goto err; } if (!folio_test_uptodate(folio)) { ret = ext4_mb_init_cache(folio, e4b->bd_bitmap, gfp); if (ret) { folio_unlock(folio); goto err; } } folio_unlock(folio); } } if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto err; } if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } update_buddy: /* Folios marked accessed already */ e4b->bd_buddy_folio = folio; e4b->bd_buddy = folio_address(folio) + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); return 0; err: if (!IS_ERR_OR_NULL(folio)) folio_put(folio); if (e4b->bd_bitmap_folio) folio_put(e4b->bd_bitmap_folio); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; return ret; } static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b) { return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); } static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_folio) folio_put(e4b->bd_bitmap_folio); if (e4b->bd_buddy_folio) folio_put(e4b->bd_buddy_folio); } static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) { int order = 1, max; void *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); while (order <= e4b->bd_blkbits + 1) { bb = mb_find_buddy(e4b, order, &max); if (!mb_test_bit(block >> order, bb)) { /* this block is part of buddy of order 'order' */ return order; } order++; } return 0; } static void mb_clear_bits(void *bm, int cur, int len) { __u32 *addr; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: clear whole word at once */ addr = bm + (cur >> 3); *addr = 0; cur += 32; continue; } mb_clear_bit(cur, bm); cur++; } } /* clear bits in given range * will return first found zero bit if any, -1 otherwise */ static int mb_test_and_clear_bits(void *bm, int cur, int len) { __u32 *addr; int zero_bit = -1; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: clear whole word at once */ addr = bm + (cur >> 3); if (*addr != (__u32)(-1) && zero_bit == -1) zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); *addr = 0; cur += 32; continue; } if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) zero_bit = cur; cur++; } return zero_bit; } void mb_set_bits(void *bm, int cur, int len) { __u32 *addr; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: set whole word at once */ addr = bm + (cur >> 3); *addr = 0xffffffff; cur += 32; continue; } mb_set_bit(cur, bm); cur++; } } static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) { if (mb_test_bit(*bit + side, bitmap)) { mb_clear_bit(*bit, bitmap); (*bit) -= side; return 1; } else { (*bit) += side; mb_set_bit(*bit, bitmap); return -1; } } static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) { int max; int order = 1; void *buddy = mb_find_buddy(e4b, order, &max); while (buddy) { void *buddy2; /* Bits in range [first; last] are known to be set since * corresponding blocks were allocated. Bits in range * (first; last) will stay set because they form buddies on * upper layer. We just deal with borders if they don't * align with upper layer and then go up. * Releasing entire group is all about clearing * single bit of highest order buddy. */ /* Example: * --------------------------------- * | 1 | 1 | 1 | 1 | * --------------------------------- * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | * --------------------------------- * 0 1 2 3 4 5 6 7 * \_____________________/ * * Neither [1] nor [6] is aligned to above layer. * Left neighbour [0] is free, so mark it busy, * decrease bb_counters and extend range to * [0; 6] * Right neighbour [7] is busy. It can't be coaleasced with [6], so * mark [6] free, increase bb_counters and shrink range to * [0; 5]. * Then shift range to [0; 2], go up and do the same. */ if (first & 1) e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); if (!(last & 1)) e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); if (first > last) break; order++; buddy2 = mb_find_buddy(e4b, order, &max); if (!buddy2) { mb_clear_bits(buddy, first, last - first + 1); e4b->bd_info->bb_counters[order - 1] += last - first + 1; break; } first >>= 1; last >>= 1; buddy = buddy2; } } static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { int left_is_free = 0; int right_is_free = 0; int block; int last = first + count - 1; struct super_block *sb = e4b->bd_sb; if (WARN_ON(count == 0)) return; BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); /* Don't bother if the block group is corrupt. */ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) return; mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); /* access memory sequentially: check left neighbour, * clear range and then check right neighbour */ if (first != 0) left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); if (unlikely(block != -1)) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; /* * Fastcommit replay can free already freed blocks which * corrupts allocation info. Regenerate it. */ if (sbi->s_mount_state & EXT4_FC_REPLAY) { mb_regenerate_buddy(e4b); goto check; } blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(sbi, block); ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing already freed block (bit %u); block bitmap corrupt.", block); return; } this_cpu_inc(discard_pa_seq); e4b->bd_info->bb_free += count; if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; /* let's maintain fragments counter */ if (left_is_free && right_is_free) e4b->bd_info->bb_fragments--; else if (!left_is_free && !right_is_free) e4b->bd_info->bb_fragments++; /* buddy[0] == bd_bitmap is a special case, so handle * it right away and let mb_buddy_mark_free stay free of * zero order checks. * Check if neighbours are to be coaleasced, * adjust bitmap bb_counters and borders appropriately. */ if (first & 1) { first += !left_is_free; e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; } if (!(last & 1)) { last -= !right_is_free; e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; } if (first <= last) mb_buddy_mark_free(e4b, first >> 1, last >> 1); mb_set_largest_free_order(sb, e4b->bd_info); mb_update_avg_fragment_size(sb, e4b->bd_info); check: mb_check_buddy(e4b); } static int mb_find_extent(struct ext4_buddy *e4b, int block, int needed, struct ext4_free_extent *ex) { int max, order, next; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); BUG_ON(ex == NULL); buddy = mb_find_buddy(e4b, 0, &max); BUG_ON(buddy == NULL); BUG_ON(block >= max); if (mb_test_bit(block, buddy)) { ex->fe_len = 0; ex->fe_start = 0; ex->fe_group = 0; return 0; } /* find actual order */ order = mb_find_order_for_block(e4b, block); ex->fe_len = (1 << order) - (block & ((1 << order) - 1)); ex->fe_start = block; ex->fe_group = e4b->bd_group; block = block >> order; while (needed > ex->fe_len && mb_find_buddy(e4b, order, &max)) { if (block + 1 >= max) break; next = (block + 1) * (1 << order); if (mb_test_bit(next, e4b->bd_bitmap)) break; order = mb_find_order_for_block(e4b, next); block = next >> order; ex->fe_len += 1 << order; } if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) { /* Should never happen! (but apparently sometimes does?!?) */ WARN_ON(1); ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0, "corruption or bug in mb_find_extent " "block=%d, order=%d needed=%d ex=%u/%d/%d@%u", block, order, needed, ex->fe_group, ex->fe_start, ex->fe_len, ex->fe_logical); ex->fe_len = 0; ex->fe_start = 0; ex->fe_group = 0; } return ex->fe_len; } static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) { int ord; int mlen = 0; int max = 0; int start = ex->fe_start; int len = ex->fe_len; unsigned ret = 0; int len0 = len; void *buddy; int ord_start, ord_end; BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); mb_check_buddy(e4b); mb_mark_used_double(e4b, start, len); this_cpu_inc(discard_pa_seq); e4b->bd_info->bb_free -= len; if (e4b->bd_info->bb_first_free == start) e4b->bd_info->bb_first_free += len; /* let's maintain fragments counter */ if (start != 0) mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) max = !mb_test_bit(start + len, e4b->bd_bitmap); if (mlen && max) e4b->bd_info->bb_fragments++; else if (!mlen && !max) e4b->bd_info->bb_fragments--; /* let's maintain buddy itself */ while (len) { ord = mb_find_order_for_block(e4b, start); if (((start >> ord) << ord) == start && len >= (1 << ord)) { /* the whole chunk may be allocated at once! */ mlen = 1 << ord; buddy = mb_find_buddy(e4b, ord, &max); BUG_ON((start >> ord) >= max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; start += mlen; len -= mlen; BUG_ON(len < 0); continue; } /* store for history */ if (ret == 0) ret = len | (ord << 16); BUG_ON(ord <= 0); buddy = mb_find_buddy(e4b, ord, &max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; ord_start = (start >> ord) << ord; ord_end = ord_start + (1 << ord); /* first chunk */ if (start > ord_start) ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, ord_start, start - ord_start, e4b->bd_info); /* last chunk */ if (start + len < ord_end) { ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, start + len, ord_end - (start + len), e4b->bd_info); break; } len = start + len - ord_end; start = ord_end; } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); return ret; } /* * Must be called under group lock! */ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int ret; BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); BUG_ON(ac->ac_status == AC_STATUS_FOUND); ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; ret = mb_mark_used(e4b, &ac->ac_b_ex); /* preallocation can change ac_b_ex, thus we store actually * allocated blocks for history */ ac->ac_f_ex = ac->ac_b_ex; ac->ac_status = AC_STATUS_FOUND; ac->ac_tail = ret & 0xffff; ac->ac_buddy = ret >> 16; /* * take the folio reference. We want the folio to be pinned * so that we don't get a ext4_mb_init_cache_call for this * group until we update the bitmap. That would mean we * double allocate blocks. The reference is dropped * in ext4_mb_release_context */ ac->ac_bitmap_folio = e4b->bd_bitmap_folio; folio_get(ac->ac_bitmap_folio); ac->ac_buddy_folio = e4b->bd_buddy_folio; folio_get(ac->ac_buddy_folio); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group); } /* * As we've just preallocated more space than * user requested originally, we store allocated * space in a special descriptor. */ if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) ext4_mb_new_preallocation(ac); } static void ext4_mb_check_limits(struct ext4_allocation_context *ac, struct ext4_buddy *e4b, int finish_group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; if (ac->ac_status == AC_STATUS_FOUND) return; /* * We don't want to scan for a whole year */ if (ac->ac_found > sbi->s_mb_max_to_scan && !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { ac->ac_status = AC_STATUS_BREAK; return; } /* * Haven't found good chunk so far, let's continue */ if (bex->fe_len < gex->fe_len) return; if (finish_group || ac->ac_found > sbi->s_mb_min_to_scan) ext4_mb_use_best_found(ac, e4b); } /* * The routine checks whether found extent is good enough. If it is, * then the extent gets marked used and flag is set to the context * to stop scanning. Otherwise, the extent is compared with the * previous found extent and if new one is better, then it's stored * in the context. Later, the best found extent will be used, if * mballoc can't find good enough extent. * * The algorithm used is roughly as follows: * * * If free extent found is exactly as big as goal, then * stop the scan and use it immediately * * * If free extent found is smaller than goal, then keep retrying * upto a max of sbi->s_mb_max_to_scan times (default 200). After * that stop scanning and use whatever we have. * * * If free extent found is bigger than goal, then keep retrying * upto a max of sbi->s_mb_min_to_scan times (default 10) before * stopping the scan and using the extent. * * * FIXME: real allocation policy is to be designed yet! */ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *ex, struct ext4_buddy *e4b) { struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); ac->ac_found++; ac->ac_cX_found[ac->ac_criteria]++; /* * The special case - take what you catch first */ if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { *bex = *ex; ext4_mb_use_best_found(ac, e4b); return; } /* * Let's check whether the chuck is good enough */ if (ex->fe_len == gex->fe_len) { *bex = *ex; ext4_mb_use_best_found(ac, e4b); return; } /* * If this is first found extent, just store it in the context */ if (bex->fe_len == 0) { *bex = *ex; return; } /* * If new found extent is better, store it in the context */ if (bex->fe_len < gex->fe_len) { /* if the request isn't satisfied, any found extent * larger than previous best one is better */ if (ex->fe_len > bex->fe_len) *bex = *ex; } else if (ex->fe_len > gex->fe_len) { /* if the request is satisfied, then we try to find * an extent that still satisfy the request, but is * smaller than previous one */ if (ex->fe_len < bex->fe_len) *bex = *ex; } ext4_mb_check_limits(ac, e4b, 0); } static noinline_for_stack void ext4_mb_try_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_free_extent ex = ac->ac_b_ex; ext4_group_t group = ex.fe_group; int max; int err; BUG_ON(ex.fe_len <= 0); err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return; ext4_lock_group(ac->ac_sb, group); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) goto out; max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); } static noinline_for_stack int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { ext4_group_t group = ac->ac_g_ex.fe_group; int max; int err; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct ext4_free_extent ex; if (!grp) return -EFSCORRUPTED; if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY))) return 0; if (grp->bb_free == 0) return 0; err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return err; ext4_lock_group(ac->ac_sb, group); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) goto out; max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == EXT4_NUM_B2C(sbi, sbi->s_stripe)) { ext4_fsblk_t start; start = ext4_grp_offs_to_block(ac->ac_sb, &ex); /* use do_div to get remainder (would be 64-bit modulo) */ if (do_div(start, sbi->s_stripe) == 0) { ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } } else if (max >= ac->ac_g_ex.fe_len) { BUG_ON(ex.fe_len <= 0); BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { /* Sometimes, caller may want to merge even small * number of blocks to an existing extent */ BUG_ON(ex.fe_len <= 0); BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); return 0; } /* * The routine scans buddy structures (not bitmap!) from given order * to max order and tries to find big enough chunk to satisfy the req */ static noinline_for_stack void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; struct ext4_group_info *grp = e4b->bd_info; void *buddy; int i; int k; int max; BUG_ON(ac->ac_2order <= 0); for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { if (grp->bb_counters[i] == 0) continue; buddy = mb_find_buddy(e4b, i, &max); if (WARN_RATELIMIT(buddy == NULL, "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i)) continue; k = mb_find_next_zero_bit(buddy, max, 0); if (k >= max) { ext4_mark_group_bitmap_corrupted(ac->ac_sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, "%d free clusters of order %d. But found 0", grp->bb_counters[i], i); break; } ac->ac_found++; ac->ac_cX_found[ac->ac_criteria]++; ac->ac_b_ex.fe_len = 1 << i; ac->ac_b_ex.fe_start = k << i; ac->ac_b_ex.fe_group = e4b->bd_group; ext4_mb_use_best_found(ac, e4b); BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len); if (EXT4_SB(sb)->s_mb_stats) atomic_inc(&EXT4_SB(sb)->s_bal_2orders); break; } } /* * The routine scans the group and measures all found extents. * In order to optimize scanning, caller must pass number of * free blocks in the group, so the routine can know upper limit. */ static noinline_for_stack void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; int i, j, freelen; int free; free = e4b->bd_info->bb_free; if (WARN_ON(free <= 0)) return; i = e4b->bd_info->bb_first_free; while (free && ac->ac_status == AC_STATUS_CONTINUE) { i = mb_find_next_zero_bit(bitmap, EXT4_CLUSTERS_PER_GROUP(sb), i); if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { /* * IF we have corrupt bitmap, we won't find any * free blocks even though group info says we * have free blocks */ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But bitmap says 0", free); break; } if (!ext4_mb_cr_expensive(ac->ac_criteria)) { /* * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are * sure that this group will have a large enough * continuous free extent, so skip over the smaller free * extents */ j = mb_find_next_bit(bitmap, EXT4_CLUSTERS_PER_GROUP(sb), i); freelen = j - i; if (freelen < ac->ac_g_ex.fe_len) { i = j; free -= freelen; continue; } } mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); if (WARN_ON(ex.fe_len <= 0)) break; if (free < ex.fe_len) { ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit * without claiming the space. */ break; } ex.fe_logical = 0xDEADC0DE; /* debug value */ ext4_mb_measure_extent(ac, &ex, e4b); i += ex.fe_len; free -= ex.fe_len; } ext4_mb_check_limits(ac, e4b, 1); } /* * This is a special case for storages like raid5 * we try to find stripe-aligned chunks for stripe-size-multiple requests */ static noinline_for_stack void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; ext4_fsblk_t first_group_block; ext4_fsblk_t a; ext4_grpblk_t i, stripe; int max; BUG_ON(sbi->s_stripe == 0); /* find first stripe-aligned block in group */ first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); a = first_group_block + sbi->s_stripe - 1; do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; stripe = EXT4_NUM_B2C(sbi, sbi->s_stripe); i = EXT4_B2C(sbi, i); while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { max = mb_find_extent(e4b, i, stripe, &ex); if (max >= stripe) { ac->ac_found++; ac->ac_cX_found[ac->ac_criteria]++; ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); break; } } i += stripe; } } static void __ext4_mb_scan_group(struct ext4_allocation_context *ac) { bool is_stripe_aligned; struct ext4_sb_info *sbi; enum criteria cr = ac->ac_criteria; ac->ac_groups_scanned++; if (cr == CR_POWER2_ALIGNED) return ext4_mb_simple_scan_group(ac, ac->ac_e4b); sbi = EXT4_SB(ac->ac_sb); is_stripe_aligned = false; if ((sbi->s_stripe >= sbi->s_cluster_ratio) && !(ac->ac_g_ex.fe_len % EXT4_NUM_B2C(sbi, sbi->s_stripe))) is_stripe_aligned = true; if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) && is_stripe_aligned) ext4_mb_scan_aligned(ac, ac->ac_e4b); if (ac->ac_status == AC_STATUS_CONTINUE) ext4_mb_complex_scan_group(ac, ac->ac_e4b); } /* * This is also called BEFORE we load the buddy bitmap. * Returns either 1 or 0 indicating that the group is either suitable * for the allocation or not. */ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, enum criteria cr) { ext4_grpblk_t free, fragments; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS); if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return false; free = grp->bb_free; if (free == 0) return false; fragments = grp->bb_fragments; if (fragments == 0) return false; switch (cr) { case CR_POWER2_ALIGNED: BUG_ON(ac->ac_2order == 0); /* Avoid using the first bg of a flexgroup for data files */ if ((ac->ac_flags & EXT4_MB_HINT_DATA) && (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && ((group % flex_size) == 0)) return false; if (free < ac->ac_g_ex.fe_len) return false; if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) return true; if (grp->bb_largest_free_order < ac->ac_2order) return false; return true; case CR_GOAL_LEN_FAST: case CR_BEST_AVAIL_LEN: if ((free / fragments) >= ac->ac_g_ex.fe_len) return true; break; case CR_GOAL_LEN_SLOW: if (free >= ac->ac_g_ex.fe_len) return true; break; case CR_ANY_FREE: return true; default: BUG(); } return false; } /* * This could return negative error code if something goes wrong * during ext4_mb_init_group(). This should not be called with * ext4_lock_group() held. * * Note: because we are conditionally operating with the group lock in * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this * function using __acquire and __release. This means we need to be * super careful before messing with the error path handling via "goto * out"! */ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_group_t group, enum criteria cr) { struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; ext4_grpblk_t free; int ret = 0; if (!grp) return -EFSCORRUPTED; if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); if (should_lock) { ext4_lock_group(sb, group); __release(ext4_group_lock_ptr(sb, group)); } free = grp->bb_free; if (free == 0) goto out; /* * In all criterias except CR_ANY_FREE we try to avoid groups that * can't possibly satisfy the full goal request due to insufficient * free blocks. */ if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; if (should_lock) { __acquire(ext4_group_lock_ptr(sb, group)); ext4_unlock_group(sb, group); } /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); int ret; /* * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic * search to find large good chunks almost for free. If buddy * data is not ready, then this optimization makes no sense. But * we never skip the first block group in a flex_bg, since this * gets used for metadata block allocation, and we want to make * sure we locate metadata blocks in the first block group in * the flex_bg if possible. */ if (!ext4_mb_cr_expensive(cr) && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) return 0; ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) return ret; } if (should_lock) { ext4_lock_group(sb, group); __release(ext4_group_lock_ptr(sb, group)); } ret = ext4_mb_good_group(ac, group, cr); out: if (should_lock) { __acquire(ext4_group_lock_ptr(sb, group)); ext4_unlock_group(sb, group); } return ret; } /* * Start prefetching @nr block bitmaps starting at @group. * Return the next group which needs to be prefetched. */ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, unsigned int nr, int *cnt) { ext4_group_t ngroups = ext4_get_groups_count(sb); struct buffer_head *bh; struct blk_plug plug; blk_start_plug(&plug); while (nr-- > 0) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); struct ext4_group_info *grp = ext4_get_group_info(sb, group); /* * Prefetch block groups with free blocks; but don't * bother if it is marked uninitialized on disk, since * it won't require I/O to read. Also only try to * prefetch once, so we avoid getblk() call, which can * be expensive. */ if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) && EXT4_MB_GRP_NEED_INIT(grp) && ext4_free_group_clusters(sb, gdp) > 0 ) { bh = ext4_read_block_bitmap_nowait(sb, group, true); if (bh && !IS_ERR(bh)) { if (!buffer_uptodate(bh) && cnt) (*cnt)++; brelse(bh); } } if (++group >= ngroups) group = 0; } blk_finish_plug(&plug); return group; } /* * Batch reads of the block allocation bitmaps to get * multiple READs in flight; limit prefetching at inexpensive * CR, otherwise mballoc can spend a lot of time loading * imperfect groups */ static void ext4_mb_might_prefetch(struct ext4_allocation_context *ac, ext4_group_t group) { struct ext4_sb_info *sbi; if (ac->ac_prefetch_grp != group) return; sbi = EXT4_SB(ac->ac_sb); if (ext4_mb_cr_expensive(ac->ac_criteria) || ac->ac_prefetch_ios < sbi->s_mb_prefetch_limit) { unsigned int nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(ac->ac_sb)) { nr = 1 << sbi->s_log_groups_per_flex; nr -= group & (nr - 1); nr = umin(nr, sbi->s_mb_prefetch); } ac->ac_prefetch_nr = nr; ac->ac_prefetch_grp = ext4_mb_prefetch(ac->ac_sb, group, nr, &ac->ac_prefetch_ios); } } /* * Prefetching reads the block bitmap into the buffer cache; but we * need to make sure that the buddy bitmap in the page cache has been * initialized. Note that ext4_mb_init_group() will block if the I/O * is not yet completed, or indeed if it was not initiated by * ext4_mb_prefetch did not start the I/O. * * TODO: We should actually kick off the buddy bitmap setup in a work * queue when the buffer I/O is completed, so that we don't block * waiting for the block allocation bitmap read to finish when * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator(). */ void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, unsigned int nr) { struct ext4_group_desc *gdp; struct ext4_group_info *grp; while (nr-- > 0) { if (!group) group = ext4_get_groups_count(sb); group--; gdp = ext4_get_group_desc(sb, group, NULL); grp = ext4_get_group_info(sb, group); if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) && ext4_free_group_clusters(sb, gdp) > 0) { if (ext4_mb_init_group(sb, group, GFP_NOFS)) break; } } } static int ext4_mb_scan_group(struct ext4_allocation_context *ac, ext4_group_t group) { int ret; struct super_block *sb = ac->ac_sb; enum criteria cr = ac->ac_criteria; ext4_mb_might_prefetch(ac, group); /* prevent unnecessary buddy loading. */ if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group))) return 0; /* This now checks without needing the buddy folio */ ret = ext4_mb_good_group_nolock(ac, group, cr); if (ret <= 0) { if (!ac->ac_first_err) ac->ac_first_err = ret; return 0; } ret = ext4_mb_load_buddy(sb, group, ac->ac_e4b); if (ret) return ret; /* skip busy group */ if (cr >= CR_ANY_FREE) ext4_lock_group(sb, group); else if (!ext4_try_lock_group(sb, group)) goto out_unload; /* We need to check again after locking the block group. */ if (unlikely(!ext4_mb_good_group(ac, group, cr))) goto out_unlock; __ext4_mb_scan_group(ac); out_unlock: ext4_unlock_group(sb, group); out_unload: ext4_mb_unload_buddy(ac->ac_e4b); return ret; } static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t i; int err = 0; struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; BUG_ON(ac->ac_status == AC_STATUS_FOUND); /* first, try the goal */ err = ext4_mb_find_by_goal(ac, &e4b); if (err || ac->ac_status == AC_STATUS_FOUND) goto out; if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) goto out; /* * ac->ac_2order is set only if the fe_len is a power of 2 * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED * so that we try exact allocation using buddy. */ i = fls(ac->ac_g_ex.fe_len); ac->ac_2order = 0; /* * We search using buddy data only if the order of the request * is greater than equal to the sbi_s_mb_order2_reqs * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req * We also support searching for power-of-two requests only for * requests upto maximum buddy size we have constructed. */ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { if (is_power_of_2(ac->ac_g_ex.fe_len)) ac->ac_2order = array_index_nospec(i - 1, MB_NUM_ORDERS(sb)); } /* if stream allocation is enabled, use global goal */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]); ac->ac_g_ex.fe_start = -1; ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL; } /* * Let's just scan groups to find more-less suitable blocks We * start with CR_GOAL_LEN_FAST, unless it is power of 2 * aligned, in which case let's do that faster approach first. */ ac->ac_criteria = CR_GOAL_LEN_FAST; if (ac->ac_2order) ac->ac_criteria = CR_POWER2_ALIGNED; ac->ac_e4b = &e4b; ac->ac_prefetch_ios = 0; ac->ac_first_err = 0; repeat: while (ac->ac_criteria < EXT4_MB_NUM_CRS) { err = ext4_mb_scan_groups(ac); if (err) goto out; if (ac->ac_status != AC_STATUS_CONTINUE) break; } if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { /* * We've been searching too long. Let's try to allocate * the best chunk we've found so far */ ext4_mb_try_best_found(ac, &e4b); if (ac->ac_status != AC_STATUS_FOUND) { int lost; /* * Someone more lucky has already allocated it. * The only thing we can do is just take first * found block(s) */ lost = atomic_inc_return(&sbi->s_mb_lost_chunks); mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n", ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, lost); ac->ac_b_ex.fe_group = 0; ac->ac_b_ex.fe_start = 0; ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; ac->ac_criteria = CR_ANY_FREE; goto repeat; } } if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) { atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); if (ac->ac_flags & EXT4_MB_STREAM_ALLOC && ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group) atomic_inc(&sbi->s_bal_stream_goals); } out: if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err) err = ac->ac_first_err; mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, ac->ac_flags, ac->ac_criteria, err); if (ac->ac_prefetch_nr) ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr); return err; } static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; group = *pos + 1; return (void *) ((unsigned long) group); } static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group; ++*pos; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; group = *pos + 1; return (void *) ((unsigned long) group); } static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i, err; char nbuf[16]; struct ext4_buddy e4b; struct ext4_group_info *grinfo; unsigned char blocksize_bits = min_t(unsigned char, sb->s_blocksize_bits, EXT4_MAX_BLOCK_LOG_SIZE); DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters, EXT4_MAX_BLOCK_LOG_SIZE + 2); group--; if (group == 0) seq_puts(seq, "#group: free frags first [" " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) + sizeof(struct ext4_group_info); grinfo = ext4_get_group_info(sb, group); if (!grinfo) return 0; /* Load the group info in memory only if not already loaded. */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf)); return 0; } ext4_mb_unload_buddy(&e4b); } /* * We care only about free space counters in the group info and * these are safe to access even after the buddy has been unloaded */ memcpy(sg, grinfo, i); seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free, sg->bb_fragments, sg->bb_first_free); for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? sg->bb_counters[i] : 0); seq_puts(seq, " ]"); if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg)) seq_puts(seq, " Block bitmap corrupted!"); seq_putc(seq, '\n'); return 0; } static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) { } const struct seq_operations ext4_mb_seq_groups_ops = { .start = ext4_mb_seq_groups_start, .next = ext4_mb_seq_groups_next, .stop = ext4_mb_seq_groups_stop, .show = ext4_mb_seq_groups_show, }; int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; struct ext4_sb_info *sbi = EXT4_SB(sb); seq_puts(seq, "mballoc:\n"); if (!sbi->s_mb_stats) { seq_puts(seq, "\tmb stats collection turned off.\n"); seq_puts( seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); return 0; } seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); /* CR_POWER2_ALIGNED stats */ seq_puts(seq, "\tcr_p2_aligned_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED])); seq_printf( seq, "\t\tgroups_considered: %llu\n", atomic64_read( &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED])); /* CR_GOAL_LEN_FAST stats */ seq_puts(seq, "\tcr_goal_fast_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tgroups_considered: %llu\n", atomic64_read( &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST])); /* CR_BEST_AVAIL_LEN stats */ seq_puts(seq, "\tcr_best_avail_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN])); seq_printf( seq, "\t\tgroups_considered: %llu\n", atomic64_read( &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN])); /* CR_GOAL_LEN_SLOW stats */ seq_puts(seq, "\tcr_goal_slow_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\tgroups_considered: %llu\n", atomic64_read( &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW])); /* CR_ANY_FREE stats */ seq_puts(seq, "\tcr_any_free_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE])); seq_printf( seq, "\t\tgroups_considered: %llu\n", atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE])); /* Aggregates */ seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); seq_printf(seq, "\t\tstream_goal_hits: %u\n", atomic_read(&sbi->s_bal_stream_goals)); seq_printf(seq, "\t\tlen_goal_hits: %u\n", atomic_read(&sbi->s_bal_len_goals)); seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); seq_printf(seq, "\tbuddies_generated: %u/%u\n", atomic_read(&sbi->s_mb_buddies_generated), ext4_get_groups_count(sb)); seq_printf(seq, "\tbuddies_time_used: %llu\n", atomic64_read(&sbi->s_mb_generation_time)); seq_printf(seq, "\tpreallocated: %u\n", atomic_read(&sbi->s_mb_preallocated)); seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded)); return 0; } static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) return NULL; position = *pos + 1; return (void *) ((unsigned long) position); } static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) { struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; ++*pos; if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) return NULL; position = *pos + 1; return (void *) ((unsigned long) position); } static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) { struct super_block *sb = pde_data(file_inode(seq->file)); struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long position = ((unsigned long) v); struct ext4_group_info *grp; unsigned int count; unsigned long idx; position--; if (position >= MB_NUM_ORDERS(sb)) { position -= MB_NUM_ORDERS(sb); if (position == 0) seq_puts(seq, "avg_fragment_size_lists:\n"); count = 0; xa_for_each(&sbi->s_mb_avg_fragment_size[position], idx, grp) count++; seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); return 0; } if (position == 0) { seq_printf(seq, "optimize_scan: %d\n", test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0); seq_puts(seq, "max_free_order_lists:\n"); } count = 0; xa_for_each(&sbi->s_mb_largest_free_orders[position], idx, grp) count++; seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); return 0; } static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) { } const struct seq_operations ext4_mb_seq_structs_summary_ops = { .start = ext4_mb_seq_structs_summary_start, .next = ext4_mb_seq_structs_summary_next, .stop = ext4_mb_seq_structs_summary_stop, .show = ext4_mb_seq_structs_summary_show, }; static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) { int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; BUG_ON(!cachep); return cachep; } /* * Allocate the top-level s_group_info array for the specified number * of groups */ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned size; struct ext4_group_info ***old_groupinfo, ***new_groupinfo; size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); if (size <= sbi->s_group_info_size) return 0; size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); new_groupinfo = kvzalloc(size, GFP_KERNEL); if (!new_groupinfo) { ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); return -ENOMEM; } rcu_read_lock(); old_groupinfo = rcu_dereference(sbi->s_group_info); if (old_groupinfo) memcpy(new_groupinfo, old_groupinfo, sbi->s_group_info_size * sizeof(*sbi->s_group_info)); rcu_read_unlock(); rcu_assign_pointer(sbi->s_group_info, new_groupinfo); sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); if (old_groupinfo) ext4_kvfree_array_rcu(old_groupinfo); ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", sbi->s_group_info_size); return 0; } /* Create and initialize ext4_group_info data for the given group. */ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *desc) { int i; int metalen = 0; int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb); struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info **meta_group_info; struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); /* * First check if this group is the first of a reserved block. * If it's true, we have to allocate a new table of pointers * to ext4_group_info structures */ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); meta_group_info = kmalloc(metalen, GFP_NOFS); if (meta_group_info == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate mem " "for a buddy group"); return -ENOMEM; } rcu_read_lock(); rcu_dereference(sbi->s_group_info)[idx] = meta_group_info; rcu_read_unlock(); } meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx); i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); if (meta_group_info[i] == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); goto exit_group_info; } set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(meta_group_info[i]->bb_state)); /* * initialize bb_free to be able to skip * empty groups without initialization */ if (ext4_has_group_desc_csum(sb) && (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { meta_group_info[i]->bb_free = ext4_free_clusters_after_init(sb, group, desc); } else { meta_group_info[i]->bb_free = ext4_free_group_clusters(sb, desc); } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ meta_group_info[i]->bb_group = group; mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); return 0; exit_group_info: /* If a meta_group_info table has been allocated, release it now */ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { struct ext4_group_info ***group_info; rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); kfree(group_info[idx]); group_info[idx] = NULL; rcu_read_unlock(); } return -ENOMEM; } /* ext4_mb_add_groupinfo */ static int ext4_mb_init_backend(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; struct ext4_sb_info *sbi = EXT4_SB(sb); int err; struct ext4_group_desc *desc; struct ext4_group_info ***group_info; struct kmem_cache *cachep; err = ext4_mb_alloc_groupinfo(sb, ngroups); if (err) return err; sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { ext4_msg(sb, KERN_ERR, "can't get new inode"); goto err_freesgi; } /* To avoid potentially colliding with an valid on-disk inode number, * use EXT4_BAD_INO for the buddy cache inode number. This inode is * not in the inode hash, so it should never be found by iget(), but * this will avoid confusion if it ever shows up during debugging. */ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; ext4_set_inode_mapping_order(sbi->s_buddy_cache); for (i = 0; i < ngroups; i++) { cond_resched(); desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); goto err_freebuddy; } if (ext4_mb_add_groupinfo(sb, i, desc) != 0) goto err_freebuddy; } if (ext4_has_feature_flex_bg(sb)) { /* a single flex group is supposed to be read by a single IO. * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is * unsigned integer, so the maximum shift is 32. */ if (sbi->s_es->s_log_groups_per_flex >= 32) { ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group"); goto err_freebuddy; } sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex, BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ } else { sbi->s_mb_prefetch = 32; } if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) sbi->s_mb_prefetch = ext4_get_groups_count(sb); /* * now many real IOs to prefetch within a single allocation at * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related * optimization we shouldn't try to load too many groups, at some point * we should start to use what we've got in memory. * with an average random access time 5ms, it'd take a second to get * 200 groups (* N with flex_bg), so let's make this limit 4 */ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4; if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb)) sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb); return 0; err_freebuddy: cachep = get_groupinfo_cache(sb->s_blocksize_bits); while (i-- > 0) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); if (grp) kmem_cache_free(cachep, grp); } i = sbi->s_group_info_size; rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); while (i-- > 0) kfree(group_info[i]); rcu_read_unlock(); iput(sbi->s_buddy_cache); err_freesgi: rcu_read_lock(); kvfree(rcu_dereference(sbi->s_group_info)); rcu_read_unlock(); return -ENOMEM; } static void ext4_groupinfo_destroy_slabs(void) { int i; for (i = 0; i < NR_GRPINFO_CACHES; i++) { kmem_cache_destroy(ext4_groupinfo_caches[i]); ext4_groupinfo_caches[i] = NULL; } } static int ext4_groupinfo_create_slab(size_t size) { static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); int slab_size; int blocksize_bits = order_base_2(size); int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; struct kmem_cache *cachep; if (cache_index >= NR_GRPINFO_CACHES) return -EINVAL; if (unlikely(cache_index < 0)) cache_index = 0; mutex_lock(&ext4_grpinfo_slab_create_mutex); if (ext4_groupinfo_caches[cache_index]) { mutex_unlock(&ext4_grpinfo_slab_create_mutex); return 0; /* Already created */ } slab_size = offsetof(struct ext4_group_info, bb_counters[blocksize_bits + 2]); cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], slab_size, 0, SLAB_RECLAIM_ACCOUNT, NULL); ext4_groupinfo_caches[cache_index] = cachep; mutex_unlock(&ext4_grpinfo_slab_create_mutex); if (!cachep) { printk(KERN_EMERG "EXT4-fs: no memory for groupinfo slab cache\n"); return -ENOMEM; } return 0; } static void ext4_discard_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, s_discard_work); struct super_block *sb = sbi->s_sb; struct ext4_free_data *fd, *nfd; struct ext4_buddy e4b; LIST_HEAD(discard_list); ext4_group_t grp, load_grp; int err = 0; spin_lock(&sbi->s_md_lock); list_splice_init(&sbi->s_discard_list, &discard_list); spin_unlock(&sbi->s_md_lock); load_grp = UINT_MAX; list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) { /* * If filesystem is umounting or no memory or suffering * from no space, give up the discard */ if ((sb->s_flags & SB_ACTIVE) && !err && !atomic_read(&sbi->s_retry_alloc_pending)) { grp = fd->efd_group; if (grp != load_grp) { if (load_grp != UINT_MAX) ext4_mb_unload_buddy(&e4b); err = ext4_mb_load_buddy(sb, grp, &e4b); if (err) { kmem_cache_free(ext4_free_data_cachep, fd); load_grp = UINT_MAX; continue; } else { load_grp = grp; } } ext4_lock_group(sb, grp); ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster, fd->efd_start_cluster + fd->efd_count - 1, 1); ext4_unlock_group(sb, grp); } kmem_cache_free(ext4_free_data_cachep, fd); } if (load_grp != UINT_MAX) ext4_mb_unload_buddy(&e4b); } static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi) { if (!sbi->s_mb_avg_fragment_size) return; for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) xa_destroy(&sbi->s_mb_avg_fragment_size[i]); kfree(sbi->s_mb_avg_fragment_size); sbi->s_mb_avg_fragment_size = NULL; } static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi) { if (!sbi->s_mb_largest_free_orders) return; for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) xa_destroy(&sbi->s_mb_largest_free_orders[i]); kfree(sbi->s_mb_largest_free_orders); sbi->s_mb_largest_free_orders = NULL; } int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned i, j; unsigned offset, offset_incr; unsigned max; int ret; i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { ret = -ENOMEM; goto out; } i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { ret = -ENOMEM; goto out; } ret = ext4_groupinfo_create_slab(sb->s_blocksize); if (ret < 0) goto out; /* order 0 is regular bitmap */ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; sbi->s_mb_offsets[0] = 0; i = 1; offset = 0; offset_incr = 1 << (sb->s_blocksize_bits - 1); max = sb->s_blocksize << 2; do { sbi->s_mb_offsets[i] = offset; sbi->s_mb_maxs[i] = max; offset += offset_incr; offset_incr = offset_incr >> 1; max = max >> 1; i++; } while (i < MB_NUM_ORDERS(sb)); sbi->s_mb_avg_fragment_size = kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray), GFP_KERNEL); if (!sbi->s_mb_avg_fragment_size) { ret = -ENOMEM; goto out; } for (i = 0; i < MB_NUM_ORDERS(sb); i++) xa_init(&sbi->s_mb_avg_fragment_size[i]); sbi->s_mb_largest_free_orders = kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray), GFP_KERNEL); if (!sbi->s_mb_largest_free_orders) { ret = -ENOMEM; goto out; } for (i = 0; i < MB_NUM_ORDERS(sb); i++) xa_init(&sbi->s_mb_largest_free_orders[i]); spin_lock_init(&sbi->s_md_lock); atomic_set(&sbi->s_mb_free_pending, 0); INIT_LIST_HEAD(&sbi->s_freed_data_list[0]); INIT_LIST_HEAD(&sbi->s_freed_data_list[1]); INIT_LIST_HEAD(&sbi->s_discard_list); INIT_WORK(&sbi->s_discard_work, ext4_discard_work); atomic_set(&sbi->s_retry_alloc_pending, 0); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER; /* * The default group preallocation is 512, which for 4k block * sizes translates to 2 megabytes. However for bigalloc file * systems, this is probably too big (i.e, if the cluster size * is 1 megabyte, then group preallocation size becomes half a * gigabyte!). As a default, we will keep a two megabyte * group pralloc size for cluster sizes up to 64k, and after * that, we will force a minimum group preallocation size of * 32 clusters. This translates to 8 megs when the cluster * size is 256k, and 32 megs when the cluster size is 1 meg, * which seems reasonable as a default. */ sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> sbi->s_cluster_bits, 32); /* * If there is a s_stripe > 1, then we set the s_mb_group_prealloc * to the lowest multiple of s_stripe which is bigger than * the s_mb_group_prealloc as determined above. We want * the preallocation size to be an exact multiple of the * RAID stripe size so that preallocations don't fragment * the stripes. */ if (sbi->s_stripe > 1) { sbi->s_mb_group_prealloc = roundup( sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe)); } sbi->s_mb_nr_global_goals = umin(num_possible_cpus(), DIV_ROUND_UP(sbi->s_groups_count, 4)); sbi->s_mb_last_groups = kcalloc(sbi->s_mb_nr_global_goals, sizeof(ext4_group_t), GFP_KERNEL); if (sbi->s_mb_last_groups == NULL) { ret = -ENOMEM; goto out; } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { ret = -ENOMEM; goto out_free_last_groups; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; lg = per_cpu_ptr(sbi->s_locality_groups, i); mutex_init(&lg->lg_mutex); for (j = 0; j < PREALLOC_TB_SIZE; j++) INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); spin_lock_init(&lg->lg_prealloc_lock); } if (bdev_nonrot(sb->s_bdev)) sbi->s_mb_max_linear_groups = 0; else sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; /* init file for buddy data */ ret = ext4_mb_init_backend(sb); if (ret != 0) goto out_free_locality_groups; return 0; out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; out_free_last_groups: kfree(sbi->s_mb_last_groups); sbi->s_mb_last_groups = NULL; out: ext4_mb_avg_fragment_size_destroy(sbi); ext4_mb_largest_free_orders_destroy(sbi); kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; kfree(sbi->s_mb_maxs); sbi->s_mb_maxs = NULL; return ret; } /* need to called with the ext4 group lock held */ static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) { struct ext4_prealloc_space *pa; struct list_head *cur, *tmp; int count = 0; list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); list_del(&pa->pa_group_list); count++; kmem_cache_free(ext4_pspace_cachep, pa); } return count; } void ext4_mb_release(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; int num_meta_group_infos; struct ext4_group_info *grinfo, ***group_info; struct ext4_sb_info *sbi = EXT4_SB(sb); struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); int count; if (test_opt(sb, DISCARD)) { /* * wait the discard work to drain all of ext4_free_data */ flush_work(&sbi->s_discard_work); WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); } if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { cond_resched(); grinfo = ext4_get_group_info(sb, i); if (!grinfo) continue; mb_group_bb_bitmap_free(grinfo); ext4_lock_group(sb, i); count = ext4_mb_cleanup_pa(grinfo); if (count) mb_debug(sb, "mballoc: %d PAs left\n", count); ext4_unlock_group(sb, i); kmem_cache_free(cachep, grinfo); } num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); for (i = 0; i < num_meta_group_infos; i++) kfree(group_info[i]); kvfree(group_info); rcu_read_unlock(); } ext4_mb_avg_fragment_size_destroy(sbi); ext4_mb_largest_free_orders_destroy(sbi); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); iput(sbi->s_buddy_cache); if (sbi->s_mb_stats) { ext4_msg(sb, KERN_INFO, "mballoc: %u blocks %u reqs (%u success)", atomic_read(&sbi->s_bal_allocated), atomic_read(&sbi->s_bal_reqs), atomic_read(&sbi->s_bal_success)); ext4_msg(sb, KERN_INFO, "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " "%u 2^N hits, %u breaks, %u lost", atomic_read(&sbi->s_bal_ex_scanned), atomic_read(&sbi->s_bal_groups_scanned), atomic_read(&sbi->s_bal_goals), atomic_read(&sbi->s_bal_2orders), atomic_read(&sbi->s_bal_breaks), atomic_read(&sbi->s_mb_lost_chunks)); ext4_msg(sb, KERN_INFO, "mballoc: %u generated and it took %llu", atomic_read(&sbi->s_mb_buddies_generated), atomic64_read(&sbi->s_mb_generation_time)); ext4_msg(sb, KERN_INFO, "mballoc: %u preallocated, %u discarded", atomic_read(&sbi->s_mb_preallocated), atomic_read(&sbi->s_mb_discarded)); } free_percpu(sbi->s_locality_groups); kfree(sbi->s_mb_last_groups); } static inline int ext4_issue_discard(struct super_block *sb, ext4_group_t block_group, ext4_grpblk_t cluster, int count) { ext4_fsblk_t discard_block; discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + ext4_group_first_block_no(sb, block_group)); count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } static void ext4_free_data_in_buddy(struct super_block *sb, struct ext4_free_data *entry) { struct ext4_buddy e4b; struct ext4_group_info *db; int err, count = 0; mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending); db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ count += entry->efd_count; ext4_lock_group(sb, entry->efd_group); /* Take it out of per group rb tree */ rb_erase(&entry->efd_node, &(db->bb_free_root)); mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); /* * Clear the trimmed flag for the group so that the next * ext4_trim_fs can trim it. */ EXT4_MB_GRP_CLEAR_TRIMMED(db); if (!db->bb_free_root.rb_node) { /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() */ folio_put(e4b.bd_buddy_folio); folio_put(e4b.bd_bitmap_folio); } ext4_unlock_group(sb, entry->efd_group); ext4_mb_unload_buddy(&e4b); mb_debug(sb, "freed %d blocks in 1 structures\n", count); } /* * This function is called by the jbd2 layer once the commit has finished, * so we know we can free the blocks that were released with that commit. */ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_free_data *entry, *tmp; LIST_HEAD(freed_data_list); struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1]; bool wake; list_replace_init(s_freed_head, &freed_data_list); list_for_each_entry(entry, &freed_data_list, efd_list) ext4_free_data_in_buddy(sb, entry); if (test_opt(sb, DISCARD)) { spin_lock(&sbi->s_md_lock); wake = list_empty(&sbi->s_discard_list); list_splice_tail(&freed_data_list, &sbi->s_discard_list); spin_unlock(&sbi->s_md_lock); if (wake) queue_work(system_dfl_wq, &sbi->s_discard_work); } else { list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) kmem_cache_free(ext4_free_data_cachep, entry); } } int __init ext4_init_mballoc(void) { ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, SLAB_RECLAIM_ACCOUNT); if (ext4_pspace_cachep == NULL) goto out; ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, SLAB_RECLAIM_ACCOUNT); if (ext4_ac_cachep == NULL) goto out_pa_free; ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, SLAB_RECLAIM_ACCOUNT); if (ext4_free_data_cachep == NULL) goto out_ac_free; return 0; out_ac_free: kmem_cache_destroy(ext4_ac_cachep); out_pa_free: kmem_cache_destroy(ext4_pspace_cachep); out: return -ENOMEM; } void ext4_exit_mballoc(void) { /* * Wait for completion of call_rcu()'s on ext4_pspace_cachep * before destroying the slab cache. */ rcu_barrier(); kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_data_cachep); ext4_groupinfo_destroy_slabs(); } #define EXT4_MB_BITMAP_MARKED_CHECK 0x0001 #define EXT4_MB_SYNC_UPDATE 0x0002 static int ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state, ext4_group_t group, ext4_grpblk_t blkoff, ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bitmap_bh = NULL; struct ext4_group_desc *gdp; struct buffer_head *gdp_bh; int err; unsigned int i, already, changed = len; KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context, handle, sb, state, group, blkoff, len, flags, ret_changed); if (ret_changed) *ret_changed = 0; bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) return PTR_ERR(bitmap_bh); if (handle) { BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, sb, bitmap_bh, EXT4_JTR_NONE); if (err) goto out_err; } err = -EIO; gdp = ext4_get_group_desc(sb, group, &gdp_bh); if (!gdp) goto out_err; if (handle) { BUFFER_TRACE(gdp_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE); if (err) goto out_err; } ext4_lock_group(sb, group); if (ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); } if (flags & EXT4_MB_BITMAP_MARKED_CHECK) { already = 0; for (i = 0; i < len; i++) if (mb_test_bit(blkoff + i, bitmap_bh->b_data) == state) already++; changed = len - already; } if (state) { mb_set_bits(bitmap_bh->b_data, blkoff, len); ext4_free_group_clusters_set(sb, gdp, ext4_free_group_clusters(sb, gdp) - changed); } else { mb_clear_bits(bitmap_bh->b_data, blkoff, len); ext4_free_group_clusters_set(sb, gdp, ext4_free_group_clusters(sb, gdp) + changed); } ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); if (ret_changed) *ret_changed = changed; if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, group); struct flex_groups *fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); if (state) atomic64_sub(changed, &fg->free_clusters); else atomic64_add(changed, &fg->free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (err) goto out_err; err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); if (err) goto out_err; if (flags & EXT4_MB_SYNC_UPDATE) { sync_dirty_buffer(bitmap_bh); sync_dirty_buffer(gdp_bh); } out_err: brelse(bitmap_bh); return err; } /* * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps * Returns 0 if success or error code */ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle, unsigned int reserv_clstrs) { struct ext4_group_desc *gdp; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block; int err, len; int flags = 0; ext4_grpblk_t changed; BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(ac->ac_b_ex.fe_len <= 0); sb = ac->ac_sb; sbi = EXT4_SB(sb); gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL); if (!gdp) return -EIO; ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, ext4_free_group_clusters(sb, gdp)); block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and return EFSCORRUPTED * We leak some of the blocks here. */ err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, 0, NULL); if (!err) err = -EFSCORRUPTED; return err; } #ifdef AGGRESSIVE_CHECK flags |= EXT4_MB_BITMAP_MARKED_CHECK; #endif err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, flags, &changed); if (err && changed == 0) return err; #ifdef AGGRESSIVE_CHECK BUG_ON(changed != ac->ac_b_ex.fe_len); #endif percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative */ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); return err; } /* * Idempotent helper for Ext4 fast commit replay path to set the state of * blocks in bitmaps and update counters. */ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, int len, bool state) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ext4_grpblk_t blkoff; int err = 0; unsigned int clen, thisgrp_len; while (len > 0) { ext4_get_group_no_and_offset(sb, block, &group, &blkoff); /* * Check to see if we are freeing blocks across a group * boundary. * In case of flex_bg, this can happen that (block, len) may * span across more than one group. In that case we need to * get the corresponding group metadata to work with. * For this we have goto again loop. */ thisgrp_len = min_t(unsigned int, (unsigned int)len, EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff)); clen = EXT4_NUM_B2C(sbi, thisgrp_len); if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) { ext4_error(sb, "Marking blocks in system zone - " "Block = %llu, len = %u", block, thisgrp_len); break; } err = ext4_mb_mark_context(NULL, sb, state, group, blkoff, clen, EXT4_MB_BITMAP_MARKED_CHECK | EXT4_MB_SYNC_UPDATE, NULL); if (err) break; block += thisgrp_len; len -= thisgrp_len; BUG_ON(len < 0); } } /* * here we normalize request for locality group * Group request are normalized to s_mb_group_prealloc, which goes to * s_strip if we set the same via mount option. * s_mb_group_prealloc can be configured via * /sys/fs/ext4/<partition>/mb_group_prealloc * * XXX: should we try to preallocate more than the group has now? */ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg = ac->ac_lg; BUG_ON(lg == NULL); ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len); } /* * This function returns the next element to look at during inode * PA rbtree walk. We assume that we have held the inode PA rbtree lock * (ei->i_prealloc_lock) * * new_start The start of the range we want to compare * cur_start The existing start that we are comparing against * node The node of the rb_tree */ static inline struct rb_node* ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node) { if (new_start < cur_start) return node->rb_left; else return node->rb_right; } static inline void ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, ext4_lblk_t start, loff_t end) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *tmp_pa; ext4_lblk_t tmp_pa_start; loff_t tmp_pa_end; struct rb_node *iter; read_lock(&ei->i_prealloc_lock); for (iter = ei->i_prealloc_node.rb_node; iter; iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) { tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; tmp_pa_end = pa_logical_end(sbi, tmp_pa); spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start)); spin_unlock(&tmp_pa->pa_lock); } read_unlock(&ei->i_prealloc_lock); } /* * Given an allocation context "ac" and a range "start", "end", check * and adjust boundaries if the range overlaps with any of the existing * preallocatoins stored in the corresponding inode of the allocation context. * * Parameters: * ac allocation context * start start of the new range * end end of the new range */ static inline void ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, ext4_lblk_t *start, loff_t *end) { struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; struct rb_node *iter; ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1; loff_t new_end, tmp_pa_end, left_pa_end = -1; new_start = *start; new_end = *end; /* * Adjust the normalized range so that it doesn't overlap with any * existing preallocated blocks(PAs). Make sure to hold the rbtree lock * so it doesn't change underneath us. */ read_lock(&ei->i_prealloc_lock); /* Step 1: find any one immediate neighboring PA of the normalized range */ for (iter = ei->i_prealloc_node.rb_node; iter; iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, tmp_pa_start, iter)) { tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; tmp_pa_end = pa_logical_end(sbi, tmp_pa); /* PA must not overlap original request */ spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || ac->ac_o_ex.fe_logical < tmp_pa_start)); spin_unlock(&tmp_pa->pa_lock); } /* * Step 2: check if the found PA is left or right neighbor and * get the other neighbor */ if (tmp_pa) { if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) { struct rb_node *tmp; left_pa = tmp_pa; tmp = rb_next(&left_pa->pa_node.inode_node); if (tmp) { right_pa = rb_entry(tmp, struct ext4_prealloc_space, pa_node.inode_node); } } else { struct rb_node *tmp; right_pa = tmp_pa; tmp = rb_prev(&right_pa->pa_node.inode_node); if (tmp) { left_pa = rb_entry(tmp, struct ext4_prealloc_space, pa_node.inode_node); } } } /* Step 3: get the non deleted neighbors */ if (left_pa) { for (iter = &left_pa->pa_node.inode_node;; iter = rb_prev(iter)) { if (!iter) { left_pa = NULL; break; } tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); left_pa = tmp_pa; spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) { spin_unlock(&tmp_pa->pa_lock); break; } spin_unlock(&tmp_pa->pa_lock); } } if (right_pa) { for (iter = &right_pa->pa_node.inode_node;; iter = rb_next(iter)) { if (!iter) { right_pa = NULL; break; } tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); right_pa = tmp_pa; spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) { spin_unlock(&tmp_pa->pa_lock); break; } spin_unlock(&tmp_pa->pa_lock); } } if (left_pa) { left_pa_end = pa_logical_end(sbi, left_pa); BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); } if (right_pa) { right_pa_start = right_pa->pa_lstart; BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical); } /* Step 4: trim our normalized range to not overlap with the neighbors */ if (left_pa) { if (left_pa_end > new_start) new_start = left_pa_end; } if (right_pa) { if (right_pa_start < new_end) new_end = right_pa_start; } read_unlock(&ei->i_prealloc_lock); /* XXX: extra loop to check we really don't overlap preallocations */ ext4_mb_pa_assert_overlap(ac, new_start, new_end); *start = new_start; *end = new_end; } /* * Normalization means making request better in terms of * size and alignment */ static noinline_for_stack void ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_super_block *es = sbi->s_es; int bsbits, max; loff_t size, start_off, end; loff_t orig_size __maybe_unused; ext4_lblk_t start; /* do normalize only data requests, metadata requests do not need preallocation */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; /* sometime caller may want exact blocks */ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; /* caller may indicate that preallocation isn't * required (it's a tail, for example) */ if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) return; if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { ext4_mb_normalize_group_request(ac); return ; } bsbits = ac->ac_sb->s_blocksize_bits; /* first, let's learn actual file size * given current request is allocated */ size = extent_logical_end(sbi, &ac->ac_o_ex); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); orig_size = size; /* max size of free chunks */ max = 2 << bsbits; #define NRL_CHECK_SIZE(req, size, max, chunk_size) \ (req <= (size) || max <= (chunk_size)) /* first, try to predict filesize */ /* XXX: should this table be tunable? */ start_off = 0; if (size <= 16 * 1024) { size = 16 * 1024; } else if (size <= 32 * 1024) { size = 32 * 1024; } else if (size <= 64 * 1024) { size = 64 * 1024; } else if (size <= 128 * 1024) { size = 128 * 1024; } else if (size <= 256 * 1024) { size = 256 * 1024; } else if (size <= 512 * 1024) { size = 512 * 1024; } else if (size <= 1024 * 1024) { size = 1024 * 1024; } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (21 - bsbits)) << 21; size = 2 * 1024 * 1024; } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (22 - bsbits)) << 22; size = 4 * 1024 * 1024; } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len), (8<<20)>>bsbits, max, 8 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (23 - bsbits)) << 23; size = 8 * 1024 * 1024; } else { start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; size = (loff_t) EXT4_C2B(sbi, ac->ac_o_ex.fe_len) << bsbits; } size = size >> bsbits; start = start_off >> bsbits; /* * For tiny groups (smaller than 8MB) the chosen allocation * alignment may be larger than group size. Make sure the * alignment does not move allocation to a different group which * makes mballoc fail assertions later. */ start = max(start, rounddown(ac->ac_o_ex.fe_logical, (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); /* avoid unnecessary preallocation that may trigger assertions */ if (start + size > EXT_MAX_BLOCKS) size = EXT_MAX_BLOCKS - start; /* don't cover already allocated blocks in selected range */ if (ar->pleft && start <= ar->lleft) { size -= ar->lleft + 1 - start; start = ar->lleft + 1; } if (ar->pright && start + size - 1 >= ar->lright) size -= start + size - ar->lright; /* * Trim allocation request for filesystems with artificially small * groups. */ if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb); end = start + size; ext4_mb_pa_adjust_overlap(ac, &start, &end); size = end - start; /* * In this function "start" and "size" are normalized for better * alignment and length such that we could preallocate more blocks. * This normalization is done such that original request of * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and * "size" boundaries. * (Note fe_len can be relaxed since FS block allocation API does not * provide gurantee on number of contiguous blocks allocation since that * depends upon free space left, etc). * In case of inode pa, later we use the allocated blocks * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated * range of goal/best blocks [start, size] to put it at the * ac_o_ex.fe_logical extent of this inode. * (See ext4_mb_use_inode_pa() for more details) */ if (start + size <= ac->ac_o_ex.fe_logical || start > ac->ac_o_ex.fe_logical) { ext4_msg(ac->ac_sb, KERN_ERR, "start %lu, size %lu, fe_logical %lu", (unsigned long) start, (unsigned long) size, (unsigned long) ac->ac_o_ex.fe_logical); BUG(); } BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ /* XXX: is it better to align blocks WRT to logical * placement or satisfy big request as is */ ac->ac_g_ex.fe_logical = start; ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; /* define goal start in order to merge */ if (ar->pright && (ar->lright == (start + size)) && ar->pright >= size && ar->pright - size >= le32_to_cpu(es->s_first_data_block)) { /* merge to the right */ ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, &ac->ac_g_ex.fe_group, &ac->ac_g_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } if (ar->pleft && (ar->lleft + 1 == start) && ar->pleft + 1 < ext4_blocks_count(es)) { /* merge to the left */ ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, &ac->ac_g_ex.fe_group, &ac->ac_g_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size, orig_size, start); } static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { atomic_inc(&sbi->s_bal_reqs); atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); for (int i=0; i<EXT4_MB_NUM_CRS; i++) { atomic_add(ac->ac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]); } atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) atomic_inc(&sbi->s_bal_goals); /* did we allocate as much as normalizer originally wanted? */ if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len) atomic_inc(&sbi->s_bal_len_goals); if (ac->ac_found > sbi->s_mb_max_to_scan) atomic_inc(&sbi->s_bal_breaks); } if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) trace_ext4_mballoc_alloc(ac); else trace_ext4_mballoc_prealloc(ac); } /* * Called on failure; free up any blocks from the inode PA for this * context. We don't need this for MB_GROUP_PA because we only change * pa_free in ext4_mb_release_context(), but on failure, we've already * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. */ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; struct ext4_buddy e4b; int err; if (pa == NULL) { if (ac->ac_f_ex.fe_len == 0) return; err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); if (WARN_RATELIMIT(err, "ext4: mb_load_buddy failed (%d)", err)) /* * This should never happen since we pin the * folios in the ext4_allocation_context so * ext4_mb_load_buddy() should never fail. */ return; ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, ac->ac_f_ex.fe_len); ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); ext4_mb_unload_buddy(&e4b); return; } if (pa->pa_type == MB_INODE_PA) { spin_lock(&pa->pa_lock); pa->pa_free += ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); } } /* * use blocks preallocated to inode */ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_fsblk_t start; ext4_fsblk_t end; int len; /* found preallocated blocks, use them */ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); len = EXT4_NUM_B2C(sbi, end - start); ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; ac->ac_status = AC_STATUS_FOUND; ac->ac_pa = pa; BUG_ON(start < pa->pa_pstart); BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); BUG_ON(pa->pa_free < len); BUG_ON(ac->ac_b_ex.fe_len <= 0); pa->pa_free -= len; mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); } /* * use blocks preallocated to locality group */ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { unsigned int len = ac->ac_o_ex.fe_len; ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; ac->ac_status = AC_STATUS_FOUND; ac->ac_pa = pa; /* we don't correct pa_pstart or pa_len here to avoid * possible race when the group is being loaded concurrently * instead we correct pa later, after blocks are marked * in on-disk bitmap -- see ext4_mb_release_context() * Other CPUs are prevented from allocating from this pa by lg_mutex */ mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", pa->pa_lstart, len, pa); } /* * Return the prealloc space that have minimal distance * from the goal block. @cpa is the prealloc * space that is having currently known minimal distance * from the goal block. */ static struct ext4_prealloc_space * ext4_mb_check_group_pa(ext4_fsblk_t goal_block, struct ext4_prealloc_space *pa, struct ext4_prealloc_space *cpa) { ext4_fsblk_t cur_distance, new_distance; if (cpa == NULL) { atomic_inc(&pa->pa_count); return pa; } cur_distance = abs(goal_block - cpa->pa_pstart); new_distance = abs(goal_block - pa->pa_pstart); if (cur_distance <= new_distance) return cpa; /* drop the previous reference */ atomic_dec(&cpa->pa_count); atomic_inc(&pa->pa_count); return pa; } /* * check if found pa meets EXT4_MB_HINT_GOAL_ONLY */ static bool ext4_mb_pa_goal_check(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_fsblk_t start; if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))) return true; /* * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted * in ext4_mb_normalize_request and will keep same with ac_o_ex * from ext4_mb_initialize_context. Choose ac_g_ex here to keep * consistent with ext4_mb_find_by_goal. */ start = pa->pa_pstart + (ac->ac_g_ex.fe_logical - pa->pa_lstart); if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start) return false; if (ac->ac_g_ex.fe_len > pa->pa_len - EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart)) return false; return true; } /* * search goal blocks in preallocated space */ static noinline_for_stack bool ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL; struct rb_node *iter; ext4_fsblk_t goal_block; /* only data can be preallocated */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return false; /* * first, try per-file preallocation by searching the inode pa rbtree. * * Here, we can't do a direct traversal of the tree because * ext4_mb_discard_group_preallocation() can paralelly mark the pa * deleted and that can cause direct traversal to skip some entries. */ read_lock(&ei->i_prealloc_lock); if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) { goto try_group_pa; } /* * Step 1: Find a pa with logical start immediately adjacent to the * original logical start. This could be on the left or right. * * (tmp_pa->pa_lstart never changes so we can skip locking for it). */ for (iter = ei->i_prealloc_node.rb_node; iter; iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, tmp_pa->pa_lstart, iter)) { tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); } /* * Step 2: The adjacent pa might be to the right of logical start, find * the left adjacent pa. After this step we'd have a valid tmp_pa whose * logical start is towards the left of original request's logical start */ if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) { struct rb_node *tmp; tmp = rb_prev(&tmp_pa->pa_node.inode_node); if (tmp) { tmp_pa = rb_entry(tmp, struct ext4_prealloc_space, pa_node.inode_node); } else { /* * If there is no adjacent pa to the left then finding * an overlapping pa is not possible hence stop searching * inode pa tree */ goto try_group_pa; } } BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); /* * Step 3: If the left adjacent pa is deleted, keep moving left to find * the first non deleted adjacent pa. After this step we should have a * valid tmp_pa which is guaranteed to be non deleted. */ for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) { if (!iter) { /* * no non deleted left adjacent pa, so stop searching * inode pa tree */ goto try_group_pa; } tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) { /* * We will keep holding the pa_lock from * this point on because we don't want group discard * to delete this pa underneath us. Since group * discard is anyways an ENOSPC operation it * should be okay for it to wait a few more cycles. */ break; } else { spin_unlock(&tmp_pa->pa_lock); } } BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); BUG_ON(tmp_pa->pa_deleted == 1); /* * Step 4: We now have the non deleted left adjacent pa. Only this * pa can possibly satisfy the request hence check if it overlaps * original logical start and stop searching if it doesn't. */ if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) { spin_unlock(&tmp_pa->pa_lock); goto try_group_pa; } /* non-extent files can't have physical blocks past 2^32 */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > EXT4_MAX_BLOCK_FILE_PHYS)) { /* * Since PAs don't overlap, we won't find any other PA to * satisfy this. */ spin_unlock(&tmp_pa->pa_lock); goto try_group_pa; } if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { atomic_inc(&tmp_pa->pa_count); ext4_mb_use_inode_pa(ac, tmp_pa); spin_unlock(&tmp_pa->pa_lock); read_unlock(&ei->i_prealloc_lock); return true; } else { /* * We found a valid overlapping pa but couldn't use it because * it had no free blocks. This should ideally never happen * because: * * 1. When a new inode pa is added to rbtree it must have * pa_free > 0 since otherwise we won't actually need * preallocation. * * 2. An inode pa that is in the rbtree can only have it's * pa_free become zero when another thread calls: * ext4_mb_new_blocks * ext4_mb_use_preallocated * ext4_mb_use_inode_pa * * 3. Further, after the above calls make pa_free == 0, we will * immediately remove it from the rbtree in: * ext4_mb_new_blocks * ext4_mb_release_context * ext4_mb_put_pa * * 4. Since the pa_free becoming 0 and pa_free getting removed * from tree both happen in ext4_mb_new_blocks, which is always * called with i_data_sem held for data allocations, we can be * sure that another process will never see a pa in rbtree with * pa_free == 0. */ WARN_ON_ONCE(tmp_pa->pa_free == 0); } spin_unlock(&tmp_pa->pa_lock); try_group_pa: read_unlock(&ei->i_prealloc_lock); /* can we use group allocation? */ if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) return false; /* inode may have no locality group for some reason */ lg = ac->ac_lg; if (lg == NULL) return false; order = fls(ac->ac_o_ex.fe_len) - 1; if (order > PREALLOC_TB_SIZE - 1) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); /* * search for the prealloc space that is having * minimal distance from the goal block. */ for (i = order; i < PREALLOC_TB_SIZE; i++) { rcu_read_lock(); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i], pa_node.lg_list) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free >= ac->ac_o_ex.fe_len) { cpa = ext4_mb_check_group_pa(goal_block, tmp_pa, cpa); } spin_unlock(&tmp_pa->pa_lock); } rcu_read_unlock(); } if (cpa) { ext4_mb_use_group_pa(ac, cpa); return true; } return false; } /* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap * Need to be called with ext4 group lock held */ static noinline_for_stack void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_prealloc_space *pa; struct list_head *cur; ext4_group_t groupnr; ext4_grpblk_t start; int preallocated = 0; int len; if (!grp) return; /* all form of preallocation discards first load group, * so the only competing code is preallocation use. * we don't need any locking here * notice we do NOT ignore preallocations with pa_deleted * otherwise we could leave used blocks available for * allocation in buddy when concurrent ext4_mb_put_pa() * is dropping preallocation */ list_for_each(cur, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); spin_lock(&pa->pa_lock); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &start); len = pa->pa_len; spin_unlock(&pa->pa_lock); if (unlikely(len == 0)) continue; BUG_ON(groupnr != group); mb_set_bits(bitmap, start, len); preallocated += len; } mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); } static void ext4_mb_mark_pa_deleted(struct super_block *sb, struct ext4_prealloc_space *pa) { struct ext4_inode_info *ei; if (pa->pa_deleted) { ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n", pa->pa_type, pa->pa_pstart, pa->pa_lstart, pa->pa_len); return; } pa->pa_deleted = 1; if (pa->pa_type == MB_INODE_PA) { ei = EXT4_I(pa->pa_inode); atomic_dec(&ei->i_prealloc_active); } } static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa) { BUG_ON(!pa); BUG_ON(atomic_read(&pa->pa_count)); BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); ext4_mb_pa_free(pa); } /* * drops a reference to preallocated space descriptor * if this was the last reference and the space is consumed */ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, struct super_block *sb, struct ext4_prealloc_space *pa) { ext4_group_t grp; ext4_fsblk_t grp_blk; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { spin_unlock(&pa->pa_lock); return; } if (pa->pa_deleted == 1) { spin_unlock(&pa->pa_lock); return; } ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); grp_blk = pa->pa_pstart; /* * If doing group-based preallocation, pa_pstart may be in the * next group when pa is used up */ if (pa->pa_type == MB_GROUP_PA) grp_blk--; grp = ext4_get_group_number(sb, grp_blk); /* * possible race: * * P1 (buddy init) P2 (regular allocation) * find block B in PA * copy on-disk bitmap to buddy * mark B in on-disk bitmap * drop PA from group * mark all PAs in buddy * * thus, P1 initializes buddy with B available. to prevent this * we make "copy" and "mark all PAs" atomic and serialize "drop PA" * against that pair */ ext4_lock_group(sb, grp); list_del(&pa->pa_group_list); ext4_unlock_group(sb, grp); if (pa->pa_type == MB_INODE_PA) { write_lock(pa->pa_node_lock.inode_lock); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); write_unlock(pa->pa_node_lock.inode_lock); ext4_mb_pa_free(pa); } else { spin_lock(pa->pa_node_lock.lg_lock); list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } } static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new) { struct rb_node **iter = &root->rb_node, *parent = NULL; struct ext4_prealloc_space *iter_pa, *new_pa; ext4_lblk_t iter_start, new_start; while (*iter) { iter_pa = rb_entry(*iter, struct ext4_prealloc_space, pa_node.inode_node); new_pa = rb_entry(new, struct ext4_prealloc_space, pa_node.inode_node); iter_start = iter_pa->pa_lstart; new_start = new_pa->pa_lstart; parent = *iter; if (new_start < iter_start) iter = &((*iter)->rb_left); else iter = &((*iter)->rb_right); } rb_link_node(new, parent, iter); rb_insert_color(new, root); } /* * creates new preallocated space for given inode */ static noinline_for_stack void ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_prealloc_space *pa; struct ext4_group_info *grp; struct ext4_inode_info *ei; /* preallocate only when found space is larger then requested */ BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); BUG_ON(ac->ac_pa == NULL); pa = ac->ac_pa; if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { struct ext4_free_extent ex = { .fe_logical = ac->ac_g_ex.fe_logical, .fe_len = ac->ac_orig_goal_len, }; loff_t orig_goal_end = extent_logical_end(sbi, &ex); loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex); /* * We can't allocate as much as normalizer wants, so we try * to get proper lstart to cover the original request, except * when the goal doesn't cover the original request as below: * * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048 * best_ex:0/200(200) -> adjusted: 1848/2048(200) */ BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); /* * Use the below logic for adjusting best extent as it keeps * fragmentation in check while ensuring logical range of best * extent doesn't overflow out of goal extent: * * 1. Check if best ex can be kept at end of goal (before * cr_best_avail trimmed it) and still cover original start * 2. Else, check if best ex can be kept at start of goal and * still cover original end * 3. Else, keep the best ex at start of original request. */ ex.fe_len = ac->ac_b_ex.fe_len; ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len); if (ac->ac_o_ex.fe_logical >= ex.fe_logical) goto adjust_bex; ex.fe_logical = ac->ac_g_ex.fe_logical; if (o_ex_end <= extent_logical_end(sbi, &ex)) goto adjust_bex; ex.fe_logical = ac->ac_o_ex.fe_logical; adjust_bex: ac->ac_b_ex.fe_logical = ex.fe_logical; BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end); } pa->pa_lstart = ac->ac_b_ex.fe_logical; pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_inode_pa(ac, pa); atomic_add(pa->pa_free, &sbi->s_mb_preallocated); ext4_mb_use_inode_pa(ac, pa); ei = EXT4_I(ac->ac_inode); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); if (!grp) return; pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock; pa->pa_inode = ac->ac_inode; list_add(&pa->pa_group_list, &grp->bb_prealloc_list); write_lock(pa->pa_node_lock.inode_lock); ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node); write_unlock(pa->pa_node_lock.inode_lock); atomic_inc(&ei->i_prealloc_active); } /* * creates new preallocated space for locality group inodes belongs to */ static noinline_for_stack void ext4_mb_new_group_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg; struct ext4_prealloc_space *pa; struct ext4_group_info *grp; /* preallocate only when found space is larger then requested */ BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); BUG_ON(ac->ac_pa == NULL); pa = ac->ac_pa; pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_lstart = pa->pa_pstart; pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_node.lg_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_GROUP_PA; mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_group_pa(ac, pa); ext4_mb_use_group_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); if (!grp) return; lg = ac->ac_lg; BUG_ON(lg == NULL); pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock; pa->pa_inode = NULL; list_add(&pa->pa_group_list, &grp->bb_prealloc_list); /* * We will later add the new pa to the right bucket * after updating the pa_free in ext4_mb_release_context */ } static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) { if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) ext4_mb_new_group_pa(ac); else ext4_mb_new_inode_pa(ac); } /* * finds all unused blocks in on-disk bitmap, frees them in * in-core bitmap and buddy. * @pa must be unlinked from inode and group lists, so that * nobody else can find/use it. * the caller MUST hold group/inode locks. * TODO: optimize the case when there are no in-core structures yet */ static noinline_for_stack void ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned int end; unsigned int next; ext4_group_t group; ext4_grpblk_t bit; unsigned long long grp_blk_start; int free = 0; BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; while (bit < end) { bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); mb_debug(sb, "free preallocated %u/%u in group %u\n", (unsigned) ext4_group_first_block_no(sb, group) + bit, (unsigned) next - bit, (unsigned) group); free += next - bit; trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + EXT4_C2B(sbi, bit)), next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } if (free != pa->pa_free) { ext4_msg(e4b->bd_sb, KERN_CRIT, "pa %p: logic %lu, phys. %lu, len %d", pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, pa->pa_len); ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* * pa is already deleted so we use the value obtained * from the bitmap and continue. */ } atomic_add(free, &sbi->s_mb_discarded); } static noinline_for_stack void ext4_mb_release_group_pa(struct ext4_buddy *e4b, struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; ext4_group_t group; ext4_grpblk_t bit; trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) { ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu", e4b->bd_group, group, pa->pa_pstart); return; } mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); } /* * releases all preallocations in given group * * first, we need to decide discard policy: * - when do we discard * 1) ENOSPC * - how many do we discard * 1) how many requested */ static noinline_for_stack int ext4_mb_discard_group_preallocations(struct super_block *sb, ext4_group_t group, int *busy) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; LIST_HEAD(list); struct ext4_buddy e4b; struct ext4_inode_info *ei; int err; int free = 0; if (!grp) return 0; mb_debug(sb, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) goto out_dbg; bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", err, group); goto out_dbg; } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { ext4_warning(sb, "Error %d loading buddy information for %u", err, group); put_bh(bitmap_bh); goto out_dbg; } ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { spin_unlock(&pa->pa_lock); *busy = 1; continue; } if (pa->pa_deleted) { spin_unlock(&pa->pa_lock); continue; } /* seems this one can be freed ... */ ext4_mb_mark_pa_deleted(sb, pa); if (!free) this_cpu_inc(discard_pa_seq); /* we can trust pa_free ... */ free += pa->pa_free; spin_unlock(&pa->pa_lock); list_del(&pa->pa_group_list); list_add(&pa->u.pa_tmp_list, &list); } /* now free all selected PAs */ list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { /* remove from object (inode or locality group) */ if (pa->pa_type == MB_GROUP_PA) { spin_lock(pa->pa_node_lock.lg_lock); list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); } else { write_lock(pa->pa_node_lock.inode_lock); ei = EXT4_I(pa->pa_inode); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); write_unlock(pa->pa_node_lock.inode_lock); } list_del(&pa->u.pa_tmp_list); if (pa->pa_type == MB_GROUP_PA) { ext4_mb_release_group_pa(&e4b, pa); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } else { ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); ext4_mb_pa_free(pa); } } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); out_dbg: mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", free, group, grp->bb_free); return free; } /* * releases all non-used preallocated blocks for given inode * * It's important to discard preallocations under i_data_sem * We don't want another block to be served from the prealloc * space when we are discarding the inode prealloc space. * * FIXME!! Make sure it is valid at all the call sites */ void ext4_discard_preallocations(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; ext4_group_t group = 0; LIST_HEAD(list); struct ext4_buddy e4b; struct rb_node *iter; int err; if (!S_ISREG(inode->i_mode)) return; if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return; mb_debug(sb, "discard preallocation for inode %lu\n", inode->i_ino); trace_ext4_discard_preallocations(inode, atomic_read(&ei->i_prealloc_active)); repeat: /* first, collect all pa's in the inode */ write_lock(&ei->i_prealloc_lock); for (iter = rb_first(&ei->i_prealloc_node); iter; iter = rb_next(iter)) { pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock); spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* this shouldn't happen often - nobody should * use preallocation while we're discarding it */ spin_unlock(&pa->pa_lock); write_unlock(&ei->i_prealloc_lock); ext4_msg(sb, KERN_ERR, "uh-oh! used pa while discarding"); WARN_ON(1); schedule_timeout_uninterruptible(HZ); goto repeat; } if (pa->pa_deleted == 0) { ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); list_add(&pa->u.pa_tmp_list, &list); continue; } /* someone is deleting pa right now */ spin_unlock(&pa->pa_lock); write_unlock(&ei->i_prealloc_lock); /* we have to wait here because pa_deleted * doesn't mean pa is already unlinked from * the list. as we might be called from * ->clear_inode() the inode will get freed * and concurrent thread which is unlinking * pa from inode's list may access already * freed memory, bad-bad-bad */ /* XXX: if this happens too often, we can * add a flag to force wait only in case * of ->clear_inode(), but not in case of * regular truncate */ schedule_timeout_uninterruptible(HZ); goto repeat; } write_unlock(&ei->i_prealloc_lock); list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { BUG_ON(pa->pa_type != MB_INODE_PA); group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { ext4_error_err(sb, -err, "Error %d loading buddy information for %u", err, group); continue; } bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", err, group); ext4_mb_unload_buddy(&e4b); continue; } ext4_lock_group(sb, group); list_del(&pa->pa_group_list); ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); list_del(&pa->u.pa_tmp_list); ext4_mb_pa_free(pa); } } static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa; BUG_ON(ext4_pspace_cachep == NULL); pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS); if (!pa) return -ENOMEM; atomic_set(&pa->pa_count, 1); ac->ac_pa = pa; return 0; } static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; BUG_ON(!pa); ac->ac_pa = NULL; WARN_ON(!atomic_dec_and_test(&pa->pa_count)); /* * current function is only called due to an error or due to * len of found blocks < len of requested blocks hence the PA has not * been added to grp->bb_prealloc_list. So we don't need to lock it */ pa->pa_deleted = 1; ext4_mb_pa_free(pa); } #ifdef CONFIG_EXT4_DEBUG static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; if (ext4_emergency_state(sb)) return; ngroups = ext4_get_groups_count(sb); mb_debug(sb, "groups: "); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); struct ext4_prealloc_space *pa; ext4_grpblk_t start; struct list_head *cur; if (!grp) continue; ext4_lock_group(sb, i); list_for_each(cur, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); spin_lock(&pa->pa_lock); ext4_get_group_no_and_offset(sb, pa->pa_pstart, NULL, &start); spin_unlock(&pa->pa_lock); mb_debug(sb, "PA:%u:%d:%d\n", i, start, pa->pa_len); } ext4_unlock_group(sb, i); mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free, grp->bb_fragments); } } static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; if (ext4_emergency_state(sb)) return; mb_debug(sb, "Can't allocate:" " Allocation context details:"); mb_debug(sb, "status %u flags 0x%x", ac->ac_status, ac->ac_flags); mb_debug(sb, "orig %lu/%lu/%lu@%lu, " "goal %lu/%lu/%lu@%lu, " "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, (unsigned long)ac->ac_o_ex.fe_start, (unsigned long)ac->ac_o_ex.fe_len, (unsigned long)ac->ac_o_ex.fe_logical, (unsigned long)ac->ac_g_ex.fe_group, (unsigned long)ac->ac_g_ex.fe_start, (unsigned long)ac->ac_g_ex.fe_len, (unsigned long)ac->ac_g_ex.fe_logical, (unsigned long)ac->ac_b_ex.fe_group, (unsigned long)ac->ac_b_ex.fe_start, (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); mb_debug(sb, "%u found", ac->ac_found); mb_debug(sb, "used pa: %s, ", str_yes_no(ac->ac_pa)); if (ac->ac_pa) mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ? "group pa" : "inode pa"); ext4_mb_show_pa(sb); } #else static inline void ext4_mb_show_pa(struct super_block *sb) { } static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) { ext4_mb_show_pa(ac->ac_sb); } #endif /* * We use locality group preallocation for small size file. The size of the * file is determined by the current size or the resulting size after * allocation which ever is larger * * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req */ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits = ac->ac_sb->s_blocksize_bits; loff_t size, isize; bool inode_pa_eligible, group_pa_eligible; if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; group_pa_eligible = sbi->s_mb_group_prealloc > 0; inode_pa_eligible = true; size = extent_logical_end(sbi, &ac->ac_o_ex); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; /* No point in using inode preallocation for closed files */ if ((size == isize) && !ext4_fs_is_busy(sbi) && !inode_is_open_for_write(ac->ac_inode)) inode_pa_eligible = false; size = max(size, isize); /* Don't use group allocation for large files */ if (size > sbi->s_mb_stream_request) group_pa_eligible = false; if (!group_pa_eligible) { if (inode_pa_eligible) ac->ac_flags |= EXT4_MB_STREAM_ALLOC; else ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; } BUG_ON(ac->ac_lg != NULL); /* * locality group prealloc space are per cpu. The reason for having * per cpu locality group is to reduce the contention between block * request from multiple CPUs. */ ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups); /* we're going to use group allocation */ ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; /* serialize all allocations in the group */ mutex_lock(&ac->ac_lg->lg_mutex); } static noinline_for_stack void ext4_mb_initialize_context(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { struct super_block *sb = ar->inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_group_t group; unsigned int len; ext4_fsblk_t goal; ext4_grpblk_t block; /* we can't allocate > group size */ len = ar->len; /* just a dirty hack to filter too big requests */ if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) len = EXT4_CLUSTERS_PER_GROUP(sb); /* start searching from the goal */ goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || goal >= ext4_blocks_count(es)) goal = le32_to_cpu(es->s_first_data_block); ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; ac->ac_inode = ar->inode; ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; ac->ac_o_ex.fe_group = group; ac->ac_o_ex.fe_start = block; ac->ac_o_ex.fe_len = len; ac->ac_g_ex = ac->ac_o_ex; ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; ac->ac_flags = ar->flags; /* we have to define context: we'll work with a file or * locality group. this is a policy, actually */ ext4_mb_group_or_file(ac); mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, " "left: %u/%u, right %u/%u to %swritable\n", (unsigned) ar->len, (unsigned) ar->logical, (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, (unsigned) ar->lleft, (unsigned) ar->pleft, (unsigned) ar->lright, (unsigned) ar->pright, inode_is_open_for_write(ar->inode) ? "" : "non-"); } static noinline_for_stack void ext4_mb_discard_lg_preallocations(struct super_block *sb, struct ext4_locality_group *lg, int order, int total_entries) { ext4_group_t group = 0; struct ext4_buddy e4b; LIST_HEAD(discard_list); struct ext4_prealloc_space *pa, *tmp; mb_debug(sb, "discard locality group preallocation\n"); spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], pa_node.lg_list, lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* * This is the pa that we just used * for block allocation. So don't * free that */ spin_unlock(&pa->pa_lock); continue; } if (pa->pa_deleted) { spin_unlock(&pa->pa_lock); continue; } /* only lg prealloc space */ BUG_ON(pa->pa_type != MB_GROUP_PA); /* seems this one can be freed ... */ ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); list_del_rcu(&pa->pa_node.lg_list); list_add(&pa->u.pa_tmp_list, &discard_list); total_entries--; if (total_entries <= 5) { /* * we want to keep only 5 entries * allowing it to grow to 8. This * mak sure we don't call discard * soon for this list. */ break; } } spin_unlock(&lg->lg_prealloc_lock); list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { int err; group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { ext4_error_err(sb, -err, "Error %d loading buddy information for %u", err, group); continue; } ext4_lock_group(sb, group); list_del(&pa->pa_group_list); ext4_mb_release_group_pa(&e4b, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } } /* * We have incremented pa_count. So it cannot be freed at this * point. Also we hold lg_mutex. So no parallel allocation is * possible from this lg. That means pa_free cannot be updated. * * A parallel ext4_mb_discard_group_preallocations is possible. * which can cause the lg_prealloc_list to be updated. */ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) { int order, added = 0, lg_prealloc_count = 1; struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg = ac->ac_lg; struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; order = fls(pa->pa_free) - 1; if (order > PREALLOC_TB_SIZE - 1) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; /* Add the prealloc space to lg */ spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], pa_node.lg_list, lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted) { spin_unlock(&tmp_pa->pa_lock); continue; } if (!added && pa->pa_free < tmp_pa->pa_free) { /* Add to the tail of the previous entry */ list_add_tail_rcu(&pa->pa_node.lg_list, &tmp_pa->pa_node.lg_list); added = 1; /* * we want to count the total * number of entries in the list */ } spin_unlock(&tmp_pa->pa_lock); lg_prealloc_count++; } if (!added) list_add_tail_rcu(&pa->pa_node.lg_list, &lg->lg_prealloc_list[order]); spin_unlock(&lg->lg_prealloc_lock); /* Now trim the list to be not more than 8 elements */ if (lg_prealloc_count > 8) ext4_mb_discard_lg_preallocations(sb, lg, order, lg_prealloc_count); } /* * release all resource we used in allocation */ static void ext4_mb_release_context(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { if (pa->pa_type == MB_GROUP_PA) { /* see comment in ext4_mb_use_group_pa() */ spin_lock(&pa->pa_lock); pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); /* * We want to add the pa to the right bucket. * Remove it from the list and while adding * make sure the list to which we are adding * doesn't grow big. */ if (likely(pa->pa_free)) { spin_lock(pa->pa_node_lock.lg_lock); list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); ext4_mb_add_n_trim(ac); } } ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->ac_bitmap_folio) folio_put(ac->ac_bitmap_folio); if (ac->ac_buddy_folio) folio_put(ac->ac_buddy_folio); if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); } static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) { ext4_group_t i, ngroups = ext4_get_groups_count(sb); int ret; int freed = 0, busy = 0; int retry = 0; trace_ext4_mb_discard_preallocations(sb, needed); if (needed == 0) needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; repeat: for (i = 0; i < ngroups && needed > 0; i++) { ret = ext4_mb_discard_group_preallocations(sb, i, &busy); freed += ret; needed -= ret; cond_resched(); } if (needed > 0 && busy && ++retry < 3) { busy = 0; goto repeat; } return freed; } static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, struct ext4_allocation_context *ac, u64 *seq) { int freed; u64 seq_retry = 0; bool ret = false; freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); if (freed) { ret = true; goto out_dbg; } seq_retry = ext4_get_discard_pa_seq_sum(); if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) { ac->ac_flags |= EXT4_MB_STRICT_CHECK; *seq = seq_retry; ret = true; } out_dbg: mb_debug(sb, "freed %d, retry ? %s\n", freed, str_yes_no(ret)); return ret; } /* * Simple allocator for Ext4 fast commit replay path. It searches for blocks * linearly starting at the goal block and also excludes the blocks which * are going to be in use after fast commit replay. */ static ext4_fsblk_t ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) { struct buffer_head *bitmap_bh; struct super_block *sb = ar->inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group, nr; ext4_grpblk_t blkoff; ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_fsblk_t goal, block; struct ext4_super_block *es = sbi->s_es; goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || goal >= ext4_blocks_count(es)) goal = le32_to_cpu(es->s_first_data_block); ar->len = 0; ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); for (nr = ext4_get_groups_count(sb); nr > 0; nr--) { bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { *errp = PTR_ERR(bitmap_bh); pr_warn("Failed to read block bitmap\n"); return 0; } while (1) { i = mb_find_next_zero_bit(bitmap_bh->b_data, max, blkoff); if (i >= max) break; if (ext4_fc_replay_check_excluded(sb, ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i))) { blkoff = i + 1; } else break; } brelse(bitmap_bh); if (i < max) break; if (++group >= ext4_get_groups_count(sb)) group = 0; blkoff = 0; } if (i >= max) { *errp = -ENOSPC; return 0; } block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); ext4_mb_mark_bb(sb, block, 1, true); ar->len = 1; *errp = 0; return block; } /* * Main entry point into mballoc to allocate blocks * it tries to use preallocation first, then falls back * to usual allocation */ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct ext4_allocation_request *ar, int *errp) { struct ext4_allocation_context *ac = NULL; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block = 0; unsigned int inquota = 0; unsigned int reserv_clstrs = 0; int retries = 0; u64 seq; might_sleep(); sb = ar->inode->i_sb; sbi = EXT4_SB(sb); trace_ext4_request_blocks(ar); if (sbi->s_mount_state & EXT4_FC_REPLAY) return ext4_mb_new_blocks_simple(ar, errp); /* Allow to use superuser reservation for quota file */ if (ext4_is_quota_file(ar->inode)) ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { /* Without delayed allocation we need to verify * there is enough free blocks to do block allocation * and verify allocation doesn't exceed the quota limits. */ while (ar->len && ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ cond_resched(); ar->len = ar->len >> 1; } if (!ar->len) { ext4_mb_show_pa(sb); *errp = -ENOSPC; return 0; } reserv_clstrs = ar->len; if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { dquot_alloc_block_nofail(ar->inode, EXT4_C2B(sbi, ar->len)); } else { while (ar->len && dquot_alloc_block(ar->inode, EXT4_C2B(sbi, ar->len))) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; ar->len--; } } inquota = ar->len; if (ar->len == 0) { *errp = -EDQUOT; goto out; } } ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); if (!ac) { ar->len = 0; *errp = -ENOMEM; goto out; } ext4_mb_initialize_context(ac, ar); ac->ac_op = EXT4_MB_HISTORY_PREALLOC; seq = this_cpu_read(discard_pa_seq); if (!ext4_mb_use_preallocated(ac)) { ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); *errp = ext4_mb_pa_alloc(ac); if (*errp) goto errout; repeat: /* allocate space in core */ *errp = ext4_mb_regular_allocator(ac); /* * pa allocated above is added to grp->bb_prealloc_list only * when we were able to allocate some block i.e. when * ac->ac_status == AC_STATUS_FOUND. * And error from above mean ac->ac_status != AC_STATUS_FOUND * So we have to free this pa here itself. */ if (*errp) { ext4_mb_pa_put_free(ac); ext4_discard_allocated_blocks(ac); goto errout; } if (ac->ac_status == AC_STATUS_FOUND && ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) ext4_mb_pa_put_free(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); if (*errp) { ext4_discard_allocated_blocks(ac); goto errout; } else { block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); ar->len = ac->ac_b_ex.fe_len; } } else { if (++retries < 3 && ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) goto repeat; /* * If block allocation fails then the pa allocated above * needs to be freed here itself. */ ext4_mb_pa_put_free(ac); *errp = -ENOSPC; } if (*errp) { errout: ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); } ext4_mb_release_context(ac); kmem_cache_free(ext4_ac_cachep, ac); out: if (inquota && ar->len < inquota) dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); if (!ar->len) { if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); } trace_ext4_allocate_blocks(ar, (unsigned long long)block); return block; } /* * We can merge two free data extents only if the physical blocks * are contiguous, AND the extents were freed by the same transaction, * AND the blocks are associated with the same group. */ static inline bool ext4_freed_extents_can_be_merged(struct ext4_free_data *entry1, struct ext4_free_data *entry2) { if (entry1->efd_tid != entry2->efd_tid) return false; if (entry1->efd_start_cluster + entry1->efd_count != entry2->efd_start_cluster) return false; if (WARN_ON_ONCE(entry1->efd_group != entry2->efd_group)) return false; return true; } static inline void ext4_merge_freed_extents(struct ext4_sb_info *sbi, struct rb_root *root, struct ext4_free_data *entry1, struct ext4_free_data *entry2) { entry1->efd_count += entry2->efd_count; spin_lock(&sbi->s_md_lock); list_del(&entry2->efd_list); spin_unlock(&sbi->s_md_lock); rb_erase(&entry2->efd_node, root); kmem_cache_free(ext4_free_data_cachep, entry2); } static inline void ext4_try_merge_freed_extent_prev(struct ext4_sb_info *sbi, struct rb_root *root, struct ext4_free_data *entry) { struct ext4_free_data *prev; struct rb_node *node; node = rb_prev(&entry->efd_node); if (!node) return; prev = rb_entry(node, struct ext4_free_data, efd_node); if (ext4_freed_extents_can_be_merged(prev, entry)) ext4_merge_freed_extents(sbi, root, prev, entry); } static inline void ext4_try_merge_freed_extent_next(struct ext4_sb_info *sbi, struct rb_root *root, struct ext4_free_data *entry) { struct ext4_free_data *next; struct rb_node *node; node = rb_next(&entry->efd_node); if (!node) return; next = rb_entry(node, struct ext4_free_data, efd_node); if (ext4_freed_extents_can_be_merged(entry, next)) ext4_merge_freed_extents(sbi, root, entry, next); } static noinline_for_stack void ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_free_data *new_entry) { ext4_group_t group = e4b->bd_group; ext4_grpblk_t cluster; ext4_grpblk_t clusters = new_entry->efd_count; struct ext4_free_data *entry = NULL; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct rb_root *root = &db->bb_free_root; struct rb_node **n = &root->rb_node; struct rb_node *parent = NULL, *new_node; BUG_ON(!ext4_handle_valid(handle)); BUG_ON(e4b->bd_bitmap_folio == NULL); BUG_ON(e4b->bd_buddy_folio == NULL); new_node = &new_entry->efd_node; cluster = new_entry->efd_start_cluster; if (!*n) { /* first free block exent. We need to protect buddy cache from being freed, * otherwise we'll refresh it from * on-disk bitmap and lose not-yet-available * blocks */ folio_get(e4b->bd_buddy_folio); folio_get(e4b->bd_bitmap_folio); } while (*n) { parent = *n; entry = rb_entry(parent, struct ext4_free_data, efd_node); if (cluster < entry->efd_start_cluster) n = &(*n)->rb_left; else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) n = &(*n)->rb_right; else { ext4_grp_locked_error(sb, group, 0, ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, cluster), "Block already on to-be-freed list"); kmem_cache_free(ext4_free_data_cachep, new_entry); return; } } atomic_add(clusters, &sbi->s_mb_free_pending); if (!entry) goto insert; /* Now try to see the extent can be merged to prev and next */ if (ext4_freed_extents_can_be_merged(new_entry, entry)) { entry->efd_start_cluster = cluster; entry->efd_count += new_entry->efd_count; kmem_cache_free(ext4_free_data_cachep, new_entry); ext4_try_merge_freed_extent_prev(sbi, root, entry); return; } if (ext4_freed_extents_can_be_merged(entry, new_entry)) { entry->efd_count += new_entry->efd_count; kmem_cache_free(ext4_free_data_cachep, new_entry); ext4_try_merge_freed_extent_next(sbi, root, entry); return; } insert: rb_link_node(new_node, parent, n); rb_insert_color(new_node, root); spin_lock(&sbi->s_md_lock); list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]); spin_unlock(&sbi->s_md_lock); } static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, unsigned long count) { struct super_block *sb = inode->i_sb; ext4_group_t group; ext4_grpblk_t blkoff; ext4_get_group_no_and_offset(sb, block, &group, &blkoff); ext4_mb_mark_context(NULL, sb, false, group, blkoff, count, EXT4_MB_BITMAP_MARKED_CHECK | EXT4_MB_SYNC_UPDATE, NULL); } /** * ext4_mb_clear_bb() -- helper function for freeing blocks. * Used by ext4_free_blocks() * @handle: handle for this transaction * @inode: inode * @block: starting physical block to be freed * @count: number of blocks to be freed * @flags: flags used by ext4_free_blocks */ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int flags) { struct super_block *sb = inode->i_sb; struct ext4_group_info *grp; unsigned int overflow; ext4_grpblk_t bit; ext4_group_t block_group; struct ext4_sb_info *sbi; struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; int mark_flags = 0; ext4_grpblk_t changed; sbi = EXT4_SB(sb); if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ goto error_out; } flags |= EXT4_FREE_BLOCKS_VALIDATED; do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); grp = ext4_get_group_info(sb, block_group); if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return; /* * Check to see if we are freeing blocks across a group * boundary. */ if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { overflow = EXT4_C2B(sbi, bit) + count - EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } count_clusters = EXT4_NUM_B2C(sbi, count); trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) goto error_out; if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ goto error_clean; } #ifdef AGGRESSIVE_CHECK mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK; #endif err = ext4_mb_mark_context(handle, sb, false, block_group, bit, count_clusters, mark_flags, &changed); if (err && changed == 0) goto error_clean; #ifdef AGGRESSIVE_CHECK BUG_ON(changed != count_clusters); #endif /* * We need to make sure we don't reuse the freed block until after the * transaction is committed. We make an exception if the inode is to be * written in writeback mode since writeback mode has weak data * consistency guarantees. */ if (ext4_handle_valid(handle) && ((flags & EXT4_FREE_BLOCKS_METADATA) || !ext4_should_writeback_data(inode))) { struct ext4_free_data *new_entry; /* * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed * to fail. */ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS|__GFP_NOFAIL); new_entry->efd_start_cluster = bit; new_entry->efd_group = block_group; new_entry->efd_count = count_clusters; new_entry->efd_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); ext4_mb_free_metadata(handle, &e4b, new_entry); } else { if (test_opt(sb, DISCARD)) { err = ext4_issue_discard(sb, block_group, bit, count_clusters); /* * Ignore EOPNOTSUPP error. This is consistent with * what happens when using journal. */ if (err == -EOPNOTSUPP) err = 0; if (err) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%u block:%d count:%lu failed" " with %d", block_group, bit, count, err); } EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); ext4_lock_group(sb, block_group); mb_free_blocks(inode, &e4b, bit, count_clusters); } ext4_unlock_group(sb, block_group); /* * on a bigalloc file system, defer the s_freeclusters_counter * update to the caller (ext4_remove_space and friends) so they * can determine if a cluster freed here should be rereserved */ if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); } if (overflow && !err) { block += count; count = overflow; ext4_mb_unload_buddy(&e4b); /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; goto do_more; } error_clean: ext4_mb_unload_buddy(&e4b); error_out: ext4_std_error(sb, err); } /** * ext4_free_blocks() -- Free given blocks and update quota * @handle: handle for this transaction * @inode: inode * @bh: optional buffer of the block to be freed * @block: starting physical block to be freed * @count: number of blocks to be freed * @flags: flags used by ext4_free_blocks */ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags) { struct super_block *sb = inode->i_sb; unsigned int overflow; struct ext4_sb_info *sbi; sbi = EXT4_SB(sb); if (bh) { if (block) BUG_ON(block != bh->b_blocknr); else block = bh->b_blocknr; } if (sbi->s_mount_state & EXT4_FC_REPLAY) { ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count)); return; } might_sleep(); if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " "block = %llu, count = %lu", block, count); return; } flags |= EXT4_FREE_BLOCKS_VALIDATED; ext4_debug("freeing block %llu\n", block); trace_ext4_free_blocks(inode, block, count, flags); if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { BUG_ON(count > 1); ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, inode, bh, block); } /* * If the extent to be freed does not begin on a cluster * boundary, we need to deal with partial clusters at the * beginning and end of the extent. Normally we will free * blocks at the beginning or the end unless we are explicitly * requested to avoid doing so. */ overflow = EXT4_PBLK_COFF(sbi, block); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { overflow = sbi->s_cluster_ratio - overflow; block += overflow; if (count > overflow) count -= overflow; else return; } else { block -= overflow; count += overflow; } /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } overflow = EXT4_LBLK_COFF(sbi, count); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { if (count > overflow) count -= overflow; else return; } else count += sbi->s_cluster_ratio - overflow; /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { int i; int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA; for (i = 0; i < count; i++) { cond_resched(); if (is_metadata) bh = sb_find_get_block_nonatomic(inode->i_sb, block + i); ext4_forget(handle, is_metadata, inode, bh, block + i); } } ext4_mb_clear_bb(handle, inode, block, count, flags); } /** * ext4_group_add_blocks() -- Add given blocks to an existing group * @handle: handle to this transaction * @sb: super block * @block: start physical block to add to the block group * @count: number of blocks to free * * This marks the blocks as free in the bitmap and buddy. */ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count) { ext4_group_t block_group; ext4_grpblk_t bit; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; int err = 0; ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); unsigned long cluster_count = last_cluster - first_cluster + 1; ext4_grpblk_t changed; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); if (cluster_count == 0) return 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); /* * Check to see if we are freeing blocks across a group * boundary. */ if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) { ext4_warning(sb, "too many blocks added to group %u", block_group); err = -EINVAL; goto error_out; } err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) goto error_out; if (!ext4_sb_block_valid(sb, NULL, block, count)) { ext4_error(sb, "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); err = -EINVAL; goto error_clean; } err = ext4_mb_mark_context(handle, sb, false, block_group, bit, cluster_count, EXT4_MB_BITMAP_MARKED_CHECK, &changed); if (err && changed == 0) goto error_clean; if (changed != cluster_count) ext4_error(sb, "bit already cleared in group %u", block_group); ext4_lock_group(sb, block_group); mb_free_blocks(NULL, &e4b, bit, cluster_count); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, changed); error_clean: ext4_mb_unload_buddy(&e4b); error_out: ext4_std_error(sb, err); return err; } /** * ext4_trim_extent -- function to TRIM one single free extent in the group * @sb: super block for the file system * @start: starting block of the free extent in the alloc. group * @count: number of blocks to TRIM * @e4b: ext4 buddy for the group * * Trim "count" blocks starting at "start" in the "group". To assure that no * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ static int ext4_trim_extent(struct super_block *sb, int start, int count, struct ext4_buddy *e4b) __releases(bitlock) __acquires(bitlock) { struct ext4_free_extent ex; ext4_group_t group = e4b->bd_group; int ret = 0; trace_ext4_trim_extent(sb, group, start, count); assert_spin_locked(ext4_group_lock_ptr(sb, group)); ex.fe_start = start; ex.fe_group = group; ex.fe_len = count; /* * Mark blocks used, so no one can reuse them while * being trimmed. */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); ret = ext4_issue_discard(sb, group, start, count); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); return ret; } static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb, ext4_group_t grp) { unsigned long nr_clusters_in_group; if (grp < (ext4_get_groups_count(sb) - 1)) nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb); else nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) - ext4_group_first_block_no(sb, grp)) >> EXT4_CLUSTER_BITS(sb); return nr_clusters_in_group - 1; } static bool ext4_trim_interrupted(void) { return fatal_signal_pending(current) || freezing(current); } static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) __acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) { ext4_grpblk_t next, count, free_count, last, origin_start; bool set_trimmed = false; void *bitmap; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) return 0; last = ext4_last_grp_cluster(sb, e4b->bd_group); bitmap = e4b->bd_bitmap; if (start == 0 && max >= last) set_trimmed = true; origin_start = start; start = max(e4b->bd_info->bb_first_free, start); count = 0; free_count = 0; while (start <= max) { start = mb_find_next_zero_bit(bitmap, max + 1, start); if (start > max) break; next = mb_find_next_bit(bitmap, last + 1, start); if (origin_start == 0 && next >= last) set_trimmed = true; if ((next - start) >= minblocks) { int ret = ext4_trim_extent(sb, start, next - start, e4b); if (ret && ret != -EOPNOTSUPP) return count; count += next - start; } free_count += next - start; start = next + 1; if (ext4_trim_interrupted()) return count; if (need_resched()) { ext4_unlock_group(sb, e4b->bd_group); cond_resched(); ext4_lock_group(sb, e4b->bd_group); } if ((e4b->bd_info->bb_free - free_count) < minblocks) break; } if (set_trimmed) EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info); return count; } /** * ext4_trim_all_free -- function to trim all free space in alloc. group * @sb: super block for file system * @group: group to be trimmed * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count * * ext4_trim_all_free walks through group's block bitmap searching for free * extents. When the free extent is found, mark it as used in group buddy * bitmap. Then issue a TRIM command on this extent and free the extent in * the group buddy bitmap. */ static ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) { struct ext4_buddy e4b; int ret; trace_ext4_trim_all_free(sb, group, start, max); ret = ext4_mb_load_buddy(sb, group, &e4b); if (ret) { ext4_warning(sb, "Error %d loading buddy information for %u", ret, group); return ret; } ext4_lock_group(sb, group); if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || minblocks < EXT4_SB(sb)->s_last_trim_minblks) ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); else ret = 0; ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); ext4_debug("trimmed %d blocks in the group %d\n", ret, group); return ret; } /** * ext4_trim_fs() -- trim ioctl handle function * @sb: superblock for filesystem * @range: fstrim_range structure * * start: First Byte to trim * len: number of Bytes to trim from start * minlen: minimum extent length in Bytes * ext4_trim_fs goes through all allocation groups containing Bytes from * start to start+len. For each such a group ext4_trim_all_free function * is invoked to trim all free space. */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev); struct ext4_group_info *grp; ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; uint64_t start, end, minlen, trimmed = 0; ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); int ret = 0; start = range->start >> sb->s_blocksize_bits; end = start + (range->len >> sb->s_blocksize_bits) - 1; minlen = EXT4_NUM_B2C(EXT4_SB(sb), range->minlen >> sb->s_blocksize_bits); if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) || start >= max_blks || range->len < sb->s_blocksize) return -EINVAL; /* No point to try to trim less than discard granularity */ if (range->minlen < discard_granularity) { minlen = EXT4_NUM_B2C(EXT4_SB(sb), discard_granularity >> sb->s_blocksize_bits); if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) goto out; } if (end >= max_blks - 1) end = max_blks - 1; if (end <= first_data_blk) goto out; if (start < first_data_blk) start = first_data_blk; /* Determine first and last group to examine based on start and end */ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, &first_group, &first_cluster); ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, &last_group, &last_cluster); /* end now represents the last cluster to discard in this group */ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; for (group = first_group; group <= last_group; group++) { if (ext4_trim_interrupted()) break; grp = ext4_get_group_info(sb, group); if (!grp) continue; /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) break; } /* * For all the groups except the last one, last cluster will * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to * change it for the last group, note that last_cluster is * already computed earlier by ext4_get_group_no_and_offset() */ if (group == last_group) end = last_cluster; if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, end, minlen); if (cnt < 0) { ret = cnt; break; } trimmed += cnt; } /* * For every group except the first one, we are sure * that the first cluster to discard will be cluster #0. */ first_cluster = 0; } if (!ret) EXT4_SB(sb)->s_last_trim_minblks = minlen; out: range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; return ret; } /* Iterate all the free extents in the group. */ int ext4_mballoc_query_range( struct super_block *sb, ext4_group_t group, ext4_grpblk_t first, ext4_grpblk_t end, ext4_mballoc_query_range_fn meta_formatter, ext4_mballoc_query_range_fn formatter, void *priv) { void *bitmap; ext4_grpblk_t start, next; struct ext4_buddy e4b; int error; error = ext4_mb_load_buddy(sb, group, &e4b); if (error) return error; bitmap = e4b.bd_bitmap; ext4_lock_group(sb, group); start = max(e4b.bd_info->bb_first_free, first); if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; if (meta_formatter && start != first) { if (start > end) start = end; ext4_unlock_group(sb, group); error = meta_formatter(sb, group, first, start - first, priv); if (error) goto out_unload; ext4_lock_group(sb, group); } while (start <= end) { start = mb_find_next_zero_bit(bitmap, end + 1, start); if (start > end) break; next = mb_find_next_bit(bitmap, end + 1, start); ext4_unlock_group(sb, group); error = formatter(sb, group, start, next - start, priv); if (error) goto out_unload; ext4_lock_group(sb, group); start = next + 1; } ext4_unlock_group(sb, group); out_unload: ext4_mb_unload_buddy(&e4b); return error; } #ifdef CONFIG_EXT4_KUNIT_TESTS #include "mballoc-test.c" #endif |
| 1 1 1 1 5 5 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 | // SPDX-License-Identifier: GPL-2.0-only /* * VMware VMCI Driver * * Copyright (C) 2012 VMware, Inc. All rights reserved. */ #include <linux/vmw_vmci_defs.h> #include <linux/vmw_vmci_api.h> #include <linux/miscdevice.h> #include <linux/interrupt.h> #include <linux/highmem.h> #include <linux/atomic.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/pci.h> #include <linux/smp.h> #include <linux/fs.h> #include <linux/io.h> #include "vmci_handle_array.h" #include "vmci_queue_pair.h" #include "vmci_datagram.h" #include "vmci_doorbell.h" #include "vmci_resource.h" #include "vmci_context.h" #include "vmci_driver.h" #include "vmci_event.h" #define VMCI_UTIL_NUM_RESOURCES 1 enum { VMCI_NOTIFY_RESOURCE_QUEUE_PAIR = 0, VMCI_NOTIFY_RESOURCE_DOOR_BELL = 1, }; enum { VMCI_NOTIFY_RESOURCE_ACTION_NOTIFY = 0, VMCI_NOTIFY_RESOURCE_ACTION_CREATE = 1, VMCI_NOTIFY_RESOURCE_ACTION_DESTROY = 2, }; /* * VMCI driver initialization. This block can also be used to * pass initial group membership etc. */ struct vmci_init_blk { u32 cid; u32 flags; }; /* VMCIqueue_pairAllocInfo_VMToVM */ struct vmci_qp_alloc_info_vmvm { struct vmci_handle handle; u32 peer; u32 flags; u64 produce_size; u64 consume_size; u64 produce_page_file; /* User VA. */ u64 consume_page_file; /* User VA. */ u64 produce_page_file_size; /* Size of the file name array. */ u64 consume_page_file_size; /* Size of the file name array. */ s32 result; u32 _pad; }; /* VMCISetNotifyInfo: Used to pass notify flag's address to the host driver. */ struct vmci_set_notify_info { u64 notify_uva; s32 result; u32 _pad; }; /* * Per-instance host state */ struct vmci_host_dev { struct vmci_ctx *context; int user_version; enum vmci_obj_type ct_type; struct mutex lock; /* Mutex lock for vmci context access */ }; static struct vmci_ctx *host_context; static bool vmci_host_device_initialized; static atomic_t vmci_host_active_users = ATOMIC_INIT(0); /* * Determines whether the VMCI host personality is * available. Since the core functionality of the host driver is * always present, all guests could possibly use the host * personality. However, to minimize the deviation from the * pre-unified driver state of affairs, we only consider the host * device active if there is no active guest device or if there * are VMX'en with active VMCI contexts using the host device. */ bool vmci_host_code_active(void) { return vmci_host_device_initialized && (!vmci_guest_code_active() || atomic_read(&vmci_host_active_users) > 0); } int vmci_host_users(void) { return atomic_read(&vmci_host_active_users); } /* * Called on open of /dev/vmci. */ static int vmci_host_open(struct inode *inode, struct file *filp) { struct vmci_host_dev *vmci_host_dev; vmci_host_dev = kzalloc(sizeof(struct vmci_host_dev), GFP_KERNEL); if (vmci_host_dev == NULL) return -ENOMEM; vmci_host_dev->ct_type = VMCIOBJ_NOT_SET; mutex_init(&vmci_host_dev->lock); filp->private_data = vmci_host_dev; return 0; } /* * Called on close of /dev/vmci, most often when the process * exits. */ static int vmci_host_close(struct inode *inode, struct file *filp) { struct vmci_host_dev *vmci_host_dev = filp->private_data; if (vmci_host_dev->ct_type == VMCIOBJ_CONTEXT) { vmci_ctx_destroy(vmci_host_dev->context); vmci_host_dev->context = NULL; /* * The number of active contexts is used to track whether any * VMX'en are using the host personality. It is incremented when * a context is created through the IOCTL_VMCI_INIT_CONTEXT * ioctl. */ atomic_dec(&vmci_host_active_users); } vmci_host_dev->ct_type = VMCIOBJ_NOT_SET; kfree(vmci_host_dev); filp->private_data = NULL; return 0; } /* * This is used to wake up the VMX when a VMCI call arrives, or * to wake up select() or poll() at the next clock tick. */ static __poll_t vmci_host_poll(struct file *filp, poll_table *wait) { struct vmci_host_dev *vmci_host_dev = filp->private_data; struct vmci_ctx *context; __poll_t mask = 0; if (vmci_host_dev->ct_type == VMCIOBJ_CONTEXT) { /* * Read context only if ct_type == VMCIOBJ_CONTEXT to make * sure that context is initialized */ context = vmci_host_dev->context; /* Check for VMCI calls to this VM context. */ if (wait) poll_wait(filp, &context->host_context.wait_queue, wait); spin_lock(&context->lock); if (context->pending_datagrams > 0 || vmci_handle_arr_get_size( context->pending_doorbell_array) > 0) { mask = EPOLLIN; } spin_unlock(&context->lock); } return mask; } /* * Copies the handles of a handle array into a user buffer, and * returns the new length in userBufferSize. If the copy to the * user buffer fails, the functions still returns VMCI_SUCCESS, * but retval != 0. */ static int drv_cp_harray_to_user(void __user *user_buf_uva, u64 *user_buf_size, struct vmci_handle_arr *handle_array, int *retval) { u32 array_size = 0; struct vmci_handle *handles; if (handle_array) array_size = vmci_handle_arr_get_size(handle_array); if (array_size * sizeof(*handles) > *user_buf_size) return VMCI_ERROR_MORE_DATA; *user_buf_size = array_size * sizeof(*handles); if (*user_buf_size) *retval = copy_to_user(user_buf_uva, vmci_handle_arr_get_handles (handle_array), *user_buf_size); return VMCI_SUCCESS; } /* * Sets up a given context for notify to work. Maps the notify * boolean in user VA into kernel space. */ static int vmci_host_setup_notify(struct vmci_ctx *context, unsigned long uva) { struct page *page; int retval; if (context->notify_page) { pr_devel("%s: Notify mechanism is already set up\n", __func__); return VMCI_ERROR_DUPLICATE_ENTRY; } /* * We are using 'bool' internally, but let's make sure we explicit * about the size. */ BUILD_BUG_ON(sizeof(bool) != sizeof(u8)); /* * Lock physical page backing a given user VA. */ retval = get_user_pages_fast(uva, 1, FOLL_WRITE, &page); if (retval != 1) return VMCI_ERROR_GENERIC; context->notify_page = page; /* * Map the locked page and set up notify pointer. */ context->notify = kmap(context->notify_page) + (uva & (PAGE_SIZE - 1)); vmci_ctx_check_signal_notify(context); return VMCI_SUCCESS; } static int vmci_host_get_version(struct vmci_host_dev *vmci_host_dev, unsigned int cmd, void __user *uptr) { if (cmd == IOCTL_VMCI_VERSION2) { int __user *vptr = uptr; if (get_user(vmci_host_dev->user_version, vptr)) return -EFAULT; } /* * The basic logic here is: * * If the user sends in a version of 0 tell it our version. * If the user didn't send in a version, tell it our version. * If the user sent in an old version, tell it -its- version. * If the user sent in an newer version, tell it our version. * * The rationale behind telling the caller its version is that * Workstation 6.5 required that VMX and VMCI kernel module were * version sync'd. All new VMX users will be programmed to * handle the VMCI kernel module version. */ if (vmci_host_dev->user_version > 0 && vmci_host_dev->user_version < VMCI_VERSION_HOSTQP) { return vmci_host_dev->user_version; } return VMCI_VERSION; } #define vmci_ioctl_err(fmt, ...) \ pr_devel("%s: " fmt, ioctl_name, ##__VA_ARGS__) static int vmci_host_do_init_context(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_init_blk init_block; const struct cred *cred; int retval; if (copy_from_user(&init_block, uptr, sizeof(init_block))) { vmci_ioctl_err("error reading init block\n"); return -EFAULT; } mutex_lock(&vmci_host_dev->lock); if (vmci_host_dev->ct_type != VMCIOBJ_NOT_SET) { vmci_ioctl_err("received VMCI init on initialized handle\n"); retval = -EINVAL; goto out; } if (init_block.flags & ~VMCI_PRIVILEGE_FLAG_RESTRICTED) { vmci_ioctl_err("unsupported VMCI restriction flag\n"); retval = -EINVAL; goto out; } cred = get_current_cred(); vmci_host_dev->context = vmci_ctx_create(init_block.cid, init_block.flags, 0, vmci_host_dev->user_version, cred); put_cred(cred); if (IS_ERR(vmci_host_dev->context)) { retval = PTR_ERR(vmci_host_dev->context); vmci_ioctl_err("error initializing context\n"); goto out; } /* * Copy cid to userlevel, we do this to allow the VMX * to enforce its policy on cid generation. */ init_block.cid = vmci_ctx_get_id(vmci_host_dev->context); if (copy_to_user(uptr, &init_block, sizeof(init_block))) { vmci_ctx_destroy(vmci_host_dev->context); vmci_host_dev->context = NULL; vmci_ioctl_err("error writing init block\n"); retval = -EFAULT; goto out; } vmci_host_dev->ct_type = VMCIOBJ_CONTEXT; atomic_inc(&vmci_host_active_users); vmci_call_vsock_callback(true); retval = 0; out: mutex_unlock(&vmci_host_dev->lock); return retval; } static int vmci_host_do_send_datagram(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_datagram_snd_rcv_info send_info; struct vmci_datagram *dg = NULL; u32 cid; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&send_info, uptr, sizeof(send_info))) return -EFAULT; if (send_info.len > VMCI_MAX_DG_SIZE) { vmci_ioctl_err("datagram is too big (size=%d)\n", send_info.len); return -EINVAL; } if (send_info.len < sizeof(*dg)) { vmci_ioctl_err("datagram is too small (size=%d)\n", send_info.len); return -EINVAL; } dg = memdup_user((void __user *)(uintptr_t)send_info.addr, send_info.len); if (IS_ERR(dg)) { vmci_ioctl_err( "cannot allocate memory to dispatch datagram\n"); return PTR_ERR(dg); } if (VMCI_DG_SIZE(dg) != send_info.len) { vmci_ioctl_err("datagram size mismatch\n"); kfree(dg); return -EINVAL; } pr_devel("Datagram dst (handle=0x%x:0x%x) src (handle=0x%x:0x%x), payload (size=%llu bytes)\n", dg->dst.context, dg->dst.resource, dg->src.context, dg->src.resource, (unsigned long long)dg->payload_size); /* Get source context id. */ cid = vmci_ctx_get_id(vmci_host_dev->context); send_info.result = vmci_datagram_dispatch(cid, dg, true); kfree(dg); return copy_to_user(uptr, &send_info, sizeof(send_info)) ? -EFAULT : 0; } static int vmci_host_do_receive_datagram(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_datagram_snd_rcv_info recv_info; struct vmci_datagram *dg = NULL; int retval; size_t size; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&recv_info, uptr, sizeof(recv_info))) return -EFAULT; size = recv_info.len; recv_info.result = vmci_ctx_dequeue_datagram(vmci_host_dev->context, &size, &dg); if (recv_info.result >= VMCI_SUCCESS) { void __user *ubuf = (void __user *)(uintptr_t)recv_info.addr; retval = copy_to_user(ubuf, dg, VMCI_DG_SIZE(dg)); kfree(dg); if (retval != 0) return -EFAULT; } return copy_to_user(uptr, &recv_info, sizeof(recv_info)) ? -EFAULT : 0; } static int vmci_host_do_alloc_queuepair(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_handle handle; int vmci_status; int __user *retptr; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (vmci_host_dev->user_version < VMCI_VERSION_NOVMVM) { struct vmci_qp_alloc_info_vmvm alloc_info; struct vmci_qp_alloc_info_vmvm __user *info = uptr; if (copy_from_user(&alloc_info, uptr, sizeof(alloc_info))) return -EFAULT; handle = alloc_info.handle; retptr = &info->result; vmci_status = vmci_qp_broker_alloc(alloc_info.handle, alloc_info.peer, alloc_info.flags, VMCI_NO_PRIVILEGE_FLAGS, alloc_info.produce_size, alloc_info.consume_size, NULL, vmci_host_dev->context); if (vmci_status == VMCI_SUCCESS) vmci_status = VMCI_SUCCESS_QUEUEPAIR_CREATE; } else { struct vmci_qp_alloc_info alloc_info; struct vmci_qp_alloc_info __user *info = uptr; struct vmci_qp_page_store page_store; if (copy_from_user(&alloc_info, uptr, sizeof(alloc_info))) return -EFAULT; handle = alloc_info.handle; retptr = &info->result; page_store.pages = alloc_info.ppn_va; page_store.len = alloc_info.num_ppns; vmci_status = vmci_qp_broker_alloc(alloc_info.handle, alloc_info.peer, alloc_info.flags, VMCI_NO_PRIVILEGE_FLAGS, alloc_info.produce_size, alloc_info.consume_size, &page_store, vmci_host_dev->context); } if (put_user(vmci_status, retptr)) { if (vmci_status >= VMCI_SUCCESS) { vmci_status = vmci_qp_broker_detach(handle, vmci_host_dev->context); } return -EFAULT; } return 0; } static int vmci_host_do_queuepair_setva(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_qp_set_va_info set_va_info; struct vmci_qp_set_va_info __user *info = uptr; s32 result; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (vmci_host_dev->user_version < VMCI_VERSION_NOVMVM) { vmci_ioctl_err("is not allowed\n"); return -EINVAL; } if (copy_from_user(&set_va_info, uptr, sizeof(set_va_info))) return -EFAULT; if (set_va_info.va) { /* * VMX is passing down a new VA for the queue * pair mapping. */ result = vmci_qp_broker_map(set_va_info.handle, vmci_host_dev->context, set_va_info.va); } else { /* * The queue pair is about to be unmapped by * the VMX. */ result = vmci_qp_broker_unmap(set_va_info.handle, vmci_host_dev->context, 0); } return put_user(result, &info->result) ? -EFAULT : 0; } static int vmci_host_do_queuepair_setpf(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_qp_page_file_info page_file_info; struct vmci_qp_page_file_info __user *info = uptr; s32 result; if (vmci_host_dev->user_version < VMCI_VERSION_HOSTQP || vmci_host_dev->user_version >= VMCI_VERSION_NOVMVM) { vmci_ioctl_err("not supported on this VMX (version=%d)\n", vmci_host_dev->user_version); return -EINVAL; } if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&page_file_info, uptr, sizeof(*info))) return -EFAULT; /* * Communicate success pre-emptively to the caller. Note that the * basic premise is that it is incumbent upon the caller not to look at * the info.result field until after the ioctl() returns. And then, * only if the ioctl() result indicates no error. We send up the * SUCCESS status before calling SetPageStore() store because failing * to copy up the result code means unwinding the SetPageStore(). * * It turns out the logic to unwind a SetPageStore() opens a can of * worms. For example, if a host had created the queue_pair and a * guest attaches and SetPageStore() is successful but writing success * fails, then ... the host has to be stopped from writing (anymore) * data into the queue_pair. That means an additional test in the * VMCI_Enqueue() code path. Ugh. */ if (put_user(VMCI_SUCCESS, &info->result)) { /* * In this case, we can't write a result field of the * caller's info block. So, we don't even try to * SetPageStore(). */ return -EFAULT; } result = vmci_qp_broker_set_page_store(page_file_info.handle, page_file_info.produce_va, page_file_info.consume_va, vmci_host_dev->context); if (result < VMCI_SUCCESS) { if (put_user(result, &info->result)) { /* * Note that in this case the SetPageStore() * call failed but we were unable to * communicate that to the caller (because the * copy_to_user() call failed). So, if we * simply return an error (in this case * -EFAULT) then the caller will know that the * SetPageStore failed even though we couldn't * put the result code in the result field and * indicate exactly why it failed. * * That says nothing about the issue where we * were once able to write to the caller's info * memory and now can't. Something more * serious is probably going on than the fact * that SetPageStore() didn't work. */ return -EFAULT; } } return 0; } static int vmci_host_do_qp_detach(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_qp_dtch_info detach_info; struct vmci_qp_dtch_info __user *info = uptr; s32 result; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&detach_info, uptr, sizeof(detach_info))) return -EFAULT; result = vmci_qp_broker_detach(detach_info.handle, vmci_host_dev->context); if (result == VMCI_SUCCESS && vmci_host_dev->user_version < VMCI_VERSION_NOVMVM) { result = VMCI_SUCCESS_LAST_DETACH; } return put_user(result, &info->result) ? -EFAULT : 0; } static int vmci_host_do_ctx_add_notify(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_ctx_info ar_info; struct vmci_ctx_info __user *info = uptr; s32 result; u32 cid; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&ar_info, uptr, sizeof(ar_info))) return -EFAULT; cid = vmci_ctx_get_id(vmci_host_dev->context); result = vmci_ctx_add_notification(cid, ar_info.remote_cid); return put_user(result, &info->result) ? -EFAULT : 0; } static int vmci_host_do_ctx_remove_notify(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_ctx_info ar_info; struct vmci_ctx_info __user *info = uptr; u32 cid; int result; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&ar_info, uptr, sizeof(ar_info))) return -EFAULT; cid = vmci_ctx_get_id(vmci_host_dev->context); result = vmci_ctx_remove_notification(cid, ar_info.remote_cid); return put_user(result, &info->result) ? -EFAULT : 0; } static int vmci_host_do_ctx_get_cpt_state(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_ctx_chkpt_buf_info get_info; u32 cid; void *cpt_buf; int retval; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&get_info, uptr, sizeof(get_info))) return -EFAULT; cid = vmci_ctx_get_id(vmci_host_dev->context); get_info.result = vmci_ctx_get_chkpt_state(cid, get_info.cpt_type, &get_info.buf_size, &cpt_buf); if (get_info.result == VMCI_SUCCESS && get_info.buf_size) { void __user *ubuf = (void __user *)(uintptr_t)get_info.cpt_buf; retval = copy_to_user(ubuf, cpt_buf, get_info.buf_size); kfree(cpt_buf); if (retval) return -EFAULT; } return copy_to_user(uptr, &get_info, sizeof(get_info)) ? -EFAULT : 0; } static int vmci_host_do_ctx_set_cpt_state(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_ctx_chkpt_buf_info set_info; u32 cid; void *cpt_buf; int retval; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&set_info, uptr, sizeof(set_info))) return -EFAULT; cpt_buf = memdup_user((void __user *)(uintptr_t)set_info.cpt_buf, set_info.buf_size); if (IS_ERR(cpt_buf)) return PTR_ERR(cpt_buf); cid = vmci_ctx_get_id(vmci_host_dev->context); set_info.result = vmci_ctx_set_chkpt_state(cid, set_info.cpt_type, set_info.buf_size, cpt_buf); retval = copy_to_user(uptr, &set_info, sizeof(set_info)) ? -EFAULT : 0; kfree(cpt_buf); return retval; } static int vmci_host_do_get_context_id(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { u32 __user *u32ptr = uptr; return put_user(VMCI_HOST_CONTEXT_ID, u32ptr) ? -EFAULT : 0; } static int vmci_host_do_set_notify(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_set_notify_info notify_info; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(¬ify_info, uptr, sizeof(notify_info))) return -EFAULT; if (notify_info.notify_uva) { notify_info.result = vmci_host_setup_notify(vmci_host_dev->context, notify_info.notify_uva); } else { vmci_ctx_unset_notify(vmci_host_dev->context); notify_info.result = VMCI_SUCCESS; } return copy_to_user(uptr, ¬ify_info, sizeof(notify_info)) ? -EFAULT : 0; } static int vmci_host_do_notify_resource(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_dbell_notify_resource_info info; u32 cid; if (vmci_host_dev->user_version < VMCI_VERSION_NOTIFY) { vmci_ioctl_err("invalid for current VMX versions\n"); return -EINVAL; } if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&info, uptr, sizeof(info))) return -EFAULT; cid = vmci_ctx_get_id(vmci_host_dev->context); switch (info.action) { case VMCI_NOTIFY_RESOURCE_ACTION_NOTIFY: if (info.resource == VMCI_NOTIFY_RESOURCE_DOOR_BELL) { u32 flags = VMCI_NO_PRIVILEGE_FLAGS; info.result = vmci_ctx_notify_dbell(cid, info.handle, flags); } else { info.result = VMCI_ERROR_UNAVAILABLE; } break; case VMCI_NOTIFY_RESOURCE_ACTION_CREATE: info.result = vmci_ctx_dbell_create(cid, info.handle); break; case VMCI_NOTIFY_RESOURCE_ACTION_DESTROY: info.result = vmci_ctx_dbell_destroy(cid, info.handle); break; default: vmci_ioctl_err("got unknown action (action=%d)\n", info.action); info.result = VMCI_ERROR_INVALID_ARGS; } return copy_to_user(uptr, &info, sizeof(info)) ? -EFAULT : 0; } static int vmci_host_do_recv_notifications(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_ctx_notify_recv_info info; struct vmci_handle_arr *db_handle_array; struct vmci_handle_arr *qp_handle_array; void __user *ubuf; u32 cid; int retval = 0; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (vmci_host_dev->user_version < VMCI_VERSION_NOTIFY) { vmci_ioctl_err("not supported for the current vmx version\n"); return -EINVAL; } if (copy_from_user(&info, uptr, sizeof(info))) return -EFAULT; if ((info.db_handle_buf_size && !info.db_handle_buf_uva) || (info.qp_handle_buf_size && !info.qp_handle_buf_uva)) { return -EINVAL; } cid = vmci_ctx_get_id(vmci_host_dev->context); info.result = vmci_ctx_rcv_notifications_get(cid, &db_handle_array, &qp_handle_array); if (info.result != VMCI_SUCCESS) return copy_to_user(uptr, &info, sizeof(info)) ? -EFAULT : 0; ubuf = (void __user *)(uintptr_t)info.db_handle_buf_uva; info.result = drv_cp_harray_to_user(ubuf, &info.db_handle_buf_size, db_handle_array, &retval); if (info.result == VMCI_SUCCESS && !retval) { ubuf = (void __user *)(uintptr_t)info.qp_handle_buf_uva; info.result = drv_cp_harray_to_user(ubuf, &info.qp_handle_buf_size, qp_handle_array, &retval); } if (!retval && copy_to_user(uptr, &info, sizeof(info))) retval = -EFAULT; vmci_ctx_rcv_notifications_release(cid, db_handle_array, qp_handle_array, info.result == VMCI_SUCCESS && !retval); return retval; } static long vmci_host_unlocked_ioctl(struct file *filp, unsigned int iocmd, unsigned long ioarg) { #define VMCI_DO_IOCTL(ioctl_name, ioctl_fn) do { \ char *name = "IOCTL_VMCI_" # ioctl_name; \ return vmci_host_do_ ## ioctl_fn( \ vmci_host_dev, name, uptr); \ } while (0) struct vmci_host_dev *vmci_host_dev = filp->private_data; void __user *uptr = (void __user *)ioarg; switch (iocmd) { case IOCTL_VMCI_INIT_CONTEXT: VMCI_DO_IOCTL(INIT_CONTEXT, init_context); case IOCTL_VMCI_DATAGRAM_SEND: VMCI_DO_IOCTL(DATAGRAM_SEND, send_datagram); case IOCTL_VMCI_DATAGRAM_RECEIVE: VMCI_DO_IOCTL(DATAGRAM_RECEIVE, receive_datagram); case IOCTL_VMCI_QUEUEPAIR_ALLOC: VMCI_DO_IOCTL(QUEUEPAIR_ALLOC, alloc_queuepair); case IOCTL_VMCI_QUEUEPAIR_SETVA: VMCI_DO_IOCTL(QUEUEPAIR_SETVA, queuepair_setva); case IOCTL_VMCI_QUEUEPAIR_SETPAGEFILE: VMCI_DO_IOCTL(QUEUEPAIR_SETPAGEFILE, queuepair_setpf); case IOCTL_VMCI_QUEUEPAIR_DETACH: VMCI_DO_IOCTL(QUEUEPAIR_DETACH, qp_detach); case IOCTL_VMCI_CTX_ADD_NOTIFICATION: VMCI_DO_IOCTL(CTX_ADD_NOTIFICATION, ctx_add_notify); case IOCTL_VMCI_CTX_REMOVE_NOTIFICATION: VMCI_DO_IOCTL(CTX_REMOVE_NOTIFICATION, ctx_remove_notify); case IOCTL_VMCI_CTX_GET_CPT_STATE: VMCI_DO_IOCTL(CTX_GET_CPT_STATE, ctx_get_cpt_state); case IOCTL_VMCI_CTX_SET_CPT_STATE: VMCI_DO_IOCTL(CTX_SET_CPT_STATE, ctx_set_cpt_state); case IOCTL_VMCI_GET_CONTEXT_ID: VMCI_DO_IOCTL(GET_CONTEXT_ID, get_context_id); case IOCTL_VMCI_SET_NOTIFY: VMCI_DO_IOCTL(SET_NOTIFY, set_notify); case IOCTL_VMCI_NOTIFY_RESOURCE: VMCI_DO_IOCTL(NOTIFY_RESOURCE, notify_resource); case IOCTL_VMCI_NOTIFICATIONS_RECEIVE: VMCI_DO_IOCTL(NOTIFICATIONS_RECEIVE, recv_notifications); case IOCTL_VMCI_VERSION: case IOCTL_VMCI_VERSION2: return vmci_host_get_version(vmci_host_dev, iocmd, uptr); default: pr_devel("%s: Unknown ioctl (iocmd=%d)\n", __func__, iocmd); return -EINVAL; } #undef VMCI_DO_IOCTL } static const struct file_operations vmuser_fops = { .owner = THIS_MODULE, .open = vmci_host_open, .release = vmci_host_close, .poll = vmci_host_poll, .unlocked_ioctl = vmci_host_unlocked_ioctl, .compat_ioctl = compat_ptr_ioctl, }; static struct miscdevice vmci_host_miscdev = { .name = "vmci", .minor = MISC_DYNAMIC_MINOR, .fops = &vmuser_fops, }; int __init vmci_host_init(void) { int error; host_context = vmci_ctx_create(VMCI_HOST_CONTEXT_ID, VMCI_DEFAULT_PROC_PRIVILEGE_FLAGS, -1, VMCI_VERSION, NULL); if (IS_ERR(host_context)) { error = PTR_ERR(host_context); pr_warn("Failed to initialize VMCIContext (error%d)\n", error); return error; } error = misc_register(&vmci_host_miscdev); if (error) { pr_warn("Module registration error (name=%s, major=%d, minor=%d, err=%d)\n", vmci_host_miscdev.name, MISC_MAJOR, vmci_host_miscdev.minor, error); pr_warn("Unable to initialize host personality\n"); vmci_ctx_destroy(host_context); return error; } pr_info("VMCI host device registered (name=%s, major=%d, minor=%d)\n", vmci_host_miscdev.name, MISC_MAJOR, vmci_host_miscdev.minor); vmci_host_device_initialized = true; return 0; } void __exit vmci_host_exit(void) { vmci_host_device_initialized = false; misc_deregister(&vmci_host_miscdev); vmci_ctx_destroy(host_context); vmci_qp_broker_exit(); pr_debug("VMCI host driver module unloaded\n"); } |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 | // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/wait.h> #include <linux/writeback.h> #include <linux/iversion.h> #include <linux/filelock.h> #include <linux/jiffies.h> #include "super.h" #include "mds_client.h" #include "cache.h" #include "crypto.h" #include <linux/ceph/decode.h> #include <linux/ceph/messenger.h> /* * Capability management * * The Ceph metadata servers control client access to inode metadata * and file data by issuing capabilities, granting clients permission * to read and/or write both inode field and file data to OSDs * (storage nodes). Each capability consists of a set of bits * indicating which operations are allowed. * * If the client holds a *_SHARED cap, the client has a coherent value * that can be safely read from the cached inode. * * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the * client is allowed to change inode attributes (e.g., file size, * mtime), note its dirty state in the ceph_cap, and asynchronously * flush that metadata change to the MDS. * * In the event of a conflicting operation (perhaps by another * client), the MDS will revoke the conflicting client capabilities. * * In order for a client to cache an inode, it must hold a capability * with at least one MDS server. When inodes are released, release * notifications are batched and periodically sent en masse to the MDS * cluster to release server state. */ static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc); static void __kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_inode_info *ci, u64 oldest_flush_tid); /* * Generate readable cap strings for debugging output. */ #define MAX_CAP_STR 20 static char cap_str[MAX_CAP_STR][40]; static DEFINE_SPINLOCK(cap_str_lock); static int last_cap_str; static char *gcap_string(char *s, int c) { if (c & CEPH_CAP_GSHARED) *s++ = 's'; if (c & CEPH_CAP_GEXCL) *s++ = 'x'; if (c & CEPH_CAP_GCACHE) *s++ = 'c'; if (c & CEPH_CAP_GRD) *s++ = 'r'; if (c & CEPH_CAP_GWR) *s++ = 'w'; if (c & CEPH_CAP_GBUFFER) *s++ = 'b'; if (c & CEPH_CAP_GWREXTEND) *s++ = 'a'; if (c & CEPH_CAP_GLAZYIO) *s++ = 'l'; return s; } const char *ceph_cap_string(int caps) { int i; char *s; int c; spin_lock(&cap_str_lock); i = last_cap_str++; if (last_cap_str == MAX_CAP_STR) last_cap_str = 0; spin_unlock(&cap_str_lock); s = cap_str[i]; if (caps & CEPH_CAP_PIN) *s++ = 'p'; c = (caps >> CEPH_CAP_SAUTH) & 3; if (c) { *s++ = 'A'; s = gcap_string(s, c); } c = (caps >> CEPH_CAP_SLINK) & 3; if (c) { *s++ = 'L'; s = gcap_string(s, c); } c = (caps >> CEPH_CAP_SXATTR) & 3; if (c) { *s++ = 'X'; s = gcap_string(s, c); } c = caps >> CEPH_CAP_SFILE; if (c) { *s++ = 'F'; s = gcap_string(s, c); } if (s == cap_str[i]) *s++ = '-'; *s = 0; return cap_str[i]; } void ceph_caps_init(struct ceph_mds_client *mdsc) { INIT_LIST_HEAD(&mdsc->caps_list); spin_lock_init(&mdsc->caps_list_lock); } void ceph_caps_finalize(struct ceph_mds_client *mdsc) { struct ceph_cap *cap; spin_lock(&mdsc->caps_list_lock); while (!list_empty(&mdsc->caps_list)) { cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); kmem_cache_free(ceph_cap_cachep, cap); } mdsc->caps_total_count = 0; mdsc->caps_avail_count = 0; mdsc->caps_use_count = 0; mdsc->caps_reserve_count = 0; mdsc->caps_min_count = 0; spin_unlock(&mdsc->caps_list_lock); } void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, struct ceph_mount_options *fsopt) { spin_lock(&mdsc->caps_list_lock); mdsc->caps_min_count = fsopt->max_readdir; if (mdsc->caps_min_count < 1024) mdsc->caps_min_count = 1024; mdsc->caps_use_max = fsopt->caps_max; if (mdsc->caps_use_max > 0 && mdsc->caps_use_max < mdsc->caps_min_count) mdsc->caps_use_max = mdsc->caps_min_count; spin_unlock(&mdsc->caps_list_lock); } static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps) { struct ceph_cap *cap; int i; if (nr_caps) { BUG_ON(mdsc->caps_reserve_count < nr_caps); mdsc->caps_reserve_count -= nr_caps; if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + mdsc->caps_min_count) { mdsc->caps_total_count -= nr_caps; for (i = 0; i < nr_caps; i++) { cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); kmem_cache_free(ceph_cap_cachep, cap); } } else { mdsc->caps_avail_count += nr_caps; } doutc(mdsc->fsc->client, "caps %d = %d used + %d resv + %d avail\n", mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); } } /* * Called under mdsc->mutex. */ int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need) { struct ceph_client *cl = mdsc->fsc->client; int i, j; struct ceph_cap *cap; int have; int alloc = 0; int max_caps; int err = 0; bool trimmed = false; struct ceph_mds_session *s; LIST_HEAD(newcaps); doutc(cl, "ctx=%p need=%d\n", ctx, need); /* first reserve any caps that are already allocated */ spin_lock(&mdsc->caps_list_lock); if (mdsc->caps_avail_count >= need) have = need; else have = mdsc->caps_avail_count; mdsc->caps_avail_count -= have; mdsc->caps_reserve_count += have; BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); spin_unlock(&mdsc->caps_list_lock); for (i = have; i < need; ) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); if (cap) { list_add(&cap->caps_item, &newcaps); alloc++; i++; continue; } if (!trimmed) { for (j = 0; j < mdsc->max_sessions; j++) { s = __ceph_lookup_mds_session(mdsc, j); if (!s) continue; mutex_unlock(&mdsc->mutex); mutex_lock(&s->s_mutex); max_caps = s->s_nr_caps - (need - i); ceph_trim_caps(mdsc, s, max_caps); mutex_unlock(&s->s_mutex); ceph_put_mds_session(s); mutex_lock(&mdsc->mutex); } trimmed = true; spin_lock(&mdsc->caps_list_lock); if (mdsc->caps_avail_count) { int more_have; if (mdsc->caps_avail_count >= need - i) more_have = need - i; else more_have = mdsc->caps_avail_count; i += more_have; have += more_have; mdsc->caps_avail_count -= more_have; mdsc->caps_reserve_count += more_have; } spin_unlock(&mdsc->caps_list_lock); continue; } pr_warn_client(cl, "ctx=%p ENOMEM need=%d got=%d\n", ctx, need, have + alloc); err = -ENOMEM; break; } if (!err) { BUG_ON(have + alloc != need); ctx->count = need; ctx->used = 0; } spin_lock(&mdsc->caps_list_lock); mdsc->caps_total_count += alloc; mdsc->caps_reserve_count += alloc; list_splice(&newcaps, &mdsc->caps_list); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); if (err) __ceph_unreserve_caps(mdsc, have + alloc); spin_unlock(&mdsc->caps_list_lock); doutc(cl, "ctx=%p %d = %d used + %d resv + %d avail\n", ctx, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); return err; } void ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx) { struct ceph_client *cl = mdsc->fsc->client; bool reclaim = false; if (!ctx->count) return; doutc(cl, "ctx=%p count=%d\n", ctx, ctx->count); spin_lock(&mdsc->caps_list_lock); __ceph_unreserve_caps(mdsc, ctx->count); ctx->count = 0; if (mdsc->caps_use_max > 0 && mdsc->caps_use_count > mdsc->caps_use_max) reclaim = true; spin_unlock(&mdsc->caps_list_lock); if (reclaim) ceph_reclaim_caps_nr(mdsc, ctx->used); } struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap *cap = NULL; /* temporary, until we do something about cap import/export */ if (!ctx) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); if (cap) { spin_lock(&mdsc->caps_list_lock); mdsc->caps_use_count++; mdsc->caps_total_count++; spin_unlock(&mdsc->caps_list_lock); } else { spin_lock(&mdsc->caps_list_lock); if (mdsc->caps_avail_count) { BUG_ON(list_empty(&mdsc->caps_list)); mdsc->caps_avail_count--; mdsc->caps_use_count++; cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); } spin_unlock(&mdsc->caps_list_lock); } return cap; } spin_lock(&mdsc->caps_list_lock); doutc(cl, "ctx=%p (%d) %d = %d used + %d resv + %d avail\n", ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); BUG_ON(!ctx->count); BUG_ON(ctx->count > mdsc->caps_reserve_count); BUG_ON(list_empty(&mdsc->caps_list)); ctx->count--; ctx->used++; mdsc->caps_reserve_count--; mdsc->caps_use_count++; cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); spin_unlock(&mdsc->caps_list_lock); return cap; } void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) { struct ceph_client *cl = mdsc->fsc->client; spin_lock(&mdsc->caps_list_lock); doutc(cl, "%p %d = %d used + %d resv + %d avail\n", cap, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); mdsc->caps_use_count--; /* * Keep some preallocated caps around (ceph_min_count), to * avoid lots of free/alloc churn. */ if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + mdsc->caps_min_count) { mdsc->caps_total_count--; kmem_cache_free(ceph_cap_cachep, cap); } else { mdsc->caps_avail_count++; list_add(&cap->caps_item, &mdsc->caps_list); } BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); spin_unlock(&mdsc->caps_list_lock); } void ceph_reservation_status(struct ceph_fs_client *fsc, int *total, int *avail, int *used, int *reserved, int *min) { struct ceph_mds_client *mdsc = fsc->mdsc; spin_lock(&mdsc->caps_list_lock); if (total) *total = mdsc->caps_total_count; if (avail) *avail = mdsc->caps_avail_count; if (used) *used = mdsc->caps_use_count; if (reserved) *reserved = mdsc->caps_reserve_count; if (min) *min = mdsc->caps_min_count; spin_unlock(&mdsc->caps_list_lock); } /* * Find ceph_cap for given mds, if any. * * Called with i_ceph_lock held. */ struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) { struct ceph_cap *cap; struct rb_node *n = ci->i_caps.rb_node; while (n) { cap = rb_entry(n, struct ceph_cap, ci_node); if (mds < cap->mds) n = n->rb_left; else if (mds > cap->mds) n = n->rb_right; else return cap; } return NULL; } struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) { struct ceph_cap *cap; spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); spin_unlock(&ci->i_ceph_lock); return cap; } /* * Called under i_ceph_lock. */ static void __insert_cap_node(struct ceph_inode_info *ci, struct ceph_cap *new) { struct rb_node **p = &ci->i_caps.rb_node; struct rb_node *parent = NULL; struct ceph_cap *cap = NULL; while (*p) { parent = *p; cap = rb_entry(parent, struct ceph_cap, ci_node); if (new->mds < cap->mds) p = &(*p)->rb_left; else if (new->mds > cap->mds) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new->ci_node, parent, p); rb_insert_color(&new->ci_node, &ci->i_caps); } /* * (re)set cap hold timeouts, which control the delayed release * of unused caps back to the MDS. Should be called on cap use. */ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; struct ceph_mount_options *opt = mdsc->fsc->mount_options; ci->i_hold_caps_max = round_jiffies(jiffies + opt->caps_wanted_delay_max * HZ); doutc(mdsc->fsc->client, "%p %llx.%llx %lu\n", inode, ceph_vinop(inode), ci->i_hold_caps_max - jiffies); } /* * (Re)queue cap at the end of the delayed cap release list. * * If I_FLUSH is set, leave the inode at the front of the list. * * Caller holds i_ceph_lock * -> we take mdsc->cap_delay_lock */ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; doutc(mdsc->fsc->client, "%p %llx.%llx flags 0x%lx at %lu\n", inode, ceph_vinop(inode), ci->i_ceph_flags, ci->i_hold_caps_max); if (!mdsc->stopping) { spin_lock(&mdsc->cap_delay_lock); if (!list_empty(&ci->i_cap_delay_list)) { if (ci->i_ceph_flags & CEPH_I_FLUSH) goto no_change; list_del_init(&ci->i_cap_delay_list); } __cap_set_timeouts(mdsc, ci); list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); no_change: spin_unlock(&mdsc->cap_delay_lock); } } /* * Queue an inode for immediate writeback. Mark inode with I_FLUSH, * indicating we should send a cap message to flush dirty metadata * asap, and move to the front of the delayed cap list. */ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); spin_lock(&mdsc->cap_delay_lock); ci->i_ceph_flags |= CEPH_I_FLUSH; if (!list_empty(&ci->i_cap_delay_list)) list_del_init(&ci->i_cap_delay_list); list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list); spin_unlock(&mdsc->cap_delay_lock); } /* * Cancel delayed work on cap. * * Caller must hold i_ceph_lock. */ static void __cap_delay_cancel(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); if (list_empty(&ci->i_cap_delay_list)) return; spin_lock(&mdsc->cap_delay_lock); list_del_init(&ci->i_cap_delay_list); spin_unlock(&mdsc->cap_delay_lock); } /* Common issue checks for add_cap, handle_cap_grant. */ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, unsigned issued) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); unsigned had = __ceph_caps_issued(ci, NULL); lockdep_assert_held(&ci->i_ceph_lock); /* * Each time we receive FILE_CACHE anew, we increment * i_rdcache_gen. */ if (S_ISREG(ci->netfs.inode.i_mode) && (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { ci->i_rdcache_gen++; } /* * If FILE_SHARED is newly issued, mark dir not complete. We don't * know what happened to this directory while we didn't have the cap. * If FILE_SHARED is being revoked, also mark dir not complete. It * stops on-going cached readdir. */ if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { if (issued & CEPH_CAP_FILE_SHARED) atomic_inc(&ci->i_shared_gen); if (S_ISDIR(ci->netfs.inode.i_mode)) { doutc(cl, " marking %p NOT complete\n", inode); __ceph_dir_clear_complete(ci); } } /* Wipe saved layout if we're losing DIR_CREATE caps */ if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && !(issued & CEPH_CAP_DIR_CREATE)) { ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); } } /** * change_auth_cap_ses - move inode to appropriate lists when auth caps change * @ci: inode to be moved * @session: new auth caps session */ void change_auth_cap_ses(struct ceph_inode_info *ci, struct ceph_mds_session *session) { lockdep_assert_held(&ci->i_ceph_lock); if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item)) return; spin_lock(&session->s_mdsc->cap_dirty_lock); if (!list_empty(&ci->i_dirty_item)) list_move(&ci->i_dirty_item, &session->s_cap_dirty); if (!list_empty(&ci->i_flushing_item)) list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); spin_unlock(&session->s_mdsc->cap_dirty_lock); } /* * Add a capability under the given MDS session. * * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock * * @fmode is the open file mode, if we are opening a file, otherwise * it is < 0. (This is so we can atomically add the cap and add an * open file reference to it.) */ void ceph_add_cap(struct inode *inode, struct ceph_mds_session *session, u64 cap_id, unsigned issued, unsigned wanted, unsigned seq, unsigned mseq, u64 realmino, int flags, struct ceph_cap **new_cap) { struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap; int mds = session->s_mds; int actual_wanted; u32 gen; lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "%p %llx.%llx mds%d cap %llx %s seq %d\n", inode, ceph_vinop(inode), session->s_mds, cap_id, ceph_cap_string(issued), seq); gen = atomic_read(&session->s_cap_gen); cap = __get_cap_for_mds(ci, mds); if (!cap) { cap = *new_cap; *new_cap = NULL; cap->issued = 0; cap->implemented = 0; cap->mds = mds; cap->mds_wanted = 0; cap->mseq = 0; cap->ci = ci; __insert_cap_node(ci, cap); /* add to session cap list */ cap->session = session; spin_lock(&session->s_cap_lock); list_add_tail(&cap->session_caps, &session->s_caps); session->s_nr_caps++; atomic64_inc(&mdsc->metric.total_caps); spin_unlock(&session->s_cap_lock); } else { spin_lock(&session->s_cap_lock); list_move_tail(&cap->session_caps, &session->s_caps); spin_unlock(&session->s_cap_lock); if (cap->cap_gen < gen) cap->issued = cap->implemented = CEPH_CAP_PIN; /* * auth mds of the inode changed. we received the cap export * message, but still haven't received the cap import message. * handle_cap_export() updated the new auth MDS' cap. * * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing * a message that was send before the cap import message. So * don't remove caps. */ if (ceph_seq_cmp(seq, cap->seq) <= 0) { WARN_ON(cap != ci->i_auth_cap); WARN_ON(cap->cap_id != cap_id); seq = cap->seq; mseq = cap->mseq; issued |= cap->issued; flags |= CEPH_CAP_FLAG_AUTH; } } if (!ci->i_snap_realm || ((flags & CEPH_CAP_FLAG_AUTH) && realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) { /* * add this inode to the appropriate snap realm */ struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, realmino); if (realm) ceph_change_snap_realm(inode, realm); else WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n", __func__, realmino, ci->i_vino.ino, ci->i_snap_realm ? ci->i_snap_realm->ino : 0); } __check_cap_issue(ci, cap, issued); /* * If we are issued caps we don't want, or the mds' wanted * value appears to be off, queue a check so we'll release * later and/or update the mds wanted value. */ actual_wanted = __ceph_caps_wanted(ci); if ((wanted & ~actual_wanted) || (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) { doutc(cl, "issued %s, mds wanted %s, actual %s, queueing\n", ceph_cap_string(issued), ceph_cap_string(wanted), ceph_cap_string(actual_wanted)); __cap_delay_requeue(mdsc, ci); } if (flags & CEPH_CAP_FLAG_AUTH) { if (!ci->i_auth_cap || ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { if (ci->i_auth_cap && ci->i_auth_cap->session != cap->session) change_auth_cap_ses(ci, cap->session); ci->i_auth_cap = cap; cap->mds_wanted = wanted; } } else { WARN_ON(ci->i_auth_cap == cap); } doutc(cl, "inode %p %llx.%llx cap %p %s now %s seq %d mds%d\n", inode, ceph_vinop(inode), cap, ceph_cap_string(issued), ceph_cap_string(issued|cap->issued), seq, mds); cap->cap_id = cap_id; cap->issued = issued; cap->implemented |= issued; if (ceph_seq_cmp(mseq, cap->mseq) > 0) cap->mds_wanted = wanted; else cap->mds_wanted |= wanted; cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; cap->cap_gen = gen; wake_up_all(&ci->i_cap_wq); } /* * Return true if cap has not timed out and belongs to the current * generation of the MDS session (i.e. has not gone 'stale' due to * us losing touch with the mds). */ static int __cap_is_valid(struct ceph_cap *cap) { struct inode *inode = &cap->ci->netfs.inode; struct ceph_client *cl = cap->session->s_mdsc->fsc->client; unsigned long ttl; u32 gen; gen = atomic_read(&cap->session->s_cap_gen); ttl = cap->session->s_cap_ttl; if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { doutc(cl, "%p %llx.%llx cap %p issued %s but STALE (gen %u vs %u)\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); return 0; } return 1; } /* * Return set of valid cap bits issued to us. Note that caps time * out, and may be invalidated in bulk if the client session times out * and session->s_cap_gen is bumped. */ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int have = ci->i_snap_caps; struct ceph_cap *cap; struct rb_node *p; if (implemented) *implemented = 0; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; doutc(cl, "%p %llx.%llx cap %p issued %s\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued)); have |= cap->issued; if (implemented) *implemented |= cap->implemented; } /* * exclude caps issued by non-auth MDS, but are been revoking * by the auth MDS. The non-auth MDS should be revoking/exporting * these caps, but the message is delayed. */ if (ci->i_auth_cap) { cap = ci->i_auth_cap; have &= ~cap->implemented | cap->issued; } return have; } /* * Get cap bits issued by caps other than @ocap */ int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap) { int have = ci->i_snap_caps; struct ceph_cap *cap; struct rb_node *p; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (cap == ocap) continue; if (!__cap_is_valid(cap)) continue; have |= cap->issued; } return have; } /* * Move a cap to the end of the LRU (oldest caps at list head, newest * at list tail). */ static void __touch_cap(struct ceph_cap *cap) { struct inode *inode = &cap->ci->netfs.inode; struct ceph_mds_session *s = cap->session; struct ceph_client *cl = s->s_mdsc->fsc->client; spin_lock(&s->s_cap_lock); if (!s->s_cap_iterator) { doutc(cl, "%p %llx.%llx cap %p mds%d\n", inode, ceph_vinop(inode), cap, s->s_mds); list_move_tail(&cap->session_caps, &s->s_caps); } else { doutc(cl, "%p %llx.%llx cap %p mds%d NOP, iterating over caps\n", inode, ceph_vinop(inode), cap, s->s_mds); } spin_unlock(&s->s_cap_lock); } /* * Check if we hold the given mask. If so, move the cap(s) to the * front of their respective LRUs. (This is the preferred way for * callers to check for caps they want.) */ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; struct rb_node *p; int have = ci->i_snap_caps; if ((have & mask) == mask) { doutc(cl, "mask %p %llx.%llx snap issued %s (mask %s)\n", inode, ceph_vinop(inode), ceph_cap_string(have), ceph_cap_string(mask)); return 1; } for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; if ((cap->issued & mask) == mask) { doutc(cl, "mask %p %llx.%llx cap %p issued %s (mask %s)\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) __touch_cap(cap); return 1; } /* does a combination of caps satisfy mask? */ have |= cap->issued; if ((have & mask) == mask) { doutc(cl, "mask %p %llx.%llx combo issued %s (mask %s)\n", inode, ceph_vinop(inode), ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) { struct rb_node *q; /* touch this + preceding caps */ __touch_cap(cap); for (q = rb_first(&ci->i_caps); q != p; q = rb_next(q)) { cap = rb_entry(q, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; if (cap->issued & mask) __touch_cap(cap); } } return 1; } } return 0; } int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, int touch) { struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb); int r; r = __ceph_caps_issued_mask(ci, mask, touch); if (r) ceph_update_cap_hit(&fsc->mdsc->metric); else ceph_update_cap_mis(&fsc->mdsc->metric); return r; } /* * Return true if mask caps are currently being revoked by an MDS. */ int __ceph_caps_revoking_other(struct ceph_inode_info *ci, struct ceph_cap *ocap, int mask) { struct ceph_cap *cap; struct rb_node *p; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (cap != ocap && (cap->implemented & ~cap->issued & mask)) return 1; } return 0; } int __ceph_caps_used(struct ceph_inode_info *ci) { int used = 0; if (ci->i_pin_ref) used |= CEPH_CAP_PIN; if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; if (ci->i_rdcache_ref || (S_ISREG(ci->netfs.inode.i_mode) && ci->netfs.inode.i_data.nrpages)) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; if (ci->i_wb_ref || ci->i_wrbuffer_ref) used |= CEPH_CAP_FILE_BUFFER; if (ci->i_fx_ref) used |= CEPH_CAP_FILE_EXCL; return used; } #define FMODE_WAIT_BIAS 1000 /* * wanted, by virtue of open file modes */ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) { const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN); const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD); const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); struct ceph_mount_options *opt = ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options; unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; if (S_ISDIR(ci->netfs.inode.i_mode)) { int want = 0; /* use used_cutoff here, to keep dir's wanted caps longer */ if (ci->i_nr_by_mode[RD_SHIFT] > 0 || time_after(ci->i_last_rd, used_cutoff)) want |= CEPH_CAP_ANY_SHARED; if (ci->i_nr_by_mode[WR_SHIFT] > 0 || time_after(ci->i_last_wr, used_cutoff)) { want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) want |= CEPH_CAP_ANY_DIR_OPS; } if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0) want |= CEPH_CAP_PIN; return want; } else { int bits = 0; if (ci->i_nr_by_mode[RD_SHIFT] > 0) { if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS || time_after(ci->i_last_rd, used_cutoff)) bits |= 1 << RD_SHIFT; } else if (time_after(ci->i_last_rd, idle_cutoff)) { bits |= 1 << RD_SHIFT; } if (ci->i_nr_by_mode[WR_SHIFT] > 0) { if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS || time_after(ci->i_last_wr, used_cutoff)) bits |= 1 << WR_SHIFT; } else if (time_after(ci->i_last_wr, idle_cutoff)) { bits |= 1 << WR_SHIFT; } /* check lazyio only when read/write is wanted */ if ((bits & (CEPH_FILE_MODE_RDWR << 1)) && ci->i_nr_by_mode[LAZY_SHIFT] > 0) bits |= 1 << LAZY_SHIFT; return bits ? ceph_caps_for_mode(bits >> 1) : 0; } } /* * wanted, by virtue of open file modes AND cap refs (buffered/cached data) */ int __ceph_caps_wanted(struct ceph_inode_info *ci) { int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); if (S_ISDIR(ci->netfs.inode.i_mode)) { /* we want EXCL if holding caps of dir ops */ if (w & CEPH_CAP_ANY_DIR_OPS) w |= CEPH_CAP_FILE_EXCL; } else { /* we want EXCL if dirty data */ if (w & CEPH_CAP_FILE_BUFFER) w |= CEPH_CAP_FILE_EXCL; } return w; } /* * Return caps we have registered with the MDS(s) as 'wanted'. */ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check) { struct ceph_cap *cap; struct rb_node *p; int mds_wanted = 0; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (check && !__cap_is_valid(cap)) continue; if (cap == ci->i_auth_cap) mds_wanted |= cap->mds_wanted; else mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR); } return mds_wanted; } int ceph_is_any_caps(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); int ret; spin_lock(&ci->i_ceph_lock); ret = __ceph_is_any_real_caps(ci); spin_unlock(&ci->i_ceph_lock); return ret; } /* * Remove a cap. Take steps to deal with a racing iterate_session_caps. * * caller should hold i_ceph_lock. * caller will not hold session s_mutex if called from destroy_inode. */ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) { struct ceph_mds_session *session = cap->session; struct ceph_client *cl = session->s_mdsc->fsc->client; struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc; int removed = 0; /* 'ci' being NULL means the remove have already occurred */ if (!ci) { doutc(cl, "inode is NULL\n"); return; } lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "%p from %p %llx.%llx\n", cap, inode, ceph_vinop(inode)); mdsc = ceph_inode_to_fs_client(&ci->netfs.inode)->mdsc; /* remove from inode's cap rbtree, and clear auth cap */ rb_erase(&cap->ci_node, &ci->i_caps); if (ci->i_auth_cap == cap) ci->i_auth_cap = NULL; /* remove from session list */ spin_lock(&session->s_cap_lock); if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ doutc(cl, "delaying %p removal from session %p\n", cap, cap->session); } else { list_del_init(&cap->session_caps); session->s_nr_caps--; atomic64_dec(&mdsc->metric.total_caps); cap->session = NULL; removed = 1; } /* protect backpointer with s_cap_lock: see iterate_session_caps */ cap->ci = NULL; /* * s_cap_reconnect is protected by s_cap_lock. no one changes * s_cap_gen while session is in the reconnect state. */ if (queue_release && (!session->s_cap_reconnect || cap->cap_gen == atomic_read(&session->s_cap_gen))) { cap->queue_release = 1; if (removed) { __ceph_queue_cap_release(session, cap); removed = 0; } } else { cap->queue_release = 0; } cap->cap_ino = ci->i_vino.ino; spin_unlock(&session->s_cap_lock); if (removed) ceph_put_cap(mdsc, cap); if (!__ceph_is_any_real_caps(ci)) { /* when reconnect denied, we remove session caps forcibly, * i_wr_ref can be non-zero. If there are ongoing write, * keep i_snap_realm. */ if (ci->i_wr_ref == 0 && ci->i_snap_realm) ceph_change_snap_realm(&ci->netfs.inode, NULL); __cap_delay_cancel(mdsc, ci); } } void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, bool queue_release) { struct ceph_inode_info *ci = cap->ci; struct ceph_fs_client *fsc; /* 'ci' being NULL means the remove have already occurred */ if (!ci) { doutc(mdsc->fsc->client, "inode is NULL\n"); return; } lockdep_assert_held(&ci->i_ceph_lock); fsc = ceph_inode_to_fs_client(&ci->netfs.inode); WARN_ON_ONCE(ci->i_auth_cap == cap && !list_empty(&ci->i_dirty_item) && !fsc->blocklisted && !ceph_inode_is_shutdown(&ci->netfs.inode)); __ceph_remove_cap(cap, queue_release); } struct cap_msg_args { struct ceph_mds_session *session; u64 ino, cid, follows; u64 flush_tid, oldest_flush_tid, size, max_size; u64 xattr_version; u64 change_attr; struct ceph_buffer *xattr_buf; struct ceph_buffer *old_xattr_buf; struct timespec64 atime, mtime, ctime, btime; int op, caps, wanted, dirty; u32 seq, issue_seq, mseq, time_warp_seq; u32 flags; kuid_t uid; kgid_t gid; umode_t mode; bool inline_data; bool wake; bool encrypted; u32 fscrypt_auth_len; u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context }; /* Marshal up the cap msg to the MDS */ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) { struct ceph_mds_caps *fc; void *p; struct ceph_mds_client *mdsc = arg->session->s_mdsc; struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; doutc(mdsc->fsc->client, "%s %llx %llx caps %s wanted %s dirty %s seq %u/%u" " tid %llu/%llu mseq %u follows %lld size %llu/%llu" " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op), arg->cid, arg->ino, ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows, arg->size, arg->max_size, arg->xattr_version, arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); msg->hdr.version = cpu_to_le16(12); msg->hdr.tid = cpu_to_le64(arg->flush_tid); fc = msg->front.iov_base; memset(fc, 0, sizeof(*fc)); fc->cap_id = cpu_to_le64(arg->cid); fc->op = cpu_to_le32(arg->op); fc->seq = cpu_to_le32(arg->seq); fc->issue_seq = cpu_to_le32(arg->issue_seq); fc->migrate_seq = cpu_to_le32(arg->mseq); fc->caps = cpu_to_le32(arg->caps); fc->wanted = cpu_to_le32(arg->wanted); fc->dirty = cpu_to_le32(arg->dirty); fc->ino = cpu_to_le64(arg->ino); fc->snap_follows = cpu_to_le64(arg->follows); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) if (arg->encrypted) fc->size = cpu_to_le64(round_up(arg->size, CEPH_FSCRYPT_BLOCK_SIZE)); else #endif fc->size = cpu_to_le64(arg->size); fc->max_size = cpu_to_le64(arg->max_size); ceph_encode_timespec64(&fc->mtime, &arg->mtime); ceph_encode_timespec64(&fc->atime, &arg->atime); ceph_encode_timespec64(&fc->ctime, &arg->ctime); fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq); fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid)); fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid)); fc->mode = cpu_to_le32(arg->mode); fc->xattr_version = cpu_to_le64(arg->xattr_version); if (arg->xattr_buf) { msg->middle = ceph_buffer_get(arg->xattr_buf); fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len); msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len); } p = fc + 1; /* flock buffer size (version 2) */ ceph_encode_32(&p, 0); /* inline version (version 4) */ ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); /* inline data size */ ceph_encode_32(&p, 0); /* * osd_epoch_barrier (version 5) * The epoch_barrier is protected osdc->lock, so READ_ONCE here in * case it was recently changed */ ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier)); /* oldest_flush_tid (version 6) */ ceph_encode_64(&p, arg->oldest_flush_tid); /* * caller_uid/caller_gid (version 7) * * Currently, we don't properly track which caller dirtied the caps * last, and force a flush of them when there is a conflict. For now, * just set this to 0:0, to emulate how the MDS has worked up to now. */ ceph_encode_32(&p, 0); ceph_encode_32(&p, 0); /* pool namespace (version 8) (mds always ignores this) */ ceph_encode_32(&p, 0); /* btime and change_attr (version 9) */ ceph_encode_timespec64(p, &arg->btime); p += sizeof(struct ceph_timespec); ceph_encode_64(&p, arg->change_attr); /* Advisory flags (version 10) */ ceph_encode_32(&p, arg->flags); /* dirstats (version 11) - these are r/o on the client */ ceph_encode_64(&p, 0); ceph_encode_64(&p, 0); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) /* * fscrypt_auth and fscrypt_file (version 12) * * fscrypt_auth holds the crypto context (if any). fscrypt_file * tracks the real i_size as an __le64 field (and we use a rounded-up * i_size in the traditional size field). */ ceph_encode_32(&p, arg->fscrypt_auth_len); ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len); ceph_encode_32(&p, sizeof(__le64)); ceph_encode_64(&p, arg->size); #else /* CONFIG_FS_ENCRYPTION */ ceph_encode_32(&p, 0); ceph_encode_32(&p, 0); #endif /* CONFIG_FS_ENCRYPTION */ } /* * Queue cap releases when an inode is dropped from our cache. */ void __ceph_remove_caps(struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct rb_node *p; /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU) * may call __ceph_caps_issued_mask() on a freeing inode. */ spin_lock(&ci->i_ceph_lock); p = rb_first(&ci->i_caps); while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); p = rb_next(p); ceph_remove_cap(mdsc, cap, true); } spin_unlock(&ci->i_ceph_lock); } /* * Prepare to send a cap message to an MDS. Update the cap state, and populate * the arg struct with the parameters that will need to be sent. This should * be done under the i_ceph_lock to guard against changes to cap state. * * Make note of max_size reported/requested from mds, revoked caps * that have now been implemented. */ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, int op, int flags, int used, int want, int retain, int flushing, u64 flush_tid, u64 oldest_flush_tid) { struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int held, revoking; lockdep_assert_held(&ci->i_ceph_lock); held = cap->issued | cap->implemented; revoking = cap->implemented & ~cap->issued; retain &= ~revoking; doutc(cl, "%p %llx.%llx cap %p session %p %s -> %s (revoking %s)\n", inode, ceph_vinop(inode), cap, cap->session, ceph_cap_string(held), ceph_cap_string(held & retain), ceph_cap_string(revoking)); BUG_ON((retain & CEPH_CAP_PIN) == 0); ci->i_ceph_flags &= ~CEPH_I_FLUSH; cap->issued &= retain; /* drop bits we don't want */ /* * Wake up any waiters on wanted -> needed transition. This is due to * the weird transition from buffered to sync IO... we need to flush * dirty pages _before_ allowing sync writes to avoid reordering. */ arg->wake = cap->implemented & ~cap->issued; cap->implemented &= cap->issued | used; cap->mds_wanted = want; arg->session = cap->session; arg->ino = ceph_vino(inode).ino; arg->cid = cap->cap_id; arg->follows = flushing ? ci->i_head_snapc->seq : 0; arg->flush_tid = flush_tid; arg->oldest_flush_tid = oldest_flush_tid; arg->size = i_size_read(inode); ci->i_reported_size = arg->size; arg->max_size = ci->i_wanted_max_size; if (cap == ci->i_auth_cap) { if (want & CEPH_CAP_ANY_FILE_WR) ci->i_requested_max_size = arg->max_size; else ci->i_requested_max_size = 0; } if (flushing & CEPH_CAP_XATTR_EXCL) { arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); arg->xattr_version = ci->i_xattrs.version; arg->xattr_buf = ceph_buffer_get(ci->i_xattrs.blob); } else { arg->xattr_buf = NULL; arg->old_xattr_buf = NULL; } arg->mtime = inode_get_mtime(inode); arg->atime = inode_get_atime(inode); arg->ctime = inode_get_ctime(inode); arg->btime = ci->i_btime; arg->change_attr = inode_peek_iversion_raw(inode); arg->op = op; arg->caps = cap->implemented; arg->wanted = want; arg->dirty = flushing; arg->seq = cap->seq; arg->issue_seq = cap->issue_seq; arg->mseq = cap->mseq; arg->time_warp_seq = ci->i_time_warp_seq; arg->uid = inode->i_uid; arg->gid = inode->i_gid; arg->mode = inode->i_mode; arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) && !list_empty(&ci->i_cap_snaps)) { struct ceph_cap_snap *capsnap; list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) { if (capsnap->cap_flush.tid) break; if (capsnap->need_flush) { flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP; break; } } } arg->flags = flags; arg->encrypted = IS_ENCRYPTED(inode); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) if (ci->fscrypt_auth_len && WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) { /* Don't set this if it's too big */ arg->fscrypt_auth_len = 0; } else { arg->fscrypt_auth_len = ci->fscrypt_auth_len; memcpy(arg->fscrypt_auth, ci->fscrypt_auth, min_t(size_t, ci->fscrypt_auth_len, sizeof(arg->fscrypt_auth))); } #endif /* CONFIG_FS_ENCRYPTION */ } #if IS_ENABLED(CONFIG_FS_ENCRYPTION) #define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8) static inline int cap_msg_size(struct cap_msg_args *arg) { return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len; } #else #define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4) static inline int cap_msg_size(struct cap_msg_args *arg) { return CAP_MSG_FIXED_FIELDS; } #endif /* CONFIG_FS_ENCRYPTION */ /* * Send a cap msg on the given inode. * * Caller should hold snap_rwsem (read), s_mutex. */ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) { struct ceph_msg *msg; struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS, false); if (!msg) { pr_err_client(cl, "error allocating cap msg: ino (%llx.%llx)" " flushing %s tid %llu, requeuing cap.\n", ceph_vinop(inode), ceph_cap_string(arg->dirty), arg->flush_tid); spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(arg->session->s_mdsc, ci); spin_unlock(&ci->i_ceph_lock); return; } encode_cap_msg(msg, arg); ceph_con_send(&arg->session->s_con, msg); ceph_buffer_put(arg->old_xattr_buf); ceph_buffer_put(arg->xattr_buf); if (arg->wake) wake_up_all(&ci->i_cap_wq); } static inline int __send_flush_snap(struct inode *inode, struct ceph_mds_session *session, struct ceph_cap_snap *capsnap, u32 mseq, u64 oldest_flush_tid) { struct cap_msg_args arg; struct ceph_msg *msg; arg.session = session; arg.ino = ceph_vino(inode).ino; arg.cid = 0; arg.follows = capsnap->follows; arg.flush_tid = capsnap->cap_flush.tid; arg.oldest_flush_tid = oldest_flush_tid; arg.size = capsnap->size; arg.max_size = 0; arg.xattr_version = capsnap->xattr_version; arg.xattr_buf = capsnap->xattr_blob; arg.old_xattr_buf = NULL; arg.atime = capsnap->atime; arg.mtime = capsnap->mtime; arg.ctime = capsnap->ctime; arg.btime = capsnap->btime; arg.change_attr = capsnap->change_attr; arg.op = CEPH_CAP_OP_FLUSHSNAP; arg.caps = capsnap->issued; arg.wanted = 0; arg.dirty = capsnap->dirty; arg.seq = 0; arg.issue_seq = 0; arg.mseq = mseq; arg.time_warp_seq = capsnap->time_warp_seq; arg.uid = capsnap->uid; arg.gid = capsnap->gid; arg.mode = capsnap->mode; arg.inline_data = capsnap->inline_data; arg.flags = 0; arg.wake = false; arg.encrypted = IS_ENCRYPTED(inode); /* No fscrypt_auth changes from a capsnap.*/ arg.fscrypt_auth_len = 0; msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg), GFP_NOFS, false); if (!msg) return -ENOMEM; encode_cap_msg(msg, &arg); ceph_con_send(&arg.session->s_con, msg); return 0; } /* * When a snapshot is taken, clients accumulate dirty metadata on * inodes with capabilities in ceph_cap_snaps to describe the file * state at the time the snapshot was taken. This must be flushed * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * * Called under i_ceph_lock. */ static void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session *session) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap_snap *capsnap; u64 oldest_flush_tid = 0; u64 first_tid = 1, last_tid = 0; doutc(cl, "%p %llx.%llx session %p\n", inode, ceph_vinop(inode), session); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { /* * we need to wait for sync writes to complete and for dirty * pages to be written out. */ if (capsnap->dirty_pages || capsnap->writing) break; /* should be removed by ceph_try_drop_cap_snap() */ BUG_ON(!capsnap->need_flush); /* only flush each capsnap once */ if (capsnap->cap_flush.tid > 0) { doutc(cl, "already flushed %p, skipping\n", capsnap); continue; } spin_lock(&mdsc->cap_dirty_lock); capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid; list_add_tail(&capsnap->cap_flush.g_list, &mdsc->cap_flush_list); if (oldest_flush_tid == 0) oldest_flush_tid = __get_oldest_flush_tid(mdsc); if (list_empty(&ci->i_flushing_item)) { list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); } spin_unlock(&mdsc->cap_dirty_lock); list_add_tail(&capsnap->cap_flush.i_list, &ci->i_cap_flush_list); if (first_tid == 1) first_tid = capsnap->cap_flush.tid; last_tid = capsnap->cap_flush.tid; } ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS; while (first_tid <= last_tid) { struct ceph_cap *cap = ci->i_auth_cap; struct ceph_cap_flush *cf = NULL, *iter; int ret; if (!(cap && cap->session == session)) { doutc(cl, "%p %llx.%llx auth cap %p not mds%d, stop\n", inode, ceph_vinop(inode), cap, session->s_mds); break; } ret = -ENOENT; list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) { if (iter->tid >= first_tid) { cf = iter; ret = 0; break; } } if (ret < 0) break; first_tid = cf->tid + 1; capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", inode, ceph_vinop(inode), capsnap, cf->tid, ceph_cap_string(capsnap->dirty)); ret = __send_flush_snap(inode, session, capsnap, cap->mseq, oldest_flush_tid); if (ret < 0) { pr_err_client(cl, "error sending cap flushsnap, " "ino (%llx.%llx) tid %llu follows %llu\n", ceph_vinop(inode), cf->tid, capsnap->follows); } ceph_put_cap_snap(capsnap); spin_lock(&ci->i_ceph_lock); } } void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_mds_session *session = NULL; bool need_put = false; int mds; doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); if (psession) session = *psession; retry: spin_lock(&ci->i_ceph_lock); if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) { doutc(cl, " no capsnap needs flush, doing nothing\n"); goto out; } if (!ci->i_auth_cap) { doutc(cl, " no auth cap (migrating?), doing nothing\n"); goto out; } mds = ci->i_auth_cap->session->s_mds; if (session && session->s_mds != mds) { doutc(cl, " oops, wrong session %p mutex\n", session); ceph_put_mds_session(session); session = NULL; } if (!session) { spin_unlock(&ci->i_ceph_lock); mutex_lock(&mdsc->mutex); session = __ceph_lookup_mds_session(mdsc, mds); mutex_unlock(&mdsc->mutex); goto retry; } // make sure flushsnap messages are sent in proper order. if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) __kick_flushing_caps(mdsc, session, ci, 0); __ceph_flush_snaps(ci, session); out: spin_unlock(&ci->i_ceph_lock); if (psession) *psession = session; else ceph_put_mds_session(session); /* we flushed them all; remove this inode from the queue */ spin_lock(&mdsc->snap_flush_lock); if (!list_empty(&ci->i_snap_flush_item)) need_put = true; list_del_init(&ci->i_snap_flush_item); spin_unlock(&mdsc->snap_flush_lock); if (need_put) iput(inode); } /* * Mark caps dirty. If inode is newly dirty, return the dirty flags. * Caller is then responsible for calling __mark_inode_dirty with the * returned flags value. */ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, struct ceph_cap_flush **pcf) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb)->mdsc; struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int was = ci->i_dirty_caps; int dirty = 0; lockdep_assert_held(&ci->i_ceph_lock); if (!ci->i_auth_cap) { pr_warn_client(cl, "%p %llx.%llx mask %s, " "but no auth cap (session was closed?)\n", inode, ceph_vinop(inode), ceph_cap_string(mask)); return 0; } doutc(cl, "%p %llx.%llx %s dirty %s -> %s\n", inode, ceph_vinop(inode), ceph_cap_string(mask), ceph_cap_string(was), ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { struct ceph_mds_session *session = ci->i_auth_cap->session; WARN_ON_ONCE(ci->i_prealloc_cap_flush); swap(ci->i_prealloc_cap_flush, *pcf); if (!ci->i_head_snapc) { WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem)); ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); } doutc(cl, "%p %llx.%llx now dirty snapc %p auth cap %p\n", inode, ceph_vinop(inode), ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &session->s_cap_dirty); spin_unlock(&mdsc->cap_dirty_lock); if (ci->i_flushing_caps == 0) { ihold(inode); dirty |= I_DIRTY_SYNC; } } else { WARN_ON_ONCE(!ci->i_prealloc_cap_flush); } BUG_ON(list_empty(&ci->i_dirty_item)); if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && (mask & CEPH_CAP_FILE_BUFFER)) dirty |= I_DIRTY_DATASYNC; __cap_delay_requeue(mdsc, ci); return dirty; } struct ceph_cap_flush *ceph_alloc_cap_flush(void) { struct ceph_cap_flush *cf; cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); if (!cf) return NULL; cf->is_capsnap = false; return cf; } void ceph_free_cap_flush(struct ceph_cap_flush *cf) { if (cf) kmem_cache_free(ceph_cap_flush_cachep, cf); } static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) { if (!list_empty(&mdsc->cap_flush_list)) { struct ceph_cap_flush *cf = list_first_entry(&mdsc->cap_flush_list, struct ceph_cap_flush, g_list); return cf->tid; } return 0; } /* * Remove cap_flush from the mdsc's or inode's flushing cap list. * Return true if caller needs to wake up flush waiters. */ static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc, struct ceph_cap_flush *cf) { struct ceph_cap_flush *prev; bool wake = cf->wake; if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { prev = list_prev_entry(cf, g_list); prev->wake = true; wake = false; } list_del_init(&cf->g_list); return wake; } static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci, struct ceph_cap_flush *cf) { struct ceph_cap_flush *prev; bool wake = cf->wake; if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { prev = list_prev_entry(cf, i_list); prev->wake = true; wake = false; } list_del_init(&cf->i_list); return wake; } /* * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. * * Called under i_ceph_lock. Returns the flush tid. */ static u64 __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session, bool wake, u64 *oldest_flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap_flush *cf = NULL; int flushing; lockdep_assert_held(&ci->i_ceph_lock); BUG_ON(ci->i_dirty_caps == 0); BUG_ON(list_empty(&ci->i_dirty_item)); BUG_ON(!ci->i_prealloc_cap_flush); flushing = ci->i_dirty_caps; doutc(cl, "flushing %s, flushing_caps %s -> %s\n", ceph_cap_string(flushing), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(ci->i_flushing_caps | flushing)); ci->i_flushing_caps |= flushing; ci->i_dirty_caps = 0; doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode)); swap(cf, ci->i_prealloc_cap_flush); cf->caps = flushing; cf->wake = wake; spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); cf->tid = ++mdsc->last_cap_flush_tid; list_add_tail(&cf->g_list, &mdsc->cap_flush_list); *oldest_flush_tid = __get_oldest_flush_tid(mdsc); if (list_empty(&ci->i_flushing_item)) { list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); mdsc->num_cap_flushing++; } spin_unlock(&mdsc->cap_dirty_lock); list_add_tail(&cf->i_list, &ci->i_cap_flush_list); return cf->tid; } /* * try to invalidate mapping pages without blocking. */ static int try_nonblocking_invalidate(struct inode *inode) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u32 invalidating_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); ceph_fscache_invalidate(inode, false); invalidate_mapping_pages(&inode->i_data, 0, -1); spin_lock(&ci->i_ceph_lock); if (inode->i_data.nrpages == 0 && invalidating_gen == ci->i_rdcache_gen) { /* success. */ doutc(cl, "%p %llx.%llx success\n", inode, ceph_vinop(inode)); /* save any racing async invalidate some trouble */ ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; return 0; } doutc(cl, "%p %llx.%llx failed\n", inode, ceph_vinop(inode)); return -1; } bool __ceph_should_report_size(struct ceph_inode_info *ci) { loff_t size = i_size_read(&ci->netfs.inode); /* mds will adjust max size according to the reported size */ if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) return false; if (size >= ci->i_max_size) return true; /* half of previous max_size increment has been used */ if (ci->i_max_size > ci->i_reported_size && (size << 1) >= ci->i_max_size + ci->i_reported_size) return true; return false; } /* * Swiss army knife function to examine currently used and wanted * versus held caps. Release, flush, ack revoked caps to mds as * appropriate. * * CHECK_CAPS_AUTHONLY - we should only check the auth cap * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without * further delay. * CHECK_CAPS_FLUSH_FORCE - we should flush any caps immediately, without * further delay. */ void ceph_check_caps(struct ceph_inode_info *ci, int flags) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; u64 flush_tid, oldest_flush_tid; int file_wanted, used, cap_used; int issued, implemented, want, retain, revoking, flushing = 0; int mds = -1; /* keep track of how far we've gone through i_caps list to avoid an infinite loop on retry */ struct rb_node *p; bool queue_invalidate = false; bool tried_invalidate = false; bool queue_writeback = false; struct ceph_mds_session *session = NULL; spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { ci->i_ceph_flags |= CEPH_I_ASYNC_CHECK_CAPS; /* Don't send messages until we get async create reply */ spin_unlock(&ci->i_ceph_lock); return; } if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; retry: /* Caps wanted by virtue of active open files. */ file_wanted = __ceph_caps_file_wanted(ci); /* Caps which have active references against them */ used = __ceph_caps_used(ci); /* * "issued" represents the current caps that the MDS wants us to have. * "implemented" is the set that we have been granted, and includes the * ones that have not yet been returned to the MDS (the "revoking" set, * usually because they have outstanding references). */ issued = __ceph_caps_issued(ci, &implemented); revoking = implemented & ~issued; want = file_wanted; /* The ones we currently want to retain (may be adjusted below) */ retain = file_wanted | used | CEPH_CAP_PIN; if (!mdsc->stopping && inode->i_nlink > 0) { if (file_wanted) { retain |= CEPH_CAP_ANY; /* be greedy */ } else if (S_ISDIR(inode->i_mode) && (issued & CEPH_CAP_FILE_SHARED) && __ceph_dir_is_complete(ci)) { /* * If a directory is complete, we want to keep * the exclusive cap. So that MDS does not end up * revoking the shared cap on every create/unlink * operation. */ if (IS_RDONLY(inode)) { want = CEPH_CAP_ANY_SHARED; } else { want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; } retain |= want; } else { retain |= CEPH_CAP_ANY_SHARED; /* * keep RD only if we didn't have the file open RW, * because then the mds would revoke it anyway to * journal max_size=0. */ if (ci->i_max_size == 0) retain |= CEPH_CAP_ANY_RD; } } doutc(cl, "%p %llx.%llx file_want %s used %s dirty %s " "flushing %s issued %s revoking %s retain %s %s%s%s%s\n", inode, ceph_vinop(inode), ceph_cap_string(file_wanted), ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(issued), ceph_cap_string(revoking), ceph_cap_string(retain), (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "", (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "", (flags & CHECK_CAPS_FLUSH_FORCE) ? " FLUSH_FORCE" : ""); /* * If we no longer need to hold onto old our caps, and we may * have cached pages, but don't want them, then try to invalidate. * If we fail, it's because pages are locked.... try again later. */ if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) && S_ISREG(inode->i_mode) && !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ inode->i_data.nrpages && /* have cached pages */ (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ !tried_invalidate) { doutc(cl, "trying to invalidate on %p %llx.%llx\n", inode, ceph_vinop(inode)); if (try_nonblocking_invalidate(inode) < 0) { doutc(cl, "queuing invalidate\n"); queue_invalidate = true; ci->i_rdcache_revoking = ci->i_rdcache_gen; } tried_invalidate = true; goto retry; } for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { int mflags = 0; struct cap_msg_args arg; cap = rb_entry(p, struct ceph_cap, ci_node); /* avoid looping forever */ if (mds >= cap->mds || ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap)) continue; /* * If we have an auth cap, we don't need to consider any * overlapping caps as used. */ cap_used = used; if (ci->i_auth_cap && cap != ci->i_auth_cap) cap_used &= ~ci->i_auth_cap->issued; revoking = cap->implemented & ~cap->issued; doutc(cl, " mds%d cap %p used %s issued %s implemented %s revoking %s\n", cap->mds, cap, ceph_cap_string(cap_used), ceph_cap_string(cap->issued), ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); /* completed revocation? going down and there are no caps? */ if (revoking) { if ((revoking & cap_used) == 0) { doutc(cl, "completed revocation of %s\n", ceph_cap_string(cap->implemented & ~cap->issued)); goto ack; } /* * If the "i_wrbuffer_ref" was increased by mmap or generic * cache write just before the ceph_check_caps() is called, * the Fb capability revoking will fail this time. Then we * must wait for the BDI's delayed work to flush the dirty * pages and to release the "i_wrbuffer_ref", which will cost * at most 5 seconds. That means the MDS needs to wait at * most 5 seconds to finished the Fb capability's revocation. * * Let's queue a writeback for it. */ if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && (revoking & CEPH_CAP_FILE_BUFFER)) queue_writeback = true; } if (flags & CHECK_CAPS_FLUSH_FORCE) { doutc(cl, "force to flush caps\n"); goto ack; } if (cap == ci->i_auth_cap && (cap->issued & CEPH_CAP_FILE_WR)) { /* request larger max_size from MDS? */ if (ci->i_wanted_max_size > ci->i_max_size && ci->i_wanted_max_size > ci->i_requested_max_size) { doutc(cl, "requesting new max_size\n"); goto ack; } /* approaching file_max? */ if (__ceph_should_report_size(ci)) { doutc(cl, "i_size approaching max_size\n"); goto ack; } } /* flush anything dirty? */ if (cap == ci->i_auth_cap) { if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) { doutc(cl, "flushing dirty caps\n"); goto ack; } if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) { doutc(cl, "flushing snap caps\n"); goto ack; } } /* want more caps from mds? */ if (want & ~cap->mds_wanted) { if (want & ~(cap->mds_wanted | cap->issued)) goto ack; if (!__cap_is_valid(cap)) goto ack; } /* things we might delay */ if ((cap->issued & ~retain) == 0) continue; /* nope, all good */ ack: ceph_put_mds_session(session); session = ceph_get_mds_session(cap->session); /* kick flushing and flush snaps before sending normal * cap message */ if (cap == ci->i_auth_cap && (ci->i_ceph_flags & (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) __kick_flushing_caps(mdsc, session, ci, 0); if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) __ceph_flush_snaps(ci, session); goto retry; } if (cap == ci->i_auth_cap && ci->i_dirty_caps) { flushing = ci->i_dirty_caps; flush_tid = __mark_caps_flushing(inode, session, false, &oldest_flush_tid); if (flags & CHECK_CAPS_FLUSH && list_empty(&session->s_cap_dirty)) mflags |= CEPH_CLIENT_CAPS_SYNC; } else { flushing = 0; flush_tid = 0; spin_lock(&mdsc->cap_dirty_lock); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); } mds = cap->mds; /* remember mds, so we don't repeat */ __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used, want, retain, flushing, flush_tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); __send_cap(&arg, ci); spin_lock(&ci->i_ceph_lock); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } /* periodically re-calculate caps wanted by open files */ if (__ceph_is_any_real_caps(ci) && list_empty(&ci->i_cap_delay_list) && (file_wanted & ~CEPH_CAP_PIN) && !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { __cap_delay_requeue(mdsc, ci); } spin_unlock(&ci->i_ceph_lock); ceph_put_mds_session(session); if (queue_writeback) ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); } /* * Try to flush dirty caps back to the auth mds. */ static int try_flush_caps(struct inode *inode, u64 *ptid) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int flushing = 0; u64 flush_tid = 0, oldest_flush_tid = 0; spin_lock(&ci->i_ceph_lock); retry_locked: if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; struct cap_msg_args arg; struct ceph_mds_session *session = cap->session; if (session->s_state < CEPH_MDS_SESSION_OPEN) { spin_unlock(&ci->i_ceph_lock); goto out; } if (ci->i_ceph_flags & (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) { if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) __kick_flushing_caps(mdsc, session, ci, 0); if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) __ceph_flush_snaps(ci, session); goto retry_locked; } flushing = ci->i_dirty_caps; flush_tid = __mark_caps_flushing(inode, session, true, &oldest_flush_tid); __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC, __ceph_caps_used(ci), __ceph_caps_wanted(ci), (cap->issued | cap->implemented), flushing, flush_tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); __send_cap(&arg, ci); } else { if (!list_empty(&ci->i_cap_flush_list)) { struct ceph_cap_flush *cf = list_last_entry(&ci->i_cap_flush_list, struct ceph_cap_flush, i_list); cf->wake = true; flush_tid = cf->tid; } flushing = ci->i_flushing_caps; spin_unlock(&ci->i_ceph_lock); } out: *ptid = flush_tid; return flushing; } /* * Return true if we've flushed caps through the given flush_tid. */ static int caps_are_flushed(struct inode *inode, u64 flush_tid) { struct ceph_inode_info *ci = ceph_inode(inode); int ret = 1; spin_lock(&ci->i_ceph_lock); if (!list_empty(&ci->i_cap_flush_list)) { struct ceph_cap_flush * cf = list_first_entry(&ci->i_cap_flush_list, struct ceph_cap_flush, i_list); if (cf->tid <= flush_tid) ret = 0; } spin_unlock(&ci->i_ceph_lock); return ret; } /* * flush the mdlog and wait for any unsafe requests to complete. */ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req1 = NULL, *req2 = NULL; int ret, err = 0; spin_lock(&ci->i_unsafe_lock); if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) { req1 = list_last_entry(&ci->i_unsafe_dirops, struct ceph_mds_request, r_unsafe_dir_item); ceph_mdsc_get_request(req1); } if (!list_empty(&ci->i_unsafe_iops)) { req2 = list_last_entry(&ci->i_unsafe_iops, struct ceph_mds_request, r_unsafe_target_item); ceph_mdsc_get_request(req2); } spin_unlock(&ci->i_unsafe_lock); /* * Trigger to flush the journal logs in all the relevant MDSes * manually, or in the worst case we must wait at most 5 seconds * to wait the journal logs to be flushed by the MDSes periodically. */ if (req1 || req2) { struct ceph_mds_request *req; struct ceph_mds_session **sessions; struct ceph_mds_session *s; unsigned int max_sessions; int i; mutex_lock(&mdsc->mutex); max_sessions = mdsc->max_sessions; sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL); if (!sessions) { mutex_unlock(&mdsc->mutex); err = -ENOMEM; goto out; } spin_lock(&ci->i_unsafe_lock); if (req1) { list_for_each_entry(req, &ci->i_unsafe_dirops, r_unsafe_dir_item) { s = req->r_session; if (!s) continue; if (!sessions[s->s_mds]) { s = ceph_get_mds_session(s); sessions[s->s_mds] = s; } } } if (req2) { list_for_each_entry(req, &ci->i_unsafe_iops, r_unsafe_target_item) { s = req->r_session; if (!s) continue; if (!sessions[s->s_mds]) { s = ceph_get_mds_session(s); sessions[s->s_mds] = s; } } } spin_unlock(&ci->i_unsafe_lock); /* the auth MDS */ spin_lock(&ci->i_ceph_lock); if (ci->i_auth_cap) { s = ci->i_auth_cap->session; if (!sessions[s->s_mds]) sessions[s->s_mds] = ceph_get_mds_session(s); } spin_unlock(&ci->i_ceph_lock); mutex_unlock(&mdsc->mutex); /* send flush mdlog request to MDSes */ for (i = 0; i < max_sessions; i++) { s = sessions[i]; if (s) { send_flush_mdlog(s); ceph_put_mds_session(s); } } kfree(sessions); } doutc(cl, "%p %llx.%llx wait on tid %llu %llu\n", inode, ceph_vinop(inode), req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); if (req1) { ret = !wait_for_completion_timeout(&req1->r_safe_completion, ceph_timeout_jiffies(req1->r_timeout)); if (ret) err = -EIO; } if (req2) { ret = !wait_for_completion_timeout(&req2->r_safe_completion, ceph_timeout_jiffies(req2->r_timeout)); if (ret) err = -EIO; } out: if (req1) ceph_mdsc_put_request(req1); if (req2) ceph_mdsc_put_request(req2); return err; } int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); u64 flush_tid; int ret, err; int dirty; doutc(cl, "%p %llx.%llx%s\n", inode, ceph_vinop(inode), datasync ? " datasync" : ""); ret = file_write_and_wait_range(file, start, end); if (datasync) goto out; ret = ceph_wait_on_async_create(inode); if (ret) goto out; dirty = try_flush_caps(inode, &flush_tid); doutc(cl, "dirty caps are %s\n", ceph_cap_string(dirty)); err = flush_mdlog_and_wait_inode_unsafe_requests(inode); /* * only wait on non-file metadata writeback (the mds * can recover size and mtime, so we don't need to * wait for that) */ if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } if (err < 0) ret = err; err = file_check_and_advance_wb_err(file); if (err < 0) ret = err; out: doutc(cl, "%p %llx.%llx%s result=%d\n", inode, ceph_vinop(inode), datasync ? " datasync" : "", ret); return ret; } /* * Flush any dirty caps back to the mds. If we aren't asked to wait, * queue inode for flush but don't do so immediately, because we can * get by with fewer MDS messages if we wait for data writeback to * complete first. */ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); u64 flush_tid; int err = 0; int dirty; int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync); doutc(cl, "%p %llx.%llx wait=%d\n", inode, ceph_vinop(inode), wait); ceph_fscache_unpin_writeback(inode, wbc); if (wait) { err = ceph_wait_on_async_create(inode); if (err) return err; dirty = try_flush_caps(inode, &flush_tid); if (dirty) err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } else { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; spin_lock(&ci->i_ceph_lock); if (__ceph_caps_dirty(ci)) __cap_delay_requeue_front(mdsc, ci); spin_unlock(&ci->i_ceph_lock); } return err; } static void __kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_inode_info *ci, u64 oldest_flush_tid) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap *cap; struct ceph_cap_flush *cf; int ret; u64 first_tid = 0; u64 last_snap_flush = 0; /* Don't do anything until create reply comes in */ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) return; ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { if (cf->is_capsnap) { last_snap_flush = cf->tid; break; } } list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { if (cf->tid < first_tid) continue; cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err_client(cl, "%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); break; } first_tid = cf->tid + 1; if (!cf->is_capsnap) { struct cap_msg_args arg; doutc(cl, "%p %llx.%llx cap %p tid %llu %s\n", inode, ceph_vinop(inode), cap, cf->tid, ceph_cap_string(cf->caps)); __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, (cf->tid < last_snap_flush ? CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0), __ceph_caps_used(ci), __ceph_caps_wanted(ci), (cap->issued | cap->implemented), cf->caps, cf->tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); __send_cap(&arg, ci); } else { struct ceph_cap_snap *capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", inode, ceph_vinop(inode), capsnap, cf->tid, ceph_cap_string(capsnap->dirty)); refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); ret = __send_flush_snap(inode, session, capsnap, cap->mseq, oldest_flush_tid); if (ret < 0) { pr_err_client(cl, "error sending cap flushsnap," " %p %llx.%llx tid %llu follows %llu\n", inode, ceph_vinop(inode), cf->tid, capsnap->follows); } ceph_put_cap_snap(capsnap); } spin_lock(&ci->i_ceph_lock); } } void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct ceph_cap *cap; u64 oldest_flush_tid; doutc(cl, "mds%d\n", session->s_mds); spin_lock(&mdsc->cap_dirty_lock); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { struct inode *inode = &ci->netfs.inode; spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n", inode, ceph_vinop(inode), cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } /* * if flushing caps were revoked, we re-send the cap flush * in client reconnect stage. This guarantees MDS * processes * the cap flush message before issuing the flushing caps to * other client. */ if ((cap->issued & ci->i_flushing_caps) != ci->i_flushing_caps) { /* encode_caps_cb() also will reset these sequence * numbers. make sure sequence numbers in cap flush * message match later reconnect message */ cap->seq = 0; cap->issue_seq = 0; cap->mseq = 0; __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } else { ci->i_ceph_flags |= CEPH_I_KICK_FLUSH; } spin_unlock(&ci->i_ceph_lock); } } void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct ceph_cap *cap; u64 oldest_flush_tid; lockdep_assert_held(&session->s_mutex); doutc(cl, "mds%d\n", session->s_mds); spin_lock(&mdsc->cap_dirty_lock); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { struct inode *inode = &ci->netfs.inode; spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n", inode, ceph_vinop(inode), cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } spin_unlock(&ci->i_ceph_lock); } } void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, struct ceph_inode_info *ci) { struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_cap *cap = ci->i_auth_cap; struct inode *inode = &ci->netfs.inode; lockdep_assert_held(&ci->i_ceph_lock); doutc(mdsc->fsc->client, "%p %llx.%llx flushing %s\n", inode, ceph_vinop(inode), ceph_cap_string(ci->i_flushing_caps)); if (!list_empty(&ci->i_cap_flush_list)) { u64 oldest_flush_tid; spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &cap->session->s_cap_flushing); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } } /* * Take references to capabilities we hold, so that we don't release * them to the MDS prematurely. */ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, bool snap_rwsem_locked) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); lockdep_assert_held(&ci->i_ceph_lock); if (got & CEPH_CAP_PIN) ci->i_pin_ref++; if (got & CEPH_CAP_FILE_RD) ci->i_rd_ref++; if (got & CEPH_CAP_FILE_CACHE) ci->i_rdcache_ref++; if (got & CEPH_CAP_FILE_EXCL) ci->i_fx_ref++; if (got & CEPH_CAP_FILE_WR) { if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { BUG_ON(!snap_rwsem_locked); ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); } ci->i_wr_ref++; } if (got & CEPH_CAP_FILE_BUFFER) { if (ci->i_wb_ref == 0) ihold(inode); ci->i_wb_ref++; doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode, ceph_vinop(inode), ci->i_wb_ref-1, ci->i_wb_ref); } } /* * Try to grab cap references. Specify those refs we @want, and the * minimal set we @need. Also include the larger offset we are writing * to (when applicable), and check against max_size here as well. * Note that caller is responsible for ensuring max_size increases are * requested from the MDS. * * Returns 0 if caps were not able to be acquired (yet), 1 if succeed, * or a negative error code. There are 3 special error codes: * -EAGAIN: need to sleep but non-blocking is specified * -EFBIG: ask caller to call check_max_size() and try again. * -EUCLEAN: ask caller to call ceph_renew_caps() and try again. */ enum { /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */ NON_BLOCKING = (1 << 8), CHECK_FILELOCK = (1 << 9), }; static int try_get_cap_refs(struct inode *inode, int need, int want, loff_t endoff, int flags, int *got) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); int ret = 0; int have, implemented; bool snap_rwsem_locked = false; doutc(cl, "%p %llx.%llx need %s want %s\n", inode, ceph_vinop(inode), ceph_cap_string(need), ceph_cap_string(want)); again: spin_lock(&ci->i_ceph_lock); if ((flags & CHECK_FILELOCK) && (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { doutc(cl, "%p %llx.%llx error filelock\n", inode, ceph_vinop(inode)); ret = -EIO; goto out_unlock; } /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); if (snap_rwsem_locked) { up_read(&mdsc->snap_rwsem); snap_rwsem_locked = false; } __ceph_do_pending_vmtruncate(inode); spin_lock(&ci->i_ceph_lock); } have = __ceph_caps_issued(ci, &implemented); if (have & need & CEPH_CAP_FILE_WR) { if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { doutc(cl, "%p %llx.%llx endoff %llu > maxsize %llu\n", inode, ceph_vinop(inode), endoff, ci->i_max_size); if (endoff > ci->i_requested_max_size) ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN; goto out_unlock; } /* * If a sync write is in progress, we must wait, so that we * can get a final snapshot value for size+mtime. */ if (__ceph_have_pending_cap_snap(ci)) { doutc(cl, "%p %llx.%llx cap_snap_pending\n", inode, ceph_vinop(inode)); goto out_unlock; } } if ((have & need) == need) { /* * Look at (implemented & ~have & not) so that we keep waiting * on transition from wanted -> needed caps. This is needed * for WRBUFFER|WR -> WR to avoid a new WR sync write from * going before a prior buffered writeback happens. * * For RDCACHE|RD -> RD, there is not need to wait and we can * just exclude the revoking caps and force to sync read. */ int not = want & ~(have & need); int revoking = implemented & ~have; int exclude = revoking & not; doutc(cl, "%p %llx.%llx have %s but not %s (revoking %s)\n", inode, ceph_vinop(inode), ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) { if (!snap_rwsem_locked && !ci->i_head_snapc && (need & CEPH_CAP_FILE_WR)) { if (!down_read_trylock(&mdsc->snap_rwsem)) { /* * we can not call down_read() when * task isn't in TASK_RUNNING state */ if (flags & NON_BLOCKING) { ret = -EAGAIN; goto out_unlock; } spin_unlock(&ci->i_ceph_lock); down_read(&mdsc->snap_rwsem); snap_rwsem_locked = true; goto again; } snap_rwsem_locked = true; } if ((have & want) == want) *got = need | (want & ~exclude); else *got = need; ceph_take_cap_refs(ci, *got, true); ret = 1; } } else { int session_readonly = false; int mds_wanted; if (ci->i_auth_cap && (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) { struct ceph_mds_session *s = ci->i_auth_cap->session; spin_lock(&s->s_cap_lock); session_readonly = s->s_readonly; spin_unlock(&s->s_cap_lock); } if (session_readonly) { doutc(cl, "%p %llx.%llx need %s but mds%d readonly\n", inode, ceph_vinop(inode), ceph_cap_string(need), ci->i_auth_cap->mds); ret = -EROFS; goto out_unlock; } if (ceph_inode_is_shutdown(inode)) { doutc(cl, "%p %llx.%llx inode is shutdown\n", inode, ceph_vinop(inode)); ret = -ESTALE; goto out_unlock; } mds_wanted = __ceph_caps_mds_wanted(ci, false); if (need & ~mds_wanted) { doutc(cl, "%p %llx.%llx need %s > mds_wanted %s\n", inode, ceph_vinop(inode), ceph_cap_string(need), ceph_cap_string(mds_wanted)); ret = -EUCLEAN; goto out_unlock; } doutc(cl, "%p %llx.%llx have %s need %s\n", inode, ceph_vinop(inode), ceph_cap_string(have), ceph_cap_string(need)); } out_unlock: __ceph_touch_fmode(ci, mdsc, flags); spin_unlock(&ci->i_ceph_lock); if (snap_rwsem_locked) up_read(&mdsc->snap_rwsem); if (!ret) ceph_update_cap_mis(&mdsc->metric); else if (ret == 1) ceph_update_cap_hit(&mdsc->metric); doutc(cl, "%p %llx.%llx ret %d got %s\n", inode, ceph_vinop(inode), ret, ceph_cap_string(*got)); return ret; } /* * Check the offset we are writing up to against our current * max_size. If necessary, tell the MDS we want to write to * a larger offset. */ static void check_max_size(struct inode *inode, loff_t endoff) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); int check = 0; /* do we need to explicitly request a larger max_size? */ spin_lock(&ci->i_ceph_lock); if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) { doutc(cl, "write %p %llx.%llx at large endoff %llu, req max_size\n", inode, ceph_vinop(inode), endoff); ci->i_wanted_max_size = endoff; } /* duplicate ceph_check_caps()'s logic */ if (ci->i_auth_cap && (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) && ci->i_wanted_max_size > ci->i_max_size && ci->i_wanted_max_size > ci->i_requested_max_size) check = 1; spin_unlock(&ci->i_ceph_lock); if (check) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY); } static inline int get_used_fmode(int caps) { int fmode = 0; if (caps & CEPH_CAP_FILE_RD) fmode |= CEPH_FILE_MODE_RD; if (caps & CEPH_CAP_FILE_WR) fmode |= CEPH_FILE_MODE_WR; return fmode; } int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got) { int ret, flags; BUG_ON(need & ~CEPH_CAP_FILE_RD); BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO | CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | CEPH_CAP_ANY_DIR_OPS)); if (need) { ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; } flags = get_used_fmode(need | want); if (nonblock) flags |= NON_BLOCKING; ret = try_get_cap_refs(inode, need, want, 0, flags, got); /* three special error codes */ if (ret == -EAGAIN || ret == -EFBIG || ret == -EUCLEAN) ret = 0; return ret; } /* * Wait for caps, and take cap references. If we can't get a WR cap * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, int want, loff_t endoff, int *got) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); int ret, _got, flags; ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && fi->filp_gen != READ_ONCE(fsc->filp_gen)) return -EBADF; flags = get_used_fmode(need | want); while (true) { flags &= CEPH_FILE_MODE_MASK; if (vfs_inode_has_locks(inode)) flags |= CHECK_FILELOCK; _got = 0; ret = try_get_cap_refs(inode, need, want, endoff, flags, &_got); WARN_ON_ONCE(ret == -EAGAIN); if (!ret) { #ifdef CONFIG_DEBUG_FS struct ceph_mds_client *mdsc = fsc->mdsc; struct cap_wait cw; #endif DEFINE_WAIT_FUNC(wait, woken_wake_function); #ifdef CONFIG_DEBUG_FS cw.ino = ceph_ino(inode); cw.tgid = current->tgid; cw.need = need; cw.want = want; spin_lock(&mdsc->caps_list_lock); list_add(&cw.list, &mdsc->cap_wait_list); spin_unlock(&mdsc->caps_list_lock); #endif /* make sure used fmode not timeout */ ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); add_wait_queue(&ci->i_cap_wq, &wait); flags |= NON_BLOCKING; while (!(ret = try_get_cap_refs(inode, need, want, endoff, flags, &_got))) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&ci->i_cap_wq, &wait); ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); #ifdef CONFIG_DEBUG_FS spin_lock(&mdsc->caps_list_lock); list_del(&cw.list); spin_unlock(&mdsc->caps_list_lock); #endif if (ret == -EAGAIN) continue; } if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && fi->filp_gen != READ_ONCE(fsc->filp_gen)) { if (ret >= 0 && _got) ceph_put_cap_refs(ci, _got); return -EBADF; } if (ret < 0) { if (ret == -EFBIG || ret == -EUCLEAN) { int ret2 = ceph_wait_on_async_create(inode); if (ret2 < 0) return ret2; } if (ret == -EFBIG) { check_max_size(inode, endoff); continue; } if (ret == -EUCLEAN) { /* session was killed, try renew caps */ ret = ceph_renew_caps(inode, flags); if (ret == 0) continue; } return ret; } if (S_ISREG(ci->netfs.inode.i_mode) && ceph_has_inline_data(ci) && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && i_size_read(inode) > 0) { struct page *page = find_get_page(inode->i_mapping, 0); if (page) { bool uptodate = PageUptodate(page); put_page(page); if (uptodate) break; } /* * drop cap refs first because getattr while * holding * caps refs can cause deadlock. */ ceph_put_cap_refs(ci, _got); _got = 0; /* * getattr request will bring inline data into * page cache */ ret = __ceph_do_getattr(inode, NULL, CEPH_STAT_CAP_INLINE_DATA, true); if (ret < 0) return ret; continue; } break; } *got = _got; return 0; } int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got) { struct ceph_file_info *fi = filp->private_data; struct inode *inode = file_inode(filp); return __ceph_get_caps(inode, fi, need, want, endoff, got); } /* * Take cap refs. Caller must already know we hold at least one ref * on the caps in question or we don't know this is safe. */ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) { spin_lock(&ci->i_ceph_lock); ceph_take_cap_refs(ci, caps, false); spin_unlock(&ci->i_ceph_lock); } /* * drop cap_snap that is not associated with any snapshot. * we don't need to send FLUSHSNAP message for it. */ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); if (!capsnap->need_flush && !capsnap->writing && !capsnap->dirty_pages) { doutc(cl, "%p follows %llu\n", capsnap, capsnap->follows); BUG_ON(capsnap->cap_flush.tid > 0); ceph_put_snap_context(capsnap->context); if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps)) ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; list_del(&capsnap->ci_item); ceph_put_cap_snap(capsnap); return 1; } return 0; } enum put_cap_refs_mode { PUT_CAP_REFS_SYNC = 0, PUT_CAP_REFS_ASYNC, }; /* * Release cap refs. * * If we released the last ref on any given cap, call ceph_check_caps * to release (or schedule a release). * * If we are releasing a WR cap (from a sync write), finalize any affected * cap_snap, and wake up any waiters. */ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, enum put_cap_refs_mode mode) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int last = 0, put = 0, flushsnaps = 0, wake = 0; bool check_flushsnaps = false; spin_lock(&ci->i_ceph_lock); if (had & CEPH_CAP_PIN) --ci->i_pin_ref; if (had & CEPH_CAP_FILE_RD) if (--ci->i_rd_ref == 0) last++; if (had & CEPH_CAP_FILE_CACHE) if (--ci->i_rdcache_ref == 0) last++; if (had & CEPH_CAP_FILE_EXCL) if (--ci->i_fx_ref == 0) last++; if (had & CEPH_CAP_FILE_BUFFER) { if (--ci->i_wb_ref == 0) { last++; /* put the ref held by ceph_take_cap_refs() */ put++; check_flushsnaps = true; } doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode, ceph_vinop(inode), ci->i_wb_ref+1, ci->i_wb_ref); } if (had & CEPH_CAP_FILE_WR) { if (--ci->i_wr_ref == 0) { /* * The Fb caps will always be took and released * together with the Fw caps. */ WARN_ON_ONCE(ci->i_wb_ref); last++; check_flushsnaps = true; if (ci->i_wrbuffer_ref_head == 0 && ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } /* see comment in __ceph_remove_cap() */ if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) ceph_change_snap_realm(inode, NULL); } } if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) { struct ceph_cap_snap *capsnap = list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, ci_item); capsnap->writing = 0; if (ceph_try_drop_cap_snap(ci, capsnap)) /* put the ref held by ceph_queue_cap_snap() */ put++; else if (__ceph_finish_cap_snap(ci, capsnap)) flushsnaps = 1; wake = 1; } spin_unlock(&ci->i_ceph_lock); doutc(cl, "%p %llx.%llx had %s%s%s\n", inode, ceph_vinop(inode), ceph_cap_string(had), last ? " last" : "", put ? " put" : ""); switch (mode) { case PUT_CAP_REFS_SYNC: if (last) ceph_check_caps(ci, 0); else if (flushsnaps) ceph_flush_snaps(ci, NULL); break; case PUT_CAP_REFS_ASYNC: if (last) ceph_queue_check_caps(inode); else if (flushsnaps) ceph_queue_flush_snaps(inode); break; default: break; } if (wake) wake_up_all(&ci->i_cap_wq); while (put-- > 0) iput(inode); } void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) { __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_SYNC); } void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had) { __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC); } /* * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap * context. Adjust per-snap dirty page accounting as appropriate. * Once all dirty data for a cap_snap is flushed, flush snapped file * metadata back to the MDS. If we dropped the last ref, call * ceph_check_caps. */ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap_snap *capsnap = NULL, *iter; int put = 0; bool last = false; bool flush_snaps = false; bool complete_capsnap = false; spin_lock(&ci->i_ceph_lock); ci->i_wrbuffer_ref -= nr; if (ci->i_wrbuffer_ref == 0) { last = true; put++; } if (ci->i_head_snapc == snapc) { ci->i_wrbuffer_ref_head -= nr; if (ci->i_wrbuffer_ref_head == 0 && ci->i_wr_ref == 0 && ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } doutc(cl, "on %p %llx.%llx head %d/%d -> %d/%d %s\n", inode, ceph_vinop(inode), ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, last ? " LAST" : ""); } else { list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { if (iter->context == snapc) { capsnap = iter; break; } } if (!capsnap) { /* * The capsnap should already be removed when removing * auth cap in the case of a forced unmount. */ WARN_ON_ONCE(ci->i_auth_cap); goto unlock; } capsnap->dirty_pages -= nr; if (capsnap->dirty_pages == 0) { complete_capsnap = true; if (!capsnap->writing) { if (ceph_try_drop_cap_snap(ci, capsnap)) { put++; } else { ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; flush_snaps = true; } } } doutc(cl, "%p %llx.%llx cap_snap %p snap %lld %d/%d -> %d/%d %s%s\n", inode, ceph_vinop(inode), capsnap, capsnap->context->seq, ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, ci->i_wrbuffer_ref, capsnap->dirty_pages, last ? " (wrbuffer last)" : "", complete_capsnap ? " (complete capsnap)" : ""); } unlock: spin_unlock(&ci->i_ceph_lock); if (last) { ceph_check_caps(ci, 0); } else if (flush_snaps) { ceph_flush_snaps(ci, NULL); } if (complete_capsnap) wake_up_all(&ci->i_cap_wq); while (put-- > 0) { iput(inode); } } /* * Invalidate unlinked inode's aliases, so we can drop the inode ASAP. */ static void invalidate_aliases(struct inode *inode) { struct ceph_client *cl = ceph_inode_to_client(inode); struct dentry *dn, *prev = NULL; doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); d_prune_aliases(inode); /* * For non-directory inode, d_find_alias() only returns * hashed dentry. After calling d_invalidate(), the * dentry becomes unhashed. * * For directory inode, d_find_alias() can return * unhashed dentry. But directory inode should have * one alias at most. */ while ((dn = d_find_alias(inode))) { if (dn == prev) { dput(dn); break; } d_invalidate(dn); if (prev) dput(prev); prev = dn; } if (prev) dput(prev); } struct cap_extra_info { struct ceph_string *pool_ns; /* inline data */ u64 inline_version; void *inline_data; u32 inline_len; /* dirstat */ bool dirstat_valid; u64 nfiles; u64 nsubdirs; u64 change_attr; /* currently issued */ int issued; struct timespec64 btime; u8 *fscrypt_auth; u32 fscrypt_auth_len; u64 fscrypt_file_size; }; /* * Handle a cap GRANT message from the MDS. (Note that a GRANT may * actually be a revocation if it specifies a smaller cap set.) * * caller holds s_mutex and i_ceph_lock, we drop both. */ static void handle_cap_grant(struct inode *inode, struct ceph_mds_session *session, struct ceph_cap *cap, struct ceph_mds_caps *grant, struct ceph_buffer *xattr_buf, struct cap_extra_info *extra_info) __releases(ci->i_ceph_lock) __releases(session->s_mdsc->snap_rwsem) { struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); int used, wanted, dirty; u64 size = le64_to_cpu(grant->size); u64 max_size = le64_to_cpu(grant->max_size); unsigned char check_caps = 0; bool was_stale = cap->cap_gen < atomic_read(&session->s_cap_gen); bool wake = false; bool writeback = false; bool queue_trunc = false; bool queue_invalidate = false; bool deleted_inode = false; bool fill_inline = false; bool revoke_wait = false; int flags = 0; /* * If there is at least one crypto block then we'll trust * fscrypt_file_size. If the real length of the file is 0, then * ignore it (it has probably been truncated down to 0 by the MDS). */ if (IS_ENCRYPTED(inode) && size) size = extra_info->fscrypt_file_size; doutc(cl, "%p %llx.%llx cap %p mds%d seq %d %s\n", inode, ceph_vinop(inode), cap, session->s_mds, seq, ceph_cap_string(newcaps)); doutc(cl, " size %llu max_size %llu, i_size %llu\n", size, max_size, i_size_read(inode)); /* * If CACHE is being revoked, and we have no dirty buffers, * try to invalidate (once). (If there are dirty buffers, we * will invalidate _after_ writeback.) */ if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { if (try_nonblocking_invalidate(inode)) { /* there were locked pages.. invalidate later in a separate thread. */ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { queue_invalidate = true; ci->i_rdcache_revoking = ci->i_rdcache_gen; } } } if (was_stale) cap->issued = cap->implemented = CEPH_CAP_PIN; /* * auth mds of the inode changed. we received the cap export message, * but still haven't received the cap import message. handle_cap_export * updated the new auth MDS' cap. * * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message * that was sent before the cap import message. So don't remove caps. */ if (ceph_seq_cmp(seq, cap->seq) <= 0) { WARN_ON(cap != ci->i_auth_cap); WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); seq = cap->seq; newcaps |= cap->issued; } /* side effects now are allowed */ cap->cap_gen = atomic_read(&session->s_cap_gen); cap->seq = seq; __check_cap_issue(ci, cap, newcaps); inode_set_max_iversion_raw(inode, extra_info->change_attr); if ((newcaps & CEPH_CAP_AUTH_SHARED) && (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { umode_t mode = le32_to_cpu(grant->mode); if (inode_wrong_type(inode, mode)) pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", ceph_vinop(inode), inode->i_mode, mode); else inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); ci->i_btime = extra_info->btime; doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode, ceph_vinop(inode), inode->i_mode, from_kuid(&init_user_ns, inode->i_uid), from_kgid(&init_user_ns, inode->i_gid)); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len || memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth, ci->fscrypt_auth_len)) pr_warn_ratelimited_client(cl, "cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n", ci->fscrypt_auth_len, extra_info->fscrypt_auth_len); #endif } if ((newcaps & CEPH_CAP_LINK_SHARED) && (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); if (inode->i_nlink == 0) deleted_inode = true; } if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { int len = le32_to_cpu(grant->xattr_len); u64 version = le64_to_cpu(grant->xattr_version); if (version > ci->i_xattrs.version) { doutc(cl, " got new xattrs v%llu on %p %llx.%llx len %d\n", version, inode, ceph_vinop(inode), len); if (ci->i_xattrs.blob) ceph_buffer_put(ci->i_xattrs.blob); ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); ci->i_xattrs.version = version; ceph_forget_all_cached_acls(inode); ceph_security_invalidate_secctx(inode); } } if (newcaps & CEPH_CAP_ANY_RD) { struct timespec64 mtime, atime, ctime; /* ctime/mtime/atime? */ ceph_decode_timespec64(&mtime, &grant->mtime); ceph_decode_timespec64(&atime, &grant->atime); ceph_decode_timespec64(&ctime, &grant->ctime); ceph_fill_file_time(inode, extra_info->issued, le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, &atime); } if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) { ci->i_files = extra_info->nfiles; ci->i_subdirs = extra_info->nsubdirs; } if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { /* file layout may have changed */ s64 old_pool = ci->i_layout.pool_id; struct ceph_string *old_ns; ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout); old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, lockdep_is_held(&ci->i_ceph_lock)); rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns); if (ci->i_layout.pool_id != old_pool || extra_info->pool_ns != old_ns) ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; extra_info->pool_ns = old_ns; /* size/truncate_seq? */ queue_trunc = ceph_fill_file_size(inode, extra_info->issued, le32_to_cpu(grant->truncate_seq), le64_to_cpu(grant->truncate_size), size); } if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) { if (max_size != ci->i_max_size) { doutc(cl, "max_size %lld -> %llu\n", ci->i_max_size, max_size); ci->i_max_size = max_size; if (max_size >= ci->i_wanted_max_size) { ci->i_wanted_max_size = 0; /* reset */ ci->i_requested_max_size = 0; } wake = true; } } /* check cap bits */ wanted = __ceph_caps_wanted(ci); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); doutc(cl, " my wanted = %s, used = %s, dirty %s\n", ceph_cap_string(wanted), ceph_cap_string(used), ceph_cap_string(dirty)); if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) && (wanted & ~(cap->mds_wanted | newcaps))) { /* * If mds is importing cap, prior cap messages that update * 'wanted' may get dropped by mds (migrate seq mismatch). * * We don't send cap message to update 'wanted' if what we * want are already issued. If mds revokes caps, cap message * that releases caps also tells mds what we want. But if * caps got revoked by mds forcedly (session stale). We may * haven't told mds what we want. */ check_caps = 1; } /* revocation, grant, or no-op? */ if (cap->issued & ~newcaps) { int revoking = cap->issued & ~newcaps; doutc(cl, "revocation: %s -> %s (revoking %s)\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps), ceph_cap_string(revoking)); if (S_ISREG(inode->i_mode) && (revoking & used & CEPH_CAP_FILE_BUFFER)) { writeback = true; /* initiate writeback; will delay ack */ revoke_wait = true; } else if (queue_invalidate && revoking == CEPH_CAP_FILE_CACHE && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) { revoke_wait = true; /* do nothing yet, invalidation will be queued */ } else if (cap == ci->i_auth_cap) { check_caps = 1; /* check auth cap only */ } else { check_caps = 2; /* check all caps */ } /* If there is new caps, try to wake up the waiters */ if (~cap->issued & newcaps) wake = true; cap->issued = newcaps; cap->implemented |= newcaps; } else if (cap->issued == newcaps) { doutc(cl, "caps unchanged: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); } else { doutc(cl, "grant: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); /* non-auth MDS is revoking the newly grant caps ? */ if (cap == ci->i_auth_cap && __ceph_caps_revoking_other(ci, cap, newcaps)) check_caps = 2; cap->issued = newcaps; cap->implemented |= newcaps; /* add bits only, to * avoid stepping on a * pending revocation */ wake = true; } BUG_ON(cap->issued & ~cap->implemented); /* don't let check_caps skip sending a response to MDS for revoke msgs */ if (!revoke_wait && le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) { cap->mds_wanted = 0; flags |= CHECK_CAPS_FLUSH_FORCE; if (cap == ci->i_auth_cap) check_caps = 1; /* check auth cap only */ else check_caps = 2; /* check all caps */ } if (extra_info->inline_version > 0 && extra_info->inline_version >= ci->i_inline_version) { ci->i_inline_version = extra_info->inline_version; if (ci->i_inline_version != CEPH_INLINE_NONE && (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO))) fill_inline = true; } if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { if (ci->i_auth_cap == cap) { if (newcaps & ~extra_info->issued) wake = true; if (ci->i_requested_max_size > max_size || !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { /* re-request max_size if necessary */ ci->i_requested_max_size = 0; wake = true; } ceph_kick_flushing_inode_caps(session, ci); } up_read(&session->s_mdsc->snap_rwsem); } spin_unlock(&ci->i_ceph_lock); if (fill_inline) ceph_fill_inline_data(inode, NULL, extra_info->inline_data, extra_info->inline_len); if (queue_trunc) ceph_queue_vmtruncate(inode); if (writeback) /* * queue inode for writeback: we can't actually call * filemap_write_and_wait, etc. from message handler * context. */ ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); if (deleted_inode) invalidate_aliases(inode); if (wake) wake_up_all(&ci->i_cap_wq); mutex_unlock(&session->s_mutex); if (check_caps == 1) ceph_check_caps(ci, flags | CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL); else if (check_caps == 2) ceph_check_caps(ci, flags | CHECK_CAPS_NOINVAL); } /* * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the * MDS has been safely committed. */ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_caps *m, struct ceph_mds_session *session, struct ceph_cap *cap) __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap_flush *cf, *tmp_cf; LIST_HEAD(to_remove); unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; bool drop = false; bool wake_ci = false; bool wake_mdsc = false; list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { /* Is this the one that was flushed? */ if (cf->tid == flush_tid) cleaned = cf->caps; /* Is this a capsnap? */ if (cf->is_capsnap) continue; if (cf->tid <= flush_tid) { /* * An earlier or current tid. The FLUSH_ACK should * represent a superset of this flush's caps. */ wake_ci |= __detach_cap_flush_from_ci(ci, cf); list_add_tail(&cf->i_list, &to_remove); } else { /* * This is a later one. Any caps in it are still dirty * so don't count them as cleaned. */ cleaned &= ~cf->caps; if (!cleaned) break; } } doutc(cl, "%p %llx.%llx mds%d seq %d on %s cleaned %s, flushing %s -> %s\n", inode, ceph_vinop(inode), session->s_mds, seq, ceph_cap_string(dirty), ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(ci->i_flushing_caps & ~cleaned)); if (list_empty(&to_remove) && !cleaned) goto out; ci->i_flushing_caps &= ~cleaned; spin_lock(&mdsc->cap_dirty_lock); list_for_each_entry(cf, &to_remove, i_list) wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf); if (ci->i_flushing_caps == 0) { if (list_empty(&ci->i_cap_flush_list)) { list_del_init(&ci->i_flushing_item); if (!list_empty(&session->s_cap_flushing)) { struct inode *inode = &list_first_entry(&session->s_cap_flushing, struct ceph_inode_info, i_flushing_item)->netfs.inode; doutc(cl, " mds%d still flushing cap on %p %llx.%llx\n", session->s_mds, inode, ceph_vinop(inode)); } } mdsc->num_cap_flushing--; doutc(cl, " %p %llx.%llx now !flushing\n", inode, ceph_vinop(inode)); if (ci->i_dirty_caps == 0) { doutc(cl, " %p %llx.%llx now clean\n", inode, ceph_vinop(inode)); BUG_ON(!list_empty(&ci->i_dirty_item)); drop = true; if (ci->i_wr_ref == 0 && ci->i_wrbuffer_ref_head == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } } else { BUG_ON(list_empty(&ci->i_dirty_item)); } } spin_unlock(&mdsc->cap_dirty_lock); out: spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { cf = list_first_entry(&to_remove, struct ceph_cap_flush, i_list); list_del_init(&cf->i_list); if (!cf->is_capsnap) ceph_free_cap_flush(cf); } if (wake_ci) wake_up_all(&ci->i_cap_wq); if (wake_mdsc) wake_up_all(&mdsc->cap_flushing_wq); if (drop) iput(inode); } void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, bool *wake_ci, bool *wake_mdsc) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = mdsc->fsc->client; bool ret; lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "removing capsnap %p, %p %llx.%llx ci %p\n", capsnap, inode, ceph_vinop(inode), ci); list_del_init(&capsnap->ci_item); ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); if (wake_ci) *wake_ci = ret; spin_lock(&mdsc->cap_dirty_lock); if (list_empty(&ci->i_cap_flush_list)) list_del_init(&ci->i_flushing_item); ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush); if (wake_mdsc) *wake_mdsc = ret; spin_unlock(&mdsc->cap_dirty_lock); } void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, bool *wake_ci, bool *wake_mdsc) { struct ceph_inode_info *ci = ceph_inode(inode); lockdep_assert_held(&ci->i_ceph_lock); WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing); __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc); } /* * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can * throw away our cap_snap. * * Caller hold s_mutex. */ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_caps *m, struct ceph_mds_session *session) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = mdsc->fsc->client; u64 follows = le64_to_cpu(m->snap_follows); struct ceph_cap_snap *capsnap = NULL, *iter; bool wake_ci = false; bool wake_mdsc = false; doutc(cl, "%p %llx.%llx ci %p mds%d follows %lld\n", inode, ceph_vinop(inode), ci, session->s_mds, follows); spin_lock(&ci->i_ceph_lock); list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { if (iter->follows == follows) { if (iter->cap_flush.tid != flush_tid) { doutc(cl, " cap_snap %p follows %lld " "tid %lld != %lld\n", iter, follows, flush_tid, iter->cap_flush.tid); break; } capsnap = iter; break; } else { doutc(cl, " skipping cap_snap %p follows %lld\n", iter, iter->follows); } } if (capsnap) ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); spin_unlock(&ci->i_ceph_lock); if (capsnap) { ceph_put_snap_context(capsnap->context); ceph_put_cap_snap(capsnap); if (wake_ci) wake_up_all(&ci->i_cap_wq); if (wake_mdsc) wake_up_all(&mdsc->cap_flushing_wq); iput(inode); } } /* * Handle TRUNC from MDS, indicating file truncation. * * caller hold s_mutex. */ static bool handle_cap_trunc(struct inode *inode, struct ceph_mds_caps *trunc, struct ceph_mds_session *session, struct cap_extra_info *extra_info) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); int mds = session->s_mds; int seq = le32_to_cpu(trunc->seq); u32 truncate_seq = le32_to_cpu(trunc->truncate_seq); u64 truncate_size = le64_to_cpu(trunc->truncate_size); u64 size = le64_to_cpu(trunc->size); int implemented = 0; int dirty = __ceph_caps_dirty(ci); int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); bool queue_trunc = false; lockdep_assert_held(&ci->i_ceph_lock); issued |= implemented | dirty; /* * If there is at least one crypto block then we'll trust * fscrypt_file_size. If the real length of the file is 0, then * ignore it (it has probably been truncated down to 0 by the MDS). */ if (IS_ENCRYPTED(inode) && size) size = extra_info->fscrypt_file_size; doutc(cl, "%p %llx.%llx mds%d seq %d to %lld truncate seq %d\n", inode, ceph_vinop(inode), mds, seq, truncate_size, truncate_seq); queue_trunc = ceph_fill_file_size(inode, issued, truncate_seq, truncate_size, size); return queue_trunc; } /* * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a * different one. If we are the most recent migration we've seen (as * indicated by mseq), make note of the migrating cap bits for the * duration (until we see the corresponding IMPORT). * * caller holds s_mutex */ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session) { struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *tsession = NULL; struct ceph_cap *cap, *tcap, *new_cap = NULL; struct ceph_inode_info *ci = ceph_inode(inode); u64 t_cap_id; u32 t_issue_seq, t_mseq; int target, issued; int mds = session->s_mds; if (ph) { t_cap_id = le64_to_cpu(ph->cap_id); t_issue_seq = le32_to_cpu(ph->issue_seq); t_mseq = le32_to_cpu(ph->mseq); target = le32_to_cpu(ph->mds); } else { t_cap_id = t_issue_seq = t_mseq = 0; target = -1; } doutc(cl, " cap %llx.%llx export to peer %d piseq %u pmseq %u\n", ceph_vinop(inode), target, t_issue_seq, t_mseq); retry: down_read(&mdsc->snap_rwsem); spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) goto out_unlock; if (target < 0) { ceph_remove_cap(mdsc, cap, false); goto out_unlock; } /* * now we know we haven't received the cap import message yet * because the exported cap still exist. */ issued = cap->issued; if (issued != cap->implemented) pr_err_ratelimited_client(cl, "issued != implemented: " "%p %llx.%llx mds%d seq %d mseq %d" " issued %s implemented %s\n", inode, ceph_vinop(inode), mds, cap->seq, cap->mseq, ceph_cap_string(issued), ceph_cap_string(cap->implemented)); tcap = __get_cap_for_mds(ci, target); if (tcap) { /* already have caps from the target */ if (tcap->cap_id == t_cap_id && ceph_seq_cmp(tcap->seq, t_issue_seq) < 0) { doutc(cl, " updating import cap %p mds%d\n", tcap, target); tcap->cap_id = t_cap_id; tcap->seq = t_issue_seq - 1; tcap->issue_seq = t_issue_seq - 1; tcap->issued |= issued; tcap->implemented |= issued; if (cap == ci->i_auth_cap) { ci->i_auth_cap = tcap; change_auth_cap_ses(ci, tcap->session); } } ceph_remove_cap(mdsc, cap, false); goto out_unlock; } else if (tsession) { /* add placeholder for the export target */ int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; tcap = new_cap; ceph_add_cap(inode, tsession, t_cap_id, issued, 0, t_issue_seq - 1, t_mseq, (u64)-1, flag, &new_cap); if (!list_empty(&ci->i_cap_flush_list) && ci->i_auth_cap == tcap) { spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &tcap->session->s_cap_flushing); spin_unlock(&mdsc->cap_dirty_lock); } ceph_remove_cap(mdsc, cap, false); goto out_unlock; } spin_unlock(&ci->i_ceph_lock); up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); /* open target session */ tsession = ceph_mdsc_open_export_target_session(mdsc, target); if (!IS_ERR(tsession)) { if (mds > target) { mutex_lock(&session->s_mutex); mutex_lock_nested(&tsession->s_mutex, SINGLE_DEPTH_NESTING); } else { mutex_lock(&tsession->s_mutex); mutex_lock_nested(&session->s_mutex, SINGLE_DEPTH_NESTING); } new_cap = ceph_get_cap(mdsc, NULL); } else { WARN_ON(1); tsession = NULL; target = -1; mutex_lock(&session->s_mutex); } goto retry; out_unlock: spin_unlock(&ci->i_ceph_lock); up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); if (tsession) { mutex_unlock(&tsession->s_mutex); ceph_put_mds_session(tsession); } if (new_cap) ceph_put_cap(mdsc, new_cap); } /* * Handle cap IMPORT. * * caller holds s_mutex. acquires i_ceph_lock */ static void handle_cap_import(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *im, struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session, struct ceph_cap **target_cap, int *old_issued) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap *cap, *ocap, *new_cap = NULL; int mds = session->s_mds; int issued; unsigned caps = le32_to_cpu(im->caps); unsigned wanted = le32_to_cpu(im->wanted); unsigned seq = le32_to_cpu(im->seq); unsigned mseq = le32_to_cpu(im->migrate_seq); u64 realmino = le64_to_cpu(im->realm); u64 cap_id = le64_to_cpu(im->cap_id); u64 p_cap_id; u32 piseq = 0; u32 pmseq = 0; int peer; if (ph) { p_cap_id = le64_to_cpu(ph->cap_id); peer = le32_to_cpu(ph->mds); piseq = le32_to_cpu(ph->issue_seq); pmseq = le32_to_cpu(ph->mseq); } else { p_cap_id = 0; peer = -1; } doutc(cl, " cap %llx.%llx import from peer %d piseq %u pmseq %u\n", ceph_vinop(inode), peer, piseq, pmseq); retry: cap = __get_cap_for_mds(ci, mds); if (!cap) { if (!new_cap) { spin_unlock(&ci->i_ceph_lock); new_cap = ceph_get_cap(mdsc, NULL); spin_lock(&ci->i_ceph_lock); goto retry; } cap = new_cap; } else { if (new_cap) { ceph_put_cap(mdsc, new_cap); new_cap = NULL; } } __ceph_caps_issued(ci, &issued); issued |= __ceph_caps_dirty(ci); ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, &new_cap); ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; if (ocap && ocap->cap_id == p_cap_id) { doutc(cl, " remove export cap %p mds%d flags %d\n", ocap, peer, ph->flags); if ((ph->flags & CEPH_CAP_FLAG_AUTH) && (ocap->seq != piseq || ocap->mseq != pmseq)) { pr_err_ratelimited_client(cl, "mismatched seq/mseq: " "%p %llx.%llx mds%d seq %d mseq %d" " importer mds%d has peer seq %d mseq %d\n", inode, ceph_vinop(inode), peer, ocap->seq, ocap->mseq, mds, piseq, pmseq); } ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } *old_issued = issued; *target_cap = cap; } #ifdef CONFIG_FS_ENCRYPTION static int parse_fscrypt_fields(void **p, void *end, struct cap_extra_info *extra) { u32 len; ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad); if (extra->fscrypt_auth_len) { ceph_decode_need(p, end, extra->fscrypt_auth_len, bad); extra->fscrypt_auth = kmalloc(extra->fscrypt_auth_len, GFP_KERNEL); if (!extra->fscrypt_auth) return -ENOMEM; ceph_decode_copy_safe(p, end, extra->fscrypt_auth, extra->fscrypt_auth_len, bad); } ceph_decode_32_safe(p, end, len, bad); if (len >= sizeof(u64)) { ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad); len -= sizeof(u64); } ceph_decode_skip_n(p, end, len, bad); return 0; bad: return -EIO; } #else static int parse_fscrypt_fields(void **p, void *end, struct cap_extra_info *extra) { u32 len; /* Don't care about these fields unless we're encryption-capable */ ceph_decode_32_safe(p, end, len, bad); if (len) ceph_decode_skip_n(p, end, len, bad); ceph_decode_32_safe(p, end, len, bad); if (len) ceph_decode_skip_n(p, end, len, bad); return 0; bad: return -EIO; } #endif /* * Handle a caps message from the MDS. * * Identify the appropriate session, inode, and call the right handler * based on the cap op. */ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct inode *inode; struct ceph_inode_info *ci; struct ceph_cap *cap; struct ceph_mds_caps *h; struct ceph_mds_cap_peer *peer = NULL; struct ceph_snap_realm *realm = NULL; int op; int msg_version = le16_to_cpu(msg->hdr.version); u32 seq, mseq, issue_seq; struct ceph_vino vino; void *snaptrace; size_t snaptrace_len; void *p, *end; struct cap_extra_info extra_info = {}; bool queue_trunc; bool close_sessions = false; bool do_cap_release = false; if (!ceph_inc_mds_stopping_blocker(mdsc, session)) return; /* decode */ end = msg->front.iov_base + msg->front.iov_len; if (msg->front.iov_len < sizeof(*h)) goto bad; h = msg->front.iov_base; op = le32_to_cpu(h->op); vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; seq = le32_to_cpu(h->seq); mseq = le32_to_cpu(h->migrate_seq); issue_seq = le32_to_cpu(h->issue_seq); snaptrace = h + 1; snaptrace_len = le32_to_cpu(h->snap_trace_len); p = snaptrace + snaptrace_len; if (msg_version >= 2) { u32 flock_len; ceph_decode_32_safe(&p, end, flock_len, bad); if (p + flock_len > end) goto bad; p += flock_len; } if (msg_version >= 3) { if (op == CEPH_CAP_OP_IMPORT) { if (p + sizeof(*peer) > end) goto bad; peer = p; p += sizeof(*peer); } else if (op == CEPH_CAP_OP_EXPORT) { /* recorded in unused fields */ peer = (void *)&h->size; } } if (msg_version >= 4) { ceph_decode_64_safe(&p, end, extra_info.inline_version, bad); ceph_decode_32_safe(&p, end, extra_info.inline_len, bad); if (p + extra_info.inline_len > end) goto bad; extra_info.inline_data = p; p += extra_info.inline_len; } if (msg_version >= 5) { struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; u32 epoch_barrier; ceph_decode_32_safe(&p, end, epoch_barrier, bad); ceph_osdc_update_epoch_barrier(osdc, epoch_barrier); } if (msg_version >= 8) { u32 pool_ns_len; /* version >= 6 */ ceph_decode_skip_64(&p, end, bad); // flush_tid /* version >= 7 */ ceph_decode_skip_32(&p, end, bad); // caller_uid ceph_decode_skip_32(&p, end, bad); // caller_gid /* version >= 8 */ ceph_decode_32_safe(&p, end, pool_ns_len, bad); if (pool_ns_len > 0) { ceph_decode_need(&p, end, pool_ns_len, bad); extra_info.pool_ns = ceph_find_or_create_string(p, pool_ns_len); p += pool_ns_len; } } if (msg_version >= 9) { struct ceph_timespec *btime; if (p + sizeof(*btime) > end) goto bad; btime = p; ceph_decode_timespec64(&extra_info.btime, btime); p += sizeof(*btime); ceph_decode_64_safe(&p, end, extra_info.change_attr, bad); } if (msg_version >= 11) { /* version >= 10 */ ceph_decode_skip_32(&p, end, bad); // flags /* version >= 11 */ extra_info.dirstat_valid = true; ceph_decode_64_safe(&p, end, extra_info.nfiles, bad); ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad); } if (msg_version >= 12) { if (parse_fscrypt_fields(&p, end, &extra_info)) goto bad; } /* lookup ino */ inode = ceph_find_inode(mdsc->fsc->sb, vino); doutc(cl, " caps mds%d op %s ino %llx.%llx inode %p seq %u iseq %u mseq %u\n", session->s_mds, ceph_cap_op_name(op), vino.ino, vino.snap, inode, seq, issue_seq, mseq); mutex_lock(&session->s_mutex); if (!inode) { doutc(cl, " i don't have ino %llx\n", vino.ino); switch (op) { case CEPH_CAP_OP_IMPORT: case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: do_cap_release = true; break; default: break; } goto flush_cap_releases; } ci = ceph_inode(inode); /* these will work even if we don't have a cap yet */ switch (op) { case CEPH_CAP_OP_FLUSHSNAP_ACK: handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid), h, session); goto done; case CEPH_CAP_OP_EXPORT: handle_cap_export(inode, h, peer, session); goto done_unlocked; case CEPH_CAP_OP_IMPORT: realm = NULL; if (snaptrace_len) { down_write(&mdsc->snap_rwsem); if (ceph_update_snap_trace(mdsc, snaptrace, snaptrace + snaptrace_len, false, &realm)) { up_write(&mdsc->snap_rwsem); close_sessions = true; goto done; } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); } spin_lock(&ci->i_ceph_lock); handle_cap_import(mdsc, inode, h, peer, session, &cap, &extra_info.issued); handle_cap_grant(inode, session, cap, h, msg->middle, &extra_info); if (realm) ceph_put_snap_realm(mdsc, realm); goto done_unlocked; } /* the rest require a cap */ spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds); if (!cap) { doutc(cl, " no cap on %p ino %llx.%llx from mds%d\n", inode, ceph_ino(inode), ceph_snap(inode), session->s_mds); spin_unlock(&ci->i_ceph_lock); switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: do_cap_release = true; break; default: break; } goto flush_cap_releases; } /* note that each of these drops i_ceph_lock for us */ switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: __ceph_caps_issued(ci, &extra_info.issued); extra_info.issued |= __ceph_caps_dirty(ci); handle_cap_grant(inode, session, cap, h, msg->middle, &extra_info); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid), h, session, cap); break; case CEPH_CAP_OP_TRUNC: queue_trunc = handle_cap_trunc(inode, h, session, &extra_info); spin_unlock(&ci->i_ceph_lock); if (queue_trunc) ceph_queue_vmtruncate(inode); break; default: spin_unlock(&ci->i_ceph_lock); pr_err_client(cl, "unknown cap op %d %s\n", op, ceph_cap_op_name(op)); } done: mutex_unlock(&session->s_mutex); done_unlocked: iput(inode); out: ceph_dec_mds_stopping_blocker(mdsc); ceph_put_string(extra_info.pool_ns); /* Defer closing the sessions after s_mutex lock being released */ if (close_sessions) ceph_mdsc_close_sessions(mdsc); kfree(extra_info.fscrypt_auth); return; flush_cap_releases: /* * send any cap release message to try to move things * along for the mds (who clearly thinks we still have this * cap). */ if (do_cap_release) { cap = ceph_get_cap(mdsc, NULL); cap->cap_ino = vino.ino; cap->queue_release = 1; cap->cap_id = le64_to_cpu(h->cap_id); cap->mseq = mseq; cap->seq = seq; cap->issue_seq = seq; spin_lock(&session->s_cap_lock); __ceph_queue_cap_release(session, cap); spin_unlock(&session->s_cap_lock); } ceph_flush_session_cap_releases(mdsc, session); goto done; bad: pr_err_client(cl, "corrupt message\n"); ceph_msg_dump(msg); goto out; } /* * Delayed work handler to process end of delayed cap release LRU list. * * If new caps are added to the list while processing it, these won't get * processed in this run. In this case, the ci->i_hold_caps_max will be * returned so that the work can be scheduled accordingly. */ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; struct inode *inode; struct ceph_inode_info *ci; struct ceph_mount_options *opt = mdsc->fsc->mount_options; unsigned long delay_max = opt->caps_wanted_delay_max * HZ; unsigned long loop_start = jiffies; unsigned long delay = 0; doutc(cl, "begin\n"); spin_lock(&mdsc->cap_delay_lock); while (!list_empty(&mdsc->cap_delay_list)) { ci = list_first_entry(&mdsc->cap_delay_list, struct ceph_inode_info, i_cap_delay_list); if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) { doutc(cl, "caps added recently. Exiting loop"); delay = ci->i_hold_caps_max; break; } if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && time_before(jiffies, ci->i_hold_caps_max)) break; list_del_init(&ci->i_cap_delay_list); inode = igrab(&ci->netfs.inode); if (inode) { spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "on %p %llx.%llx\n", inode, ceph_vinop(inode)); ceph_check_caps(ci, 0); iput(inode); spin_lock(&mdsc->cap_delay_lock); } /* * Make sure too many dirty caps or general * slowness doesn't block mdsc delayed work, * preventing send_renew_caps() from running. */ if (time_after_eq(jiffies, loop_start + 5 * HZ)) break; } spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "done\n"); return delay; } /* * Flush all dirty caps to the mds */ static void flush_dirty_session_caps(struct ceph_mds_session *s) { struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct inode *inode; doutc(cl, "begin\n"); spin_lock(&mdsc->cap_dirty_lock); while (!list_empty(&s->s_cap_dirty)) { ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, i_dirty_item); inode = &ci->netfs.inode; ihold(inode); doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); spin_unlock(&mdsc->cap_dirty_lock); ceph_wait_on_async_create(inode); ceph_check_caps(ci, CHECK_CAPS_FLUSH); iput(inode); spin_lock(&mdsc->cap_dirty_lock); } spin_unlock(&mdsc->cap_dirty_lock); doutc(cl, "done\n"); } void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) { ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true); } /* * Flush all cap releases to the mds */ static void flush_cap_releases(struct ceph_mds_session *s) { struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; doutc(cl, "begin\n"); spin_lock(&s->s_cap_lock); if (s->s_num_cap_releases) ceph_flush_session_cap_releases(mdsc, s); spin_unlock(&s->s_cap_lock); doutc(cl, "done\n"); } void ceph_flush_cap_releases(struct ceph_mds_client *mdsc) { ceph_mdsc_iterate_sessions(mdsc, flush_cap_releases, true); } void __ceph_touch_fmode(struct ceph_inode_info *ci, struct ceph_mds_client *mdsc, int fmode) { unsigned long now = jiffies; if (fmode & CEPH_FILE_MODE_RD) ci->i_last_rd = now; if (fmode & CEPH_FILE_MODE_WR) ci->i_last_wr = now; /* queue periodic check */ if (fmode && __ceph_is_any_real_caps(ci) && list_empty(&ci->i_cap_delay_list)) __cap_delay_requeue(mdsc, ci); } void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); int bits = (fmode << 1) | 1; bool already_opened = false; int i; if (count == 1) atomic64_inc(&mdsc->metric.opened_files); spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { /* * If any of the mode ref is larger than 0, * that means it has been already opened by * others. Just skip checking the PIN ref. */ if (i && ci->i_nr_by_mode[i]) already_opened = true; if (bits & (1 << i)) ci->i_nr_by_mode[i] += count; } if (!already_opened) percpu_counter_inc(&mdsc->metric.opened_inodes); spin_unlock(&ci->i_ceph_lock); } /* * Drop open file reference. If we were the last open file, * we may need to release capabilities to the MDS (or schedule * their delayed release). */ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); int bits = (fmode << 1) | 1; bool is_closed = true; int i; if (count == 1) atomic64_dec(&mdsc->metric.opened_files); spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { if (bits & (1 << i)) { BUG_ON(ci->i_nr_by_mode[i] < count); ci->i_nr_by_mode[i] -= count; } /* * If any of the mode ref is not 0 after * decreased, that means it is still opened * by others. Just skip checking the PIN ref. */ if (i && ci->i_nr_by_mode[i]) is_closed = false; } if (is_closed) percpu_counter_dec(&mdsc->metric.opened_inodes); spin_unlock(&ci->i_ceph_lock); } /* * For a soon-to-be unlinked file, drop the LINK caps. If it * looks like the link count will hit 0, drop any other caps (other * than PIN) we don't specifically want (due to the file still being * open). */ int ceph_drop_caps_for_unlink(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; spin_lock(&ci->i_ceph_lock); if (inode->i_nlink == 1) { drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); if (__ceph_caps_dirty(ci)) { struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); spin_lock(&mdsc->cap_delay_lock); ci->i_ceph_flags |= CEPH_I_FLUSH; if (!list_empty(&ci->i_cap_delay_list)) list_del_init(&ci->i_cap_delay_list); list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_unlink_delay_list); spin_unlock(&mdsc->cap_delay_lock); /* * Fire the work immediately, because the MDS maybe * waiting for caps release. */ ceph_queue_cap_unlink_work(mdsc); } } spin_unlock(&ci->i_ceph_lock); return drop; } /* * Helpers for embedding cap and dentry lease releases into mds * requests. * * @force is used by dentry_release (below) to force inclusion of a * record for the directory inode, even when there aren't any caps to * drop. */ int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; struct ceph_mds_request_release *rel = *p; int used, dirty; int ret = 0; spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); doutc(cl, "%p %llx.%llx mds%d used|dirty %s drop %s unless %s\n", inode, ceph_vinop(inode), mds, ceph_cap_string(used|dirty), ceph_cap_string(drop), ceph_cap_string(unless)); /* only drop unused, clean caps */ drop &= ~(used | dirty); cap = __get_cap_for_mds(ci, mds); if (cap && __cap_is_valid(cap)) { unless &= cap->issued; if (unless) { if (unless & CEPH_CAP_AUTH_EXCL) drop &= ~CEPH_CAP_AUTH_SHARED; if (unless & CEPH_CAP_LINK_EXCL) drop &= ~CEPH_CAP_LINK_SHARED; if (unless & CEPH_CAP_XATTR_EXCL) drop &= ~CEPH_CAP_XATTR_SHARED; if (unless & CEPH_CAP_FILE_EXCL) drop &= ~CEPH_CAP_FILE_SHARED; } if (force || (cap->issued & drop)) { if (cap->issued & drop) { int wanted = __ceph_caps_wanted(ci); doutc(cl, "%p %llx.%llx cap %p %s -> %s, " "wanted %s -> %s\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued), ceph_cap_string(cap->issued & ~drop), ceph_cap_string(cap->mds_wanted), ceph_cap_string(wanted)); cap->issued &= ~drop; cap->implemented &= ~drop; cap->mds_wanted = wanted; if (cap == ci->i_auth_cap && !(wanted & CEPH_CAP_ANY_FILE_WR)) ci->i_requested_max_size = 0; } else { doutc(cl, "%p %llx.%llx cap %p %s (force)\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued)); } rel->ino = cpu_to_le64(ceph_ino(inode)); rel->cap_id = cpu_to_le64(cap->cap_id); rel->seq = cpu_to_le32(cap->seq); rel->issue_seq = cpu_to_le32(cap->issue_seq); rel->mseq = cpu_to_le32(cap->mseq); rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); rel->dname_len = 0; rel->dname_seq = 0; *p += sizeof(*rel); ret = 1; } else { doutc(cl, "%p %llx.%llx cap %p %s (noop)\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued)); } } spin_unlock(&ci->i_ceph_lock); return ret; } /** * ceph_encode_dentry_release - encode a dentry release into an outgoing request * @p: outgoing request buffer * @dentry: dentry to release * @dir: dir to release it from * @mds: mds that we're speaking to * @drop: caps being dropped * @unless: unless we have these caps * * Encode a dentry release into an outgoing request buffer. Returns 1 if the * thing was released, or a negative error code otherwise. */ int ceph_encode_dentry_release(void **p, struct dentry *dentry, struct inode *dir, int mds, int drop, int unless) { struct ceph_mds_request_release *rel = *p; struct ceph_dentry_info *di = ceph_dentry(dentry); struct ceph_client *cl; int force = 0; int ret; /* This shouldn't happen */ BUG_ON(!dir); /* * force an record for the directory caps if we have a dentry lease. * this is racy (can't take i_ceph_lock and d_lock together), but it * doesn't have to be perfect; the mds will revoke anything we don't * release. */ spin_lock(&dentry->d_lock); if (di->lease_session && di->lease_session->s_mds == mds) force = 1; spin_unlock(&dentry->d_lock); ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); cl = ceph_inode_to_client(dir); spin_lock(&dentry->d_lock); if (ret && di->lease_session && di->lease_session->s_mds == mds) { int len = dentry->d_name.len; doutc(cl, "%p mds%d seq %d\n", dentry, mds, (int)di->lease_seq); rel->dname_seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); memcpy(*p, dentry->d_name.name, len); spin_unlock(&dentry->d_lock); if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) { len = ceph_encode_encrypted_dname(dir, *p, len); if (len < 0) return len; } rel->dname_len = cpu_to_le32(len); *p += len; } else { spin_unlock(&dentry->d_lock); } return ret; } static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap_snap *capsnap; int capsnap_release = 0; lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "removing capsnaps, ci is %p, %p %llx.%llx\n", ci, inode, ceph_vinop(inode)); while (!list_empty(&ci->i_cap_snaps)) { capsnap = list_first_entry(&ci->i_cap_snaps, struct ceph_cap_snap, ci_item); __ceph_remove_capsnap(inode, capsnap, NULL, NULL); ceph_put_snap_context(capsnap->context); ceph_put_cap_snap(capsnap); capsnap_release++; } wake_up_all(&ci->i_cap_wq); wake_up_all(&mdsc->cap_flushing_wq); return capsnap_release; } int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate) { struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_client *cl = fsc->client; struct ceph_inode_info *ci = ceph_inode(inode); bool is_auth; bool dirty_dropped = false; int iputs = 0; lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "removing cap %p, ci is %p, %p %llx.%llx\n", cap, ci, inode, ceph_vinop(inode)); is_auth = (cap == ci->i_auth_cap); __ceph_remove_cap(cap, false); if (is_auth) { struct ceph_cap_flush *cf; if (ceph_inode_is_shutdown(inode)) { if (inode->i_data.nrpages > 0) *invalidate = true; if (ci->i_wrbuffer_ref > 0) mapping_set_error(&inode->i_data, -EIO); } spin_lock(&mdsc->cap_dirty_lock); /* trash all of the cap flushes for this inode */ while (!list_empty(&ci->i_cap_flush_list)) { cf = list_first_entry(&ci->i_cap_flush_list, struct ceph_cap_flush, i_list); list_del_init(&cf->g_list); list_del_init(&cf->i_list); if (!cf->is_capsnap) ceph_free_cap_flush(cf); } if (!list_empty(&ci->i_dirty_item)) { pr_warn_ratelimited_client(cl, " dropping dirty %s state for %p %llx.%llx\n", ceph_cap_string(ci->i_dirty_caps), inode, ceph_vinop(inode)); ci->i_dirty_caps = 0; list_del_init(&ci->i_dirty_item); dirty_dropped = true; } if (!list_empty(&ci->i_flushing_item)) { pr_warn_ratelimited_client(cl, " dropping dirty+flushing %s state for %p %llx.%llx\n", ceph_cap_string(ci->i_flushing_caps), inode, ceph_vinop(inode)); ci->i_flushing_caps = 0; list_del_init(&ci->i_flushing_item); mdsc->num_cap_flushing--; dirty_dropped = true; } spin_unlock(&mdsc->cap_dirty_lock); if (dirty_dropped) { mapping_set_error(inode->i_mapping, -EIO); if (ci->i_wrbuffer_ref_head == 0 && ci->i_wr_ref == 0 && ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } } if (atomic_read(&ci->i_filelock_ref) > 0) { /* make further file lock syscall return -EIO */ ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; pr_warn_ratelimited_client(cl, " dropping file locks for %p %llx.%llx\n", inode, ceph_vinop(inode)); } if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { cf = ci->i_prealloc_cap_flush; ci->i_prealloc_cap_flush = NULL; if (!cf->is_capsnap) ceph_free_cap_flush(cf); } if (!list_empty(&ci->i_cap_snaps)) iputs = remove_capsnaps(mdsc, inode); } if (dirty_dropped) ++iputs; return iputs; } |
| 6 5 5 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | /* zutil.h -- internal interface and configuration of the compression library * Copyright (C) 1995-1998 Jean-loup Gailly. * For conditions of distribution and use, see copyright notice in zlib.h */ /* WARNING: this file should *not* be used by applications. It is part of the implementation of the compression library and is subject to change. Applications should only use zlib.h. */ /* @(#) $Id: zutil.h,v 1.1 2000/01/01 03:32:23 davem Exp $ */ #ifndef _Z_UTIL_H #define _Z_UTIL_H #include <linux/zlib.h> #include <linux/string.h> #include <linux/kernel.h> typedef unsigned char uch; typedef unsigned short ush; typedef unsigned long ulg; /* common constants */ #define STORED_BLOCK 0 #define STATIC_TREES 1 #define DYN_TREES 2 /* The three kinds of block type */ #define MIN_MATCH 3 #define MAX_MATCH 258 /* The minimum and maximum match lengths */ #define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */ /* target dependencies */ /* Common defaults */ #ifndef OS_CODE # define OS_CODE 0x03 /* assume Unix */ #endif /* functions */ typedef uLong (*check_func) (uLong check, const Byte *buf, uInt len); /* checksum functions */ #define BASE 65521L /* largest prime smaller than 65536 */ #define NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ #define DO1(buf,i) {s1 += buf[i]; s2 += s1;} #define DO2(buf,i) DO1(buf,i); DO1(buf,i+1); #define DO4(buf,i) DO2(buf,i); DO2(buf,i+2); #define DO8(buf,i) DO4(buf,i); DO4(buf,i+4); #define DO16(buf) DO8(buf,0); DO8(buf,8); /* ========================================================================= */ /* Update a running Adler-32 checksum with the bytes buf[0..len-1] and return the updated checksum. If buf is NULL, this function returns the required initial value for the checksum. An Adler-32 checksum is almost as reliable as a CRC32 but can be computed much faster. Usage example: uLong adler = zlib_adler32(0L, NULL, 0); while (read_buffer(buffer, length) != EOF) { adler = zlib_adler32(adler, buffer, length); } if (adler != original_adler) error(); */ static inline uLong zlib_adler32(uLong adler, const Byte *buf, uInt len) { unsigned long s1 = adler & 0xffff; unsigned long s2 = (adler >> 16) & 0xffff; int k; if (buf == NULL) return 1L; while (len > 0) { k = len < NMAX ? len : NMAX; len -= k; while (k >= 16) { DO16(buf); buf += 16; k -= 16; } if (k != 0) do { s1 += *buf++; s2 += s1; } while (--k); s1 %= BASE; s2 %= BASE; } return (s2 << 16) | s1; } #endif /* _Z_UTIL_H */ |
| 5 5 5 5 5 5 5 5 5 4 3 3 3 3 3 3 3 3 3 3 3 3 5 5 5 3 3 5 5 5 2 3 5 5 5 5 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 | // SPDX-License-Identifier: GPL-2.0 /* * List pending timers * * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar */ #include <linux/proc_fs.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/kallsyms.h> #include <linux/nmi.h> #include <linux/uaccess.h> #include "tick-internal.h" struct timer_list_iter { int cpu; bool second_pass; u64 now; }; /* * This allows printing both to /proc/timer_list and * to the console (on SysRq-Q): */ __printf(2, 3) static void SEQ_printf(struct seq_file *m, const char *fmt, ...) { va_list args; va_start(args, fmt); if (m) seq_vprintf(m, fmt, args); else vprintk(fmt, args); va_end(args); } static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function)); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)), (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now), (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); } static void print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { struct hrtimer *timer, tmp; unsigned long next = 0, i; struct timerqueue_node *curr; unsigned long flags; next_one: i = 0; touch_nmi_watchdog(); raw_spin_lock_irqsave(&base->cpu_base->lock, flags); curr = timerqueue_getnext(&base->active); /* * Crude but we have to do this O(N*N) thing, because * we have to unlock the base when printing: */ while (curr && i < next) { curr = timerqueue_iterate_next(curr); i++; } if (curr) { timer = container_of(curr, struct hrtimer, node); tmp = *timer; raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); print_timer(m, timer, &tmp, i, now); next++; goto next_one; } raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); } static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); #ifdef CONFIG_HIGH_RES_TIMERS SEQ_printf(m, " .offset: %Lu nsecs\n", (unsigned long long) ktime_to_ns(base->offset)); #endif SEQ_printf(m, "active timers:\n"); print_active_timers(m, base, now + ktime_to_ns(base->offset)); } static void print_cpu(struct seq_file *m, int cpu, u64 now) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; SEQ_printf(m, "cpu: %d\n", cpu); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { SEQ_printf(m, " clock %d:\n", i); print_base(m, cpu_base->clock_base + i, now); } #define P(x) \ SEQ_printf(m, " .%-15s: %Lu\n", #x, \ (unsigned long long)(cpu_base->x)) #define P_ns(x) \ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ (unsigned long long)(ktime_to_ns(cpu_base->x))) #ifdef CONFIG_HIGH_RES_TIMERS P_ns(expires_next); P(hres_active); P(nr_events); P(nr_retries); P(nr_hangs); P(max_hang_time); #endif #undef P #undef P_ns #ifdef CONFIG_TICK_ONESHOT # define P(x) \ SEQ_printf(m, " .%-15s: %Lu\n", #x, \ (unsigned long long)(ts->x)) # define P_ns(x) \ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ (unsigned long long)(ktime_to_ns(ts->x))) # define P_flag(x, f) \ SEQ_printf(m, " .%-15s: %d\n", #x, !!(ts->flags & (f))) { struct tick_sched *ts = tick_get_tick_sched(cpu); P_flag(nohz, TS_FLAG_NOHZ); P_flag(highres, TS_FLAG_HIGHRES); P_ns(last_tick); P_flag(tick_stopped, TS_FLAG_STOPPED); P(idle_jiffies); P(idle_calls); P(idle_sleeps); P_ns(idle_entrytime); P_ns(idle_waketime); P_ns(idle_exittime); P_ns(idle_sleeptime); P_ns(iowait_sleeptime); P(last_jiffies); P(next_timer); P_ns(idle_expires); SEQ_printf(m, "jiffies: %Lu\n", (unsigned long long)jiffies); } #endif #undef P #undef P_ns SEQ_printf(m, "\n"); } #ifdef CONFIG_GENERIC_CLOCKEVENTS static void print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; touch_nmi_watchdog(); SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); if (cpu < 0) SEQ_printf(m, "Broadcast device\n"); else SEQ_printf(m, "Per CPU device: %d\n", cpu); SEQ_printf(m, "Clock Event Device: "); if (!dev) { SEQ_printf(m, "<NULL>\n"); return; } SEQ_printf(m, "%s\n", dev->name); SEQ_printf(m, " max_delta_ns: %llu\n", (unsigned long long) dev->max_delta_ns); SEQ_printf(m, " min_delta_ns: %llu\n", (unsigned long long) dev->min_delta_ns); SEQ_printf(m, " mult: %u\n", dev->mult); SEQ_printf(m, " shift: %u\n", dev->shift); SEQ_printf(m, " mode: %d\n", clockevent_get_state(dev)); SEQ_printf(m, " next_event: %Ld nsecs\n", (unsigned long long) ktime_to_ns(dev->next_event)); SEQ_printf(m, " set_next_event: %ps\n", dev->set_next_event); if |