| 4 1 3 2 1 1 2 2 4 2 2 1 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 | // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC key management * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * RxRPC keys should have a description of describing their purpose: * "afs@CAMBRIDGE.REDHAT.COM> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/skcipher.h> #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/key-type.h> #include <linux/ctype.h> #include <linux/slab.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <keys/rxrpc-type.h> #include <keys/user-type.h> #include "ar-internal.h" static int rxrpc_vet_description_s(const char *); static int rxrpc_preparse_s(struct key_preparsed_payload *); static void rxrpc_free_preparse_s(struct key_preparsed_payload *); static void rxrpc_destroy_s(struct key *); static void rxrpc_describe_s(const struct key *, struct seq_file *); /* * rxrpc server keys take "<serviceId>:<securityIndex>[:<sec-specific>]" as the * description and the key material as the payload. */ struct key_type key_type_rxrpc_s = { .name = "rxrpc_s", .flags = KEY_TYPE_NET_DOMAIN, .vet_description = rxrpc_vet_description_s, .preparse = rxrpc_preparse_s, .free_preparse = rxrpc_free_preparse_s, .instantiate = generic_key_instantiate, .destroy = rxrpc_destroy_s, .describe = rxrpc_describe_s, }; /* * Vet the description for an RxRPC server key. */ static int rxrpc_vet_description_s(const char *desc) { unsigned long service, sec_class; char *p; service = simple_strtoul(desc, &p, 10); if (*p != ':' || service > 65535) return -EINVAL; sec_class = simple_strtoul(p + 1, &p, 10); if ((*p && *p != ':') || sec_class < 1 || sec_class > 255) return -EINVAL; return 0; } /* * Preparse a server secret key. */ static int rxrpc_preparse_s(struct key_preparsed_payload *prep) { const struct rxrpc_security *sec; unsigned int service, sec_class; int n; _enter("%zu", prep->datalen); if (!prep->orig_description) return -EINVAL; if (sscanf(prep->orig_description, "%u:%u%n", &service, &sec_class, &n) != 2) return -EINVAL; sec = rxrpc_security_lookup(sec_class); if (!sec) return -ENOPKG; prep->payload.data[1] = (struct rxrpc_security *)sec; if (!sec->preparse_server_key) return -EINVAL; return sec->preparse_server_key(prep); } static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep) { const struct rxrpc_security *sec = prep->payload.data[1]; if (sec && sec->free_preparse_server_key) sec->free_preparse_server_key(prep); } static void rxrpc_destroy_s(struct key *key) { const struct rxrpc_security *sec = key->payload.data[1]; if (sec && sec->destroy_server_key) sec->destroy_server_key(key); } static void rxrpc_describe_s(const struct key *key, struct seq_file *m) { const struct rxrpc_security *sec = key->payload.data[1]; seq_puts(m, key->description); if (sec && sec->describe_server_key) sec->describe_server_key(key, m); } /* * grab the security keyring for a server socket */ int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) { struct key *key; char *description; _enter(""); if (optlen <= 0 || optlen > PAGE_SIZE - 1) return -EINVAL; description = memdup_sockptr_nul(optval, optlen); if (IS_ERR(description)) return PTR_ERR(description); key = request_key(&key_type_keyring, description, NULL); if (IS_ERR(key)) { kfree(description); _leave(" = %ld", PTR_ERR(key)); return PTR_ERR(key); } rx->securities = key; kfree(description); _leave(" = 0 [key %x]", key->serial); return 0; } /** * rxrpc_sock_set_security_keyring - Set the security keyring for a kernel service * @sk: The socket to set the keyring on * @keyring: The keyring to set * * Set the server security keyring on an rxrpc socket. This is used to provide * the encryption keys for a kernel service. * * Return: %0 if successful and a negative error code otherwise. */ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring) { struct rxrpc_sock *rx = rxrpc_sk(sk); int ret = 0; lock_sock(sk); if (rx->securities) ret = -EINVAL; else if (rx->sk.sk_state != RXRPC_UNBOUND) ret = -EISCONN; else rx->securities = key_get(keyring); release_sock(sk); return ret; } EXPORT_SYMBOL(rxrpc_sock_set_security_keyring); /** * rxrpc_sock_set_manage_response - Set the manage-response flag for a kernel service * @sk: The socket to set the keyring on * @set: True to set, false to clear the flag * * Set the flag on an rxrpc socket to say that the caller wants to manage the * RESPONSE packet and the user-defined data it may contain. Setting this * means that recvmsg() will return messages with RXRPC_CHALLENGED in the * control message buffer containing information about the challenge. * * The user should respond to the challenge by passing RXRPC_RESPOND or * RXRPC_RESPOND_ABORT control messages with sendmsg() to the same call. * Supplementary control messages, such as RXRPC_RESP_RXGK_APPDATA, may be * included to indicate the parts the user wants to supply. * * The server will be passed the response data with a RXRPC_RESPONDED control * message when it gets the first data from each call. * * Note that this is only honoured by security classes that need auxiliary data * (e.g. RxGK). Those that don't offer the facility (e.g. RxKAD) respond * without consulting userspace. * * Return: The previous setting. */ int rxrpc_sock_set_manage_response(struct sock *sk, bool set) { struct rxrpc_sock *rx = rxrpc_sk(sk); int ret; lock_sock(sk); ret = !!test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); if (set) set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); else clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); release_sock(sk); return ret; } EXPORT_SYMBOL(rxrpc_sock_set_manage_response); |
| 2 1 12 8 2 2 158 694 3 26 664 158 20 184 117 530 609 159 372 235 116 1 369 449 280 429 683 684 684 625 3 56 57 682 630 57 3 3 6 6 5 6 5 1 6 210 2 159 2 46 1 45 611 1 22 3 630 464 186 30 601 627 635 631 635 638 7 1 6 144 148 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 | // SPDX-License-Identifier: GPL-2.0-or-later /* Provide a way to create a superblock configuration context within the kernel * that allows a superblock to be set up prior to mounting. * * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/magic.h> #include <linux/security.h> #include <linux/mnt_namespace.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <net/net_namespace.h> #include <asm/sections.h> #include "mount.h" #include "internal.h" enum legacy_fs_param { LEGACY_FS_UNSET_PARAMS, LEGACY_FS_MONOLITHIC_PARAMS, LEGACY_FS_INDIVIDUAL_PARAMS, }; struct legacy_fs_context { char *legacy_data; /* Data page for legacy filesystems */ size_t data_size; enum legacy_fs_param param_type; }; static int legacy_init_fs_context(struct fs_context *fc); static const struct constant_table common_set_sb_flag[] = { { "dirsync", SB_DIRSYNC }, { "lazytime", SB_LAZYTIME }, { "mand", SB_MANDLOCK }, { "ro", SB_RDONLY }, { "sync", SB_SYNCHRONOUS }, { }, }; static const struct constant_table common_clear_sb_flag[] = { { "async", SB_SYNCHRONOUS }, { "nolazytime", SB_LAZYTIME }, { "nomand", SB_MANDLOCK }, { "rw", SB_RDONLY }, { }, }; /* * Check for a common mount option that manipulates s_flags. */ static int vfs_parse_sb_flag(struct fs_context *fc, const char *key) { unsigned int token; token = lookup_constant(common_set_sb_flag, key, 0); if (token) { fc->sb_flags |= token; fc->sb_flags_mask |= token; return 0; } token = lookup_constant(common_clear_sb_flag, key, 0); if (token) { fc->sb_flags &= ~token; fc->sb_flags_mask |= token; return 0; } return -ENOPARAM; } /** * vfs_parse_fs_param_source - Handle setting "source" via parameter * @fc: The filesystem context to modify * @param: The parameter * * This is a simple helper for filesystems to verify that the "source" they * accept is sane. * * Returns 0 on success, -ENOPARAM if this is not "source" parameter, and * -EINVAL otherwise. In the event of failure, supplementary error information * is logged. */ int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param) { if (strcmp(param->key, "source") != 0) return -ENOPARAM; if (param->type != fs_value_is_string) return invalf(fc, "Non-string source"); if (fc->source) return invalf(fc, "Multiple sources"); fc->source = param->string; param->string = NULL; return 0; } EXPORT_SYMBOL(vfs_parse_fs_param_source); /** * vfs_parse_fs_param - Add a single parameter to a superblock config * @fc: The filesystem context to modify * @param: The parameter * * A single mount option in string form is applied to the filesystem context * being set up. Certain standard options (for example "ro") are translated * into flag bits without going to the filesystem. The active security module * is allowed to observe and poach options. Any other options are passed over * to the filesystem to parse. * * This may be called multiple times for a context. * * Returns 0 on success and a negative error code on failure. In the event of * failure, supplementary error information may have been set. */ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param) { int ret; if (!param->key) return invalf(fc, "Unnamed parameter\n"); ret = vfs_parse_sb_flag(fc, param->key); if (ret != -ENOPARAM) return ret; ret = security_fs_context_parse_param(fc, param); if (ret != -ENOPARAM) /* Param belongs to the LSM or is disallowed by the LSM; so * don't pass to the FS. */ return ret; if (fc->ops->parse_param) { ret = fc->ops->parse_param(fc, param); if (ret != -ENOPARAM) return ret; } /* If the filesystem doesn't take any arguments, give it the * default handling of source. */ ret = vfs_parse_fs_param_source(fc, param); if (ret != -ENOPARAM) return ret; return invalf(fc, "%s: Unknown parameter '%s'", fc->fs_type->name, param->key); } EXPORT_SYMBOL(vfs_parse_fs_param); /** * vfs_parse_fs_string - Convenience function to just parse a string. * @fc: Filesystem context. * @key: Parameter name. * @value: Default value. * @v_size: Maximum number of bytes in the value. */ int vfs_parse_fs_string(struct fs_context *fc, const char *key, const char *value, size_t v_size) { int ret; struct fs_parameter param = { .key = key, .type = fs_value_is_flag, .size = v_size, }; if (value) { param.string = kmemdup_nul(value, v_size, GFP_KERNEL); if (!param.string) return -ENOMEM; param.type = fs_value_is_string; } ret = vfs_parse_fs_param(fc, ¶m); kfree(param.string); return ret; } EXPORT_SYMBOL(vfs_parse_fs_string); /** * vfs_parse_monolithic_sep - Parse key[=val][,key[=val]]* mount data * @fc: The superblock configuration to fill in. * @data: The data to parse * @sep: callback for separating next option * * Parse a blob of data that's in key[=val][,key[=val]]* form with a custom * option separator callback. * * Returns 0 on success or the error returned by the ->parse_option() fs_context * operation on failure. */ int vfs_parse_monolithic_sep(struct fs_context *fc, void *data, char *(*sep)(char **)) { char *options = data, *key; int ret = 0; if (!options) return 0; ret = security_sb_eat_lsm_opts(options, &fc->security); if (ret) return ret; while ((key = sep(&options)) != NULL) { if (*key) { size_t v_len = 0; char *value = strchr(key, '='); if (value) { if (unlikely(value == key)) continue; *value++ = 0; v_len = strlen(value); } ret = vfs_parse_fs_string(fc, key, value, v_len); if (ret < 0) break; } } return ret; } EXPORT_SYMBOL(vfs_parse_monolithic_sep); static char *vfs_parse_comma_sep(char **s) { return strsep(s, ","); } /** * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data * @fc: The superblock configuration to fill in. * @data: The data to parse * * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be * called from the ->monolithic_mount_data() fs_context operation. * * Returns 0 on success or the error returned by the ->parse_option() fs_context * operation on failure. */ int generic_parse_monolithic(struct fs_context *fc, void *data) { return vfs_parse_monolithic_sep(fc, data, vfs_parse_comma_sep); } EXPORT_SYMBOL(generic_parse_monolithic); /** * alloc_fs_context - Create a filesystem context. * @fs_type: The filesystem type. * @reference: The dentry from which this one derives (or NULL) * @sb_flags: Filesystem/superblock flags (SB_*) * @sb_flags_mask: Applicable members of @sb_flags * @purpose: The purpose that this configuration shall be used for. * * Open a filesystem and create a mount context. The mount context is * initialised with the supplied flags and, if a submount/automount from * another superblock (referred to by @reference) is supplied, may have * parameters such as namespaces copied across from that superblock. */ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, struct dentry *reference, unsigned int sb_flags, unsigned int sb_flags_mask, enum fs_context_purpose purpose) { int (*init_fs_context)(struct fs_context *); struct fs_context *fc; int ret = -ENOMEM; fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT); if (!fc) return ERR_PTR(-ENOMEM); fc->purpose = purpose; fc->sb_flags = sb_flags; fc->sb_flags_mask = sb_flags_mask; fc->fs_type = get_filesystem(fs_type); fc->cred = get_current_cred(); fc->net_ns = get_net(current->nsproxy->net_ns); fc->log.prefix = fs_type->name; mutex_init(&fc->uapi_mutex); switch (purpose) { case FS_CONTEXT_FOR_MOUNT: fc->user_ns = get_user_ns(fc->cred->user_ns); break; case FS_CONTEXT_FOR_SUBMOUNT: fc->user_ns = get_user_ns(reference->d_sb->s_user_ns); break; case FS_CONTEXT_FOR_RECONFIGURE: atomic_inc(&reference->d_sb->s_active); fc->user_ns = get_user_ns(reference->d_sb->s_user_ns); fc->root = dget(reference); break; } /* TODO: Make all filesystems support this unconditionally */ init_fs_context = fc->fs_type->init_fs_context; if (!init_fs_context) init_fs_context = legacy_init_fs_context; ret = init_fs_context(fc); if (ret < 0) goto err_fc; fc->need_free = true; return fc; err_fc: put_fs_context(fc); return ERR_PTR(ret); } struct fs_context *fs_context_for_mount(struct file_system_type *fs_type, unsigned int sb_flags) { return alloc_fs_context(fs_type, NULL, sb_flags, 0, FS_CONTEXT_FOR_MOUNT); } EXPORT_SYMBOL(fs_context_for_mount); struct fs_context *fs_context_for_reconfigure(struct dentry *dentry, unsigned int sb_flags, unsigned int sb_flags_mask) { return alloc_fs_context(dentry->d_sb->s_type, dentry, sb_flags, sb_flags_mask, FS_CONTEXT_FOR_RECONFIGURE); } EXPORT_SYMBOL(fs_context_for_reconfigure); /** * fs_context_for_submount: allocate a new fs_context for a submount * @type: file_system_type of the new context * @reference: reference dentry from which to copy relevant info * * Allocate a new fs_context suitable for a submount. This also ensures that * the fc->security object is inherited from @reference (if needed). */ struct fs_context *fs_context_for_submount(struct file_system_type *type, struct dentry *reference) { struct fs_context *fc; int ret; fc = alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT); if (IS_ERR(fc)) return fc; ret = security_fs_context_submount(fc, reference->d_sb); if (ret) { put_fs_context(fc); return ERR_PTR(ret); } return fc; } EXPORT_SYMBOL(fs_context_for_submount); void fc_drop_locked(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; dput(fc->root); fc->root = NULL; deactivate_locked_super(sb); } static void legacy_fs_context_free(struct fs_context *fc); /** * vfs_dup_fs_context - Duplicate a filesystem context. * @src_fc: The context to copy. */ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) { struct fs_context *fc; int ret; if (!src_fc->ops->dup) return ERR_PTR(-EOPNOTSUPP); fc = kmemdup(src_fc, sizeof(struct fs_context), GFP_KERNEL); if (!fc) return ERR_PTR(-ENOMEM); mutex_init(&fc->uapi_mutex); fc->fs_private = NULL; fc->s_fs_info = NULL; fc->source = NULL; fc->security = NULL; get_filesystem(fc->fs_type); get_net(fc->net_ns); get_user_ns(fc->user_ns); get_cred(fc->cred); if (fc->log.log) refcount_inc(&fc->log.log->usage); /* Can't call put until we've called ->dup */ ret = fc->ops->dup(fc, src_fc); if (ret < 0) goto err_fc; ret = security_fs_context_dup(fc, src_fc); if (ret < 0) goto err_fc; return fc; err_fc: put_fs_context(fc); return ERR_PTR(ret); } EXPORT_SYMBOL(vfs_dup_fs_context); /** * logfc - Log a message to a filesystem context * @log: The filesystem context to log to, or NULL to use printk. * @prefix: A string to prefix the output with, or NULL. * @level: 'w' for a warning, 'e' for an error. Anything else is a notice. * @fmt: The format of the buffer. */ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...) { va_list va; struct va_format vaf = {.fmt = fmt, .va = &va}; va_start(va, fmt); if (!log) { switch (level) { case 'w': printk(KERN_WARNING "%s%s%pV\n", prefix ? prefix : "", prefix ? ": " : "", &vaf); break; case 'e': printk(KERN_ERR "%s%s%pV\n", prefix ? prefix : "", prefix ? ": " : "", &vaf); break; case 'i': printk(KERN_INFO "%s%s%pV\n", prefix ? prefix : "", prefix ? ": " : "", &vaf); break; default: printk(KERN_NOTICE "%s%s%pV\n", prefix ? prefix : "", prefix ? ": " : "", &vaf); break; } } else { unsigned int logsize = ARRAY_SIZE(log->buffer); u8 index; char *q = kasprintf(GFP_KERNEL, "%c %s%s%pV\n", level, prefix ? prefix : "", prefix ? ": " : "", &vaf); index = log->head & (logsize - 1); BUILD_BUG_ON(sizeof(log->head) != sizeof(u8) || sizeof(log->tail) != sizeof(u8)); if ((u8)(log->head - log->tail) == logsize) { /* The buffer is full, discard the oldest message */ if (log->need_free & (1 << index)) kfree(log->buffer[index]); log->tail++; } log->buffer[index] = q ? q : "OOM: Can't store error string"; if (q) log->need_free |= 1 << index; else log->need_free &= ~(1 << index); log->head++; } va_end(va); } EXPORT_SYMBOL(logfc); /* * Free a logging structure. */ static void put_fc_log(struct fs_context *fc) { struct fc_log *log = fc->log.log; int i; if (log) { if (refcount_dec_and_test(&log->usage)) { fc->log.log = NULL; for (i = 0; i < ARRAY_SIZE(log->buffer) ; i++) if (log->need_free & (1 << i)) kfree(log->buffer[i]); kfree(log); } } } /** * put_fs_context - Dispose of a superblock configuration context. * @fc: The context to dispose of. */ void put_fs_context(struct fs_context *fc) { struct super_block *sb; if (fc->root) { sb = fc->root->d_sb; dput(fc->root); fc->root = NULL; deactivate_super(sb); } if (fc->need_free && fc->ops && fc->ops->free) fc->ops->free(fc); security_free_mnt_opts(&fc->security); put_net(fc->net_ns); put_user_ns(fc->user_ns); put_cred(fc->cred); put_fc_log(fc); put_filesystem(fc->fs_type); kfree(fc->source); kfree(fc); } EXPORT_SYMBOL(put_fs_context); /* * Free the config for a filesystem that doesn't support fs_context. */ static void legacy_fs_context_free(struct fs_context *fc) { struct legacy_fs_context *ctx = fc->fs_private; if (ctx) { if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) kfree(ctx->legacy_data); kfree(ctx); } } /* * Duplicate a legacy config. */ static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { struct legacy_fs_context *ctx; struct legacy_fs_context *src_ctx = src_fc->fs_private; ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) { ctx->legacy_data = kmemdup(src_ctx->legacy_data, src_ctx->data_size, GFP_KERNEL); if (!ctx->legacy_data) { kfree(ctx); return -ENOMEM; } } fc->fs_private = ctx; return 0; } /* * Add a parameter to a legacy config. We build up a comma-separated list of * options. */ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct legacy_fs_context *ctx = fc->fs_private; unsigned int size = ctx->data_size; size_t len = 0; int ret; ret = vfs_parse_fs_param_source(fc, param); if (ret != -ENOPARAM) return ret; if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); switch (param->type) { case fs_value_is_string: len = 1 + param->size; fallthrough; case fs_value_is_flag: len += strlen(param->key); break; default: return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported", param->key); } if (size + len + 2 > PAGE_SIZE) return invalf(fc, "VFS: Legacy: Cumulative options too large"); if (strchr(param->key, ',') || (param->type == fs_value_is_string && memchr(param->string, ',', param->size))) return invalf(fc, "VFS: Legacy: Option '%s' contained comma", param->key); if (!ctx->legacy_data) { ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!ctx->legacy_data) return -ENOMEM; } if (size) ctx->legacy_data[size++] = ','; len = strlen(param->key); memcpy(ctx->legacy_data + size, param->key, len); size += len; if (param->type == fs_value_is_string) { ctx->legacy_data[size++] = '='; memcpy(ctx->legacy_data + size, param->string, param->size); size += param->size; } ctx->legacy_data[size] = '\0'; ctx->data_size = size; ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS; return 0; } /* * Add monolithic mount data. */ static int legacy_parse_monolithic(struct fs_context *fc, void *data) { struct legacy_fs_context *ctx = fc->fs_private; if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) { pr_warn("VFS: Can't mix monolithic and individual options\n"); return -EINVAL; } ctx->legacy_data = data; ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS; if (!ctx->legacy_data) return 0; if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA) return 0; return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security); } /* * Get a mountable root with the legacy mount command. */ static int legacy_get_tree(struct fs_context *fc) { struct legacy_fs_context *ctx = fc->fs_private; struct super_block *sb; struct dentry *root; root = fc->fs_type->mount(fc->fs_type, fc->sb_flags, fc->source, ctx->legacy_data); if (IS_ERR(root)) return PTR_ERR(root); sb = root->d_sb; BUG_ON(!sb); fc->root = root; return 0; } /* * Handle remount. */ static int legacy_reconfigure(struct fs_context *fc) { struct legacy_fs_context *ctx = fc->fs_private; struct super_block *sb = fc->root->d_sb; if (!sb->s_op->remount_fs) return 0; return sb->s_op->remount_fs(sb, &fc->sb_flags, ctx ? ctx->legacy_data : NULL); } const struct fs_context_operations legacy_fs_context_ops = { .free = legacy_fs_context_free, .dup = legacy_fs_context_dup, .parse_param = legacy_parse_param, .parse_monolithic = legacy_parse_monolithic, .get_tree = legacy_get_tree, .reconfigure = legacy_reconfigure, }; /* * Initialise a legacy context for a filesystem that doesn't support * fs_context. */ static int legacy_init_fs_context(struct fs_context *fc) { fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT); if (!fc->fs_private) return -ENOMEM; fc->ops = &legacy_fs_context_ops; return 0; } int parse_monolithic_mount_data(struct fs_context *fc, void *data) { int (*monolithic_mount_data)(struct fs_context *, void *); monolithic_mount_data = fc->ops->parse_monolithic; if (!monolithic_mount_data) monolithic_mount_data = generic_parse_monolithic; return monolithic_mount_data(fc, data); } /* * Clean up a context after performing an action on it and put it into a state * from where it can be used to reconfigure a superblock. * * Note that here we do only the parts that can't fail; the rest is in * finish_clean_context() below and in between those fs_context is marked * FS_CONTEXT_AWAITING_RECONF. The reason for splitup is that after * successful mount or remount we need to report success to userland. * Trying to do full reinit (for the sake of possible subsequent remount) * and failing to allocate memory would've put us into a nasty situation. * So here we only discard the old state and reinitialization is left * until we actually try to reconfigure. */ void vfs_clean_context(struct fs_context *fc) { if (fc->need_free && fc->ops && fc->ops->free) fc->ops->free(fc); fc->need_free = false; fc->fs_private = NULL; fc->s_fs_info = NULL; fc->sb_flags = 0; security_free_mnt_opts(&fc->security); kfree(fc->source); fc->source = NULL; fc->exclusive = false; fc->purpose = FS_CONTEXT_FOR_RECONFIGURE; fc->phase = FS_CONTEXT_AWAITING_RECONF; } int finish_clean_context(struct fs_context *fc) { int error; if (fc->phase != FS_CONTEXT_AWAITING_RECONF) return 0; if (fc->fs_type->init_fs_context) error = fc->fs_type->init_fs_context(fc); else error = legacy_init_fs_context(fc); if (unlikely(error)) { fc->phase = FS_CONTEXT_FAILED; return error; } fc->need_free = true; fc->phase = FS_CONTEXT_RECONF_PARAMS; return 0; } |
| 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Driver for USB ethernet port of Conexant CX82310-based ADSL routers * Copyright (C) 2010 by Ondrej Zary * some parts inspired by the cxacru driver */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/workqueue.h> #include <linux/mii.h> #include <linux/usb.h> #include <linux/usb/usbnet.h> enum cx82310_cmd { CMD_START = 0x84, /* no effect? */ CMD_STOP = 0x85, /* no effect? */ CMD_GET_STATUS = 0x90, /* returns nothing? */ CMD_GET_MAC_ADDR = 0x91, /* read MAC address */ CMD_GET_LINK_STATUS = 0x92, /* not useful, link is always up */ CMD_ETHERNET_MODE = 0x99, /* unknown, needed during init */ }; enum cx82310_status { STATUS_UNDEFINED, STATUS_SUCCESS, STATUS_ERROR, STATUS_UNSUPPORTED, STATUS_UNIMPLEMENTED, STATUS_PARAMETER_ERROR, STATUS_DBG_LOOPBACK, }; #define CMD_PACKET_SIZE 64 #define CMD_TIMEOUT 100 #define CMD_REPLY_RETRY 5 #define CX82310_MTU 1514 #define CMD_EP 0x01 struct cx82310_priv { struct work_struct reenable_work; struct usbnet *dev; }; /* * execute control command * - optionally send some data (command parameters) * - optionally wait for the reply * - optionally read some data from the reply */ static int cx82310_cmd(struct usbnet *dev, enum cx82310_cmd cmd, bool reply, u8 *wdata, int wlen, u8 *rdata, int rlen) { int actual_len, retries, ret; struct usb_device *udev = dev->udev; u8 *buf = kzalloc(CMD_PACKET_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; /* create command packet */ buf[0] = cmd; if (wdata) memcpy(buf + 4, wdata, min_t(int, wlen, CMD_PACKET_SIZE - 4)); /* send command packet */ ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, CMD_EP), buf, CMD_PACKET_SIZE, &actual_len, CMD_TIMEOUT); if (ret < 0) { if (cmd != CMD_GET_LINK_STATUS) netdev_err(dev->net, "send command %#x: error %d\n", cmd, ret); goto end; } if (reply) { /* wait for reply, retry if it's empty */ for (retries = 0; retries < CMD_REPLY_RETRY; retries++) { ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, CMD_EP), buf, CMD_PACKET_SIZE, &actual_len, CMD_TIMEOUT); if (ret < 0) { if (cmd != CMD_GET_LINK_STATUS) netdev_err(dev->net, "reply receive error %d\n", ret); goto end; } if (actual_len > 0) break; } if (actual_len == 0) { netdev_err(dev->net, "no reply to command %#x\n", cmd); ret = -EIO; goto end; } if (buf[0] != cmd) { netdev_err(dev->net, "got reply to command %#x, expected: %#x\n", buf[0], cmd); ret = -EIO; goto end; } if (buf[1] != STATUS_SUCCESS) { netdev_err(dev->net, "command %#x failed: %#x\n", cmd, buf[1]); ret = -EIO; goto end; } if (rdata) memcpy(rdata, buf + 4, min_t(int, rlen, CMD_PACKET_SIZE - 4)); } end: kfree(buf); return ret; } static int cx82310_enable_ethernet(struct usbnet *dev) { int ret = cx82310_cmd(dev, CMD_ETHERNET_MODE, true, "\x01", 1, NULL, 0); if (ret) netdev_err(dev->net, "unable to enable ethernet mode: %d\n", ret); return ret; } static void cx82310_reenable_work(struct work_struct *work) { struct cx82310_priv *priv = container_of(work, struct cx82310_priv, reenable_work); cx82310_enable_ethernet(priv->dev); } #define partial_len data[0] /* length of partial packet data */ #define partial_rem data[1] /* remaining (missing) data length */ #define partial_data data[2] /* partial packet data */ static int cx82310_bind(struct usbnet *dev, struct usb_interface *intf) { int ret; char buf[15]; struct usb_device *udev = dev->udev; u8 link[3]; int timeout = 50; struct cx82310_priv *priv; u8 addr[ETH_ALEN]; /* avoid ADSL modems - continue only if iProduct is "USB NET CARD" */ if (usb_string(udev, udev->descriptor.iProduct, buf, sizeof(buf)) > 0 && strcmp(buf, "USB NET CARD")) { dev_info(&udev->dev, "ignoring: probably an ADSL modem\n"); return -ENODEV; } ret = usbnet_get_endpoints(dev, intf); if (ret) return ret; /* * this must not include ethernet header as the device can send partial * packets with no header (and sometimes even empty URBs) */ dev->net->hard_header_len = 0; /* we can send at most 1514 bytes of data (+ 2-byte header) per URB */ dev->hard_mtu = CX82310_MTU + 2; /* we can receive URBs up to 4KB from the device */ dev->rx_urb_size = 4096; dev->partial_data = (unsigned long) kmalloc(dev->hard_mtu, GFP_KERNEL); if (!dev->partial_data) return -ENOMEM; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) { ret = -ENOMEM; goto err_partial; } dev->driver_priv = priv; INIT_WORK(&priv->reenable_work, cx82310_reenable_work); priv->dev = dev; /* wait for firmware to become ready (indicated by the link being up) */ while (--timeout) { ret = cx82310_cmd(dev, CMD_GET_LINK_STATUS, true, NULL, 0, link, sizeof(link)); /* the command can time out during boot - it's not an error */ if (!ret && link[0] == 1 && link[2] == 1) break; msleep(500); } if (!timeout) { netdev_err(dev->net, "firmware not ready in time\n"); ret = -ETIMEDOUT; goto err; } /* enable ethernet mode (?) */ ret = cx82310_enable_ethernet(dev); if (ret) goto err; /* get the MAC address */ ret = cx82310_cmd(dev, CMD_GET_MAC_ADDR, true, NULL, 0, addr, ETH_ALEN); if (ret) { netdev_err(dev->net, "unable to read MAC address: %d\n", ret); goto err; } eth_hw_addr_set(dev->net, addr); /* start (does not seem to have any effect?) */ ret = cx82310_cmd(dev, CMD_START, false, NULL, 0, NULL, 0); if (ret) goto err; return 0; err: kfree(dev->driver_priv); err_partial: kfree((void *)dev->partial_data); return ret; } static void cx82310_unbind(struct usbnet *dev, struct usb_interface *intf) { struct cx82310_priv *priv = dev->driver_priv; kfree((void *)dev->partial_data); cancel_work_sync(&priv->reenable_work); kfree(dev->driver_priv); } /* * RX is NOT easy - we can receive multiple packets per skb, each having 2-byte * packet length at the beginning. * The last packet might be incomplete (when it crosses the 4KB URB size), * continuing in the next skb (without any headers). * If a packet has odd length, there is one extra byte at the end (before next * packet or at the end of the URB). */ static int cx82310_rx_fixup(struct usbnet *dev, struct sk_buff *skb) { int len; struct sk_buff *skb2; struct cx82310_priv *priv = dev->driver_priv; /* * If the last skb ended with an incomplete packet, this skb contains * end of that packet at the beginning. */ if (dev->partial_rem) { len = dev->partial_len + dev->partial_rem; skb2 = alloc_skb(len, GFP_ATOMIC); if (!skb2) return 0; skb_put(skb2, len); memcpy(skb2->data, (void *)dev->partial_data, dev->partial_len); memcpy(skb2->data + dev->partial_len, skb->data, dev->partial_rem); usbnet_skb_return(dev, skb2); skb_pull(skb, (dev->partial_rem + 1) & ~1); dev->partial_rem = 0; if (skb->len < 2) return 1; } /* a skb can contain multiple packets */ while (skb->len > 1) { /* first two bytes are packet length */ len = skb->data[0] | (skb->data[1] << 8); skb_pull(skb, 2); /* if last packet in the skb, let usbnet to process it */ if (len == skb->len || len + 1 == skb->len) { skb_trim(skb, len); break; } if (len == 0xffff) { netdev_info(dev->net, "router was rebooted, re-enabling ethernet mode"); schedule_work(&priv->reenable_work); } else if (len > CX82310_MTU) { netdev_err(dev->net, "RX packet too long: %d B\n", len); return 0; } /* incomplete packet, save it for the next skb */ if (len > skb->len) { dev->partial_len = skb->len; dev->partial_rem = len - skb->len; memcpy((void *)dev->partial_data, skb->data, dev->partial_len); skb_pull(skb, skb->len); break; } skb2 = alloc_skb(len, GFP_ATOMIC); if (!skb2) return 0; skb_put(skb2, len); memcpy(skb2->data, skb->data, len); /* process the packet */ usbnet_skb_return(dev, skb2); skb_pull(skb, (len + 1) & ~1); } /* let usbnet process the last packet */ return 1; } /* TX is easy, just add 2 bytes of length at the beginning */ static struct sk_buff *cx82310_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { int len = skb->len; if (skb_cow_head(skb, 2)) { dev_kfree_skb_any(skb); return NULL; } skb_push(skb, 2); skb->data[0] = len; skb->data[1] = len >> 8; return skb; } static const struct driver_info cx82310_info = { .description = "Conexant CX82310 USB ethernet", .flags = FLAG_ETHER, .bind = cx82310_bind, .unbind = cx82310_unbind, .rx_fixup = cx82310_rx_fixup, .tx_fixup = cx82310_tx_fixup, }; #define USB_DEVICE_CLASS(vend, prod, cl, sc, pr) \ .match_flags = USB_DEVICE_ID_MATCH_DEVICE | \ USB_DEVICE_ID_MATCH_DEV_INFO, \ .idVendor = (vend), \ .idProduct = (prod), \ .bDeviceClass = (cl), \ .bDeviceSubClass = (sc), \ .bDeviceProtocol = (pr) static const struct usb_device_id products[] = { { USB_DEVICE_CLASS(0x0572, 0xcb01, 0xff, 0, 0), .driver_info = (unsigned long) &cx82310_info }, { }, }; MODULE_DEVICE_TABLE(usb, products); static struct usb_driver cx82310_driver = { .name = "cx82310_eth", .id_table = products, .probe = usbnet_probe, .disconnect = usbnet_disconnect, .suspend = usbnet_suspend, .resume = usbnet_resume, .disable_hub_initiated_lpm = 1, }; module_usb_driver(cx82310_driver); MODULE_AUTHOR("Ondrej Zary"); MODULE_DESCRIPTION("Conexant CX82310-based ADSL router USB ethernet driver"); MODULE_LICENSE("GPL"); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 | /* SPDX-License-Identifier: ISC */ /* * Copyright (c) 2005-2011 Atheros Communications Inc. * Copyright (c) 2011-2017 Qualcomm Atheros, Inc. * Copyright (c) 2018, The Linux Foundation. All rights reserved. * Copyright (c) 2021, 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. */ #ifndef _HTT_H_ #define _HTT_H_ #include <linux/bug.h> #include <linux/interrupt.h> #include <linux/dmapool.h> #include <linux/hashtable.h> #include <linux/kfifo.h> #include <net/mac80211.h> #include "htc.h" #include "hw.h" #include "rx_desc.h" enum htt_dbg_stats_type { HTT_DBG_STATS_WAL_PDEV_TXRX = 1 << 0, HTT_DBG_STATS_RX_REORDER = 1 << 1, HTT_DBG_STATS_RX_RATE_INFO = 1 << 2, HTT_DBG_STATS_TX_PPDU_LOG = 1 << 3, HTT_DBG_STATS_TX_RATE_INFO = 1 << 4, /* bits 5-23 currently reserved */ HTT_DBG_NUM_STATS /* keep this last */ }; enum htt_h2t_msg_type { /* host-to-target */ HTT_H2T_MSG_TYPE_VERSION_REQ = 0, HTT_H2T_MSG_TYPE_TX_FRM = 1, HTT_H2T_MSG_TYPE_RX_RING_CFG = 2, HTT_H2T_MSG_TYPE_STATS_REQ = 3, HTT_H2T_MSG_TYPE_SYNC = 4, HTT_H2T_MSG_TYPE_AGGR_CFG = 5, HTT_H2T_MSG_TYPE_FRAG_DESC_BANK_CFG = 6, /* This command is used for sending management frames in HTT < 3.0. * HTT >= 3.0 uses TX_FRM for everything. */ HTT_H2T_MSG_TYPE_MGMT_TX = 7, HTT_H2T_MSG_TYPE_TX_FETCH_RESP = 11, HTT_H2T_NUM_MSGS /* keep this last */ }; struct htt_cmd_hdr { u8 msg_type; } __packed; struct htt_ver_req { u8 pad[sizeof(u32) - sizeof(struct htt_cmd_hdr)]; } __packed; /* * HTT tx MSDU descriptor * * The HTT tx MSDU descriptor is created by the host HTT SW for each * tx MSDU. The HTT tx MSDU descriptor contains the information that * the target firmware needs for the FW's tx processing, particularly * for creating the HW msdu descriptor. * The same HTT tx descriptor is used for HL and LL systems, though * a few fields within the tx descriptor are used only by LL or * only by HL. * The HTT tx descriptor is defined in two manners: by a struct with * bitfields, and by a series of [dword offset, bit mask, bit shift] * definitions. * The target should use the struct def, for simplicity and clarity, * but the host shall use the bit-mast + bit-shift defs, to be endian- * neutral. Specifically, the host shall use the get/set macros built * around the mask + shift defs. */ struct htt_data_tx_desc_frag { union { struct double_word_addr { __le32 paddr; __le32 len; } __packed dword_addr; struct triple_word_addr { __le32 paddr_lo; __le16 paddr_hi; __le16 len_16; } __packed tword_addr; } __packed; } __packed; struct htt_msdu_ext_desc { __le32 tso_flag[3]; __le16 ip_identification; u8 flags; u8 reserved; struct htt_data_tx_desc_frag frags[6]; }; struct htt_msdu_ext_desc_64 { __le32 tso_flag[5]; __le16 ip_identification; u8 flags; u8 reserved; struct htt_data_tx_desc_frag frags[6]; }; #define HTT_MSDU_EXT_DESC_FLAG_IPV4_CSUM_ENABLE BIT(0) #define HTT_MSDU_EXT_DESC_FLAG_UDP_IPV4_CSUM_ENABLE BIT(1) #define HTT_MSDU_EXT_DESC_FLAG_UDP_IPV6_CSUM_ENABLE BIT(2) #define HTT_MSDU_EXT_DESC_FLAG_TCP_IPV4_CSUM_ENABLE BIT(3) #define HTT_MSDU_EXT_DESC_FLAG_TCP_IPV6_CSUM_ENABLE BIT(4) #define HTT_MSDU_CHECKSUM_ENABLE (HTT_MSDU_EXT_DESC_FLAG_IPV4_CSUM_ENABLE \ | HTT_MSDU_EXT_DESC_FLAG_UDP_IPV4_CSUM_ENABLE \ | HTT_MSDU_EXT_DESC_FLAG_UDP_IPV6_CSUM_ENABLE \ | HTT_MSDU_EXT_DESC_FLAG_TCP_IPV4_CSUM_ENABLE \ | HTT_MSDU_EXT_DESC_FLAG_TCP_IPV6_CSUM_ENABLE) #define HTT_MSDU_EXT_DESC_FLAG_IPV4_CSUM_ENABLE_64 BIT(16) #define HTT_MSDU_EXT_DESC_FLAG_UDP_IPV4_CSUM_ENABLE_64 BIT(17) #define HTT_MSDU_EXT_DESC_FLAG_UDP_IPV6_CSUM_ENABLE_64 BIT(18) #define HTT_MSDU_EXT_DESC_FLAG_TCP_IPV4_CSUM_ENABLE_64 BIT(19) #define HTT_MSDU_EXT_DESC_FLAG_TCP_IPV6_CSUM_ENABLE_64 BIT(20) #define HTT_MSDU_EXT_DESC_FLAG_PARTIAL_CSUM_ENABLE_64 BIT(21) #define HTT_MSDU_CHECKSUM_ENABLE_64 (HTT_MSDU_EXT_DESC_FLAG_IPV4_CSUM_ENABLE_64 \ | HTT_MSDU_EXT_DESC_FLAG_UDP_IPV4_CSUM_ENABLE_64 \ | HTT_MSDU_EXT_DESC_FLAG_UDP_IPV6_CSUM_ENABLE_64 \ | HTT_MSDU_EXT_DESC_FLAG_TCP_IPV4_CSUM_ENABLE_64 \ | HTT_MSDU_EXT_DESC_FLAG_TCP_IPV6_CSUM_ENABLE_64) enum htt_data_tx_desc_flags0 { HTT_DATA_TX_DESC_FLAGS0_MAC_HDR_PRESENT = 1 << 0, HTT_DATA_TX_DESC_FLAGS0_NO_AGGR = 1 << 1, HTT_DATA_TX_DESC_FLAGS0_NO_ENCRYPT = 1 << 2, HTT_DATA_TX_DESC_FLAGS0_NO_CLASSIFY = 1 << 3, HTT_DATA_TX_DESC_FLAGS0_RSVD0 = 1 << 4 #define HTT_DATA_TX_DESC_FLAGS0_PKT_TYPE_MASK 0xE0 #define HTT_DATA_TX_DESC_FLAGS0_PKT_TYPE_LSB 5 }; enum htt_data_tx_desc_flags1 { #define HTT_DATA_TX_DESC_FLAGS1_VDEV_ID_BITS 6 #define HTT_DATA_TX_DESC_FLAGS1_VDEV_ID_MASK 0x003F #define HTT_DATA_TX_DESC_FLAGS1_VDEV_ID_LSB 0 #define HTT_DATA_TX_DESC_FLAGS1_EXT_TID_BITS 5 #define HTT_DATA_TX_DESC_FLAGS1_EXT_TID_MASK 0x07C0 #define HTT_DATA_TX_DESC_FLAGS1_EXT_TID_LSB 6 HTT_DATA_TX_DESC_FLAGS1_POSTPONED = 1 << 11, HTT_DATA_TX_DESC_FLAGS1_MORE_IN_BATCH = 1 << 12, HTT_DATA_TX_DESC_FLAGS1_CKSUM_L3_OFFLOAD = 1 << 13, HTT_DATA_TX_DESC_FLAGS1_CKSUM_L4_OFFLOAD = 1 << 14, HTT_DATA_TX_DESC_FLAGS1_TX_COMPLETE = 1 << 15 }; #define HTT_TX_CREDIT_DELTA_ABS_M 0xffff0000 #define HTT_TX_CREDIT_DELTA_ABS_S 16 #define HTT_TX_CREDIT_DELTA_ABS_GET(word) \ (((word) & HTT_TX_CREDIT_DELTA_ABS_M) >> HTT_TX_CREDIT_DELTA_ABS_S) #define HTT_TX_CREDIT_SIGN_BIT_M 0x00000100 #define HTT_TX_CREDIT_SIGN_BIT_S 8 #define HTT_TX_CREDIT_SIGN_BIT_GET(word) \ (((word) & HTT_TX_CREDIT_SIGN_BIT_M) >> HTT_TX_CREDIT_SIGN_BIT_S) enum htt_data_tx_ext_tid { HTT_DATA_TX_EXT_TID_NON_QOS_MCAST_BCAST = 16, HTT_DATA_TX_EXT_TID_MGMT = 17, HTT_DATA_TX_EXT_TID_INVALID = 31 }; #define HTT_INVALID_PEERID 0xFFFF /* * htt_data_tx_desc - used for data tx path * * Note: vdev_id irrelevant for pkt_type == raw and no_classify == 1. * ext_tid: for qos-data frames (0-15), see %HTT_DATA_TX_EXT_TID_ * for special kinds of tids * postponed: only for HL hosts. indicates if this is a resend * (HL hosts manage queues on the host ) * more_in_batch: only for HL hosts. indicates if more packets are * pending. this allows target to wait and aggregate * freq: 0 means home channel of given vdev. intended for offchannel */ struct htt_data_tx_desc { u8 flags0; /* %HTT_DATA_TX_DESC_FLAGS0_ */ __le16 flags1; /* %HTT_DATA_TX_DESC_FLAGS1_ */ __le16 len; __le16 id; __le32 frags_paddr; union { __le32 peerid; struct { __le16 peerid; __le16 freq; } __packed offchan_tx; } __packed; u8 prefetch[0]; /* start of frame, for FW classification engine */ } __packed; struct htt_data_tx_desc_64 { u8 flags0; /* %HTT_DATA_TX_DESC_FLAGS0_ */ __le16 flags1; /* %HTT_DATA_TX_DESC_FLAGS1_ */ __le16 len; __le16 id; __le64 frags_paddr; union { __le32 peerid; struct { __le16 peerid; __le16 freq; } __packed offchan_tx; } __packed; u8 prefetch[0]; /* start of frame, for FW classification engine */ } __packed; enum htt_rx_ring_flags { HTT_RX_RING_FLAGS_MAC80211_HDR = 1 << 0, HTT_RX_RING_FLAGS_MSDU_PAYLOAD = 1 << 1, HTT_RX_RING_FLAGS_PPDU_START = 1 << 2, HTT_RX_RING_FLAGS_PPDU_END = 1 << 3, HTT_RX_RING_FLAGS_MPDU_START = 1 << 4, HTT_RX_RING_FLAGS_MPDU_END = 1 << 5, HTT_RX_RING_FLAGS_MSDU_START = 1 << 6, HTT_RX_RING_FLAGS_MSDU_END = 1 << 7, HTT_RX_RING_FLAGS_RX_ATTENTION = 1 << 8, HTT_RX_RING_FLAGS_FRAG_INFO = 1 << 9, HTT_RX_RING_FLAGS_UNICAST_RX = 1 << 10, HTT_RX_RING_FLAGS_MULTICAST_RX = 1 << 11, HTT_RX_RING_FLAGS_CTRL_RX = 1 << 12, HTT_RX_RING_FLAGS_MGMT_RX = 1 << 13, HTT_RX_RING_FLAGS_NULL_RX = 1 << 14, HTT_RX_RING_FLAGS_PHY_DATA_RX = 1 << 15 }; #define HTT_RX_RING_SIZE_MIN 128 #define HTT_RX_RING_SIZE_MAX 2048 #define HTT_RX_RING_SIZE HTT_RX_RING_SIZE_MAX #define HTT_RX_RING_FILL_LEVEL (((HTT_RX_RING_SIZE) / 2) - 1) #define HTT_RX_RING_FILL_LEVEL_DUAL_MAC (HTT_RX_RING_SIZE - 1) struct htt_rx_ring_rx_desc_offsets { /* the following offsets are in 4-byte units */ __le16 mac80211_hdr_offset; __le16 msdu_payload_offset; __le16 ppdu_start_offset; __le16 ppdu_end_offset; __le16 mpdu_start_offset; __le16 mpdu_end_offset; __le16 msdu_start_offset; __le16 msdu_end_offset; __le16 rx_attention_offset; __le16 frag_info_offset; } __packed; struct htt_rx_ring_setup_ring32 { __le32 fw_idx_shadow_reg_paddr; __le32 rx_ring_base_paddr; __le16 rx_ring_len; /* in 4-byte words */ __le16 rx_ring_bufsize; /* rx skb size - in bytes */ __le16 flags; /* %HTT_RX_RING_FLAGS_ */ __le16 fw_idx_init_val; struct htt_rx_ring_rx_desc_offsets offsets; } __packed; struct htt_rx_ring_setup_ring64 { __le64 fw_idx_shadow_reg_paddr; __le64 rx_ring_base_paddr; __le16 rx_ring_len; /* in 4-byte words */ __le16 rx_ring_bufsize; /* rx skb size - in bytes */ __le16 flags; /* %HTT_RX_RING_FLAGS_ */ __le16 fw_idx_init_val; struct htt_rx_ring_rx_desc_offsets offsets; } __packed; struct htt_rx_ring_setup_hdr { u8 num_rings; /* supported values: 1, 2 */ __le16 rsvd0; } __packed; struct htt_rx_ring_setup_32 { struct htt_rx_ring_setup_hdr hdr; struct htt_rx_ring_setup_ring32 rings[]; } __packed; struct htt_rx_ring_setup_64 { struct htt_rx_ring_setup_hdr hdr; struct htt_rx_ring_setup_ring64 rings[]; } __packed; /* * htt_stats_req - request target to send specified statistics * * @msg_type: hardcoded %HTT_H2T_MSG_TYPE_STATS_REQ * @upload_types: see %htt_dbg_stats_type. this is 24bit field actually * so make sure its little-endian. * @reset_types: see %htt_dbg_stats_type. this is 24bit field actually * so make sure its little-endian. * @cfg_val: stat_type specific configuration * @stat_type: see %htt_dbg_stats_type * @cookie_lsb: used for confirmation message from target->host * @cookie_msb: ditto as %cookie */ struct htt_stats_req { u8 upload_types[3]; u8 rsvd0; u8 reset_types[3]; struct { u8 mpdu_bytes; u8 mpdu_num_msdus; u8 msdu_bytes; } __packed; u8 stat_type; __le32 cookie_lsb; __le32 cookie_msb; } __packed; #define HTT_STATS_REQ_CFG_STAT_TYPE_INVALID 0xff #define HTT_STATS_BIT_MASK GENMASK(16, 0) /* * htt_oob_sync_req - request out-of-band sync * * The HTT SYNC tells the target to suspend processing of subsequent * HTT host-to-target messages until some other target agent locally * informs the target HTT FW that the current sync counter is equal to * or greater than (in a modulo sense) the sync counter specified in * the SYNC message. * * This allows other host-target components to synchronize their operation * with HTT, e.g. to ensure that tx frames don't get transmitted until a * security key has been downloaded to and activated by the target. * In the absence of any explicit synchronization counter value * specification, the target HTT FW will use zero as the default current * sync value. * * The HTT target FW will suspend its host->target message processing as long * as 0 < (in-band sync counter - out-of-band sync counter) & 0xff < 128. */ struct htt_oob_sync_req { u8 sync_count; __le16 rsvd0; } __packed; struct htt_aggr_conf { u8 max_num_ampdu_subframes; /* amsdu_subframes is limited by 0x1F mask */ u8 max_num_amsdu_subframes; } __packed; struct htt_aggr_conf_v2 { u8 max_num_ampdu_subframes; /* amsdu_subframes is limited by 0x1F mask */ u8 max_num_amsdu_subframes; u8 reserved; } __packed; #define HTT_MGMT_FRM_HDR_DOWNLOAD_LEN 32 struct htt_mgmt_tx_desc_qca99x0 { __le32 rate; } __packed; struct htt_mgmt_tx_desc { u8 pad[sizeof(u32) - sizeof(struct htt_cmd_hdr)]; __le32 msdu_paddr; __le32 desc_id; __le32 len; __le32 vdev_id; u8 hdr[HTT_MGMT_FRM_HDR_DOWNLOAD_LEN]; union { struct htt_mgmt_tx_desc_qca99x0 qca99x0; } __packed; } __packed; enum htt_mgmt_tx_status { HTT_MGMT_TX_STATUS_OK = 0, HTT_MGMT_TX_STATUS_RETRY = 1, HTT_MGMT_TX_STATUS_DROP = 2 }; /*=== target -> host messages ===============================================*/ enum htt_main_t2h_msg_type { HTT_MAIN_T2H_MSG_TYPE_VERSION_CONF = 0x0, HTT_MAIN_T2H_MSG_TYPE_RX_IND = 0x1, HTT_MAIN_T2H_MSG_TYPE_RX_FLUSH = 0x2, HTT_MAIN_T2H_MSG_TYPE_PEER_MAP = 0x3, HTT_MAIN_T2H_MSG_TYPE_PEER_UNMAP = 0x4, HTT_MAIN_T2H_MSG_TYPE_RX_ADDBA = 0x5, HTT_MAIN_T2H_MSG_TYPE_RX_DELBA = 0x6, HTT_MAIN_T2H_MSG_TYPE_TX_COMPL_IND = 0x7, HTT_MAIN_T2H_MSG_TYPE_PKTLOG = 0x8, HTT_MAIN_T2H_MSG_TYPE_STATS_CONF = 0x9, HTT_MAIN_T2H_MSG_TYPE_RX_FRAG_IND = 0xa, HTT_MAIN_T2H_MSG_TYPE_SEC_IND = 0xb, HTT_MAIN_T2H_MSG_TYPE_TX_INSPECT_IND = 0xd, HTT_MAIN_T2H_MSG_TYPE_MGMT_TX_COMPL_IND = 0xe, HTT_MAIN_T2H_MSG_TYPE_TX_CREDIT_UPDATE_IND = 0xf, HTT_MAIN_T2H_MSG_TYPE_RX_PN_IND = 0x10, HTT_MAIN_T2H_MSG_TYPE_RX_OFFLOAD_DELIVER_IND = 0x11, HTT_MAIN_T2H_MSG_TYPE_TEST, /* keep this last */ HTT_MAIN_T2H_NUM_MSGS }; enum htt_10x_t2h_msg_type { HTT_10X_T2H_MSG_TYPE_VERSION_CONF = 0x0, HTT_10X_T2H_MSG_TYPE_RX_IND = 0x1, HTT_10X_T2H_MSG_TYPE_RX_FLUSH = 0x2, HTT_10X_T2H_MSG_TYPE_PEER_MAP = 0x3, HTT_10X_T2H_MSG_TYPE_PEER_UNMAP = 0x4, HTT_10X_T2H_MSG_TYPE_RX_ADDBA = 0x5, HTT_10X_T2H_MSG_TYPE_RX_DELBA = 0x6, HTT_10X_T2H_MSG_TYPE_TX_COMPL_IND = 0x7, HTT_10X_T2H_MSG_TYPE_PKTLOG = 0x8, HTT_10X_T2H_MSG_TYPE_STATS_CONF = 0x9, HTT_10X_T2H_MSG_TYPE_RX_FRAG_IND = 0xa, HTT_10X_T2H_MSG_TYPE_SEC_IND = 0xb, HTT_10X_T2H_MSG_TYPE_RC_UPDATE_IND = 0xc, HTT_10X_T2H_MSG_TYPE_TX_INSPECT_IND = 0xd, HTT_10X_T2H_MSG_TYPE_TEST = 0xe, HTT_10X_T2H_MSG_TYPE_CHAN_CHANGE = 0xf, HTT_10X_T2H_MSG_TYPE_AGGR_CONF = 0x11, HTT_10X_T2H_MSG_TYPE_STATS_NOUPLOAD = 0x12, HTT_10X_T2H_MSG_TYPE_MGMT_TX_COMPL_IND = 0x13, /* keep this last */ HTT_10X_T2H_NUM_MSGS }; enum htt_tlv_t2h_msg_type { HTT_TLV_T2H_MSG_TYPE_VERSION_CONF = 0x0, HTT_TLV_T2H_MSG_TYPE_RX_IND = 0x1, HTT_TLV_T2H_MSG_TYPE_RX_FLUSH = 0x2, HTT_TLV_T2H_MSG_TYPE_PEER_MAP = 0x3, HTT_TLV_T2H_MSG_TYPE_PEER_UNMAP = 0x4, HTT_TLV_T2H_MSG_TYPE_RX_ADDBA = 0x5, HTT_TLV_T2H_MSG_TYPE_RX_DELBA = 0x6, HTT_TLV_T2H_MSG_TYPE_TX_COMPL_IND = 0x7, HTT_TLV_T2H_MSG_TYPE_PKTLOG = 0x8, HTT_TLV_T2H_MSG_TYPE_STATS_CONF = 0x9, HTT_TLV_T2H_MSG_TYPE_RX_FRAG_IND = 0xa, HTT_TLV_T2H_MSG_TYPE_SEC_IND = 0xb, HTT_TLV_T2H_MSG_TYPE_RC_UPDATE_IND = 0xc, /* deprecated */ HTT_TLV_T2H_MSG_TYPE_TX_INSPECT_IND = 0xd, HTT_TLV_T2H_MSG_TYPE_MGMT_TX_COMPL_IND = 0xe, HTT_TLV_T2H_MSG_TYPE_TX_CREDIT_UPDATE_IND = 0xf, HTT_TLV_T2H_MSG_TYPE_RX_PN_IND = 0x10, HTT_TLV_T2H_MSG_TYPE_RX_OFFLOAD_DELIVER_IND = 0x11, HTT_TLV_T2H_MSG_TYPE_RX_IN_ORD_PADDR_IND = 0x12, /* 0x13 reservd */ HTT_TLV_T2H_MSG_TYPE_WDI_IPA_OP_RESPONSE = 0x14, HTT_TLV_T2H_MSG_TYPE_CHAN_CHANGE = 0x15, HTT_TLV_T2H_MSG_TYPE_RX_OFLD_PKT_ERR = 0x16, HTT_TLV_T2H_MSG_TYPE_TEST, /* keep this last */ HTT_TLV_T2H_NUM_MSGS }; enum htt_10_4_t2h_msg_type { HTT_10_4_T2H_MSG_TYPE_VERSION_CONF = 0x0, HTT_10_4_T2H_MSG_TYPE_RX_IND = 0x1, HTT_10_4_T2H_MSG_TYPE_RX_FLUSH = 0x2, HTT_10_4_T2H_MSG_TYPE_PEER_MAP = 0x3, HTT_10_4_T2H_MSG_TYPE_PEER_UNMAP = 0x4, HTT_10_4_T2H_MSG_TYPE_RX_ADDBA = 0x5, HTT_10_4_T2H_MSG_TYPE_RX_DELBA = 0x6, HTT_10_4_T2H_MSG_TYPE_TX_COMPL_IND = 0x7, HTT_10_4_T2H_MSG_TYPE_PKTLOG = 0x8, HTT_10_4_T2H_MSG_TYPE_STATS_CONF = 0x9, HTT_10_4_T2H_MSG_TYPE_RX_FRAG_IND = 0xa, HTT_10_4_T2H_MSG_TYPE_SEC_IND = 0xb, HTT_10_4_T2H_MSG_TYPE_RC_UPDATE_IND = 0xc, HTT_10_4_T2H_MSG_TYPE_TX_INSPECT_IND = 0xd, HTT_10_4_T2H_MSG_TYPE_MGMT_TX_COMPL_IND = 0xe, HTT_10_4_T2H_MSG_TYPE_CHAN_CHANGE = 0xf, HTT_10_4_T2H_MSG_TYPE_TX_CREDIT_UPDATE_IND = 0x10, HTT_10_4_T2H_MSG_TYPE_RX_PN_IND = 0x11, HTT_10_4_T2H_MSG_TYPE_RX_OFFLOAD_DELIVER_IND = 0x12, HTT_10_4_T2H_MSG_TYPE_TEST = 0x13, HTT_10_4_T2H_MSG_TYPE_EN_STATS = 0x14, HTT_10_4_T2H_MSG_TYPE_AGGR_CONF = 0x15, HTT_10_4_T2H_MSG_TYPE_TX_FETCH_IND = 0x16, HTT_10_4_T2H_MSG_TYPE_TX_FETCH_CONFIRM = 0x17, HTT_10_4_T2H_MSG_TYPE_STATS_NOUPLOAD = 0x18, /* 0x19 to 0x2f are reserved */ HTT_10_4_T2H_MSG_TYPE_TX_MODE_SWITCH_IND = 0x30, HTT_10_4_T2H_MSG_TYPE_PEER_STATS = 0x31, /* keep this last */ HTT_10_4_T2H_NUM_MSGS }; enum htt_t2h_msg_type { HTT_T2H_MSG_TYPE_VERSION_CONF, HTT_T2H_MSG_TYPE_RX_IND, HTT_T2H_MSG_TYPE_RX_FLUSH, HTT_T2H_MSG_TYPE_PEER_MAP, HTT_T2H_MSG_TYPE_PEER_UNMAP, HTT_T2H_MSG_TYPE_RX_ADDBA, HTT_T2H_MSG_TYPE_RX_DELBA, HTT_T2H_MSG_TYPE_TX_COMPL_IND, HTT_T2H_MSG_TYPE_PKTLOG, HTT_T2H_MSG_TYPE_STATS_CONF, HTT_T2H_MSG_TYPE_RX_FRAG_IND, HTT_T2H_MSG_TYPE_SEC_IND, HTT_T2H_MSG_TYPE_RC_UPDATE_IND, HTT_T2H_MSG_TYPE_TX_INSPECT_IND, HTT_T2H_MSG_TYPE_MGMT_TX_COMPLETION, HTT_T2H_MSG_TYPE_TX_CREDIT_UPDATE_IND, HTT_T2H_MSG_TYPE_RX_PN_IND, HTT_T2H_MSG_TYPE_RX_OFFLOAD_DELIVER_IND, HTT_T2H_MSG_TYPE_RX_IN_ORD_PADDR_IND, HTT_T2H_MSG_TYPE_WDI_IPA_OP_RESPONSE, HTT_T2H_MSG_TYPE_CHAN_CHANGE, HTT_T2H_MSG_TYPE_RX_OFLD_PKT_ERR, HTT_T2H_MSG_TYPE_AGGR_CONF, HTT_T2H_MSG_TYPE_STATS_NOUPLOAD, HTT_T2H_MSG_TYPE_TEST, HTT_T2H_MSG_TYPE_EN_STATS, HTT_T2H_MSG_TYPE_TX_FETCH_IND, HTT_T2H_MSG_TYPE_TX_FETCH_CONFIRM, HTT_T2H_MSG_TYPE_TX_MODE_SWITCH_IND, HTT_T2H_MSG_TYPE_PEER_STATS, /* keep this last */ HTT_T2H_NUM_MSGS }; /* * htt_resp_hdr - header for target-to-host messages * * msg_type: see htt_t2h_msg_type */ struct htt_resp_hdr { u8 msg_type; } __packed; #define HTT_RESP_HDR_MSG_TYPE_OFFSET 0 #define HTT_RESP_HDR_MSG_TYPE_MASK 0xff #define HTT_RESP_HDR_MSG_TYPE_LSB 0 /* htt_ver_resp - response sent for htt_ver_req */ struct htt_ver_resp { u8 minor; u8 major; u8 rsvd0; } __packed; #define HTT_MGMT_TX_CMPL_FLAG_ACK_RSSI BIT(0) #define HTT_MGMT_TX_CMPL_INFO_ACK_RSSI_MASK GENMASK(7, 0) struct htt_mgmt_tx_completion { u8 rsvd0; u8 rsvd1; u8 flags; __le32 desc_id; __le32 status; __le32 ppdu_id; __le32 info; } __packed; #define HTT_RX_INDICATION_INFO0_EXT_TID_MASK (0x1F) #define HTT_RX_INDICATION_INFO0_EXT_TID_LSB (0) #define HTT_RX_INDICATION_INFO0_FLUSH_VALID (1 << 5) #define HTT_RX_INDICATION_INFO0_RELEASE_VALID (1 << 6) #define HTT_RX_INDICATION_INFO0_PPDU_DURATION BIT(7) #define HTT_RX_INDICATION_INFO1_FLUSH_START_SEQNO_MASK 0x0000003F #define HTT_RX_INDICATION_INFO1_FLUSH_START_SEQNO_LSB 0 #define HTT_RX_INDICATION_INFO1_FLUSH_END_SEQNO_MASK 0x00000FC0 #define HTT_RX_INDICATION_INFO1_FLUSH_END_SEQNO_LSB 6 #define HTT_RX_INDICATION_INFO1_RELEASE_START_SEQNO_MASK 0x0003F000 #define HTT_RX_INDICATION_INFO1_RELEASE_START_SEQNO_LSB 12 #define HTT_RX_INDICATION_INFO1_RELEASE_END_SEQNO_MASK 0x00FC0000 #define HTT_RX_INDICATION_INFO1_RELEASE_END_SEQNO_LSB 18 #define HTT_RX_INDICATION_INFO1_NUM_MPDU_RANGES_MASK 0xFF000000 #define HTT_RX_INDICATION_INFO1_NUM_MPDU_RANGES_LSB 24 #define HTT_TX_CMPL_FLAG_DATA_RSSI BIT(0) #define HTT_TX_CMPL_FLAG_PPID_PRESENT BIT(1) #define HTT_TX_CMPL_FLAG_PA_PRESENT BIT(2) #define HTT_TX_CMPL_FLAG_PPDU_DURATION_PRESENT BIT(3) #define HTT_TX_DATA_RSSI_ENABLE_WCN3990 BIT(3) #define HTT_TX_DATA_APPEND_RETRIES BIT(0) #define HTT_TX_DATA_APPEND_TIMESTAMP BIT(1) struct htt_rx_indication_hdr { u8 info0; /* %HTT_RX_INDICATION_INFO0_ */ __le16 peer_id; __le32 info1; /* %HTT_RX_INDICATION_INFO1_ */ } __packed; #define HTT_RX_INDICATION_INFO0_PHY_ERR_VALID (1 << 0) #define HTT_RX_INDICATION_INFO0_LEGACY_RATE_MASK (0x1E) #define HTT_RX_INDICATION_INFO0_LEGACY_RATE_LSB (1) #define HTT_RX_INDICATION_INFO0_LEGACY_RATE_CCK (1 << 5) #define HTT_RX_INDICATION_INFO0_END_VALID (1 << 6) #define HTT_RX_INDICATION_INFO0_START_VALID (1 << 7) #define HTT_RX_INDICATION_INFO1_VHT_SIG_A1_MASK 0x00FFFFFF #define HTT_RX_INDICATION_INFO1_VHT_SIG_A1_LSB 0 #define HTT_RX_INDICATION_INFO1_PREAMBLE_TYPE_MASK 0xFF000000 #define HTT_RX_INDICATION_INFO1_PREAMBLE_TYPE_LSB 24 #define HTT_RX_INDICATION_INFO2_VHT_SIG_A1_MASK 0x00FFFFFF #define HTT_RX_INDICATION_INFO2_VHT_SIG_A1_LSB 0 #define HTT_RX_INDICATION_INFO2_SERVICE_MASK 0xFF000000 #define HTT_RX_INDICATION_INFO2_SERVICE_LSB 24 enum htt_rx_legacy_rate { HTT_RX_OFDM_48 = 0, HTT_RX_OFDM_24 = 1, HTT_RX_OFDM_12, HTT_RX_OFDM_6, HTT_RX_OFDM_54, HTT_RX_OFDM_36, HTT_RX_OFDM_18, HTT_RX_OFDM_9, /* long preamble */ HTT_RX_CCK_11_LP = 0, HTT_RX_CCK_5_5_LP = 1, HTT_RX_CCK_2_LP, HTT_RX_CCK_1_LP, /* short preamble */ HTT_RX_CCK_11_SP, HTT_RX_CCK_5_5_SP, HTT_RX_CCK_2_SP }; enum htt_rx_legacy_rate_type { HTT_RX_LEGACY_RATE_OFDM = 0, HTT_RX_LEGACY_RATE_CCK }; enum htt_rx_preamble_type { HTT_RX_LEGACY = 0x4, HTT_RX_HT = 0x8, HTT_RX_HT_WITH_TXBF = 0x9, HTT_RX_VHT = 0xC, HTT_RX_VHT_WITH_TXBF = 0xD, }; /* * Fields: phy_err_valid, phy_err_code, tsf, * usec_timestamp, sub_usec_timestamp * ..are valid only if end_valid == 1. * * Fields: rssi_chains, legacy_rate_type, * legacy_rate_cck, preamble_type, service, * vht_sig_* * ..are valid only if start_valid == 1; */ struct htt_rx_indication_ppdu { u8 combined_rssi; u8 sub_usec_timestamp; u8 phy_err_code; u8 info0; /* HTT_RX_INDICATION_INFO0_ */ struct { u8 pri20_db; u8 ext20_db; u8 ext40_db; u8 ext80_db; } __packed rssi_chains[4]; __le32 tsf; __le32 usec_timestamp; __le32 info1; /* HTT_RX_INDICATION_INFO1_ */ __le32 info2; /* HTT_RX_INDICATION_INFO2_ */ } __packed; enum htt_rx_mpdu_status { HTT_RX_IND_MPDU_STATUS_UNKNOWN = 0x0, HTT_RX_IND_MPDU_STATUS_OK, HTT_RX_IND_MPDU_STATUS_ERR_FCS, HTT_RX_IND_MPDU_STATUS_ERR_DUP, HTT_RX_IND_MPDU_STATUS_ERR_REPLAY, HTT_RX_IND_MPDU_STATUS_ERR_INV_PEER, /* only accept EAPOL frames */ HTT_RX_IND_MPDU_STATUS_UNAUTH_PEER, HTT_RX_IND_MPDU_STATUS_OUT_OF_SYNC, /* Non-data in promiscuous mode */ HTT_RX_IND_MPDU_STATUS_MGMT_CTRL, HTT_RX_IND_MPDU_STATUS_TKIP_MIC_ERR, HTT_RX_IND_MPDU_STATUS_DECRYPT_ERR, HTT_RX_IND_MPDU_STATUS_MPDU_LENGTH_ERR, HTT_RX_IND_MPDU_STATUS_ENCRYPT_REQUIRED_ERR, HTT_RX_IND_MPDU_STATUS_PRIVACY_ERR, /* * MISC: discard for unspecified reasons. * Leave this enum value last. */ HTT_RX_IND_MPDU_STATUS_ERR_MISC = 0xFF }; struct htt_rx_indication_mpdu_range { u8 mpdu_count; u8 mpdu_range_status; /* %htt_rx_mpdu_status */ u8 pad0; u8 pad1; } __packed; struct htt_rx_indication_prefix { __le16 fw_rx_desc_bytes; u8 pad0; u8 pad1; } __packed; struct htt_rx_indication { struct htt_rx_indication_hdr hdr; struct htt_rx_indication_ppdu ppdu; struct htt_rx_indication_prefix prefix; /* * the following fields are both dynamically sized, so * take care addressing them */ /* the size of this is %fw_rx_desc_bytes */ struct fw_rx_desc_base fw_desc; /* * %mpdu_ranges starts after &%prefix + roundup(%fw_rx_desc_bytes, 4) * and has %num_mpdu_ranges elements. */ struct htt_rx_indication_mpdu_range mpdu_ranges[]; } __packed; /* High latency version of the RX indication */ struct htt_rx_indication_hl { struct htt_rx_indication_hdr hdr; struct htt_rx_indication_ppdu ppdu; struct htt_rx_indication_prefix prefix; struct fw_rx_desc_hl fw_desc; struct htt_rx_indication_mpdu_range mpdu_ranges[]; } __packed; struct htt_hl_rx_desc { __le32 info; __le32 pn_31_0; union { struct { __le16 pn_47_32; __le16 pn_63_48; } pn16; __le32 pn_63_32; } u0; __le32 pn_95_64; __le32 pn_127_96; } __packed; static inline struct htt_rx_indication_mpdu_range * htt_rx_ind_get_mpdu_ranges(struct htt_rx_indication *rx_ind) { void *ptr = rx_ind; ptr += sizeof(rx_ind->hdr) + sizeof(rx_ind->ppdu) + sizeof(rx_ind->prefix) + roundup(__le16_to_cpu(rx_ind->prefix.fw_rx_desc_bytes), 4); return ptr; } static inline struct htt_rx_indication_mpdu_range * htt_rx_ind_get_mpdu_ranges_hl(struct htt_rx_indication_hl *rx_ind) { void *ptr = rx_ind; ptr += sizeof(rx_ind->hdr) + sizeof(rx_ind->ppdu) + sizeof(rx_ind->prefix) + sizeof(rx_ind->fw_desc); return ptr; } enum htt_rx_flush_mpdu_status { HTT_RX_FLUSH_MPDU_DISCARD = 0, HTT_RX_FLUSH_MPDU_REORDER = 1, }; /* * htt_rx_flush - discard or reorder given range of mpdus * * Note: host must check if all sequence numbers between * [seq_num_start, seq_num_end-1] are valid. */ struct htt_rx_flush { __le16 peer_id; u8 tid; u8 rsvd0; u8 mpdu_status; /* %htt_rx_flush_mpdu_status */ u8 seq_num_start; /* it is 6 LSBs of 802.11 seq no */ u8 seq_num_end; /* it is 6 LSBs of 802.11 seq no */ }; struct htt_rx_peer_map { u8 vdev_id; __le16 peer_id; u8 addr[6]; u8 rsvd0; u8 rsvd1; } __packed; struct htt_rx_peer_unmap { u8 rsvd0; __le16 peer_id; } __packed; enum htt_txrx_sec_cast_type { HTT_TXRX_SEC_MCAST = 0, HTT_TXRX_SEC_UCAST }; enum htt_rx_pn_check_type { HTT_RX_NON_PN_CHECK = 0, HTT_RX_PN_CHECK }; enum htt_rx_tkip_demic_type { HTT_RX_NON_TKIP_MIC = 0, HTT_RX_TKIP_MIC }; enum htt_security_types { HTT_SECURITY_NONE, HTT_SECURITY_WEP128, HTT_SECURITY_WEP104, HTT_SECURITY_WEP40, HTT_SECURITY_TKIP, HTT_SECURITY_TKIP_NOMIC, HTT_SECURITY_AES_CCMP, HTT_SECURITY_WAPI, HTT_NUM_SECURITY_TYPES /* keep this last! */ }; #define ATH10K_HTT_TXRX_PEER_SECURITY_MAX 2 #define ATH10K_TXRX_NUM_EXT_TIDS 19 #define ATH10K_TXRX_NON_QOS_TID 16 enum htt_security_flags { #define HTT_SECURITY_TYPE_MASK 0x7F #define HTT_SECURITY_TYPE_LSB 0 HTT_SECURITY_IS_UNICAST = 1 << 7 }; struct htt_security_indication { union { /* dont use bitfields; undefined behaviour */ u8 flags; /* %htt_security_flags */ struct { u8 security_type:7, /* %htt_security_types */ is_unicast:1; } __packed; } __packed; __le16 peer_id; u8 michael_key[8]; u8 wapi_rsc[16]; } __packed; #define HTT_RX_BA_INFO0_TID_MASK 0x000F #define HTT_RX_BA_INFO0_TID_LSB 0 #define HTT_RX_BA_INFO0_PEER_ID_MASK 0xFFF0 #define HTT_RX_BA_INFO0_PEER_ID_LSB 4 struct htt_rx_addba { u8 window_size; __le16 info0; /* %HTT_RX_BA_INFO0_ */ } __packed; struct htt_rx_delba { u8 rsvd0; __le16 info0; /* %HTT_RX_BA_INFO0_ */ } __packed; enum htt_data_tx_status { HTT_DATA_TX_STATUS_OK = 0, HTT_DATA_TX_STATUS_DISCARD = 1, HTT_DATA_TX_STATUS_NO_ACK = 2, HTT_DATA_TX_STATUS_POSTPONE = 3 /* HL only */ }; enum htt_data_tx_flags { #define HTT_DATA_TX_STATUS_MASK 0x07 #define HTT_DATA_TX_STATUS_LSB 0 #define HTT_DATA_TX_TID_MASK 0x78 #define HTT_DATA_TX_TID_LSB 3 HTT_DATA_TX_TID_INVALID = 1 << 7 }; #define HTT_TX_COMPL_INV_MSDU_ID 0xFFFF struct htt_append_retries { __le16 msdu_id; u8 tx_retries; u8 flag; } __packed; struct htt_data_tx_completion_ext { struct htt_append_retries a_retries; __le32 t_stamp; __le16 msdus_rssi[]; } __packed; /* * @brief target -> host TX completion indication message definition * * @details * The following diagram shows the format of the TX completion indication sent * from the target to the host * * |31 28|27|26|25|24|23 16| 15 |14 11|10 8|7 0| * |-------------------------------------------------------------| * header: |rsvd |A2|TP|A1|A0| num | t_i| tid |status| msg_type | * |-------------------------------------------------------------| * payload: | MSDU1 ID | MSDU0 ID | * |-------------------------------------------------------------| * : MSDU3 ID : MSDU2 ID : * |-------------------------------------------------------------| * | struct htt_tx_compl_ind_append_retries | * |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -| * | struct htt_tx_compl_ind_append_tx_tstamp | * |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -| * | MSDU1 ACK RSSI | MSDU0 ACK RSSI | * |-------------------------------------------------------------| * : MSDU3 ACK RSSI : MSDU2 ACK RSSI : * |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -| * -msg_type * Bits 7:0 * Purpose: identifies this as HTT TX completion indication * -status * Bits 10:8 * Purpose: the TX completion status of payload fragmentations descriptors * Value: could be HTT_TX_COMPL_IND_STAT_OK or HTT_TX_COMPL_IND_STAT_DISCARD * -tid * Bits 14:11 * Purpose: the tid associated with those fragmentation descriptors. It is * valid or not, depending on the tid_invalid bit. * Value: 0 to 15 * -tid_invalid * Bits 15:15 * Purpose: this bit indicates whether the tid field is valid or not * Value: 0 indicates valid, 1 indicates invalid * -num * Bits 23:16 * Purpose: the number of payload in this indication * Value: 1 to 255 * -A0 = append * Bits 24:24 * Purpose: append the struct htt_tx_compl_ind_append_retries which contains * the number of tx retries for one MSDU at the end of this message * Value: 0 indicates no appending, 1 indicates appending * -A1 = append1 * Bits 25:25 * Purpose: Append the struct htt_tx_compl_ind_append_tx_tstamp which * contains the timestamp info for each TX msdu id in payload. * Value: 0 indicates no appending, 1 indicates appending * -TP = MSDU tx power presence * Bits 26:26 * Purpose: Indicate whether the TX_COMPL_IND includes a tx power report * for each MSDU referenced by the TX_COMPL_IND message. * The order of the per-MSDU tx power reports matches the order * of the MSDU IDs. * Value: 0 indicates not appending, 1 indicates appending * -A2 = append2 * Bits 27:27 * Purpose: Indicate whether data ACK RSSI is appended for each MSDU in * TX_COMP_IND message. The order of the per-MSDU ACK RSSI report * matches the order of the MSDU IDs. * The ACK RSSI values are valid when status is COMPLETE_OK (and * this append2 bit is set). * Value: 0 indicates not appending, 1 indicates appending */ struct htt_data_tx_completion { union { u8 flags; struct { u8 status:3, tid:4, tid_invalid:1; } __packed; } __packed; u8 num_msdus; u8 flags2; /* HTT_TX_CMPL_FLAG_DATA_RSSI */ __le16 msdus[]; /* variable length based on %num_msdus */ } __packed; #define HTT_TX_PPDU_DUR_INFO0_PEER_ID_MASK GENMASK(15, 0) #define HTT_TX_PPDU_DUR_INFO0_TID_MASK GENMASK(20, 16) struct htt_data_tx_ppdu_dur { __le32 info0; /* HTT_TX_PPDU_DUR_INFO0_ */ __le32 tx_duration; /* in usecs */ } __packed; #define HTT_TX_COMPL_PPDU_DUR_INFO0_NUM_ENTRIES_MASK GENMASK(7, 0) struct htt_data_tx_compl_ppdu_dur { __le32 info0; /* HTT_TX_COMPL_PPDU_DUR_INFO0_ */ struct htt_data_tx_ppdu_dur ppdu_dur[]; } __packed; struct htt_tx_compl_ind_base { u32 hdr; u16 payload[1/*or more*/]; } __packed; struct htt_rc_tx_done_params { u32 rate_code; u32 rate_code_flags; u32 flags; u32 num_enqued; /* 1 for non-AMPDU */ u32 num_retries; u32 num_failed; /* for AMPDU */ u32 ack_rssi; u32 time_stamp; u32 is_probe; }; struct htt_rc_update { u8 vdev_id; __le16 peer_id; u8 addr[6]; u8 num_elems; u8 rsvd0; struct htt_rc_tx_done_params params[]; /* variable length %num_elems */ } __packed; /* see htt_rx_indication for similar fields and descriptions */ struct htt_rx_fragment_indication { union { u8 info0; /* %HTT_RX_FRAG_IND_INFO0_ */ struct { u8 ext_tid:5, flush_valid:1; } __packed; } __packed; __le16 peer_id; __le32 info1; /* %HTT_RX_FRAG_IND_INFO1_ */ __le16 fw_rx_desc_bytes; __le16 rsvd0; u8 fw_msdu_rx_desc[]; } __packed; #define ATH10K_IEEE80211_EXTIV BIT(5) #define ATH10K_IEEE80211_TKIP_MICLEN 8 /* trailing MIC */ #define HTT_RX_FRAG_IND_INFO0_HEADER_LEN 16 #define HTT_RX_FRAG_IND_INFO0_EXT_TID_MASK 0x1F #define HTT_RX_FRAG_IND_INFO0_EXT_TID_LSB 0 #define HTT_RX_FRAG_IND_INFO0_FLUSH_VALID_MASK 0x20 #define HTT_RX_FRAG_IND_INFO0_FLUSH_VALID_LSB 5 #define HTT_RX_FRAG_IND_INFO1_FLUSH_SEQ_NUM_START_MASK 0x0000003F #define HTT_RX_FRAG_IND_INFO1_FLUSH_SEQ_NUM_START_LSB 0 #define HTT_RX_FRAG_IND_INFO1_FLUSH_SEQ_NUM_END_MASK 0x00000FC0 #define HTT_RX_FRAG_IND_INFO1_FLUSH_SEQ_NUM_END_LSB 6 struct htt_rx_pn_ind { __le16 peer_id; u8 tid; u8 seqno_start; u8 seqno_end; u8 pn_ie_count; u8 reserved; u8 pn_ies[]; } __packed; struct htt_rx_offload_msdu { __le16 msdu_len; __le16 peer_id; u8 vdev_id; u8 tid; u8 fw_desc; u8 payload[]; } __packed; struct htt_rx_offload_ind { u8 reserved; __le16 msdu_count; } __packed; struct htt_rx_in_ord_msdu_desc { __le32 msdu_paddr; __le16 msdu_len; u8 fw_desc; u8 reserved; } __packed; struct htt_rx_in_ord_msdu_desc_ext { __le64 msdu_paddr; __le16 msdu_len; u8 fw_desc; u8 reserved; } __packed; struct htt_rx_in_ord_ind { u8 info; __le16 peer_id; u8 vdev_id; u8 reserved; __le16 msdu_count; union { DECLARE_FLEX_ARRAY(struct htt_rx_in_ord_msdu_desc, msdu_descs32); DECLARE_FLEX_ARRAY(struct htt_rx_in_ord_msdu_desc_ext, msdu_descs64); } __packed; } __packed; #define HTT_RX_IN_ORD_IND_INFO_TID_MASK 0x0000001f #define HTT_RX_IN_ORD_IND_INFO_TID_LSB 0 #define HTT_RX_IN_ORD_IND_INFO_OFFLOAD_MASK 0x00000020 #define HTT_RX_IN_ORD_IND_INFO_OFFLOAD_LSB 5 #define HTT_RX_IN_ORD_IND_INFO_FRAG_MASK 0x00000040 #define HTT_RX_IN_ORD_IND_INFO_FRAG_LSB 6 /* * target -> host test message definition * * The following field definitions describe the format of the test * message sent from the target to the host. * The message consists of a 4-octet header, followed by a variable * number of 32-bit integer values, followed by a variable number * of 8-bit character values. * * |31 16|15 8|7 0| * |-----------------------------------------------------------| * | num chars | num ints | msg type | * |-----------------------------------------------------------| * | int 0 | * |-----------------------------------------------------------| * | int 1 | * |-----------------------------------------------------------| * | ... | * |-----------------------------------------------------------| * | char 3 | char 2 | char 1 | char 0 | * |-----------------------------------------------------------| * | | | ... | char 4 | * |-----------------------------------------------------------| * - MSG_TYPE * Bits 7:0 * Purpose: identifies this as a test message * Value: HTT_MSG_TYPE_TEST * - NUM_INTS * Bits 15:8 * Purpose: indicate how many 32-bit integers follow the message header * - NUM_CHARS * Bits 31:16 * Purpose: indicate how many 8-bit characters follow the series of integers */ struct htt_rx_test { u8 num_ints; __le16 num_chars; /* payload consists of 2 lists: * a) num_ints * sizeof(__le32) * b) num_chars * sizeof(u8) aligned to 4bytes */ u8 payload[]; } __packed; static inline __le32 *htt_rx_test_get_ints(struct htt_rx_test *rx_test) { return (__le32 *)rx_test->payload; } static inline u8 *htt_rx_test_get_chars(struct htt_rx_test *rx_test) { return rx_test->payload + (rx_test->num_ints * sizeof(__le32)); } /* * target -> host packet log message * * The following field definitions describe the format of the packet log * message sent from the target to the host. * The message consists of a 4-octet header,followed by a variable number * of 32-bit character values. * * |31 24|23 16|15 8|7 0| * |-----------------------------------------------------------| * | | | | msg type | * |-----------------------------------------------------------| * | payload | * |-----------------------------------------------------------| * - MSG_TYPE * Bits 7:0 * Purpose: identifies this as a test message * Value: HTT_MSG_TYPE_PACKETLOG */ struct htt_pktlog_msg { u8 pad[3]; u8 payload[]; } __packed; struct htt_dbg_stats_rx_reorder_stats { /* Non QoS MPDUs received */ __le32 deliver_non_qos; /* MPDUs received in-order */ __le32 deliver_in_order; /* Flush due to reorder timer expired */ __le32 deliver_flush_timeout; /* Flush due to move out of window */ __le32 deliver_flush_oow; /* Flush due to DELBA */ __le32 deliver_flush_delba; /* MPDUs dropped due to FCS error */ __le32 fcs_error; /* MPDUs dropped due to monitor mode non-data packet */ __le32 mgmt_ctrl; /* MPDUs dropped due to invalid peer */ __le32 invalid_peer; /* MPDUs dropped due to duplication (non aggregation) */ __le32 dup_non_aggr; /* MPDUs dropped due to processed before */ __le32 dup_past; /* MPDUs dropped due to duplicate in reorder queue */ __le32 dup_in_reorder; /* Reorder timeout happened */ __le32 reorder_timeout; /* invalid bar ssn */ __le32 invalid_bar_ssn; /* reorder reset due to bar ssn */ __le32 ssn_reset; }; struct htt_dbg_stats_wal_tx_stats { /* Num HTT cookies queued to dispatch list */ __le32 comp_queued; /* Num HTT cookies dispatched */ __le32 comp_delivered; /* Num MSDU queued to WAL */ __le32 msdu_enqued; /* Num MPDU queue to WAL */ __le32 mpdu_enqued; /* Num MSDUs dropped by WMM limit */ __le32 wmm_drop; /* Num Local frames queued */ __le32 local_enqued; /* Num Local frames done */ __le32 local_freed; /* Num queued to HW */ __le32 hw_queued; /* Num PPDU reaped from HW */ __le32 hw_reaped; /* Num underruns */ __le32 underrun; /* Num PPDUs cleaned up in TX abort */ __le32 tx_abort; /* Num MPDUs requeued by SW */ __le32 mpdus_requeued; /* excessive retries */ __le32 tx_ko; /* data hw rate code */ __le32 data_rc; /* Scheduler self triggers */ __le32 self_triggers; /* frames dropped due to excessive sw retries */ __le32 sw_retry_failure; /* illegal rate phy errors */ __le32 illgl_rate_phy_err; /* wal pdev continuous xretry */ __le32 pdev_cont_xretry; /* wal pdev continuous xretry */ __le32 pdev_tx_timeout; /* wal pdev resets */ __le32 pdev_resets; __le32 phy_underrun; /* MPDU is more than txop limit */ __le32 txop_ovf; } __packed; struct htt_dbg_stats_wal_rx_stats { /* Cnts any change in ring routing mid-ppdu */ __le32 mid_ppdu_route_change; /* Total number of statuses processed */ __le32 status_rcvd; /* Extra frags on rings 0-3 */ __le32 r0_frags; __le32 r1_frags; __le32 r2_frags; __le32 r3_frags; /* MSDUs / MPDUs delivered to HTT */ __le32 htt_msdus; __le32 htt_mpdus; /* MSDUs / MPDUs delivered to local stack */ __le32 loc_msdus; __le32 loc_mpdus; /* AMSDUs that have more MSDUs than the status ring size */ __le32 oversize_amsdu; /* Number of PHY errors */ __le32 phy_errs; /* Number of PHY errors drops */ __le32 phy_err_drop; /* Number of mpdu errors - FCS, MIC, ENC etc. */ __le32 mpdu_errs; } __packed; struct htt_dbg_stats_wal_peer_stats { __le32 dummy; /* REMOVE THIS ONCE REAL PEER STAT COUNTERS ARE ADDED */ } __packed; struct htt_dbg_stats_wal_pdev_txrx { struct htt_dbg_stats_wal_tx_stats tx_stats; struct htt_dbg_stats_wal_rx_stats rx_stats; struct htt_dbg_stats_wal_peer_stats peer_stats; } __packed; struct htt_dbg_stats_rx_rate_info { __le32 mcs[10]; __le32 sgi[10]; __le32 nss[4]; __le32 stbc[10]; __le32 bw[3]; __le32 pream[6]; __le32 ldpc; __le32 txbf; }; /* * htt_dbg_stats_status - * present - The requested stats have been delivered in full. * This indicates that either the stats information was contained * in its entirety within this message, or else this message * completes the delivery of the requested stats info that was * partially delivered through earlier STATS_CONF messages. * partial - The requested stats have been delivered in part. * One or more subsequent STATS_CONF messages with the same * cookie value will be sent to deliver the remainder of the * information. * error - The requested stats could not be delivered, for example due * to a shortage of memory to construct a message holding the * requested stats. * invalid - The requested stat type is either not recognized, or the * target is configured to not gather the stats type in question. * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * series_done - This special value indicates that no further stats info * elements are present within a series of stats info elems * (within a stats upload confirmation message). */ enum htt_dbg_stats_status { HTT_DBG_STATS_STATUS_PRESENT = 0, HTT_DBG_STATS_STATUS_PARTIAL = 1, HTT_DBG_STATS_STATUS_ERROR = 2, HTT_DBG_STATS_STATUS_INVALID = 3, HTT_DBG_STATS_STATUS_SERIES_DONE = 7 }; /* * host -> target FRAG DESCRIPTOR/MSDU_EXT DESC bank * * The following field definitions describe the format of the HTT host * to target frag_desc/msdu_ext bank configuration message. * The message contains the based address and the min and max id of the * MSDU_EXT/FRAG_DESC that will be used by the HTT to map MSDU DESC and * MSDU_EXT/FRAG_DESC. * HTT will use id in HTT descriptor instead sending the frag_desc_ptr. * For QCA988X HW the firmware will use fragment_desc_ptr but in WIFI2.0 * the hardware does the mapping/translation. * * Total banks that can be configured is configured to 16. * * This should be called before any TX has be initiated by the HTT * * |31 16|15 8|7 5|4 0| * |------------------------------------------------------------| * | DESC_SIZE | NUM_BANKS | RES |SWP|pdev| msg type | * |------------------------------------------------------------| * | BANK0_BASE_ADDRESS | * |------------------------------------------------------------| * | ... | * |------------------------------------------------------------| * | BANK15_BASE_ADDRESS | * |------------------------------------------------------------| * | BANK0_MAX_ID | BANK0_MIN_ID | * |------------------------------------------------------------| * | ... | * |------------------------------------------------------------| * | BANK15_MAX_ID | BANK15_MIN_ID | * |------------------------------------------------------------| * Header fields: * - MSG_TYPE * Bits 7:0 * Value: 0x6 * - BANKx_BASE_ADDRESS * Bits 31:0 * Purpose: Provide a mechanism to specify the base address of the MSDU_EXT * bank physical/bus address. * - BANKx_MIN_ID * Bits 15:0 * Purpose: Provide a mechanism to specify the min index that needs to * mapped. * - BANKx_MAX_ID * Bits 31:16 * Purpose: Provide a mechanism to specify the max index that needs to * */ struct htt_frag_desc_bank_id { __le16 bank_min_id; __le16 bank_max_id; } __packed; /* real is 16 but it wouldn't fit in the max htt message size * so we use a conservatively safe value for now */ #define HTT_FRAG_DESC_BANK_MAX 4 #define HTT_FRAG_DESC_BANK_CFG_INFO_PDEV_ID_MASK 0x03 #define HTT_FRAG_DESC_BANK_CFG_INFO_PDEV_ID_LSB 0 #define HTT_FRAG_DESC_BANK_CFG_INFO_SWAP BIT(2) #define HTT_FRAG_DESC_BANK_CFG_INFO_Q_STATE_VALID BIT(3) #define HTT_FRAG_DESC_BANK_CFG_INFO_Q_STATE_DEPTH_TYPE_MASK BIT(4) #define HTT_FRAG_DESC_BANK_CFG_INFO_Q_STATE_DEPTH_TYPE_LSB 4 enum htt_q_depth_type { HTT_Q_DEPTH_TYPE_BYTES = 0, HTT_Q_DEPTH_TYPE_MSDUS = 1, }; #define HTT_TX_Q_STATE_NUM_PEERS (TARGET_10_4_NUM_QCACHE_PEERS_MAX + \ TARGET_10_4_NUM_VDEVS) #define HTT_TX_Q_STATE_NUM_TIDS 8 #define HTT_TX_Q_STATE_ENTRY_SIZE 1 #define HTT_TX_Q_STATE_ENTRY_MULTIPLIER 0 /** * struct htt_q_state_conf - part of htt_frag_desc_bank_cfg for host q state config * * Defines host q state format and behavior. See htt_q_state. * * @paddr: Queue physical address * @num_peers: Number of supported peers * @num_tids: Number of supported TIDs * @record_size: Defines the size of each host q entry in bytes. In practice * however firmware (at least 10.4.3-00191) ignores this host * configuration value and uses hardcoded value of 1. * @record_multiplier: This is valid only when q depth type is MSDUs. It * defines the exponent for the power of 2 multiplication. * @pad: struct padding for 32-bit alignment */ struct htt_q_state_conf { __le32 paddr; __le16 num_peers; __le16 num_tids; u8 record_size; u8 record_multiplier; u8 pad[2]; } __packed; struct htt_frag_desc_bank_cfg32 { u8 info; /* HTT_FRAG_DESC_BANK_CFG_INFO_ */ u8 num_banks; u8 desc_size; __le32 bank_base_addrs[HTT_FRAG_DESC_BANK_MAX]; struct htt_frag_desc_bank_id bank_id[HTT_FRAG_DESC_BANK_MAX]; struct htt_q_state_conf q_state; } __packed; struct htt_frag_desc_bank_cfg64 { u8 info; /* HTT_FRAG_DESC_BANK_CFG_INFO_ */ u8 num_banks; u8 desc_size; __le64 bank_base_addrs[HTT_FRAG_DESC_BANK_MAX]; struct htt_frag_desc_bank_id bank_id[HTT_FRAG_DESC_BANK_MAX]; struct htt_q_state_conf q_state; } __packed; #define HTT_TX_Q_STATE_ENTRY_COEFFICIENT 128 #define HTT_TX_Q_STATE_ENTRY_FACTOR_MASK 0x3f #define HTT_TX_Q_STATE_ENTRY_FACTOR_LSB 0 #define HTT_TX_Q_STATE_ENTRY_EXP_MASK 0xc0 #define HTT_TX_Q_STATE_ENTRY_EXP_LSB 6 /** * struct htt_q_state - shared between host and firmware via DMA * * This structure is used for the host to expose it's software queue state to * firmware so that its rate control can schedule fetch requests for optimized * performance. This is most notably used for MU-MIMO aggregation when multiple * MU clients are connected. * * @count: Each element defines the host queue depth. When q depth type was * configured as HTT_Q_DEPTH_TYPE_BYTES then each entry is defined as: * FACTOR * 128 * 8^EXP (see HTT_TX_Q_STATE_ENTRY_FACTOR_MASK and * HTT_TX_Q_STATE_ENTRY_EXP_MASK). When q depth type was configured as * HTT_Q_DEPTH_TYPE_MSDUS the number of packets is scaled by 2 ** * record_multiplier (see htt_q_state_conf). * @map: Used by firmware to quickly check which host queues are not empty. It * is a bitmap simply saying. * @seq: Used by firmware to quickly check if the host queues were updated * since it last checked. * * FIXME: Is the q_state map[] size calculation really correct? */ struct htt_q_state { u8 count[HTT_TX_Q_STATE_NUM_TIDS][HTT_TX_Q_STATE_NUM_PEERS]; u32 map[HTT_TX_Q_STATE_NUM_TIDS][(HTT_TX_Q_STATE_NUM_PEERS + 31) / 32]; __le32 seq; } __packed; #define HTT_TX_FETCH_RECORD_INFO_PEER_ID_MASK 0x0fff #define HTT_TX_FETCH_RECORD_INFO_PEER_ID_LSB 0 #define HTT_TX_FETCH_RECORD_INFO_TID_MASK 0xf000 #define HTT_TX_FETCH_RECORD_INFO_TID_LSB 12 struct htt_tx_fetch_record { __le16 info; /* HTT_TX_FETCH_IND_RECORD_INFO_ */ __le16 num_msdus; __le32 num_bytes; } __packed; struct htt_tx_fetch_ind { u8 pad0; __le16 fetch_seq_num; __le32 token; __le16 num_resp_ids; __le16 num_records; union { /* ath10k_htt_get_tx_fetch_ind_resp_ids() */ DECLARE_FLEX_ARRAY(__le32, resp_ids); DECLARE_FLEX_ARRAY(struct htt_tx_fetch_record, records); } __packed; } __packed; static inline void * ath10k_htt_get_tx_fetch_ind_resp_ids(struct htt_tx_fetch_ind *ind) { return (void *)&ind->records[le16_to_cpu(ind->num_records)]; } struct htt_tx_fetch_resp { u8 pad0; __le16 resp_id; __le16 fetch_seq_num; __le16 num_records; __le32 token; struct htt_tx_fetch_record records[]; } __packed; struct htt_tx_fetch_confirm { u8 pad0; __le16 num_resp_ids; __le32 resp_ids[]; } __packed; enum htt_tx_mode_switch_mode { HTT_TX_MODE_SWITCH_PUSH = 0, HTT_TX_MODE_SWITCH_PUSH_PULL = 1, }; #define HTT_TX_MODE_SWITCH_IND_INFO0_ENABLE BIT(0) #define HTT_TX_MODE_SWITCH_IND_INFO0_NUM_RECORDS_MASK 0xfffe #define HTT_TX_MODE_SWITCH_IND_INFO0_NUM_RECORDS_LSB 1 #define HTT_TX_MODE_SWITCH_IND_INFO1_MODE_MASK 0x0003 #define HTT_TX_MODE_SWITCH_IND_INFO1_MODE_LSB 0 #define HTT_TX_MODE_SWITCH_IND_INFO1_THRESHOLD_MASK 0xfffc #define HTT_TX_MODE_SWITCH_IND_INFO1_THRESHOLD_LSB 2 #define HTT_TX_MODE_SWITCH_RECORD_INFO0_PEER_ID_MASK 0x0fff #define HTT_TX_MODE_SWITCH_RECORD_INFO0_PEER_ID_LSB 0 #define HTT_TX_MODE_SWITCH_RECORD_INFO0_TID_MASK 0xf000 #define HTT_TX_MODE_SWITCH_RECORD_INFO0_TID_LSB 12 struct htt_tx_mode_switch_record { __le16 info0; /* HTT_TX_MODE_SWITCH_RECORD_INFO0_ */ __le16 num_max_msdus; } __packed; struct htt_tx_mode_switch_ind { u8 pad0; __le16 info0; /* HTT_TX_MODE_SWITCH_IND_INFO0_ */ __le16 info1; /* HTT_TX_MODE_SWITCH_IND_INFO1_ */ u8 pad1[2]; struct htt_tx_mode_switch_record records[]; } __packed; struct htt_channel_change { u8 pad[3]; __le32 freq; __le32 center_freq1; __le32 center_freq2; __le32 phymode; } __packed; struct htt_per_peer_tx_stats_ind { __le32 succ_bytes; __le32 retry_bytes; __le32 failed_bytes; u8 ratecode; u8 flags; __le16 peer_id; __le16 succ_pkts; __le16 retry_pkts; __le16 failed_pkts; __le16 tx_duration; __le32 reserved1; __le32 reserved2; } __packed; struct htt_peer_tx_stats { u8 num_ppdu; u8 ppdu_len; u8 version; u8 payload[]; } __packed; #define ATH10K_10_2_TX_STATS_OFFSET 136 #define PEER_STATS_FOR_NO_OF_PPDUS 4 struct ath10k_10_2_peer_tx_stats { u8 ratecode[PEER_STATS_FOR_NO_OF_PPDUS]; u8 success_pkts[PEER_STATS_FOR_NO_OF_PPDUS]; __le16 success_bytes[PEER_STATS_FOR_NO_OF_PPDUS]; u8 retry_pkts[PEER_STATS_FOR_NO_OF_PPDUS]; __le16 retry_bytes[PEER_STATS_FOR_NO_OF_PPDUS]; u8 failed_pkts[PEER_STATS_FOR_NO_OF_PPDUS]; __le16 failed_bytes[PEER_STATS_FOR_NO_OF_PPDUS]; u8 flags[PEER_STATS_FOR_NO_OF_PPDUS]; __le32 tx_duration; u8 tx_ppdu_cnt; u8 peer_id; } __packed; union htt_rx_pn_t { /* WEP: 24-bit PN */ u32 pn24; /* TKIP or CCMP: 48-bit PN */ u64 pn48; /* WAPI: 128-bit PN */ u64 pn128[2]; }; struct htt_cmd { struct htt_cmd_hdr hdr; union { struct htt_ver_req ver_req; struct htt_mgmt_tx_desc mgmt_tx; struct htt_data_tx_desc data_tx; struct htt_rx_ring_setup_32 rx_setup_32; struct htt_rx_ring_setup_64 rx_setup_64; struct htt_stats_req stats_req; struct htt_oob_sync_req oob_sync_req; struct htt_aggr_conf aggr_conf; struct htt_aggr_conf_v2 aggr_conf_v2; struct htt_frag_desc_bank_cfg32 frag_desc_bank_cfg32; struct htt_frag_desc_bank_cfg64 frag_desc_bank_cfg64; struct htt_tx_fetch_resp tx_fetch_resp; }; } __packed; struct htt_resp { struct htt_resp_hdr hdr; union { struct htt_ver_resp ver_resp; struct htt_mgmt_tx_completion mgmt_tx_completion; struct htt_data_tx_completion data_tx_completion; struct htt_rx_indication rx_ind; struct htt_rx_indication_hl rx_ind_hl; struct htt_rx_fragment_indication rx_frag_ind; struct htt_rx_peer_map peer_map; struct htt_rx_peer_unmap peer_unmap; struct htt_rx_flush rx_flush; struct htt_rx_addba rx_addba; struct htt_rx_delba rx_delba; struct htt_security_indication security_indication; struct htt_rc_update rc_update; struct htt_rx_test rx_test; struct htt_pktlog_msg pktlog_msg; struct htt_rx_pn_ind rx_pn_ind; struct htt_rx_offload_ind rx_offload_ind; struct htt_rx_in_ord_ind rx_in_ord_ind; struct htt_tx_fetch_ind tx_fetch_ind; struct htt_tx_fetch_confirm tx_fetch_confirm; struct htt_tx_mode_switch_ind tx_mode_switch_ind; struct htt_channel_change chan_change; struct htt_peer_tx_stats peer_tx_stats; } __packed; } __packed; /*** host side structures follow ***/ struct htt_tx_done { u16 msdu_id; u16 status; u8 ack_rssi; }; enum htt_tx_compl_state { HTT_TX_COMPL_STATE_NONE, HTT_TX_COMPL_STATE_ACK, HTT_TX_COMPL_STATE_NOACK, HTT_TX_COMPL_STATE_DISCARD, }; struct htt_peer_map_event { u8 vdev_id; u16 peer_id; u8 addr[ETH_ALEN]; }; struct htt_peer_unmap_event { u16 peer_id; }; struct ath10k_htt_txbuf_32 { struct htt_data_tx_desc_frag frags[2]; struct ath10k_htc_hdr htc_hdr; struct htt_cmd_hdr cmd_hdr; struct htt_data_tx_desc cmd_tx; } __packed __aligned(4); struct ath10k_htt_txbuf_64 { struct htt_data_tx_desc_frag frags[2]; struct ath10k_htc_hdr htc_hdr; struct htt_cmd_hdr cmd_hdr; struct htt_data_tx_desc_64 cmd_tx; } __packed __aligned(4); struct ath10k_htt { struct ath10k *ar; enum ath10k_htc_ep_id eid; struct sk_buff_head rx_indication_head; u8 target_version_major; u8 target_version_minor; struct completion target_version_received; u8 max_num_amsdu; u8 max_num_ampdu; const enum htt_t2h_msg_type *t2h_msg_types; u32 t2h_msg_types_max; struct { /* * Ring of network buffer objects - This ring is * used exclusively by the host SW. This ring * mirrors the dev_addrs_ring that is shared * between the host SW and the MAC HW. The host SW * uses this netbufs ring to locate the network * buffer objects whose data buffers the HW has * filled. */ struct sk_buff **netbufs_ring; /* This is used only with firmware supporting IN_ORD_IND. * * With Full Rx Reorder the HTT Rx Ring is more of a temporary * buffer ring from which buffer addresses are copied by the * firmware to MAC Rx ring. Firmware then delivers IN_ORD_IND * pointing to specific (re-ordered) buffers. * * FIXME: With kernel generic hashing functions there's a lot * of hash collisions for sk_buffs. */ bool in_ord_rx; DECLARE_HASHTABLE(skb_table, 4); /* * Ring of buffer addresses - * This ring holds the "physical" device address of the * rx buffers the host SW provides for the MAC HW to * fill. */ union { __le64 *paddrs_ring_64; __le32 *paddrs_ring_32; }; /* * Base address of ring, as a "physical" device address * rather than a CPU address. */ dma_addr_t base_paddr; /* how many elems in the ring (power of 2) */ int size; /* size - 1 */ unsigned int size_mask; /* how many rx buffers to keep in the ring */ int fill_level; /* how many rx buffers (full+empty) are in the ring */ int fill_cnt; /* * alloc_idx - where HTT SW has deposited empty buffers * This is allocated in consistent mem, so that the FW can * read this variable, and program the HW's FW_IDX reg with * the value of this shadow register. */ struct { __le32 *vaddr; dma_addr_t paddr; } alloc_idx; /* where HTT SW has processed bufs filled by rx MAC DMA */ struct { unsigned int msdu_payld; } sw_rd_idx; /* * refill_retry_timer - timer triggered when the ring is * not refilled to the level expected */ struct timer_list refill_retry_timer; /* Protects access to all rx ring buffer state variables */ spinlock_t lock; } rx_ring; unsigned int prefetch_len; /* Protects access to pending_tx, num_pending_tx */ spinlock_t tx_lock; int max_num_pending_tx; int num_pending_tx; int num_pending_mgmt_tx; struct idr pending_tx; wait_queue_head_t empty_tx_wq; /* FIFO for storing tx done status {ack, no-ack, discard} and msdu id */ DECLARE_KFIFO_PTR(txdone_fifo, struct htt_tx_done); /* set if host-fw communication goes haywire * used to avoid further failures */ bool rx_confused; atomic_t num_mpdus_ready; /* This is used to group tx/rx completions separately and process them * in batches to reduce cache stalls */ struct sk_buff_head rx_msdus_q; struct sk_buff_head rx_in_ord_compl_q; struct sk_buff_head tx_fetch_ind_q; /* rx_status template */ struct ieee80211_rx_status rx_status; struct { dma_addr_t paddr; union { struct htt_msdu_ext_desc *vaddr_desc_32; struct htt_msdu_ext_desc_64 *vaddr_desc_64; }; size_t size; } frag_desc; struct { dma_addr_t paddr; union { struct ath10k_htt_txbuf_32 *vaddr_txbuff_32; struct ath10k_htt_txbuf_64 *vaddr_txbuff_64; }; size_t size; } txbuf; struct { bool enabled; struct htt_q_state *vaddr; dma_addr_t paddr; u16 num_push_allowed; u16 num_peers; u16 num_tids; enum htt_tx_mode_switch_mode mode; enum htt_q_depth_type type; } tx_q_state; bool tx_mem_allocated; const struct ath10k_htt_tx_ops *tx_ops; const struct ath10k_htt_rx_ops *rx_ops; bool disable_tx_comp; bool bundle_tx; struct sk_buff_head tx_req_head; struct sk_buff_head tx_complete_head; }; struct ath10k_htt_tx_ops { int (*htt_send_rx_ring_cfg)(struct ath10k_htt *htt); int (*htt_send_frag_desc_bank_cfg)(struct ath10k_htt *htt); int (*htt_alloc_frag_desc)(struct ath10k_htt *htt); void (*htt_free_frag_desc)(struct ath10k_htt *htt); int (*htt_tx)(struct ath10k_htt *htt, enum ath10k_hw_txrx_mode txmode, struct sk_buff *msdu); int (*htt_alloc_txbuff)(struct ath10k_htt *htt); void (*htt_free_txbuff)(struct ath10k_htt *htt); int (*htt_h2t_aggr_cfg_msg)(struct ath10k_htt *htt, u8 max_subfrms_ampdu, u8 max_subfrms_amsdu); void (*htt_flush_tx)(struct ath10k_htt *htt); }; static inline int ath10k_htt_send_rx_ring_cfg(struct ath10k_htt *htt) { if (!htt->tx_ops->htt_send_rx_ring_cfg) return -EOPNOTSUPP; return htt->tx_ops->htt_send_rx_ring_cfg(htt); } static inline int ath10k_htt_send_frag_desc_bank_cfg(struct ath10k_htt *htt) { if (!htt->tx_ops->htt_send_frag_desc_bank_cfg) return -EOPNOTSUPP; return htt->tx_ops->htt_send_frag_desc_bank_cfg(htt); } static inline int ath10k_htt_alloc_frag_desc(struct ath10k_htt *htt) { if (!htt->tx_ops->htt_alloc_frag_desc) return -EOPNOTSUPP; return htt->tx_ops->htt_alloc_frag_desc(htt); } static inline void ath10k_htt_free_frag_desc(struct ath10k_htt *htt) { if (htt->tx_ops->htt_free_frag_desc) htt->tx_ops->htt_free_frag_desc(htt); } static inline int ath10k_htt_tx(struct ath10k_htt *htt, enum ath10k_hw_txrx_mode txmode, struct sk_buff *msdu) { return htt->tx_ops->htt_tx(htt, txmode, msdu); } static inline void ath10k_htt_flush_tx(struct ath10k_htt *htt) { if (htt->tx_ops->htt_flush_tx) htt->tx_ops->htt_flush_tx(htt); } static inline int ath10k_htt_alloc_txbuff(struct ath10k_htt *htt) { if (!htt->tx_ops->htt_alloc_txbuff) return -EOPNOTSUPP; return htt->tx_ops->htt_alloc_txbuff(htt); } static inline void ath10k_htt_free_txbuff(struct ath10k_htt *htt) { if (htt->tx_ops->htt_free_txbuff) htt->tx_ops->htt_free_txbuff(htt); } static inline int ath10k_htt_h2t_aggr_cfg_msg(struct ath10k_htt *htt, u8 max_subfrms_ampdu, u8 max_subfrms_amsdu) { if (!htt->tx_ops->htt_h2t_aggr_cfg_msg) return -EOPNOTSUPP; return htt->tx_ops->htt_h2t_aggr_cfg_msg(htt, max_subfrms_ampdu, max_subfrms_amsdu); } struct ath10k_htt_rx_ops { size_t (*htt_get_rx_ring_size)(struct ath10k_htt *htt); void (*htt_config_paddrs_ring)(struct ath10k_htt *htt, void *vaddr); void (*htt_set_paddrs_ring)(struct ath10k_htt *htt, dma_addr_t paddr, int idx); void* (*htt_get_vaddr_ring)(struct ath10k_htt *htt); void (*htt_reset_paddrs_ring)(struct ath10k_htt *htt, int idx); bool (*htt_rx_proc_rx_frag_ind)(struct ath10k_htt *htt, struct htt_rx_fragment_indication *rx, struct sk_buff *skb); }; static inline size_t ath10k_htt_get_rx_ring_size(struct ath10k_htt *htt) { if (!htt->rx_ops->htt_get_rx_ring_size) return 0; return htt->rx_ops->htt_get_rx_ring_size(htt); } static inline void ath10k_htt_config_paddrs_ring(struct ath10k_htt *htt, void *vaddr) { if (htt->rx_ops->htt_config_paddrs_ring) htt->rx_ops->htt_config_paddrs_ring(htt, vaddr); } static inline void ath10k_htt_set_paddrs_ring(struct ath10k_htt *htt, dma_addr_t paddr, int idx) { if (htt->rx_ops->htt_set_paddrs_ring) htt->rx_ops->htt_set_paddrs_ring(htt, paddr, idx); } static inline void *ath10k_htt_get_vaddr_ring(struct ath10k_htt *htt) { if (!htt->rx_ops->htt_get_vaddr_ring) return NULL; return htt->rx_ops->htt_get_vaddr_ring(htt); } static inline void ath10k_htt_reset_paddrs_ring(struct ath10k_htt *htt, int idx) { if (htt->rx_ops->htt_reset_paddrs_ring) htt->rx_ops->htt_reset_paddrs_ring(htt, idx); } static inline bool ath10k_htt_rx_proc_rx_frag_ind(struct ath10k_htt *htt, struct htt_rx_fragment_indication *rx, struct sk_buff *skb) { if (!htt->rx_ops->htt_rx_proc_rx_frag_ind) return true; return htt->rx_ops->htt_rx_proc_rx_frag_ind(htt, rx, skb); } /* the driver strongly assumes that the rx header status be 64 bytes long, * so all possible rx_desc structures must respect this assumption. */ #define RX_HTT_HDR_STATUS_LEN 64 /* The rx descriptor structure layout is programmed via rx ring setup * so that FW knows how to transfer the rx descriptor to the host. * Unfortunately, though, QCA6174's firmware doesn't currently behave correctly * when modifying the structure layout of the rx descriptor beyond what it expects * (even if it correctly programmed during the rx ring setup). * Therefore we must keep two different memory layouts, abstract the rx descriptor * representation and use ath10k_rx_desc_ops * for correctly accessing rx descriptor data. */ /* base struct used for abstracting the rx descriptor representation */ struct htt_rx_desc { union { /* This field is filled on the host using the msdu buffer * from htt_rx_indication */ struct fw_rx_desc_base fw_desc; u32 pad; } __packed; } __packed; /* rx descriptor for wcn3990 and possibly extensible for newer cards * Buffers like this are placed on the rx ring. */ struct htt_rx_desc_v2 { struct htt_rx_desc base; struct { struct rx_attention attention; struct rx_frag_info frag_info; struct rx_mpdu_start mpdu_start; struct rx_msdu_start msdu_start; struct rx_msdu_end msdu_end; struct rx_mpdu_end mpdu_end; struct rx_ppdu_start ppdu_start; struct rx_ppdu_end ppdu_end; } __packed; u8 rx_hdr_status[RX_HTT_HDR_STATUS_LEN]; u8 msdu_payload[]; }; /* QCA6174, QCA988x, QCA99x0 dedicated rx descriptor to make sure their firmware * works correctly. We keep a single rx descriptor for all these three * families of cards because from tests it seems to be the most stable solution, * e.g. having a rx descriptor only for QCA6174 seldom caused firmware crashes * during some tests. * Buffers like this are placed on the rx ring. */ struct htt_rx_desc_v1 { struct htt_rx_desc base; struct { struct rx_attention attention; struct rx_frag_info_v1 frag_info; struct rx_mpdu_start mpdu_start; struct rx_msdu_start_v1 msdu_start; struct rx_msdu_end_v1 msdu_end; struct rx_mpdu_end mpdu_end; struct rx_ppdu_start ppdu_start; struct rx_ppdu_end_v1 ppdu_end; } __packed; u8 rx_hdr_status[RX_HTT_HDR_STATUS_LEN]; u8 msdu_payload[]; }; /* rx_desc abstraction */ struct ath10k_htt_rx_desc_ops { /* These fields are mandatory, they must be specified in any instance */ /* sizeof() of the rx_desc structure used by this hw */ size_t rx_desc_size; /* offset of msdu_payload inside the rx_desc structure used by this hw */ size_t rx_desc_msdu_payload_offset; /* These fields are options. * When a field is not provided the default implementation gets used * (see the ath10k_rx_desc_* operations below for more info about the defaults) */ bool (*rx_desc_get_msdu_limit_error)(struct htt_rx_desc *rxd); int (*rx_desc_get_l3_pad_bytes)(struct htt_rx_desc *rxd); /* Safely cast from a void* buffer containing an rx descriptor * to the proper rx_desc structure */ struct htt_rx_desc *(*rx_desc_from_raw_buffer)(void *buff); void (*rx_desc_get_offsets)(struct htt_rx_ring_rx_desc_offsets *offs); struct rx_attention *(*rx_desc_get_attention)(struct htt_rx_desc *rxd); struct rx_frag_info_common *(*rx_desc_get_frag_info)(struct htt_rx_desc *rxd); struct rx_mpdu_start *(*rx_desc_get_mpdu_start)(struct htt_rx_desc *rxd); struct rx_mpdu_end *(*rx_desc_get_mpdu_end)(struct htt_rx_desc *rxd); struct rx_msdu_start_common *(*rx_desc_get_msdu_start)(struct htt_rx_desc *rxd); struct rx_msdu_end_common *(*rx_desc_get_msdu_end)(struct htt_rx_desc *rxd); struct rx_ppdu_start *(*rx_desc_get_ppdu_start)(struct htt_rx_desc *rxd); struct rx_ppdu_end_common *(*rx_desc_get_ppdu_end)(struct htt_rx_desc *rxd); u8 *(*rx_desc_get_rx_hdr_status)(struct htt_rx_desc *rxd); u8 *(*rx_desc_get_msdu_payload)(struct htt_rx_desc *rxd); }; extern const struct ath10k_htt_rx_desc_ops qca988x_rx_desc_ops; extern const struct ath10k_htt_rx_desc_ops qca99x0_rx_desc_ops; extern const struct ath10k_htt_rx_desc_ops wcn3990_rx_desc_ops; static inline int ath10k_htt_rx_desc_get_l3_pad_bytes(struct ath10k_hw_params *hw, struct htt_rx_desc *rxd) { if (hw->rx_desc_ops->rx_desc_get_l3_pad_bytes) return hw->rx_desc_ops->rx_desc_get_l3_pad_bytes(rxd); return 0; } static inline bool ath10k_htt_rx_desc_msdu_limit_error(struct ath10k_hw_params *hw, struct htt_rx_desc *rxd) { if (hw->rx_desc_ops->rx_desc_get_msdu_limit_error) return hw->rx_desc_ops->rx_desc_get_msdu_limit_error(rxd); return false; } /* The default implementation of all these getters is using the old rx_desc, * so that it is easier to define the ath10k_htt_rx_desc_ops instances. * But probably, if new wireless cards must be supported, it would be better * to switch the default implementation to the new rx_desc, since this would * make the extension easier . */ static inline struct htt_rx_desc * ath10k_htt_rx_desc_from_raw_buffer(struct ath10k_hw_params *hw, void *buff) { if (hw->rx_desc_ops->rx_desc_from_raw_buffer) return hw->rx_desc_ops->rx_desc_from_raw_buffer(buff); return &((struct htt_rx_desc_v1 *)buff)->base; } static inline void ath10k_htt_rx_desc_get_offsets(struct ath10k_hw_params *hw, struct htt_rx_ring_rx_desc_offsets *off) { if (hw->rx_desc_ops->rx_desc_get_offsets) { hw->rx_desc_ops->rx_desc_get_offsets(off); } else { #define desc_offset(x) (offsetof(struct htt_rx_desc_v1, x) / 4) off->mac80211_hdr_offset = __cpu_to_le16(desc_offset(rx_hdr_status)); off->msdu_payload_offset = __cpu_to_le16(desc_offset(msdu_payload)); off->ppdu_start_offset = __cpu_to_le16(desc_offset(ppdu_start)); off->ppdu_end_offset = __cpu_to_le16(desc_offset(ppdu_end)); off->mpdu_start_offset = __cpu_to_le16(desc_offset(mpdu_start)); off->mpdu_end_offset = __cpu_to_le16(desc_offset(mpdu_end)); off->msdu_start_offset = __cpu_to_le16(desc_offset(msdu_start)); off->msdu_end_offset = __cpu_to_le16(desc_offset(msdu_end)); off->rx_attention_offset = __cpu_to_le16(desc_offset(attention)); off->frag_info_offset = __cpu_to_le16(desc_offset(frag_info)); #undef desc_offset } } static inline struct rx_attention * ath10k_htt_rx_desc_get_attention(struct ath10k_hw_params *hw, struct htt_rx_desc *rxd) { struct htt_rx_desc_v1 *rx_desc; if (hw->rx_desc_ops->rx_desc_get_attention) return hw->rx_desc_ops->rx_desc_get_attention(rxd); rx_desc = container_of(rxd, struct htt_rx_desc_v1, base); return &rx_desc->attention; } static inline struct rx_frag_info_common * ath10k_htt_rx_desc_get_frag_info(struct ath10k_hw_params *hw, struct htt_rx_desc *rxd) { struct htt_rx_desc_v1 *rx_desc; if (hw->rx_desc_ops->rx_desc_get_frag_info) return hw->rx_desc_ops->rx_desc_get_frag_info(rxd); rx_desc = container_of(rxd, struct htt_rx_desc_v1, base); return &rx_desc->frag_info.common; } static inline struct rx_mpdu_start * ath10k_htt_rx_desc_get_mpdu_start(struct ath10k_hw_params *hw, struct htt_rx_desc *rxd) { struct htt_rx_desc_v1 *rx_desc; if (hw->rx_desc_ops->rx_desc_get_mpdu_start) return hw->rx_desc_ops->rx_desc_get_mpdu_start(rxd); rx_desc = container_of(rxd, struct htt_rx_desc_v1, base); return &rx_desc->mpdu_start; } static inline struct rx_mpdu_end * ath10k_htt_rx_desc_get_mpdu_end(struct ath10k_hw_params *hw, struct htt_rx_desc *rxd) { struct htt_rx_desc_v1 *rx_desc; if (hw->rx_desc_ops->rx_desc_get_mpdu_end) return hw->rx_desc_ops->rx_desc_get_mpdu_end(rxd); rx_desc = container_of(rxd, struct htt_rx_desc_v1, base); return &rx_desc->mpdu_end; } static inline struct rx_msdu_start_common * ath10k_htt_rx_desc_get_msdu_start(struct ath10k_hw_params *hw, struct htt_rx_desc *rxd) { struct htt_rx_desc_v1 *rx_desc; if (hw->rx_desc_ops->rx_desc_get_msdu_start) return hw->rx_desc_ops->rx_desc_get_msdu_start(rxd); rx_desc = container_of(rxd, struct htt_rx_desc_v1, base); return &rx_desc->msdu_start.common; } static inline struct rx_msdu_end_common * ath10k_htt_rx_desc_get_msdu_end(struct ath10k_hw_params *hw, struct htt_rx_desc *rxd) { struct htt_rx_desc_v1 *rx_desc; if (hw->rx_desc_ops->rx_desc_get_msdu_end) return hw->rx_desc_ops->rx_desc_get_msdu_end(rxd); rx_desc = container_of(rxd, struct htt_rx_desc_v1, base); return &rx_desc->msdu_end.common; } static inline struct rx_ppdu_start * ath10k_htt_rx_desc_get_ppdu_start(struct ath10k_hw_params *hw, struct htt_rx_desc *rxd) { struct htt_rx_desc_v1 *rx_desc; if (hw->rx_desc_ops->rx_desc_get_ppdu_start) return hw->rx_desc_ops->rx_desc_get_ppdu_start(rxd); rx_desc = container_of(rxd, struct htt_rx_desc_v1, base); return &rx_desc->ppdu_start; } static inline struct rx_ppdu_end_common * ath10k_htt_rx_desc_get_ppdu_end(struct ath10k_hw_params *hw, struct htt_rx_desc *rxd) { struct htt_rx_desc_v1 *rx_desc; if (hw->rx_desc_ops->rx_desc_get_ppdu_end) return hw->rx_desc_ops->rx_desc_get_ppdu_end(rxd); rx_desc = container_of(rxd, struct htt_rx_desc_v1, base); return &rx_desc->ppdu_end.common; } static inline u8 * ath10k_htt_rx_desc_get_rx_hdr_status(struct ath10k_hw_params *hw, struct htt_rx_desc *rxd) { struct htt_rx_desc_v1 *rx_desc; if (hw->rx_desc_ops->rx_desc_get_rx_hdr_status) return hw->rx_desc_ops->rx_desc_get_rx_hdr_status(rxd); rx_desc = container_of(rxd, struct htt_rx_desc_v1, base); return rx_desc->rx_hdr_status; } static inline u8 * ath10k_htt_rx_desc_get_msdu_payload(struct ath10k_hw_params *hw, struct htt_rx_desc *rxd) { struct htt_rx_desc_v1 *rx_desc; if (hw->rx_desc_ops->rx_desc_get_msdu_payload) return hw->rx_desc_ops->rx_desc_get_msdu_payload(rxd); rx_desc = container_of(rxd, struct htt_rx_desc_v1, base); return rx_desc->msdu_payload; } #define HTT_RX_DESC_HL_INFO_SEQ_NUM_MASK 0x00000fff #define HTT_RX_DESC_HL_INFO_SEQ_NUM_LSB 0 #define HTT_RX_DESC_HL_INFO_ENCRYPTED_MASK 0x00001000 #define HTT_RX_DESC_HL_INFO_ENCRYPTED_LSB 12 #define HTT_RX_DESC_HL_INFO_CHAN_INFO_PRESENT_MASK 0x00002000 #define HTT_RX_DESC_HL_INFO_CHAN_INFO_PRESENT_LSB 13 #define HTT_RX_DESC_HL_INFO_MCAST_BCAST_MASK 0x00010000 #define HTT_RX_DESC_HL_INFO_MCAST_BCAST_LSB 16 #define HTT_RX_DESC_HL_INFO_KEY_ID_OCT_MASK 0x01fe0000 #define HTT_RX_DESC_HL_INFO_KEY_ID_OCT_LSB 17 struct htt_rx_desc_base_hl { __le32 info; /* HTT_RX_DESC_HL_INFO_ */ }; struct htt_rx_chan_info { __le16 primary_chan_center_freq_mhz; __le16 contig_chan1_center_freq_mhz; __le16 contig_chan2_center_freq_mhz; u8 phy_mode; u8 reserved; } __packed; #define HTT_RX_DESC_ALIGN 8 #define HTT_MAC_ADDR_LEN 6 /* * FIX THIS * Should be: sizeof(struct htt_host_rx_desc) + max rx MSDU size, * rounded up to a cache line size. */ #define HTT_RX_BUF_SIZE 2048 /* The HTT_RX_MSDU_SIZE can't be statically computed anymore, * because it depends on the underlying device rx_desc representation */ static inline int ath10k_htt_rx_msdu_size(struct ath10k_hw_params *hw) { return HTT_RX_BUF_SIZE - (int)hw->rx_desc_ops->rx_desc_size; } /* Refill a bunch of RX buffers for each refill round so that FW/HW can handle * aggregated traffic more nicely. */ #define ATH10K_HTT_MAX_NUM_REFILL 100 /* * DMA_MAP expects the buffer to be an integral number of cache lines. * Rather than checking the actual cache line size, this code makes a * conservative estimate of what the cache line size could be. */ #define HTT_LOG2_MAX_CACHE_LINE_SIZE 7 /* 2^7 = 128 */ #define HTT_MAX_CACHE_LINE_SIZE_MASK ((1 << HTT_LOG2_MAX_CACHE_LINE_SIZE) - 1) /* These values are default in most firmware revisions and apparently are a * sweet spot performance wise. */ #define ATH10K_HTT_MAX_NUM_AMSDU_DEFAULT 3 #define ATH10K_HTT_MAX_NUM_AMPDU_DEFAULT 64 int ath10k_htt_connect(struct ath10k_htt *htt); int ath10k_htt_init(struct ath10k *ar); int ath10k_htt_setup(struct ath10k_htt *htt); int ath10k_htt_tx_start(struct ath10k_htt *htt); void ath10k_htt_tx_stop(struct ath10k_htt *htt); void ath10k_htt_tx_destroy(struct ath10k_htt *htt); void ath10k_htt_tx_free(struct ath10k_htt *htt); int ath10k_htt_rx_alloc(struct ath10k_htt *htt); int ath10k_htt_rx_ring_refill(struct ath10k *ar); void ath10k_htt_rx_free(struct ath10k_htt *htt); void ath10k_htt_htc_tx_complete(struct ath10k *ar, struct sk_buff *skb); void ath10k_htt_htc_t2h_msg_handler(struct ath10k *ar, struct sk_buff *skb); bool ath10k_htt_t2h_msg_handler(struct ath10k *ar, struct sk_buff *skb); int ath10k_htt_h2t_ver_req_msg(struct ath10k_htt *htt); int ath10k_htt_h2t_stats_req(struct ath10k_htt *htt, u32 mask, u32 reset_mask, u64 cookie); void ath10k_htt_hif_tx_complete(struct ath10k *ar, struct sk_buff *skb); int ath10k_htt_tx_fetch_resp(struct ath10k *ar, __le32 token, __le16 fetch_seq_num, struct htt_tx_fetch_record *records, size_t num_records); void ath10k_htt_op_ep_tx_credits(struct ath10k *ar); void ath10k_htt_tx_txq_update(struct ieee80211_hw *hw, struct ieee80211_txq *txq); void ath10k_htt_tx_txq_recalc(struct ieee80211_hw *hw, struct ieee80211_txq *txq); void ath10k_htt_tx_txq_sync(struct ath10k *ar); void ath10k_htt_tx_dec_pending(struct ath10k_htt *htt); int ath10k_htt_tx_inc_pending(struct ath10k_htt *htt); void ath10k_htt_tx_mgmt_dec_pending(struct ath10k_htt *htt); int ath10k_htt_tx_mgmt_inc_pending(struct ath10k_htt *htt, bool is_mgmt, bool is_presp); int ath10k_htt_tx_alloc_msdu_id(struct ath10k_htt *htt, struct sk_buff *skb); void ath10k_htt_tx_free_msdu_id(struct ath10k_htt *htt, u16 msdu_id); int ath10k_htt_mgmt_tx(struct ath10k_htt *htt, struct sk_buff *msdu); void ath10k_htt_rx_pktlog_completion_handler(struct ath10k *ar, struct sk_buff *skb); int ath10k_htt_txrx_compl_task(struct ath10k *ar, int budget); int ath10k_htt_rx_hl_indication(struct ath10k *ar, int budget); void ath10k_htt_set_tx_ops(struct ath10k_htt *htt); void ath10k_htt_set_rx_ops(struct ath10k_htt *htt); #endif |
| 3 58 59 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | // SPDX-License-Identifier: GPL-2.0+ #include <drm/drm_atomic_helper.h> #include <drm/drm_edid.h> #include <drm/drm_managed.h> #include <drm/drm_probe_helper.h> #include "vkms_connector.h" static const struct drm_connector_funcs vkms_connector_funcs = { .fill_modes = drm_helper_probe_single_connector_modes, .reset = drm_atomic_helper_connector_reset, .atomic_duplicate_state = drm_atomic_helper_connector_duplicate_state, .atomic_destroy_state = drm_atomic_helper_connector_destroy_state, }; static int vkms_conn_get_modes(struct drm_connector *connector) { int count; /* Use the default modes list from DRM */ count = drm_add_modes_noedid(connector, XRES_MAX, YRES_MAX); drm_set_preferred_mode(connector, XRES_DEF, YRES_DEF); return count; } static struct drm_encoder *vkms_conn_best_encoder(struct drm_connector *connector) { struct drm_encoder *encoder; drm_connector_for_each_possible_encoder(connector, encoder) return encoder; return NULL; } static const struct drm_connector_helper_funcs vkms_conn_helper_funcs = { .get_modes = vkms_conn_get_modes, .best_encoder = vkms_conn_best_encoder, }; struct vkms_connector *vkms_connector_init(struct vkms_device *vkmsdev) { struct drm_device *dev = &vkmsdev->drm; struct vkms_connector *connector; int ret; connector = drmm_kzalloc(dev, sizeof(*connector), GFP_KERNEL); if (!connector) return ERR_PTR(-ENOMEM); ret = drmm_connector_init(dev, &connector->base, &vkms_connector_funcs, DRM_MODE_CONNECTOR_VIRTUAL, NULL); if (ret) return ERR_PTR(ret); drm_connector_helper_add(&connector->base, &vkms_conn_helper_funcs); return connector; } |
| 8 3 1 63 24 5 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_HUGETLB_H #define _ASM_GENERIC_HUGETLB_H #include <linux/swap.h> #include <linux/swapops.h> static inline unsigned long huge_pte_write(pte_t pte) { return pte_write(pte); } static inline unsigned long huge_pte_dirty(pte_t pte) { return pte_dirty(pte); } static inline pte_t huge_pte_mkwrite(pte_t pte) { return pte_mkwrite_novma(pte); } #ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); } #endif static inline pte_t huge_pte_mkdirty(pte_t pte) { return pte_mkdirty(pte); } static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) { return pte_modify(pte, newprot); } #ifndef __HAVE_ARCH_HUGE_PTE_MKUFFD_WP static inline pte_t huge_pte_mkuffd_wp(pte_t pte) { return huge_pte_wrprotect(pte_mkuffd_wp(pte)); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) { return pte_clear_uffd_wp(pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_UFFD_WP static inline int huge_pte_uffd_wp(pte_t pte) { return pte_uffd_wp(pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { pte_clear(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz) { set_pte_at(mm, addr, ptep, pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { return ptep_get_and_clear(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return ptep_clear_flush(vma, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_NONE static inline int huge_pte_none(pte_t pte) { return pte_none(pte); } #endif /* Please refer to comments above pte_none_mostly() for the usage */ #ifndef __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY static inline int huge_pte_none_mostly(pte_t pte) { return huge_pte_none(pte) || is_pte_marker(pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { ptep_set_wrprotect(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { return ptep_set_access_flags(vma, addr, ptep, pte, dirty); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_GET static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { return ptep_get(ptep); } #endif #ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED static inline bool gigantic_page_runtime_supported(void) { return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE); } #endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */ #endif /* _ASM_GENERIC_HUGETLB_H */ |
| 33 72 12 5 4 2 45 1 40 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 | /* SPDX-License-Identifier: GPL-2.0+ */ /* * Driver for 8250/16550-type serial ports * * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. * * Copyright (C) 2001 Russell King. */ #include <linux/bits.h> #include <linux/serial_8250.h> #include <linux/serial_core.h> #include <linux/dmaengine.h> #include "../serial_mctrl_gpio.h" struct uart_8250_dma { int (*tx_dma)(struct uart_8250_port *p); int (*rx_dma)(struct uart_8250_port *p); void (*prepare_tx_dma)(struct uart_8250_port *p); void (*prepare_rx_dma)(struct uart_8250_port *p); /* Filter function */ dma_filter_fn fn; /* Parameter to the filter function */ void *rx_param; void *tx_param; struct dma_slave_config rxconf; struct dma_slave_config txconf; struct dma_chan *rxchan; struct dma_chan *txchan; /* Device address base for DMA operations */ phys_addr_t rx_dma_addr; phys_addr_t tx_dma_addr; /* DMA address of the buffer in memory */ dma_addr_t rx_addr; dma_addr_t tx_addr; dma_cookie_t rx_cookie; dma_cookie_t tx_cookie; void *rx_buf; size_t rx_size; size_t tx_size; unsigned char tx_running; unsigned char tx_err; unsigned char rx_running; }; struct old_serial_port { unsigned int uart; unsigned int baud_base; unsigned int port; unsigned int irq; upf_t flags; unsigned char io_type; unsigned char __iomem *iomem_base; unsigned short iomem_reg_shift; }; struct serial8250_config { const char *name; unsigned short fifo_size; unsigned short tx_loadsz; unsigned char fcr; unsigned char rxtrig_bytes[UART_FCR_R_TRIG_MAX_STATE]; unsigned int flags; }; #define UART_CAP_FIFO BIT(8) /* UART has FIFO */ #define UART_CAP_EFR BIT(9) /* UART has EFR */ #define UART_CAP_SLEEP BIT(10) /* UART has IER sleep */ #define UART_CAP_AFE BIT(11) /* MCR-based hw flow control */ #define UART_CAP_UUE BIT(12) /* UART needs IER bit 6 set (Xscale) */ #define UART_CAP_RTOIE BIT(13) /* UART needs IER bit 4 set (Xscale, Tegra) */ #define UART_CAP_HFIFO BIT(14) /* UART has a "hidden" FIFO */ #define UART_CAP_RPM BIT(15) /* Runtime PM is active while idle */ #define UART_CAP_IRDA BIT(16) /* UART supports IrDA line discipline */ #define UART_CAP_MINI BIT(17) /* Mini UART on BCM283X family lacks: * STOP PARITY EPAR SPAR WLEN5 WLEN6 */ #define UART_CAP_NOTEMT BIT(18) /* UART without interrupt on TEMT available */ #define UART_BUG_QUOT BIT(0) /* UART has buggy quot LSB */ #define UART_BUG_TXEN BIT(1) /* UART has buggy TX IIR status */ #define UART_BUG_NOMSR BIT(2) /* UART has buggy MSR status bits (Au1x00) */ #define UART_BUG_THRE BIT(3) /* UART has buggy THRE reassertion */ #define UART_BUG_TXRACE BIT(5) /* UART Tx fails to set remote DR */ /* Module parameters */ #define UART_NR CONFIG_SERIAL_8250_NR_UARTS extern unsigned int nr_uarts; #ifdef CONFIG_SERIAL_8250_SHARE_IRQ #define SERIAL8250_SHARE_IRQS 1 #else #define SERIAL8250_SHARE_IRQS 0 #endif extern unsigned int share_irqs; extern unsigned int skip_txen_test; #define SERIAL8250_PORT_FLAGS(_base, _irq, _flags) \ { \ .iobase = _base, \ .irq = _irq, \ .uartclk = 1843200, \ .iotype = UPIO_PORT, \ .flags = UPF_BOOT_AUTOCONF | (_flags), \ } #define SERIAL8250_PORT(_base, _irq) SERIAL8250_PORT_FLAGS(_base, _irq, 0) extern struct uart_driver serial8250_reg; void serial8250_register_ports(struct uart_driver *drv, struct device *dev); /* Legacy ISA bus related APIs */ typedef void (*serial8250_isa_config_fn)(int, struct uart_port *, u32 *); extern serial8250_isa_config_fn serial8250_isa_config; void serial8250_isa_init_ports(void); extern struct platform_device *serial8250_isa_devs; extern const struct uart_ops *univ8250_port_base_ops; extern struct uart_ops univ8250_port_ops; static inline int serial_in(struct uart_8250_port *up, int offset) { return up->port.serial_in(&up->port, offset); } static inline void serial_out(struct uart_8250_port *up, int offset, int value) { up->port.serial_out(&up->port, offset, value); } /** * serial_lsr_in - Read LSR register and preserve flags across reads * @up: uart 8250 port * * Read LSR register and handle saving non-preserved flags across reads. * The flags that are not preserved across reads are stored into * up->lsr_saved_flags. * * Returns LSR value or'ed with the preserved flags (if any). */ static inline u16 serial_lsr_in(struct uart_8250_port *up) { u16 lsr = up->lsr_saved_flags; lsr |= serial_in(up, UART_LSR); up->lsr_saved_flags = lsr & up->lsr_save_mask; return lsr; } /* * For the 16C950 */ static void serial_icr_write(struct uart_8250_port *up, int offset, int value) { serial_out(up, UART_SCR, offset); serial_out(up, UART_ICR, value); } static unsigned int __maybe_unused serial_icr_read(struct uart_8250_port *up, int offset) { unsigned int value; serial_icr_write(up, UART_ACR, up->acr | UART_ACR_ICRRD); serial_out(up, UART_SCR, offset); value = serial_in(up, UART_ICR); serial_icr_write(up, UART_ACR, up->acr); return value; } void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p); void serial8250_rpm_get(struct uart_8250_port *p); void serial8250_rpm_put(struct uart_8250_port *p); DEFINE_GUARD(serial8250_rpm, struct uart_8250_port *, serial8250_rpm_get(_T), serial8250_rpm_put(_T)); static inline u32 serial_dl_read(struct uart_8250_port *up) { return up->dl_read(up); } static inline void serial_dl_write(struct uart_8250_port *up, u32 value) { up->dl_write(up, value); } static inline bool serial8250_set_THRI(struct uart_8250_port *up) { /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&up->port.lock); if (up->ier & UART_IER_THRI) return false; up->ier |= UART_IER_THRI; serial_out(up, UART_IER, up->ier); return true; } static inline bool serial8250_clear_THRI(struct uart_8250_port *up) { /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&up->port.lock); if (!(up->ier & UART_IER_THRI)) return false; up->ier &= ~UART_IER_THRI; serial_out(up, UART_IER, up->ier); return true; } struct uart_8250_port *serial8250_setup_port(int index); struct uart_8250_port *serial8250_get_port(int line); int serial8250_em485_config(struct uart_port *port, struct ktermios *termios, struct serial_rs485 *rs485); void serial8250_em485_start_tx(struct uart_8250_port *p, bool toggle_ier); void serial8250_em485_stop_tx(struct uart_8250_port *p, bool toggle_ier); void serial8250_em485_destroy(struct uart_8250_port *p); extern struct serial_rs485 serial8250_em485_supported; /* MCR <-> TIOCM conversion */ static inline int serial8250_TIOCM_to_MCR(int tiocm) { int mcr = 0; if (tiocm & TIOCM_RTS) mcr |= UART_MCR_RTS; if (tiocm & TIOCM_DTR) mcr |= UART_MCR_DTR; if (tiocm & TIOCM_OUT1) mcr |= UART_MCR_OUT1; if (tiocm & TIOCM_OUT2) mcr |= UART_MCR_OUT2; if (tiocm & TIOCM_LOOP) mcr |= UART_MCR_LOOP; return mcr; } static inline int serial8250_MCR_to_TIOCM(int mcr) { int tiocm = 0; if (mcr & UART_MCR_RTS) tiocm |= TIOCM_RTS; if (mcr & UART_MCR_DTR) tiocm |= TIOCM_DTR; if (mcr & UART_MCR_OUT1) tiocm |= TIOCM_OUT1; if (mcr & UART_MCR_OUT2) tiocm |= TIOCM_OUT2; if (mcr & UART_MCR_LOOP) tiocm |= TIOCM_LOOP; return tiocm; } /* MSR <-> TIOCM conversion */ static inline int serial8250_MSR_to_TIOCM(int msr) { int tiocm = 0; if (msr & UART_MSR_DCD) tiocm |= TIOCM_CAR; if (msr & UART_MSR_RI) tiocm |= TIOCM_RNG; if (msr & UART_MSR_DSR) tiocm |= TIOCM_DSR; if (msr & UART_MSR_CTS) tiocm |= TIOCM_CTS; return tiocm; } static inline void serial8250_out_MCR(struct uart_8250_port *up, int value) { serial_out(up, UART_MCR, value); if (up->gpios) mctrl_gpio_set(up->gpios, serial8250_MCR_to_TIOCM(value)); } static inline int serial8250_in_MCR(struct uart_8250_port *up) { int mctrl; mctrl = serial_in(up, UART_MCR); if (up->gpios) { unsigned int mctrl_gpio = 0; mctrl_gpio = mctrl_gpio_get_outputs(up->gpios, &mctrl_gpio); mctrl |= serial8250_TIOCM_to_MCR(mctrl_gpio); } return mctrl; } #ifdef CONFIG_SERIAL_8250_PNP int serial8250_pnp_init(void); void serial8250_pnp_exit(void); #else static inline int serial8250_pnp_init(void) { return 0; } static inline void serial8250_pnp_exit(void) { } #endif #ifdef CONFIG_SERIAL_8250_RSA void univ8250_rsa_support(struct uart_ops *ops); void rsa_enable(struct uart_8250_port *up); void rsa_disable(struct uart_8250_port *up); void rsa_autoconfig(struct uart_8250_port *up); void rsa_reset(struct uart_8250_port *up); #else static inline void univ8250_rsa_support(struct uart_ops *ops) { } static inline void rsa_enable(struct uart_8250_port *up) {} static inline void rsa_disable(struct uart_8250_port *up) {} static inline void rsa_autoconfig(struct uart_8250_port *up) {} static inline void rsa_reset(struct uart_8250_port *up) {} #endif #ifdef CONFIG_SERIAL_8250_FINTEK int fintek_8250_probe(struct uart_8250_port *uart); #else static inline int fintek_8250_probe(struct uart_8250_port *uart) { return 0; } #endif #ifdef CONFIG_ARCH_OMAP1 #include <linux/soc/ti/omap1-soc.h> static inline int is_omap1_8250(struct uart_8250_port *pt) { int res; switch (pt->port.mapbase) { case OMAP1_UART1_BASE: case OMAP1_UART2_BASE: case OMAP1_UART3_BASE: res = 1; break; default: res = 0; break; } return res; } static inline int is_omap1510_8250(struct uart_8250_port *pt) { if (!cpu_is_omap1510()) return 0; return is_omap1_8250(pt); } #else static inline int is_omap1_8250(struct uart_8250_port *pt) { return 0; } static inline int is_omap1510_8250(struct uart_8250_port *pt) { return 0; } #endif #ifdef CONFIG_SERIAL_8250_DMA extern int serial8250_tx_dma(struct uart_8250_port *); extern void serial8250_tx_dma_flush(struct uart_8250_port *); extern int serial8250_rx_dma(struct uart_8250_port *); extern void serial8250_rx_dma_flush(struct uart_8250_port *); extern int serial8250_request_dma(struct uart_8250_port *); extern void serial8250_release_dma(struct uart_8250_port *); static inline void serial8250_do_prepare_tx_dma(struct uart_8250_port *p) { struct uart_8250_dma *dma = p->dma; if (dma->prepare_tx_dma) dma->prepare_tx_dma(p); } static inline void serial8250_do_prepare_rx_dma(struct uart_8250_port *p) { struct uart_8250_dma *dma = p->dma; if (dma->prepare_rx_dma) dma->prepare_rx_dma(p); } static inline bool serial8250_tx_dma_running(struct uart_8250_port *p) { struct uart_8250_dma *dma = p->dma; return dma && dma->tx_running; } #else static inline int serial8250_tx_dma(struct uart_8250_port *p) { return -1; } static inline void serial8250_tx_dma_flush(struct uart_8250_port *p) { } static inline int serial8250_rx_dma(struct uart_8250_port *p) { return -1; } static inline void serial8250_rx_dma_flush(struct uart_8250_port *p) { } static inline int serial8250_request_dma(struct uart_8250_port *p) { return -1; } static inline void serial8250_release_dma(struct uart_8250_port *p) { } static inline bool serial8250_tx_dma_running(struct uart_8250_port *p) { return false; } #endif static inline int ns16550a_goto_highspeed(struct uart_8250_port *up) { unsigned char status; status = serial_in(up, 0x04); /* EXCR2 */ #define PRESL(x) ((x) & 0x30) if (PRESL(status) == 0x10) { /* already in high speed mode */ return 0; } else { status &= ~0xB0; /* Disable LOCK, mask out PRESL[01] */ status |= 0x10; /* 1.625 divisor for baud_base --> 921600 */ serial_out(up, 0x04, status); } return 1; } static inline int serial_index(struct uart_port *port) { return port->minor - 64; } |
| 1 3 1 3 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 | /* SPDX-License-Identifier: GPL-2.0 */ #ifdef pr_fmt #undef pr_fmt #endif #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include "amigaffs.h" #include <linux/mutex.h> #include <linux/workqueue.h> /* Ugly macros make the code more pretty. */ #define AFFS_BLOCK(sb, bh, blk) (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)]) #define AFFS_HEAD(bh) ((struct affs_head *)(bh)->b_data) #define AFFS_TAIL(sb, bh) ((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail))) #define AFFS_ROOT_HEAD(bh) ((struct affs_root_head *)(bh)->b_data) #define AFFS_ROOT_TAIL(sb, bh) ((struct affs_root_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_root_tail))) #define AFFS_DATA_HEAD(bh) ((struct affs_data_head *)(bh)->b_data) #define AFFS_DATA(bh) (((struct affs_data_head *)(bh)->b_data)->data) #define AFFS_CACHE_SIZE PAGE_SIZE #define AFFS_LC_SIZE (AFFS_CACHE_SIZE/sizeof(u32)/2) #define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2) #define AFFS_AC_MASK (AFFS_AC_SIZE-1) #define AFFSNAMEMAX 30U struct affs_ext_key { u32 ext; /* idx of the extended block */ u32 key; /* block number */ }; /* * affs fs inode data in memory */ struct affs_inode_info { atomic_t i_opencnt; struct mutex i_link_lock; /* Protects internal inode access. */ struct mutex i_ext_lock; /* Protects internal inode access. */ #define i_hash_lock i_ext_lock u32 i_blkcnt; /* block count */ u32 i_extcnt; /* extended block count */ u32 *i_lc; /* linear cache of extended blocks */ u32 i_lc_size; u32 i_lc_shift; u32 i_lc_mask; struct affs_ext_key *i_ac; /* associative cache of extended blocks */ u32 i_ext_last; /* last accessed extended block */ struct buffer_head *i_ext_bh; /* bh of last extended block */ loff_t mmu_private; u32 i_protect; /* unused attribute bits */ u32 i_lastalloc; /* last allocated block */ int i_pa_cnt; /* number of preallocated blocks */ struct inode vfs_inode; }; /* short cut to get to the affs specific inode data */ static inline struct affs_inode_info *AFFS_I(struct inode *inode) { return container_of(inode, struct affs_inode_info, vfs_inode); } /* * super-block data in memory * * Block numbers are adjusted for their actual size * */ struct affs_bm_info { u32 bm_key; /* Disk block number */ u32 bm_free; /* Free blocks in here */ }; struct affs_sb_info { int s_partition_size; /* Partition size in blocks. */ int s_reserved; /* Number of reserved blocks. */ //u32 s_blksize; /* Initial device blksize */ u32 s_data_blksize; /* size of the data block w/o header */ u32 s_root_block; /* FFS root block number. */ int s_hashsize; /* Size of hash table. */ unsigned long s_flags; /* See below. */ kuid_t s_uid; /* uid to override */ kgid_t s_gid; /* gid to override */ umode_t s_mode; /* mode to override */ struct buffer_head *s_root_bh; /* Cached root block. */ struct mutex s_bmlock; /* Protects bitmap access. */ struct affs_bm_info *s_bitmap; /* Bitmap infos. */ u32 s_bmap_count; /* # of bitmap blocks. */ u32 s_bmap_bits; /* # of bits in one bitmap blocks */ u32 s_last_bmap; struct buffer_head *s_bmap_bh; char *s_prefix; /* Prefix for volumes and assigns. */ char s_volume[32]; /* Volume prefix for absolute symlinks. */ spinlock_t symlink_lock; /* protects the previous two */ struct super_block *sb; /* the VFS superblock object */ int work_queued; /* non-zero delayed work is queued */ struct delayed_work sb_work; /* superblock flush delayed work */ spinlock_t work_lock; /* protects sb_work and work_queued */ struct rcu_head rcu; }; #define AFFS_MOUNT_SF_INTL 0x0001 /* International filesystem. */ #define AFFS_MOUNT_SF_BM_VALID 0x0002 /* Bitmap is valid. */ #define AFFS_MOUNT_SF_IMMUTABLE 0x0004 /* Protection bits cannot be changed */ #define AFFS_MOUNT_SF_QUIET 0x0008 /* chmod errors will be not reported */ #define AFFS_MOUNT_SF_SETUID 0x0010 /* Ignore Amiga uid */ #define AFFS_MOUNT_SF_SETGID 0x0020 /* Ignore Amiga gid */ #define AFFS_MOUNT_SF_SETMODE 0x0040 /* Ignore Amiga protection bits */ #define AFFS_MOUNT_SF_MUFS 0x0100 /* Use MUFS uid/gid mapping */ #define AFFS_MOUNT_SF_OFS 0x0200 /* Old filesystem */ #define AFFS_MOUNT_SF_PREFIX 0x0400 /* Buffer for prefix is allocated */ #define AFFS_MOUNT_SF_VERBOSE 0x0800 /* Talk about fs when mounting */ #define AFFS_MOUNT_SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */ #define affs_clear_opt(o, opt) (o &= ~AFFS_MOUNT_##opt) #define affs_set_opt(o, opt) (o |= AFFS_MOUNT_##opt) #define affs_test_opt(o, opt) ((o) & AFFS_MOUNT_##opt) /* short cut to get to the affs specific sb data */ static inline struct affs_sb_info *AFFS_SB(struct super_block *sb) { return sb->s_fs_info; } void affs_mark_sb_dirty(struct super_block *sb); /* amigaffs.c */ extern int affs_insert_hash(struct inode *inode, struct buffer_head *bh); extern int affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh); extern int affs_remove_header(struct dentry *dentry); extern u32 affs_checksum_block(struct super_block *sb, struct buffer_head *bh); extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); extern void affs_secs_to_datestamp(time64_t secs, struct affs_date *ds); extern umode_t affs_prot_to_mode(u32 prot); extern void affs_mode_to_prot(struct inode *inode); __printf(3, 4) extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...); __printf(3, 4) extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...); extern bool affs_nofilenametruncate(const struct dentry *dentry); extern int affs_check_name(const unsigned char *name, int len, bool notruncate); extern int affs_copy_name(unsigned char *bstr, struct dentry *dentry); /* bitmap. c */ extern u32 affs_count_free_blocks(struct super_block *s); extern void affs_free_block(struct super_block *sb, u32 block); extern u32 affs_alloc_block(struct inode *inode, u32 goal); extern int affs_init_bitmap(struct super_block *sb, int *flags); extern void affs_free_bitmap(struct super_block *sb); /* namei.c */ extern const struct export_operations affs_export_ops; extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len); extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int); extern int affs_unlink(struct inode *dir, struct dentry *dentry); extern int affs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool); extern struct dentry *affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode); extern int affs_rmdir(struct inode *dir, struct dentry *dentry); extern int affs_link(struct dentry *olddentry, struct inode *dir, struct dentry *dentry); extern int affs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname); extern int affs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); /* inode.c */ extern struct inode *affs_new_inode(struct inode *dir); extern int affs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern void affs_evict_inode(struct inode *inode); extern struct inode *affs_iget(struct super_block *sb, unsigned long ino); extern int affs_write_inode(struct inode *inode, struct writeback_control *wbc); extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type); /* file.c */ void affs_free_prealloc(struct inode *inode); extern void affs_truncate(struct inode *); int affs_file_fsync(struct file *, loff_t, loff_t, int); /* dir.c */ extern void affs_dir_truncate(struct inode *); /* jump tables */ extern const struct inode_operations affs_file_inode_operations; extern const struct inode_operations affs_dir_inode_operations; extern const struct inode_operations affs_symlink_inode_operations; extern const struct file_operations affs_file_operations; extern const struct file_operations affs_file_operations_ofs; extern const struct file_operations affs_dir_operations; extern const struct address_space_operations affs_symlink_aops; extern const struct address_space_operations affs_aops; extern const struct address_space_operations affs_aops_ofs; extern const struct dentry_operations affs_dentry_operations; extern const struct dentry_operations affs_intl_dentry_operations; static inline bool affs_validblock(struct super_block *sb, int block) { return(block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size); } static inline void affs_set_blocksize(struct super_block *sb, int size) { sb_set_blocksize(sb, size); } static inline struct buffer_head * affs_bread(struct super_block *sb, int block) { pr_debug("%s: %d\n", __func__, block); if (affs_validblock(sb, block)) return sb_bread(sb, block); return NULL; } static inline struct buffer_head * affs_getblk(struct super_block *sb, int block) { pr_debug("%s: %d\n", __func__, block); if (affs_validblock(sb, block)) return sb_getblk(sb, block); return NULL; } static inline struct buffer_head * affs_getzeroblk(struct super_block *sb, int block) { struct buffer_head *bh; pr_debug("%s: %d\n", __func__, block); if (affs_validblock(sb, block)) { bh = sb_getblk(sb, block); lock_buffer(bh); memset(bh->b_data, 0 , sb->s_blocksize); set_buffer_uptodate(bh); unlock_buffer(bh); return bh; } return NULL; } static inline struct buffer_head * affs_getemptyblk(struct super_block *sb, int block) { struct buffer_head *bh; pr_debug("%s: %d\n", __func__, block); if (affs_validblock(sb, block)) { bh = sb_getblk(sb, block); wait_on_buffer(bh); set_buffer_uptodate(bh); return bh; } return NULL; } static inline void affs_brelse(struct buffer_head *bh) { if (bh) pr_debug("%s: %lld\n", __func__, (long long) bh->b_blocknr); brelse(bh); } static inline void affs_adjust_checksum(struct buffer_head *bh, u32 val) { u32 tmp = be32_to_cpu(((__be32 *)bh->b_data)[5]); ((__be32 *)bh->b_data)[5] = cpu_to_be32(tmp - val); } static inline void affs_adjust_bitmapchecksum(struct buffer_head *bh, u32 val) { u32 tmp = be32_to_cpu(((__be32 *)bh->b_data)[0]); ((__be32 *)bh->b_data)[0] = cpu_to_be32(tmp - val); } static inline void affs_lock_link(struct inode *inode) { mutex_lock(&AFFS_I(inode)->i_link_lock); } static inline void affs_unlock_link(struct inode *inode) { mutex_unlock(&AFFS_I(inode)->i_link_lock); } static inline void affs_lock_dir(struct inode *inode) { mutex_lock_nested(&AFFS_I(inode)->i_hash_lock, SINGLE_DEPTH_NESTING); } static inline void affs_unlock_dir(struct inode *inode) { mutex_unlock(&AFFS_I(inode)->i_hash_lock); } static inline void affs_lock_ext(struct inode *inode) { mutex_lock(&AFFS_I(inode)->i_ext_lock); } static inline void affs_unlock_ext(struct inode *inode) { mutex_unlock(&AFFS_I(inode)->i_ext_lock); } |
| 23 31 31 23 7 7 12 31 31 30 31 29 21 29 21 31 31 29 31 1 31 31 12 23 8 29 24 26 30 30 6 31 31 2 2 29 1 29 1 28 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 29 31 29 29 29 31 31 14 26 28 28 1 10 24 12 25 27 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 | // SPDX-License-Identifier: GPL-2.0-or-later /* * 842 Software Compression * * Copyright (C) 2015 Dan Streetman, IBM Corp * * See 842.h for details of the 842 compressed format. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define MODULE_NAME "842_compress" #include <linux/hashtable.h> #include "842.h" #include "842_debugfs.h" #define SW842_HASHTABLE8_BITS (10) #define SW842_HASHTABLE4_BITS (11) #define SW842_HASHTABLE2_BITS (10) /* By default, we allow compressing input buffers of any length, but we must * use the non-standard "short data" template so the decompressor can correctly * reproduce the uncompressed data buffer at the right length. However the * hardware 842 compressor will not recognize the "short data" template, and * will fail to decompress any compressed buffer containing it (I have no idea * why anyone would want to use software to compress and hardware to decompress * but that's beside the point). This parameter forces the compression * function to simply reject any input buffer that isn't a multiple of 8 bytes * long, instead of using the "short data" template, so that all compressed * buffers produced by this function will be decompressable by the 842 hardware * decompressor. Unless you have a specific need for that, leave this disabled * so that any length buffer can be compressed. */ static bool sw842_strict; module_param_named(strict, sw842_strict, bool, 0644); static u8 comp_ops[OPS_MAX][5] = { /* params size in bits */ { I8, N0, N0, N0, 0x19 }, /* 8 */ { I4, I4, N0, N0, 0x18 }, /* 18 */ { I4, I2, I2, N0, 0x17 }, /* 25 */ { I2, I2, I4, N0, 0x13 }, /* 25 */ { I2, I2, I2, I2, 0x12 }, /* 32 */ { I4, I2, D2, N0, 0x16 }, /* 33 */ { I4, D2, I2, N0, 0x15 }, /* 33 */ { I2, D2, I4, N0, 0x0e }, /* 33 */ { D2, I2, I4, N0, 0x09 }, /* 33 */ { I2, I2, I2, D2, 0x11 }, /* 40 */ { I2, I2, D2, I2, 0x10 }, /* 40 */ { I2, D2, I2, I2, 0x0d }, /* 40 */ { D2, I2, I2, I2, 0x08 }, /* 40 */ { I4, D4, N0, N0, 0x14 }, /* 41 */ { D4, I4, N0, N0, 0x04 }, /* 41 */ { I2, I2, D4, N0, 0x0f }, /* 48 */ { I2, D2, I2, D2, 0x0c }, /* 48 */ { I2, D4, I2, N0, 0x0b }, /* 48 */ { D2, I2, I2, D2, 0x07 }, /* 48 */ { D2, I2, D2, I2, 0x06 }, /* 48 */ { D4, I2, I2, N0, 0x03 }, /* 48 */ { I2, D2, D4, N0, 0x0a }, /* 56 */ { D2, I2, D4, N0, 0x05 }, /* 56 */ { D4, I2, D2, N0, 0x02 }, /* 56 */ { D4, D2, I2, N0, 0x01 }, /* 56 */ { D8, N0, N0, N0, 0x00 }, /* 64 */ }; struct sw842_hlist_node8 { struct hlist_node node; u64 data; u8 index; }; struct sw842_hlist_node4 { struct hlist_node node; u32 data; u16 index; }; struct sw842_hlist_node2 { struct hlist_node node; u16 data; u8 index; }; #define INDEX_NOT_FOUND (-1) #define INDEX_NOT_CHECKED (-2) struct sw842_param { u8 *in; u8 *instart; u64 ilen; u8 *out; u64 olen; u8 bit; u64 data8[1]; u32 data4[2]; u16 data2[4]; int index8[1]; int index4[2]; int index2[4]; DECLARE_HASHTABLE(htable8, SW842_HASHTABLE8_BITS); DECLARE_HASHTABLE(htable4, SW842_HASHTABLE4_BITS); DECLARE_HASHTABLE(htable2, SW842_HASHTABLE2_BITS); struct sw842_hlist_node8 node8[1 << I8_BITS]; struct sw842_hlist_node4 node4[1 << I4_BITS]; struct sw842_hlist_node2 node2[1 << I2_BITS]; }; #define get_input_data(p, o, b) \ be##b##_to_cpu(get_unaligned((__be##b *)((p)->in + (o)))) #define init_hashtable_nodes(p, b) do { \ int _i; \ hash_init((p)->htable##b); \ for (_i = 0; _i < ARRAY_SIZE((p)->node##b); _i++) { \ (p)->node##b[_i].index = _i; \ (p)->node##b[_i].data = 0; \ INIT_HLIST_NODE(&(p)->node##b[_i].node); \ } \ } while (0) #define find_index(p, b, n) ({ \ struct sw842_hlist_node##b *_n; \ p->index##b[n] = INDEX_NOT_FOUND; \ hash_for_each_possible(p->htable##b, _n, node, p->data##b[n]) { \ if (p->data##b[n] == _n->data) { \ p->index##b[n] = _n->index; \ break; \ } \ } \ p->index##b[n] >= 0; \ }) #define check_index(p, b, n) \ ((p)->index##b[n] == INDEX_NOT_CHECKED \ ? find_index(p, b, n) \ : (p)->index##b[n] >= 0) #define replace_hash(p, b, i, d) do { \ struct sw842_hlist_node##b *_n = &(p)->node##b[(i)+(d)]; \ hash_del(&_n->node); \ _n->data = (p)->data##b[d]; \ pr_debug("add hash index%x %x pos %x data %lx\n", b, \ (unsigned int)_n->index, \ (unsigned int)((p)->in - (p)->instart), \ (unsigned long)_n->data); \ hash_add((p)->htable##b, &_n->node, _n->data); \ } while (0) static u8 bmask[8] = { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe }; static int add_bits(struct sw842_param *p, u64 d, u8 n); static int __split_add_bits(struct sw842_param *p, u64 d, u8 n, u8 s) { int ret; if (n <= s) return -EINVAL; ret = add_bits(p, d >> s, n - s); if (ret) return ret; return add_bits(p, d & GENMASK_ULL(s - 1, 0), s); } static int add_bits(struct sw842_param *p, u64 d, u8 n) { int b = p->bit, bits = b + n, s = round_up(bits, 8) - bits; u64 o; u8 *out = p->out; pr_debug("add %u bits %lx\n", (unsigned char)n, (unsigned long)d); if (n > 64) return -EINVAL; /* split this up if writing to > 8 bytes (i.e. n == 64 && p->bit > 0), * or if we're at the end of the output buffer and would write past end */ if (bits > 64) return __split_add_bits(p, d, n, 32); else if (p->olen < 8 && bits > 32 && bits <= 56) return __split_add_bits(p, d, n, 16); else if (p->olen < 4 && bits > 16 && bits <= 24) return __split_add_bits(p, d, n, 8); if (DIV_ROUND_UP(bits, 8) > p->olen) return -ENOSPC; o = *out & bmask[b]; d <<= s; if (bits <= 8) *out = o | d; else if (bits <= 16) put_unaligned(cpu_to_be16(o << 8 | d), (__be16 *)out); else if (bits <= 24) put_unaligned(cpu_to_be32(o << 24 | d << 8), (__be32 *)out); else if (bits <= 32) put_unaligned(cpu_to_be32(o << 24 | d), (__be32 *)out); else if (bits <= 40) put_unaligned(cpu_to_be64(o << 56 | d << 24), (__be64 *)out); else if (bits <= 48) put_unaligned(cpu_to_be64(o << 56 | d << 16), (__be64 *)out); else if (bits <= 56) put_unaligned(cpu_to_be64(o << 56 | d << 8), (__be64 *)out); else put_unaligned(cpu_to_be64(o << 56 | d), (__be64 *)out); p->bit += n; if (p->bit > 7) { p->out += p->bit / 8; p->olen -= p->bit / 8; p->bit %= 8; } return 0; } static int add_template(struct sw842_param *p, u8 c) { int ret, i, b = 0; u8 *t = comp_ops[c]; bool inv = false; if (c >= OPS_MAX) return -EINVAL; pr_debug("template %x\n", t[4]); ret = add_bits(p, t[4], OP_BITS); if (ret) return ret; for (i = 0; i < 4; i++) { pr_debug("op %x\n", t[i]); switch (t[i] & OP_AMOUNT) { case OP_AMOUNT_8: if (b) inv = true; else if (t[i] & OP_ACTION_INDEX) ret = add_bits(p, p->index8[0], I8_BITS); else if (t[i] & OP_ACTION_DATA) ret = add_bits(p, p->data8[0], 64); else inv = true; break; case OP_AMOUNT_4: if (b == 2 && t[i] & OP_ACTION_DATA) ret = add_bits(p, get_input_data(p, 2, 32), 32); else if (b != 0 && b != 4) inv = true; else if (t[i] & OP_ACTION_INDEX) ret = add_bits(p, p->index4[b >> 2], I4_BITS); else if (t[i] & OP_ACTION_DATA) ret = add_bits(p, p->data4[b >> 2], 32); else inv = true; break; case OP_AMOUNT_2: if (b != 0 && b != 2 && b != 4 && b != 6) inv = true; if (t[i] & OP_ACTION_INDEX) ret = add_bits(p, p->index2[b >> 1], I2_BITS); else if (t[i] & OP_ACTION_DATA) ret = add_bits(p, p->data2[b >> 1], 16); else inv = true; break; case OP_AMOUNT_0: inv = (b != 8) || !(t[i] & OP_ACTION_NOOP); break; default: inv = true; break; } if (ret) return ret; if (inv) { pr_err("Invalid templ %x op %d : %x %x %x %x\n", c, i, t[0], t[1], t[2], t[3]); return -EINVAL; } b += t[i] & OP_AMOUNT; } if (b != 8) { pr_err("Invalid template %x len %x : %x %x %x %x\n", c, b, t[0], t[1], t[2], t[3]); return -EINVAL; } if (sw842_template_counts) atomic_inc(&template_count[t[4]]); return 0; } static int add_repeat_template(struct sw842_param *p, u8 r) { int ret; /* repeat param is 0-based */ if (!r || --r > REPEAT_BITS_MAX) return -EINVAL; ret = add_bits(p, OP_REPEAT, OP_BITS); if (ret) return ret; ret = add_bits(p, r, REPEAT_BITS); if (ret) return ret; if (sw842_template_counts) atomic_inc(&template_repeat_count); return 0; } static int add_short_data_template(struct sw842_param *p, u8 b) { int ret, i; if (!b || b > SHORT_DATA_BITS_MAX) return -EINVAL; ret = add_bits(p, OP_SHORT_DATA, OP_BITS); if (ret) return ret; ret = add_bits(p, b, SHORT_DATA_BITS); if (ret) return ret; for (i = 0; i < b; i++) { ret = add_bits(p, p->in[i], 8); if (ret) return ret; } if (sw842_template_counts) atomic_inc(&template_short_data_count); return 0; } static int add_zeros_template(struct sw842_param *p) { int ret = add_bits(p, OP_ZEROS, OP_BITS); if (ret) return ret; if (sw842_template_counts) atomic_inc(&template_zeros_count); return 0; } static int add_end_template(struct sw842_param *p) { int ret = add_bits(p, OP_END, OP_BITS); if (ret) return ret; if (sw842_template_counts) atomic_inc(&template_end_count); return 0; } static bool check_template(struct sw842_param *p, u8 c) { u8 *t = comp_ops[c]; int i, match, b = 0; if (c >= OPS_MAX) return false; for (i = 0; i < 4; i++) { if (t[i] & OP_ACTION_INDEX) { if (t[i] & OP_AMOUNT_2) match = check_index(p, 2, b >> 1); else if (t[i] & OP_AMOUNT_4) match = check_index(p, 4, b >> 2); else if (t[i] & OP_AMOUNT_8) match = check_index(p, 8, 0); else return false; if (!match) return false; } b += t[i] & OP_AMOUNT; } return true; } static void get_next_data(struct sw842_param *p) { p->data8[0] = get_input_data(p, 0, 64); p->data4[0] = get_input_data(p, 0, 32); p->data4[1] = get_input_data(p, 4, 32); p->data2[0] = get_input_data(p, 0, 16); p->data2[1] = get_input_data(p, 2, 16); p->data2[2] = get_input_data(p, 4, 16); p->data2[3] = get_input_data(p, 6, 16); } /* update the hashtable entries. * only call this after finding/adding the current template * the dataN fields for the current 8 byte block must be already updated */ static void update_hashtables(struct sw842_param *p) { u64 pos = p->in - p->instart; u64 n8 = (pos >> 3) % (1 << I8_BITS); u64 n4 = (pos >> 2) % (1 << I4_BITS); u64 n2 = (pos >> 1) % (1 << I2_BITS); replace_hash(p, 8, n8, 0); replace_hash(p, 4, n4, 0); replace_hash(p, 4, n4, 1); replace_hash(p, 2, n2, 0); replace_hash(p, 2, n2, 1); replace_hash(p, 2, n2, 2); replace_hash(p, 2, n2, 3); } /* find the next template to use, and add it * the p->dataN fields must already be set for the current 8 byte block */ static int process_next(struct sw842_param *p) { int ret, i; p->index8[0] = INDEX_NOT_CHECKED; p->index4[0] = INDEX_NOT_CHECKED; p->index4[1] = INDEX_NOT_CHECKED; p->index2[0] = INDEX_NOT_CHECKED; p->index2[1] = INDEX_NOT_CHECKED; p->index2[2] = INDEX_NOT_CHECKED; p->index2[3] = INDEX_NOT_CHECKED; /* check up to OPS_MAX - 1; last op is our fallback */ for (i = 0; i < OPS_MAX - 1; i++) { if (check_template(p, i)) break; } ret = add_template(p, i); if (ret) return ret; return 0; } /** * sw842_compress * * Compress the uncompressed buffer of length @ilen at @in to the output buffer * @out, using no more than @olen bytes, using the 842 compression format. * * Returns: 0 on success, error on failure. The @olen parameter * will contain the number of output bytes written on success, or * 0 on error. */ int sw842_compress(const u8 *in, unsigned int ilen, u8 *out, unsigned int *olen, void *wmem) { struct sw842_param *p = (struct sw842_param *)wmem; int ret; u64 last, next, pad, total; u8 repeat_count = 0; u32 crc; BUILD_BUG_ON(sizeof(*p) > SW842_MEM_COMPRESS); init_hashtable_nodes(p, 8); init_hashtable_nodes(p, 4); init_hashtable_nodes(p, 2); p->in = (u8 *)in; p->instart = p->in; p->ilen = ilen; p->out = out; p->olen = *olen; p->bit = 0; total = p->olen; *olen = 0; /* if using strict mode, we can only compress a multiple of 8 */ if (sw842_strict && (ilen % 8)) { pr_err("Using strict mode, can't compress len %d\n", ilen); return -EINVAL; } /* let's compress at least 8 bytes, mkay? */ if (unlikely(ilen < 8)) goto skip_comp; /* make initial 'last' different so we don't match the first time */ last = ~get_unaligned((u64 *)p->in); while (p->ilen > 7) { next = get_unaligned((u64 *)p->in); /* must get the next data, as we need to update the hashtable * entries with the new data every time */ get_next_data(p); /* we don't care about endianness in last or next; * we're just comparing 8 bytes to another 8 bytes, * they're both the same endianness */ if (next == last) { /* repeat count bits are 0-based, so we stop at +1 */ if (++repeat_count <= REPEAT_BITS_MAX) goto repeat; } if (repeat_count) { ret = add_repeat_template(p, repeat_count); if (ret) return ret; repeat_count = 0; if (next == last) /* reached max repeat bits */ goto repeat; } if (next == 0) ret = add_zeros_template(p); else ret = process_next(p); if (ret) return ret; repeat: last = next; update_hashtables(p); p->in += 8; p->ilen -= 8; } if (repeat_count) { ret = add_repeat_template(p, repeat_count); if (ret) return ret; } skip_comp: if (p->ilen > 0) { ret = add_short_data_template(p, p->ilen); if (ret) return ret; p->in += p->ilen; p->ilen = 0; } ret = add_end_template(p); if (ret) return ret; /* * crc(0:31) is appended to target data starting with the next * bit after End of stream template. * nx842 calculates CRC for data in big-endian format. So doing * same here so that sw842 decompression can be used for both * compressed data. */ crc = crc32_be(0, in, ilen); ret = add_bits(p, crc, CRC_BITS); if (ret) return ret; if (p->bit) { p->out++; p->olen--; p->bit = 0; } /* pad compressed length to multiple of 8 */ pad = (8 - ((total - p->olen) % 8)) % 8; if (pad) { if (pad > p->olen) /* we were so close! */ return -ENOSPC; memset(p->out, 0, pad); p->out += pad; p->olen -= pad; } if (unlikely((total - p->olen) > UINT_MAX)) return -ENOSPC; *olen = total - p->olen; return 0; } EXPORT_SYMBOL_GPL(sw842_compress); static int __init sw842_init(void) { if (sw842_template_counts) sw842_debugfs_create(); return 0; } module_init(sw842_init); static void __exit sw842_exit(void) { if (sw842_template_counts) sw842_debugfs_remove(); } module_exit(sw842_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Software 842 Compressor"); MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); |
| 5 15 15 15 15 15 14 5 1 15 15 15 4 4 1 4 1 4 3 2 3 2 1 4 4 4 16 16 16 16 15 15 15 15 14 15 5 4 3 2 1 1 2 1 10 9 1 2 5 9 9 9 8 4 4 4 4 4 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/skmsg.h> #include <linux/skbuff.h> #include <linux/scatterlist.h> #include <net/sock.h> #include <net/tcp.h> #include <net/tls.h> #include <trace/events/sock.h> static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) { if (msg->sg.end > msg->sg.start && elem_first_coalesce < msg->sg.end) return true; if (msg->sg.end < msg->sg.start && (elem_first_coalesce > msg->sg.start || elem_first_coalesce < msg->sg.end)) return true; return false; } int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce) { struct page_frag *pfrag = sk_page_frag(sk); u32 osize = msg->sg.size; int ret = 0; len -= msg->sg.size; while (len > 0) { struct scatterlist *sge; u32 orig_offset; int use, i; if (!sk_page_frag_refill(sk, pfrag)) { ret = -ENOMEM; goto msg_trim; } orig_offset = pfrag->offset; use = min_t(int, len, pfrag->size - orig_offset); if (!sk_wmem_schedule(sk, use)) { ret = -ENOMEM; goto msg_trim; } i = msg->sg.end; sk_msg_iter_var_prev(i); sge = &msg->sg.data[i]; if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) && sg_page(sge) == pfrag->page && sge->offset + sge->length == orig_offset) { sge->length += use; } else { if (sk_msg_full(msg)) { ret = -ENOSPC; break; } sge = &msg->sg.data[msg->sg.end]; sg_unmark_end(sge); sg_set_page(sge, pfrag->page, use, orig_offset); get_page(pfrag->page); sk_msg_iter_next(msg, end); } sk_mem_charge(sk, use); msg->sg.size += use; pfrag->offset += use; len -= use; } return ret; msg_trim: sk_msg_trim(sk, msg, osize); return ret; } EXPORT_SYMBOL_GPL(sk_msg_alloc); int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, u32 off, u32 len) { int i = src->sg.start; struct scatterlist *sge = sk_msg_elem(src, i); struct scatterlist *sgd = NULL; u32 sge_len, sge_off; while (off) { if (sge->length > off) break; off -= sge->length; sk_msg_iter_var_next(i); if (i == src->sg.end && off) return -ENOSPC; sge = sk_msg_elem(src, i); } while (len) { sge_len = sge->length - off; if (sge_len > len) sge_len = len; if (dst->sg.end) sgd = sk_msg_elem(dst, dst->sg.end - 1); if (sgd && (sg_page(sge) == sg_page(sgd)) && (sg_virt(sge) + off == sg_virt(sgd) + sgd->length)) { sgd->length += sge_len; dst->sg.size += sge_len; } else if (!sk_msg_full(dst)) { sge_off = sge->offset + off; sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off); } else { return -ENOSPC; } off = 0; len -= sge_len; sk_mem_charge(sk, sge_len); sk_msg_iter_var_next(i); if (i == src->sg.end && len) return -ENOSPC; sge = sk_msg_elem(src, i); } return 0; } EXPORT_SYMBOL_GPL(sk_msg_clone); void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes) { int i = msg->sg.start; do { struct scatterlist *sge = sk_msg_elem(msg, i); if (bytes < sge->length) { sge->length -= bytes; sge->offset += bytes; sk_mem_uncharge(sk, bytes); break; } sk_mem_uncharge(sk, sge->length); bytes -= sge->length; sge->length = 0; sge->offset = 0; sk_msg_iter_var_next(i); } while (bytes && i != msg->sg.end); msg->sg.start = i; } EXPORT_SYMBOL_GPL(sk_msg_return_zero); void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes) { int i = msg->sg.start; do { struct scatterlist *sge = &msg->sg.data[i]; int uncharge = (bytes < sge->length) ? bytes : sge->length; sk_mem_uncharge(sk, uncharge); bytes -= uncharge; sk_msg_iter_var_next(i); } while (i != msg->sg.end); } EXPORT_SYMBOL_GPL(sk_msg_return); static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i, bool charge) { struct scatterlist *sge = sk_msg_elem(msg, i); u32 len = sge->length; /* When the skb owns the memory we free it from consume_skb path. */ if (!msg->skb) { if (charge) sk_mem_uncharge(sk, len); put_page(sg_page(sge)); } memset(sge, 0, sizeof(*sge)); return len; } static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i, bool charge) { struct scatterlist *sge = sk_msg_elem(msg, i); int freed = 0; while (msg->sg.size) { msg->sg.size -= sge->length; freed += sk_msg_free_elem(sk, msg, i, charge); sk_msg_iter_var_next(i); sk_msg_check_to_free(msg, i, msg->sg.size); sge = sk_msg_elem(msg, i); } consume_skb(msg->skb); sk_msg_init(msg); return freed; } int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg) { return __sk_msg_free(sk, msg, msg->sg.start, false); } EXPORT_SYMBOL_GPL(sk_msg_free_nocharge); int sk_msg_free(struct sock *sk, struct sk_msg *msg) { return __sk_msg_free(sk, msg, msg->sg.start, true); } EXPORT_SYMBOL_GPL(sk_msg_free); static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes, bool charge) { struct scatterlist *sge; u32 i = msg->sg.start; while (bytes) { sge = sk_msg_elem(msg, i); if (!sge->length) break; if (bytes < sge->length) { if (charge) sk_mem_uncharge(sk, bytes); sge->length -= bytes; sge->offset += bytes; msg->sg.size -= bytes; break; } msg->sg.size -= sge->length; bytes -= sge->length; sk_msg_free_elem(sk, msg, i, charge); sk_msg_iter_var_next(i); sk_msg_check_to_free(msg, i, bytes); } msg->sg.start = i; } void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes) { __sk_msg_free_partial(sk, msg, bytes, true); } EXPORT_SYMBOL_GPL(sk_msg_free_partial); void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, u32 bytes) { __sk_msg_free_partial(sk, msg, bytes, false); } void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len) { int trim = msg->sg.size - len; u32 i = msg->sg.end; if (trim <= 0) { WARN_ON(trim < 0); return; } sk_msg_iter_var_prev(i); msg->sg.size = len; while (msg->sg.data[i].length && trim >= msg->sg.data[i].length) { trim -= msg->sg.data[i].length; sk_msg_free_elem(sk, msg, i, true); sk_msg_iter_var_prev(i); if (!trim) goto out; } msg->sg.data[i].length -= trim; sk_mem_uncharge(sk, trim); /* Adjust copybreak if it falls into the trimmed part of last buf */ if (msg->sg.curr == i && msg->sg.copybreak > msg->sg.data[i].length) msg->sg.copybreak = msg->sg.data[i].length; out: sk_msg_iter_var_next(i); msg->sg.end = i; /* If we trim data a full sg elem before curr pointer update * copybreak and current so that any future copy operations * start at new copy location. * However trimmed data that has not yet been used in a copy op * does not require an update. */ if (!msg->sg.size) { msg->sg.curr = msg->sg.start; msg->sg.copybreak = 0; } else if (sk_msg_iter_dist(msg->sg.start, msg->sg.curr) >= sk_msg_iter_dist(msg->sg.start, msg->sg.end)) { sk_msg_iter_var_prev(i); msg->sg.curr = i; msg->sg.copybreak = msg->sg.data[i].length; } } EXPORT_SYMBOL_GPL(sk_msg_trim); int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes) { int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg); const int to_max_pages = MAX_MSG_FRAGS; struct page *pages[MAX_MSG_FRAGS]; ssize_t orig, copied, use, offset; orig = msg->sg.size; while (bytes > 0) { i = 0; maxpages = to_max_pages - num_elems; if (maxpages == 0) { ret = -EFAULT; goto out; } copied = iov_iter_get_pages2(from, pages, bytes, maxpages, &offset); if (copied <= 0) { ret = -EFAULT; goto out; } bytes -= copied; msg->sg.size += copied; while (copied) { use = min_t(int, copied, PAGE_SIZE - offset); sg_set_page(&msg->sg.data[msg->sg.end], pages[i], use, offset); sg_unmark_end(&msg->sg.data[msg->sg.end]); sk_mem_charge(sk, use); offset = 0; copied -= use; sk_msg_iter_next(msg, end); num_elems++; i++; } /* When zerocopy is mixed with sk_msg_*copy* operations we * may have a copybreak set in this case clear and prefer * zerocopy remainder when possible. */ msg->sg.copybreak = 0; msg->sg.curr = msg->sg.end; } out: /* Revert iov_iter updates, msg will need to use 'trim' later if it * also needs to be cleared. */ if (ret) iov_iter_revert(from, msg->sg.size - orig); return ret; } EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes) { int ret = -ENOSPC, i = msg->sg.curr; u32 copy, buf_size, copied = 0; struct scatterlist *sge; void *to; do { sge = sk_msg_elem(msg, i); /* This is possible if a trim operation shrunk the buffer */ if (msg->sg.copybreak >= sge->length) { msg->sg.copybreak = 0; sk_msg_iter_var_next(i); if (i == msg->sg.end) break; sge = sk_msg_elem(msg, i); } buf_size = sge->length - msg->sg.copybreak; copy = (buf_size > bytes) ? bytes : buf_size; to = sg_virt(sge) + msg->sg.copybreak; msg->sg.copybreak += copy; if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) ret = copy_from_iter_nocache(to, copy, from); else ret = copy_from_iter(to, copy, from); if (ret != copy) { ret = -EFAULT; goto out; } bytes -= copy; copied += copy; if (!bytes) break; msg->sg.copybreak = 0; sk_msg_iter_var_next(i); } while (i != msg->sg.end); out: msg->sg.curr = i; return (ret < 0) ? ret : copied; } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); /* Receive sk_msg from psock->ingress_msg to @msg. */ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags) { struct iov_iter *iter = &msg->msg_iter; int peek = flags & MSG_PEEK; struct sk_msg *msg_rx; int i, copied = 0; msg_rx = sk_psock_peek_msg(psock); while (copied != len) { struct scatterlist *sge; if (unlikely(!msg_rx)) break; i = msg_rx->sg.start; do { struct page *page; int copy; sge = sk_msg_elem(msg_rx, i); copy = sge->length; page = sg_page(sge); if (copied + copy > len) copy = len - copied; if (copy) copy = copy_page_to_iter(page, sge->offset, copy, iter); if (!copy) { copied = copied ? copied : -EFAULT; goto out; } copied += copy; if (likely(!peek)) { sge->offset += copy; sge->length -= copy; if (!msg_rx->skb) { sk_mem_uncharge(sk, copy); atomic_sub(copy, &sk->sk_rmem_alloc); } msg_rx->sg.size -= copy; if (!sge->length) { sk_msg_iter_var_next(i); if (!msg_rx->skb) put_page(page); } } else { /* Lets not optimize peek case if copy_page_to_iter * didn't copy the entire length lets just break. */ if (copy != sge->length) goto out; sk_msg_iter_var_next(i); } if (copied == len) break; } while ((i != msg_rx->sg.end) && !sg_is_last(sge)); if (unlikely(peek)) { msg_rx = sk_psock_next_msg(psock, msg_rx); if (!msg_rx) break; continue; } msg_rx->sg.start = i; if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) { msg_rx = sk_psock_dequeue_msg(psock); kfree_sk_msg(msg_rx); } msg_rx = sk_psock_peek_msg(psock); } out: return copied; } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); bool sk_msg_is_readable(struct sock *sk) { struct sk_psock *psock; bool empty = true; rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) empty = list_empty(&psock->ingress_msg); rcu_read_unlock(); return !empty; } EXPORT_SYMBOL_GPL(sk_msg_is_readable); static struct sk_msg *alloc_sk_msg(gfp_t gfp) { struct sk_msg *msg; msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN); if (unlikely(!msg)) return NULL; sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); return msg; } static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, struct sk_buff *skb) { if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) return NULL; if (!sk_rmem_schedule(sk, skb, skb->truesize)) return NULL; return alloc_sk_msg(GFP_KERNEL); } static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, u32 off, u32 len, struct sk_psock *psock, struct sock *sk, struct sk_msg *msg, bool take_ref) { int num_sge, copied; /* skb_to_sgvec will fail when the total number of fragments in * frag_list and frags exceeds MAX_MSG_FRAGS. For example, the * caller may aggregate multiple skbs. */ num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); if (num_sge < 0) { /* skb linearize may fail with ENOMEM, but lets simply try again * later if this happens. Under memory pressure we don't want to * drop the skb. We need to linearize the skb so that the mapping * in skb_to_sgvec can not error. * Note that skb_linearize requires the skb not to be shared. */ if (skb_linearize(skb)) return -EAGAIN; num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); if (unlikely(num_sge < 0)) return num_sge; } #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) psock->ingress_bytes += len; #endif copied = len; msg->sg.start = 0; msg->sg.size = copied; msg->sg.end = num_sge; msg->skb = take_ref ? skb_get(skb) : skb; sk_psock_queue_msg(psock, msg); sk_psock_data_ready(sk, psock); return copied; } static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool take_ref); static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len) { struct sock *sk = psock->sk; struct sk_msg *msg; int err; /* If we are receiving on the same sock skb->sk is already assigned, * skip memory accounting and owner transition seeing it already set * correctly. */ if (unlikely(skb->sk == sk)) return sk_psock_skb_ingress_self(psock, skb, off, len, true); msg = sk_psock_create_ingress_msg(sk, skb); if (!msg) return -EAGAIN; /* This will transition ownership of the data from the socket where * the BPF program was run initiating the redirect to the socket * we will eventually receive this data on. The data will be released * from skb_consume found in __tcp_bpf_recvmsg() after its been copied * into user buffers. */ skb_set_owner_r(skb, sk); err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true); if (err < 0) kfree(msg); return err; } /* Puts an skb on the ingress queue of the socket already assigned to the * skb. In this case we do not need to check memory limits or skb_set_owner_r * because the skb is already accounted for here. */ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool take_ref) { struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); struct sock *sk = psock->sk; int err; if (unlikely(!msg)) return -EAGAIN; skb_set_owner_r(skb, sk); err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref); if (err < 0) kfree(msg); return err; } static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; return skb_send_sock(psock->sk, skb, off, len); } return sk_psock_skb_ingress(psock, skb, off, len); } static void sk_psock_skb_state(struct sk_psock *psock, struct sk_psock_work_state *state, int len, int off) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { state->len = len; state->off = off; } spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_backlog(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct sk_psock *psock = container_of(dwork, struct sk_psock, work); struct sk_psock_work_state *state = &psock->work_state; struct sk_buff *skb = NULL; u32 len = 0, off = 0; bool ingress; int ret; /* If sk is quickly removed from the map and then added back, the old * psock should not be scheduled, because there are now two psocks * pointing to the same sk. */ if (!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) return; /* Increment the psock refcnt to synchronize with close(fd) path in * sock_map_close(), ensuring we wait for backlog thread completion * before sk_socket freed. If refcnt increment fails, it indicates * sock_map_close() completed with sk_socket potentially already freed. */ if (!sk_psock_get(psock->sk)) return; mutex_lock(&psock->work_mutex); while ((skb = skb_peek(&psock->ingress_skb))) { len = skb->len; off = 0; if (skb_bpf_strparser(skb)) { struct strp_msg *stm = strp_msg(skb); off = stm->offset; len = stm->full_len; } /* Resume processing from previous partial state */ if (unlikely(state->len)) { len = state->len; off = state->off; } ingress = skb_bpf_ingress(skb); skb_bpf_redirect_clear(skb); do { ret = -EIO; if (!sock_flag(psock->sk, SOCK_DEAD)) ret = sk_psock_handle_skb(psock, skb, off, len, ingress); if (ret <= 0) { if (ret == -EAGAIN) { sk_psock_skb_state(psock, state, len, off); /* Restore redir info we cleared before */ skb_bpf_set_redir(skb, psock->sk, ingress); /* Delay slightly to prioritize any * other work that might be here. */ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) schedule_delayed_work(&psock->work, 1); goto end; } /* Hard errors break pipe and stop xmit. */ sk_psock_report_error(psock, ret ? -ret : EPIPE); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); goto end; } off += ret; len -= ret; } while (len); /* The entire skb sent, clear state */ sk_psock_skb_state(psock, state, 0, 0); skb = skb_dequeue(&psock->ingress_skb); kfree_skb(skb); } end: mutex_unlock(&psock->work_mutex); sk_psock_put(psock->sk, psock); } struct sk_psock *sk_psock_init(struct sock *sk, int node) { struct sk_psock *psock; struct proto *prot; write_lock_bh(&sk->sk_callback_lock); if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) { psock = ERR_PTR(-EINVAL); goto out; } if (sk->sk_user_data) { psock = ERR_PTR(-EBUSY); goto out; } psock = kzalloc_node(sizeof(*psock), GFP_ATOMIC | __GFP_NOWARN, node); if (!psock) { psock = ERR_PTR(-ENOMEM); goto out; } prot = READ_ONCE(sk->sk_prot); psock->sk = sk; psock->eval = __SK_NONE; psock->sk_proto = prot; psock->saved_unhash = prot->unhash; psock->saved_destroy = prot->destroy; psock->saved_close = prot->close; psock->saved_write_space = sk->sk_write_space; INIT_LIST_HEAD(&psock->link); spin_lock_init(&psock->link_lock); INIT_DELAYED_WORK(&psock->work, sk_psock_backlog); mutex_init(&psock->work_mutex); INIT_LIST_HEAD(&psock->ingress_msg); spin_lock_init(&psock->ingress_lock); skb_queue_head_init(&psock->ingress_skb); sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); refcount_set(&psock->refcnt, 1); __rcu_assign_sk_user_data_with_flags(sk, psock, SK_USER_DATA_NOCOPY | SK_USER_DATA_PSOCK); sock_hold(sk); out: write_unlock_bh(&sk->sk_callback_lock); return psock; } EXPORT_SYMBOL_GPL(sk_psock_init); struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) { struct sk_psock_link *link; spin_lock_bh(&psock->link_lock); link = list_first_entry_or_null(&psock->link, struct sk_psock_link, list); if (link) list_del(&link->list); spin_unlock_bh(&psock->link_lock); return link; } static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) { struct sk_msg *msg, *tmp; list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); if (!msg->skb) atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); sk_msg_free(psock->sk, msg); kfree(msg); } } static void __sk_psock_zap_ingress(struct sk_psock *psock) { struct sk_buff *skb; while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { skb_bpf_redirect_clear(skb); sock_drop(psock->sk, skb); } __sk_psock_purge_ingress_msg(psock); } static void sk_psock_link_destroy(struct sk_psock *psock) { struct sk_psock_link *link, *tmp; list_for_each_entry_safe(link, tmp, &psock->link, list) { list_del(&link->list); sk_psock_free_link(link); } } void sk_psock_stop(struct sk_psock *psock) { spin_lock_bh(&psock->ingress_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); sk_psock_cork_free(psock); spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_done_strp(struct sk_psock *psock); static void sk_psock_destroy(struct work_struct *work) { struct sk_psock *psock = container_of(to_rcu_work(work), struct sk_psock, rwork); /* No sk_callback_lock since already detached. */ sk_psock_done_strp(psock); cancel_delayed_work_sync(&psock->work); __sk_psock_zap_ingress(psock); mutex_destroy(&psock->work_mutex); psock_progs_drop(&psock->progs); sk_psock_link_destroy(psock); sk_psock_cork_free(psock); if (psock->sk_redir) sock_put(psock->sk_redir); if (psock->sk_pair) sock_put(psock->sk_pair); sock_put(psock->sk); kfree(psock); } void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); rcu_assign_sk_user_data(sk, NULL); if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); else if (psock->progs.stream_verdict || psock->progs.skb_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); sk_psock_stop(psock); INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); queue_rcu_work(system_wq, &psock->rwork); } EXPORT_SYMBOL_GPL(sk_psock_drop); static int sk_psock_map_verd(int verdict, bool redir) { switch (verdict) { case SK_PASS: return redir ? __SK_REDIRECT : __SK_PASS; case SK_DROP: default: break; } return __SK_DROP; } int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg) { struct bpf_prog *prog; int ret; rcu_read_lock(); prog = READ_ONCE(psock->progs.msg_parser); if (unlikely(!prog)) { ret = __SK_PASS; goto out; } sk_msg_compute_data_pointers(msg); msg->sk = sk; ret = bpf_prog_run_pin_on_cpu(prog, msg); ret = sk_psock_map_verd(ret, msg->sk_redir); psock->apply_bytes = msg->apply_bytes; if (ret == __SK_REDIRECT) { if (psock->sk_redir) { sock_put(psock->sk_redir); psock->sk_redir = NULL; } if (!msg->sk_redir) { ret = __SK_DROP; goto out; } psock->redir_ingress = sk_msg_to_ingress(msg); psock->sk_redir = msg->sk_redir; sock_hold(psock->sk_redir); } out: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; sk_other = skb_bpf_redirect_fetch(skb); /* This error is a buggy BPF program, it returned a redirect * return code, but then didn't set a redirect interface. */ if (unlikely(!sk_other)) { skb_bpf_redirect_clear(skb); sock_drop(from->sk, skb); return -EIO; } psock_other = sk_psock(sk_other); /* This error indicates the socket is being torn down or had another * error that caused the pipe to break. We can't send a packet on * a socket that is in this state so we drop the skb. */ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { skb_bpf_redirect_clear(skb); sock_drop(from->sk, skb); return -EIO; } spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); skb_bpf_redirect_clear(skb); sock_drop(from->sk, skb); return -EIO; } skb_queue_tail(&psock_other->ingress_skb, skb); schedule_delayed_work(&psock_other->work, 0); spin_unlock_bh(&psock_other->ingress_lock); return 0; } static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sk_psock *from, int verdict) { switch (verdict) { case __SK_REDIRECT: sk_psock_skb_redirect(from, skb); break; case __SK_PASS: case __SK_DROP: default: break; } } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) { struct bpf_prog *prog; int ret = __SK_PASS; rcu_read_lock(); prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb->sk = psock->sk; skb_dst_drop(skb); skb_bpf_redirect_clear(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_tls_verdict_apply(skb, psock, ret); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { struct sock *sk_other; int err = 0; u32 len, off; switch (verdict) { case __SK_PASS: err = -EIO; sk_other = psock->sk; if (sock_flag(sk_other, SOCK_DEAD) || !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) goto out_free; skb_bpf_set_ingress(skb); /* If the queue is empty then we can submit directly * into the msg queue. If its not empty we have to * queue work otherwise we may get OOO data. Otherwise, * if sk_psock_skb_ingress errors will be handled by * retrying later from workqueue. */ if (skb_queue_empty(&psock->ingress_skb)) { len = skb->len; off = 0; if (skb_bpf_strparser(skb)) { struct strp_msg *stm = strp_msg(skb); off = stm->offset; len = stm->full_len; } err = sk_psock_skb_ingress_self(psock, skb, off, len, false); } if (err < 0) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { skb_queue_tail(&psock->ingress_skb, skb); schedule_delayed_work(&psock->work, 0); err = 0; } spin_unlock_bh(&psock->ingress_lock); if (err < 0) goto out_free; } break; case __SK_REDIRECT: tcp_eat_skb(psock->sk, skb); err = sk_psock_skb_redirect(psock, skb); break; case __SK_DROP: default: out_free: skb_bpf_redirect_clear(skb); tcp_eat_skb(psock->sk, skb); sock_drop(psock->sk, skb); } return err; } static void sk_psock_write_space(struct sock *sk) { struct sk_psock *psock; void (*write_space)(struct sock *sk) = NULL; rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) schedule_delayed_work(&psock->work, 0); write_space = psock->saved_write_space; } rcu_read_unlock(); if (write_space) write_space(sk); } #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; struct sock *sk; rcu_read_lock(); sk = strp->sk; psock = sk_psock(sk); if (unlikely(!psock)) { sock_drop(sk, skb); goto out; } prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb->sk = sk; skb_dst_drop(skb); skb_bpf_redirect_clear(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); skb_bpf_set_strparser(skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_verdict_apply(psock, skb, ret); out: rcu_read_unlock(); } static int sk_psock_strp_read_done(struct strparser *strp, int err) { return err; } static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock = container_of(strp, struct sk_psock, strp); struct bpf_prog *prog; int ret = skb->len; rcu_read_lock(); prog = READ_ONCE(psock->progs.stream_parser); if (likely(prog)) { skb->sk = psock->sk; ret = bpf_prog_run_pin_on_cpu(prog, skb); skb->sk = NULL; } rcu_read_unlock(); return ret; } /* Called with socket lock held. */ static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; trace_sk_data_ready(sk); rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { if (tls_sw_has_ctx_rx(sk)) { psock->saved_data_ready(sk); } else { read_lock_bh(&sk->sk_callback_lock); strp_data_ready(&psock->strp); read_unlock_bh(&sk->sk_callback_lock); } } rcu_read_unlock(); } int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { int ret; static const struct strp_callbacks cb = { .rcv_msg = sk_psock_strp_read, .read_sock_done = sk_psock_strp_read_done, .parse_msg = sk_psock_strp_parse, }; ret = strp_init(&psock->strp, sk, &cb); if (!ret) sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED); if (sk_is_tcp(sk)) { psock->strp.cb.read_sock = tcp_bpf_strp_read_sock; psock->copied_seq = tcp_sk(sk)->copied_seq; } return ret; } void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { if (psock->saved_data_ready) return; psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; } void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) { psock_set_prog(&psock->progs.stream_parser, NULL); if (!psock->saved_data_ready) return; sk->sk_data_ready = psock->saved_data_ready; psock->saved_data_ready = NULL; strp_stop(&psock->strp); } static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ if (sk_psock_test_state(psock, SK_PSOCK_RX_STRP_ENABLED)) strp_done(&psock->strp); } #else static void sk_psock_done_strp(struct sk_psock *psock) { } #endif /* CONFIG_BPF_STREAM_PARSER */ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) { struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; int len = skb->len; rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { len = 0; tcp_eat_skb(sk, skb); sock_drop(sk, skb); goto out; } prog = READ_ONCE(psock->progs.stream_verdict); if (!prog) prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } ret = sk_psock_verdict_apply(psock, skb, ret); if (ret < 0) len = ret; out: rcu_read_unlock(); return len; } static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; const struct proto_ops *ops; int copied; trace_sk_data_ready(sk); if (unlikely(!sock)) return; ops = READ_ONCE(sock->ops); if (!ops || !ops->read_skb) return; copied = ops->read_skb(sk, sk_psock_verdict_recv); if (copied >= 0) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock) sk_psock_data_ready(sk, psock); rcu_read_unlock(); } } void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) { if (psock->saved_data_ready) return; psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_verdict_data_ready; sk->sk_write_space = sk_psock_write_space; } void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) { psock_set_prog(&psock->progs.stream_verdict, NULL); psock_set_prog(&psock->progs.skb_verdict, NULL); if (!psock->saved_data_ready) return; sk->sk_data_ready = psock->saved_data_ready; psock->saved_data_ready = NULL; } |
| 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 | // SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. ** Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. ** ** ******************************************************************************* ******************************************************************************/ /* * lowcomms.c * * This is the "low-level" comms layer. * * It is responsible for sending/receiving messages * from other nodes in the cluster. * * Cluster nodes are referred to by their nodeids. nodeids are * simply 32 bit numbers to the locking module - if they need to * be expanded for the cluster infrastructure then that is its * responsibility. It is this layer's * responsibility to resolve these into IP address or * whatever it needs for inter-node communication. * * The comms level is two kernel threads that deal mainly with * the receiving of messages from other nodes and passing them * up to the mid-level comms layer (which understands the * message format) for execution by the locking core, and * a send thread which does all the setting up of connections * to remote nodes and the sending of data. Threads are not allowed * to send their own data because it may cause them to wait in times * of high load. Also, this way, the sending thread can collect together * messages bound for one node and send them in one block. * * lowcomms will choose to use either TCP or SCTP as its transport layer * depending on the configuration variable 'protocol'. This should be set * to 0 (default) for TCP or 1 for SCTP. It should be configured using a * cluster-wide mechanism as it must be the same on all nodes of the cluster * for the DLM to function. * */ #include <asm/ioctls.h> #include <net/sock.h> #include <net/tcp.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/mutex.h> #include <linux/sctp.h> #include <linux/slab.h> #include <net/sctp/sctp.h> #include <net/ipv6.h> #include <trace/events/dlm.h> #include <trace/events/sock.h> #include "dlm_internal.h" #include "lowcomms.h" #include "midcomms.h" #include "memory.h" #include "config.h" #define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(5000) #define DLM_MAX_PROCESS_BUFFERS 24 #define NEEDED_RMEM (4*1024*1024) struct connection { struct socket *sock; /* NULL if not connected */ uint32_t nodeid; /* So we know who we are in the list */ /* this semaphore is used to allow parallel recv/send in read * lock mode. When we release a sock we need to held the write lock. * * However this is locking code and not nice. When we remove the * othercon handling we can look into other mechanism to synchronize * io handling to call sock_release() at the right time. */ struct rw_semaphore sock_lock; unsigned long flags; #define CF_APP_LIMITED 0 #define CF_RECV_PENDING 1 #define CF_SEND_PENDING 2 #define CF_RECV_INTR 3 #define CF_IO_STOP 4 #define CF_IS_OTHERCON 5 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; int retries; struct hlist_node list; /* due some connect()/accept() races we currently have this cross over * connection attempt second connection for one node. * * There is a solution to avoid the race by introducing a connect * rule as e.g. our_nodeid > nodeid_to_connect who is allowed to * connect. Otherside can connect but will only be considered that * the other side wants to have a reconnect. * * However changing to this behaviour will break backwards compatible. * In a DLM protocol major version upgrade we should remove this! */ struct connection *othercon; struct work_struct rwork; /* receive worker */ struct work_struct swork; /* send worker */ wait_queue_head_t shutdown_wait; unsigned char rx_leftover_buf[DLM_MAX_SOCKET_BUFSIZE]; int rx_leftover; int mark; int addr_count; int curr_addr_index; struct sockaddr_storage addr[DLM_MAX_ADDR_COUNT]; spinlock_t addrs_lock; struct rcu_head rcu; }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) struct listen_connection { struct socket *sock; struct work_struct rwork; }; #define DLM_WQ_REMAIN_BYTES(e) (PAGE_SIZE - e->end) #define DLM_WQ_LENGTH_BYTES(e) (e->end - e->offset) /* An entry waiting to be sent */ struct writequeue_entry { struct list_head list; struct page *page; int offset; int len; int end; int users; bool dirty; struct connection *con; struct list_head msgs; struct kref ref; }; struct dlm_msg { struct writequeue_entry *entry; struct dlm_msg *orig_msg; bool retransmit; void *ppc; int len; int idx; /* new()/commit() idx exchange */ struct list_head list; struct kref ref; }; struct processqueue_entry { unsigned char *buf; int nodeid; int buflen; struct list_head list; }; struct dlm_proto_ops { bool try_new_addr; const char *name; int proto; int how; void (*sockopts)(struct socket *sock); int (*bind)(struct socket *sock); int (*listen_validate)(void); void (*listen_sockopts)(struct socket *sock); int (*listen_bind)(struct socket *sock); }; static struct listen_sock_callbacks { void (*sk_error_report)(struct sock *); void (*sk_data_ready)(struct sock *); void (*sk_state_change)(struct sock *); void (*sk_write_space)(struct sock *); } listen_sock; static struct listen_connection listen_con; static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT]; static int dlm_local_count; /* Work queues */ static struct workqueue_struct *io_workqueue; static struct workqueue_struct *process_workqueue; static struct hlist_head connection_hash[CONN_HASH_SIZE]; static DEFINE_SPINLOCK(connections_lock); DEFINE_STATIC_SRCU(connections_srcu); static const struct dlm_proto_ops *dlm_proto_ops; #define DLM_IO_SUCCESS 0 #define DLM_IO_END 1 #define DLM_IO_EOF 2 #define DLM_IO_RESCHED 3 #define DLM_IO_FLUSH 4 static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); static void process_dlm_messages(struct work_struct *work); static DECLARE_WORK(process_work, process_dlm_messages); static DEFINE_SPINLOCK(processqueue_lock); static bool process_dlm_messages_pending; static DECLARE_WAIT_QUEUE_HEAD(processqueue_wq); static atomic_t processqueue_count; static LIST_HEAD(processqueue); bool dlm_lowcomms_is_running(void) { return !!listen_con.sock; } static void lowcomms_queue_swork(struct connection *con) { assert_spin_locked(&con->writequeue_lock); if (!test_bit(CF_IO_STOP, &con->flags) && !test_bit(CF_APP_LIMITED, &con->flags) && !test_and_set_bit(CF_SEND_PENDING, &con->flags)) queue_work(io_workqueue, &con->swork); } static void lowcomms_queue_rwork(struct connection *con) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(!lockdep_sock_is_held(con->sock->sk)); #endif if (!test_bit(CF_IO_STOP, &con->flags) && !test_and_set_bit(CF_RECV_PENDING, &con->flags)) queue_work(io_workqueue, &con->rwork); } static void writequeue_entry_ctor(void *data) { struct writequeue_entry *entry = data; INIT_LIST_HEAD(&entry->msgs); } struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void) { return kmem_cache_create("dlm_writequeue", sizeof(struct writequeue_entry), 0, 0, writequeue_entry_ctor); } struct kmem_cache *dlm_lowcomms_msg_cache_create(void) { return KMEM_CACHE(dlm_msg, 0); } /* need to held writequeue_lock */ static struct writequeue_entry *con_next_wq(struct connection *con) { struct writequeue_entry *e; e = list_first_entry_or_null(&con->writequeue, struct writequeue_entry, list); /* if len is zero nothing is to send, if there are users filling * buffers we wait until the users are done so we can send more. */ if (!e || e->users || e->len == 0) return NULL; return e; } static struct connection *__find_con(int nodeid, int r) { struct connection *con; hlist_for_each_entry_rcu(con, &connection_hash[r], list) { if (con->nodeid == nodeid) return con; } return NULL; } static void dlm_con_init(struct connection *con, int nodeid) { con->nodeid = nodeid; init_rwsem(&con->sock_lock); INIT_LIST_HEAD(&con->writequeue); spin_lock_init(&con->writequeue_lock); INIT_WORK(&con->swork, process_send_sockets); INIT_WORK(&con->rwork, process_recv_sockets); spin_lock_init(&con->addrs_lock); init_waitqueue_head(&con->shutdown_wait); } /* * If 'allocation' is zero then we don't attempt to create a new * connection structure for this node. */ static struct connection *nodeid2con(int nodeid, gfp_t alloc) { struct connection *con, *tmp; int r; r = nodeid_hash(nodeid); con = __find_con(nodeid, r); if (con || !alloc) return con; con = kzalloc(sizeof(*con), alloc); if (!con) return NULL; dlm_con_init(con, nodeid); spin_lock(&connections_lock); /* Because multiple workqueues/threads calls this function it can * race on multiple cpu's. Instead of locking hot path __find_con() * we just check in rare cases of recently added nodes again * under protection of connections_lock. If this is the case we * abort our connection creation and return the existing connection. */ tmp = __find_con(nodeid, r); if (tmp) { spin_unlock(&connections_lock); kfree(con); return tmp; } hlist_add_head_rcu(&con->list, &connection_hash[r]); spin_unlock(&connections_lock); return con; } static int addr_compare(const struct sockaddr_storage *x, const struct sockaddr_storage *y) { switch (x->ss_family) { case AF_INET: { struct sockaddr_in *sinx = (struct sockaddr_in *)x; struct sockaddr_in *siny = (struct sockaddr_in *)y; if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr) return 0; if (sinx->sin_port != siny->sin_port) return 0; break; } case AF_INET6: { struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x; struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y; if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr)) return 0; if (sinx->sin6_port != siny->sin6_port) return 0; break; } default: return 0; } return 1; } static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, struct sockaddr *sa_out, bool try_new_addr, unsigned int *mark) { struct sockaddr_storage sas; struct connection *con; int idx; if (!dlm_local_count) return -1; idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); if (!con) { srcu_read_unlock(&connections_srcu, idx); return -ENOENT; } spin_lock(&con->addrs_lock); if (!con->addr_count) { spin_unlock(&con->addrs_lock); srcu_read_unlock(&connections_srcu, idx); return -ENOENT; } memcpy(&sas, &con->addr[con->curr_addr_index], sizeof(struct sockaddr_storage)); if (try_new_addr) { con->curr_addr_index++; if (con->curr_addr_index == con->addr_count) con->curr_addr_index = 0; } *mark = con->mark; spin_unlock(&con->addrs_lock); if (sas_out) memcpy(sas_out, &sas, sizeof(struct sockaddr_storage)); if (!sa_out) { srcu_read_unlock(&connections_srcu, idx); return 0; } if (dlm_local_addr[0].ss_family == AF_INET) { struct sockaddr_in *in4 = (struct sockaddr_in *) &sas; struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out; ret4->sin_addr.s_addr = in4->sin_addr.s_addr; } else { struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &sas; struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) sa_out; ret6->sin6_addr = in6->sin6_addr; } srcu_read_unlock(&connections_srcu, idx); return 0; } static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid, unsigned int *mark) { struct connection *con; int i, idx, addr_i; idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(con, &connection_hash[i], list) { WARN_ON_ONCE(!con->addr_count); spin_lock(&con->addrs_lock); for (addr_i = 0; addr_i < con->addr_count; addr_i++) { if (addr_compare(&con->addr[addr_i], addr)) { *nodeid = con->nodeid; *mark = con->mark; spin_unlock(&con->addrs_lock); srcu_read_unlock(&connections_srcu, idx); return 0; } } spin_unlock(&con->addrs_lock); } } srcu_read_unlock(&connections_srcu, idx); return -ENOENT; } static bool dlm_lowcomms_con_has_addr(const struct connection *con, const struct sockaddr_storage *addr) { int i; for (i = 0; i < con->addr_count; i++) { if (addr_compare(&con->addr[i], addr)) return true; } return false; } int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr) { struct connection *con; bool ret; int idx; idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, GFP_NOFS); if (!con) { srcu_read_unlock(&connections_srcu, idx); return -ENOMEM; } spin_lock(&con->addrs_lock); if (!con->addr_count) { memcpy(&con->addr[0], addr, sizeof(*addr)); con->addr_count = 1; con->mark = dlm_config.ci_mark; spin_unlock(&con->addrs_lock); srcu_read_unlock(&connections_srcu, idx); return 0; } ret = dlm_lowcomms_con_has_addr(con, addr); if (ret) { spin_unlock(&con->addrs_lock); srcu_read_unlock(&connections_srcu, idx); return -EEXIST; } if (con->addr_count >= DLM_MAX_ADDR_COUNT) { spin_unlock(&con->addrs_lock); srcu_read_unlock(&connections_srcu, idx); return -ENOSPC; } memcpy(&con->addr[con->addr_count++], addr, sizeof(*addr)); srcu_read_unlock(&connections_srcu, idx); spin_unlock(&con->addrs_lock); return 0; } /* Data available on socket or listen socket received a connect */ static void lowcomms_data_ready(struct sock *sk) { struct connection *con = sock2con(sk); trace_sk_data_ready(sk); set_bit(CF_RECV_INTR, &con->flags); lowcomms_queue_rwork(con); } static void lowcomms_write_space(struct sock *sk) { struct connection *con = sock2con(sk); clear_bit(SOCK_NOSPACE, &con->sock->flags); spin_lock_bh(&con->writequeue_lock); if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) { con->sock->sk->sk_write_pending--; clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags); } lowcomms_queue_swork(con); spin_unlock_bh(&con->writequeue_lock); } static void lowcomms_state_change(struct sock *sk) { /* SCTP layer is not calling sk_data_ready when the connection * is done, so we catch the signal through here. */ if (sk->sk_shutdown & RCV_SHUTDOWN) lowcomms_data_ready(sk); } static void lowcomms_listen_data_ready(struct sock *sk) { trace_sk_data_ready(sk); queue_work(io_workqueue, &listen_con.rwork); } int dlm_lowcomms_connect_node(int nodeid) { struct connection *con; int idx; idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); if (WARN_ON_ONCE(!con)) { srcu_read_unlock(&connections_srcu, idx); return -ENOENT; } down_read(&con->sock_lock); if (!con->sock) { spin_lock_bh(&con->writequeue_lock); lowcomms_queue_swork(con); spin_unlock_bh(&con->writequeue_lock); } up_read(&con->sock_lock); srcu_read_unlock(&connections_srcu, idx); cond_resched(); return 0; } int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark) { struct connection *con; int idx; idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); if (!con) { srcu_read_unlock(&connections_srcu, idx); return -ENOENT; } spin_lock(&con->addrs_lock); con->mark = mark; spin_unlock(&con->addrs_lock); srcu_read_unlock(&connections_srcu, idx); return 0; } static void lowcomms_error_report(struct sock *sk) { struct connection *con = sock2con(sk); struct inet_sock *inet; inet = inet_sk(sk); switch (sk->sk_family) { case AF_INET: printk_ratelimited(KERN_ERR "dlm: node %d: socket error " "sending to node %d at %pI4, dport %d, " "sk_err=%d/%d\n", dlm_our_nodeid(), con->nodeid, &inet->inet_daddr, ntohs(inet->inet_dport), sk->sk_err, READ_ONCE(sk->sk_err_soft)); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: printk_ratelimited(KERN_ERR "dlm: node %d: socket error " "sending to node %d at %pI6c, " "dport %d, sk_err=%d/%d\n", dlm_our_nodeid(), con->nodeid, &sk->sk_v6_daddr, ntohs(inet->inet_dport), sk->sk_err, READ_ONCE(sk->sk_err_soft)); break; #endif default: printk_ratelimited(KERN_ERR "dlm: node %d: socket error " "invalid socket family %d set, " "sk_err=%d/%d\n", dlm_our_nodeid(), sk->sk_family, sk->sk_err, READ_ONCE(sk->sk_err_soft)); break; } dlm_midcomms_unack_msg_resend(con->nodeid); listen_sock.sk_error_report(sk); } static void restore_callbacks(struct sock *sk) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(!lockdep_sock_is_held(sk)); #endif sk->sk_user_data = NULL; sk->sk_data_ready = listen_sock.sk_data_ready; sk->sk_state_change = listen_sock.sk_state_change; sk->sk_write_space = listen_sock.sk_write_space; sk->sk_error_report = listen_sock.sk_error_report; } /* Make a socket active */ static void add_sock(struct socket *sock, struct connection *con) { struct sock *sk = sock->sk; lock_sock(sk); con->sock = sock; sk->sk_user_data = con; sk->sk_data_ready = lowcomms_data_ready; sk->sk_write_space = lowcomms_write_space; if (dlm_config.ci_protocol == DLM_PROTO_SCTP) sk->sk_state_change = lowcomms_state_change; sk->sk_allocation = GFP_NOFS; sk->sk_use_task_frag = false; sk->sk_error_report = lowcomms_error_report; release_sock(sk); } /* Add the port number to an IPv6 or 4 sockaddr and return the address length */ static void make_sockaddr(struct sockaddr_storage *saddr, __be16 port, int *addr_len) { saddr->ss_family = dlm_local_addr[0].ss_family; if (saddr->ss_family == AF_INET) { struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; in4_addr->sin_port = port; *addr_len = sizeof(struct sockaddr_in); memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero)); } else { struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr; in6_addr->sin6_port = port; *addr_len = sizeof(struct sockaddr_in6); } memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len); } static void dlm_page_release(struct kref *kref) { struct writequeue_entry *e = container_of(kref, struct writequeue_entry, ref); __free_page(e->page); dlm_free_writequeue(e); } static void dlm_msg_release(struct kref *kref) { struct dlm_msg *msg = container_of(kref, struct dlm_msg, ref); kref_put(&msg->entry->ref, dlm_page_release); dlm_free_msg(msg); } static void free_entry(struct writequeue_entry *e) { struct dlm_msg *msg, *tmp; list_for_each_entry_safe(msg, tmp, &e->msgs, list) { if (msg->orig_msg) { msg->orig_msg->retransmit = false; kref_put(&msg->orig_msg->ref, dlm_msg_release); } list_del(&msg->list); kref_put(&msg->ref, dlm_msg_release); } list_del(&e->list); kref_put(&e->ref, dlm_page_release); } static void dlm_close_sock(struct socket **sock) { lock_sock((*sock)->sk); restore_callbacks((*sock)->sk); release_sock((*sock)->sk); sock_release(*sock); *sock = NULL; } static void allow_connection_io(struct connection *con) { if (con->othercon) clear_bit(CF_IO_STOP, &con->othercon->flags); clear_bit(CF_IO_STOP, &con->flags); } static void stop_connection_io(struct connection *con) { if (con->othercon) stop_connection_io(con->othercon); spin_lock_bh(&con->writequeue_lock); set_bit(CF_IO_STOP, &con->flags); spin_unlock_bh(&con->writequeue_lock); down_write(&con->sock_lock); if (con->sock) { lock_sock(con->sock->sk); restore_callbacks(con->sock->sk); release_sock(con->sock->sk); } up_write(&con->sock_lock); cancel_work_sync(&con->swork); cancel_work_sync(&con->rwork); } /* Close a remote connection and tidy up */ static void close_connection(struct connection *con, bool and_other) { struct writequeue_entry *e; if (con->othercon && and_other) close_connection(con->othercon, false); down_write(&con->sock_lock); if (!con->sock) { up_write(&con->sock_lock); return; } dlm_close_sock(&con->sock); /* if we send a writequeue entry only a half way, we drop the * whole entry because reconnection and that we not start of the * middle of a msg which will confuse the other end. * * we can always drop messages because retransmits, but what we * cannot allow is to transmit half messages which may be processed * at the other side. * * our policy is to start on a clean state when disconnects, we don't * know what's send/received on transport layer in this case. */ spin_lock_bh(&con->writequeue_lock); if (!list_empty(&con->writequeue)) { e = list_first_entry(&con->writequeue, struct writequeue_entry, list); if (e->dirty) free_entry(e); } spin_unlock_bh(&con->writequeue_lock); con->rx_leftover = 0; con->retries = 0; clear_bit(CF_APP_LIMITED, &con->flags); clear_bit(CF_RECV_PENDING, &con->flags); clear_bit(CF_SEND_PENDING, &con->flags); up_write(&con->sock_lock); } static void shutdown_connection(struct connection *con, bool and_other) { int ret; if (con->othercon && and_other) shutdown_connection(con->othercon, false); flush_workqueue(io_workqueue); down_read(&con->sock_lock); /* nothing to shutdown */ if (!con->sock) { up_read(&con->sock_lock); return; } ret = kernel_sock_shutdown(con->sock, dlm_proto_ops->how); up_read(&con->sock_lock); if (ret) { log_print("Connection %p failed to shutdown: %d will force close", con, ret); goto force_close; } else { ret = wait_event_timeout(con->shutdown_wait, !con->sock, DLM_SHUTDOWN_WAIT_TIMEOUT); if (ret == 0) { log_print("Connection %p shutdown timed out, will force close", con); goto force_close; } } return; force_close: close_connection(con, false); } static struct processqueue_entry *new_processqueue_entry(int nodeid, int buflen) { struct processqueue_entry *pentry; pentry = kmalloc(sizeof(*pentry), GFP_NOFS); if (!pentry) return NULL; pentry->buf = kmalloc(buflen, GFP_NOFS); if (!pentry->buf) { kfree(pentry); return NULL; } pentry->nodeid = nodeid; return pentry; } static void free_processqueue_entry(struct processqueue_entry *pentry) { kfree(pentry->buf); kfree(pentry); } static void process_dlm_messages(struct work_struct *work) { struct processqueue_entry *pentry; spin_lock_bh(&processqueue_lock); pentry = list_first_entry_or_null(&processqueue, struct processqueue_entry, list); if (WARN_ON_ONCE(!pentry)) { process_dlm_messages_pending = false; spin_unlock_bh(&processqueue_lock); return; } list_del(&pentry->list); if (atomic_dec_and_test(&processqueue_count)) wake_up(&processqueue_wq); spin_unlock_bh(&processqueue_lock); for (;;) { dlm_process_incoming_buffer(pentry->nodeid, pentry->buf, pentry->buflen); free_processqueue_entry(pentry); spin_lock_bh(&processqueue_lock); pentry = list_first_entry_or_null(&processqueue, struct processqueue_entry, list); if (!pentry) { process_dlm_messages_pending = false; spin_unlock_bh(&processqueue_lock); break; } list_del(&pentry->list); if (atomic_dec_and_test(&processqueue_count)) wake_up(&processqueue_wq); spin_unlock_bh(&processqueue_lock); } } /* Data received from remote end */ static int receive_from_sock(struct connection *con, int buflen) { struct processqueue_entry *pentry; int ret, buflen_real; struct msghdr msg; struct kvec iov; pentry = new_processqueue_entry(con->nodeid, buflen); if (!pentry) return DLM_IO_RESCHED; memcpy(pentry->buf, con->rx_leftover_buf, con->rx_leftover); /* calculate new buffer parameter regarding last receive and * possible leftover bytes */ iov.iov_base = pentry->buf + con->rx_leftover; iov.iov_len = buflen - con->rx_leftover; memset(&msg, 0, sizeof(msg)); msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; clear_bit(CF_RECV_INTR, &con->flags); again: ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); trace_dlm_recv(con->nodeid, ret); if (ret == -EAGAIN) { lock_sock(con->sock->sk); if (test_and_clear_bit(CF_RECV_INTR, &con->flags)) { release_sock(con->sock->sk); goto again; } clear_bit(CF_RECV_PENDING, &con->flags); release_sock(con->sock->sk); free_processqueue_entry(pentry); return DLM_IO_END; } else if (ret == 0) { /* close will clear CF_RECV_PENDING */ free_processqueue_entry(pentry); return DLM_IO_EOF; } else if (ret < 0) { free_processqueue_entry(pentry); return ret; } /* new buflen according readed bytes and leftover from last receive */ buflen_real = ret + con->rx_leftover; ret = dlm_validate_incoming_buffer(con->nodeid, pentry->buf, buflen_real); if (ret < 0) { free_processqueue_entry(pentry); return ret; } pentry->buflen = ret; /* calculate leftover bytes from process and put it into begin of * the receive buffer, so next receive we have the full message * at the start address of the receive buffer. */ con->rx_leftover = buflen_real - ret; memmove(con->rx_leftover_buf, pentry->buf + ret, con->rx_leftover); spin_lock_bh(&processqueue_lock); ret = atomic_inc_return(&processqueue_count); list_add_tail(&pentry->list, &processqueue); if (!process_dlm_messages_pending) { process_dlm_messages_pending = true; queue_work(process_workqueue, &process_work); } spin_unlock_bh(&processqueue_lock); if (ret > DLM_MAX_PROCESS_BUFFERS) return DLM_IO_FLUSH; return DLM_IO_SUCCESS; } /* Listening socket is busy, accept a connection */ static int accept_from_sock(void) { struct sockaddr_storage peeraddr; int len, idx, result, nodeid; struct connection *newcon; struct socket *newsock; unsigned int mark; result = kernel_accept(listen_con.sock, &newsock, O_NONBLOCK); if (result == -EAGAIN) return DLM_IO_END; else if (result < 0) goto accept_err; /* Get the connected socket's peer */ memset(&peeraddr, 0, sizeof(peeraddr)); len = newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, 2); if (len < 0) { result = -ECONNABORTED; goto accept_err; } /* Get the new node's NODEID */ make_sockaddr(&peeraddr, 0, &len); if (addr_to_nodeid(&peeraddr, &nodeid, &mark)) { switch (peeraddr.ss_family) { case AF_INET: { struct sockaddr_in *sin = (struct sockaddr_in *)&peeraddr; log_print("connect from non cluster IPv4 node %pI4", &sin->sin_addr); break; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&peeraddr; log_print("connect from non cluster IPv6 node %pI6c", &sin6->sin6_addr); break; } #endif default: log_print("invalid family from non cluster node"); break; } sock_release(newsock); return -1; } log_print("got connection from %d", nodeid); /* Check to see if we already have a connection to this node. This * could happen if the two nodes initiate a connection at roughly * the same time and the connections cross on the wire. * In this case we store the incoming one in "othercon" */ idx = srcu_read_lock(&connections_srcu); newcon = nodeid2con(nodeid, 0); if (WARN_ON_ONCE(!newcon)) { srcu_read_unlock(&connections_srcu, idx); result = -ENOENT; goto accept_err; } sock_set_mark(newsock->sk, mark); down_write(&newcon->sock_lock); if (newcon->sock) { struct connection *othercon = newcon->othercon; if (!othercon) { othercon = kzalloc(sizeof(*othercon), GFP_NOFS); if (!othercon) { log_print("failed to allocate incoming socket"); up_write(&newcon->sock_lock); srcu_read_unlock(&connections_srcu, idx); result = -ENOMEM; goto accept_err; } dlm_con_init(othercon, nodeid); lockdep_set_subclass(&othercon->sock_lock, 1); newcon->othercon = othercon; set_bit(CF_IS_OTHERCON, &othercon->flags); } else { /* close other sock con if we have something new */ close_connection(othercon, false); } down_write(&othercon->sock_lock); add_sock(newsock, othercon); /* check if we receved something while adding */ lock_sock(othercon->sock->sk); lowcomms_queue_rwork(othercon); release_sock(othercon->sock->sk); up_write(&othercon->sock_lock); } else { /* accept copies the sk after we've saved the callbacks, so we don't want to save them a second time or comm errors will result in calling sk_error_report recursively. */ add_sock(newsock, newcon); /* check if we receved something while adding */ lock_sock(newcon->sock->sk); lowcomms_queue_rwork(newcon); release_sock(newcon->sock->sk); } up_write(&newcon->sock_lock); srcu_read_unlock(&connections_srcu, idx); return DLM_IO_SUCCESS; accept_err: if (newsock) sock_release(newsock); return result; } /* * writequeue_entry_complete - try to delete and free write queue entry * @e: write queue entry to try to delete * @completed: bytes completed * * writequeue_lock must be held. */ static void writequeue_entry_complete(struct writequeue_entry *e, int completed) { e->offset += completed; e->len -= completed; /* signal that page was half way transmitted */ e->dirty = true; if (e->len == 0 && e->users == 0) free_entry(e); } /* * sctp_bind_addrs - bind a SCTP socket to all our addresses */ static int sctp_bind_addrs(struct socket *sock, __be16 port) { struct sockaddr_storage localaddr; struct sockaddr *addr = (struct sockaddr *)&localaddr; int i, addr_len, result = 0; for (i = 0; i < dlm_local_count; i++) { memcpy(&localaddr, &dlm_local_addr[i], sizeof(localaddr)); make_sockaddr(&localaddr, port, &addr_len); if (!i) result = kernel_bind(sock, addr, addr_len); else result = sock_bind_add(sock->sk, addr, addr_len); if (result < 0) { log_print("Can't bind to %d addr number %d, %d.\n", port, i + 1, result); break; } } return result; } /* Get local addresses */ static void init_local(void) { struct sockaddr_storage sas; int i; dlm_local_count = 0; for (i = 0; i < DLM_MAX_ADDR_COUNT; i++) { if (dlm_our_addr(&sas, i)) break; memcpy(&dlm_local_addr[dlm_local_count++], &sas, sizeof(sas)); } } static struct writequeue_entry *new_writequeue_entry(struct connection *con) { struct writequeue_entry *entry; entry = dlm_allocate_writequeue(); if (!entry) return NULL; entry->page = alloc_page(GFP_ATOMIC | __GFP_ZERO); if (!entry->page) { dlm_free_writequeue(entry); return NULL; } entry->offset = 0; entry->len = 0; entry->end = 0; entry->dirty = false; entry->con = con; entry->users = 1; kref_init(&entry->ref); return entry; } static struct writequeue_entry *new_wq_entry(struct connection *con, int len, char **ppc, void (*cb)(void *data), void *data) { struct writequeue_entry *e; spin_lock_bh(&con->writequeue_lock); if (!list_empty(&con->writequeue)) { e = list_last_entry(&con->writequeue, struct writequeue_entry, list); if (DLM_WQ_REMAIN_BYTES(e) >= len) { kref_get(&e->ref); *ppc = page_address(e->page) + e->end; if (cb) cb(data); e->end += len; e->users++; goto out; } } e = new_writequeue_entry(con); if (!e) goto out; kref_get(&e->ref); *ppc = page_address(e->page); e->end += len; if (cb) cb(data); list_add_tail(&e->list, &con->writequeue); out: spin_unlock_bh(&con->writequeue_lock); return e; }; static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, char **ppc, void (*cb)(void *data), void *data) { struct writequeue_entry *e; struct dlm_msg *msg; msg = dlm_allocate_msg(); if (!msg) return NULL; kref_init(&msg->ref); e = new_wq_entry(con, len, ppc, cb, data); if (!e) { dlm_free_msg(msg); return NULL; } msg->retransmit = false; msg->orig_msg = NULL; msg->ppc = *ppc; msg->len = len; msg->entry = e; return msg; } /* avoid false positive for nodes_srcu, unlock happens in * dlm_lowcomms_commit_msg which is a must call if success */ #ifndef __CHECKER__ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, char **ppc, void (*cb)(void *data), void *data) { struct connection *con; struct dlm_msg *msg; int idx; if (len > DLM_MAX_SOCKET_BUFSIZE || len < sizeof(struct dlm_header)) { BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE); log_print("failed to allocate a buffer of size %d", len); WARN_ON_ONCE(1); return NULL; } idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); if (WARN_ON_ONCE(!con)) { srcu_read_unlock(&connections_srcu, idx); return NULL; } msg = dlm_lowcomms_new_msg_con(con, len, ppc, cb, data); if (!msg) { srcu_read_unlock(&connections_srcu, idx); return NULL; } /* for dlm_lowcomms_commit_msg() */ kref_get(&msg->ref); /* we assume if successful commit must called */ msg->idx = idx; return msg; } #endif static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg) { struct writequeue_entry *e = msg->entry; struct connection *con = e->con; int users; spin_lock_bh(&con->writequeue_lock); kref_get(&msg->ref); list_add(&msg->list, &e->msgs); users = --e->users; if (users) goto out; e->len = DLM_WQ_LENGTH_BYTES(e); lowcomms_queue_swork(con); out: spin_unlock_bh(&con->writequeue_lock); return; } /* avoid false positive for nodes_srcu, lock was happen in * dlm_lowcomms_new_msg */ #ifndef __CHECKER__ void dlm_lowcomms_commit_msg(struct dlm_msg *msg) { _dlm_lowcomms_commit_msg(msg); srcu_read_unlock(&connections_srcu, msg->idx); /* because dlm_lowcomms_new_msg() */ kref_put(&msg->ref, dlm_msg_release); } #endif void dlm_lowcomms_put_msg(struct dlm_msg *msg) { kref_put(&msg->ref, dlm_msg_release); } /* does not held connections_srcu, usage lowcomms_error_report only */ int dlm_lowcomms_resend_msg(struct dlm_msg *msg) { struct dlm_msg *msg_resend; char *ppc; if (msg->retransmit) return 1; msg_resend = dlm_lowcomms_new_msg_con(msg->entry->con, msg->len, &ppc, NULL, NULL); if (!msg_resend) return -ENOMEM; msg->retransmit = true; kref_get(&msg->ref); msg_resend->orig_msg = msg; memcpy(ppc, msg->ppc, msg->len); _dlm_lowcomms_commit_msg(msg_resend); dlm_lowcomms_put_msg(msg_resend); return 0; } /* Send a message */ static int send_to_sock(struct connection *con) { struct writequeue_entry *e; struct bio_vec bvec; struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | MSG_NOSIGNAL, }; int len, offset, ret; spin_lock_bh(&con->writequeue_lock); e = con_next_wq(con); if (!e) { clear_bit(CF_SEND_PENDING, &con->flags); spin_unlock_bh(&con->writequeue_lock); return DLM_IO_END; } len = e->len; offset = e->offset; WARN_ON_ONCE(len == 0 && e->users == 0); spin_unlock_bh(&con->writequeue_lock); bvec_set_page(&bvec, e->page, len, offset); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); ret = sock_sendmsg(con->sock, &msg); trace_dlm_send(con->nodeid, ret); if (ret == -EAGAIN || ret == 0) { lock_sock(con->sock->sk); spin_lock_bh(&con->writequeue_lock); if (test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { /* Notify TCP that we're limited by the * application window size. */ set_bit(SOCK_NOSPACE, &con->sock->sk->sk_socket->flags); con->sock->sk->sk_write_pending++; clear_bit(CF_SEND_PENDING, &con->flags); spin_unlock_bh(&con->writequeue_lock); release_sock(con->sock->sk); /* wait for write_space() event */ return DLM_IO_END; } spin_unlock_bh(&con->writequeue_lock); release_sock(con->sock->sk); return DLM_IO_RESCHED; } else if (ret < 0) { return ret; } spin_lock_bh(&con->writequeue_lock); writequeue_entry_complete(e, ret); spin_unlock_bh(&con->writequeue_lock); return DLM_IO_SUCCESS; } static void clean_one_writequeue(struct connection *con) { struct writequeue_entry *e, *safe; spin_lock_bh(&con->writequeue_lock); list_for_each_entry_safe(e, safe, &con->writequeue, list) { free_entry(e); } spin_unlock_bh(&con->writequeue_lock); } static void connection_release(struct rcu_head *rcu) { struct connection *con = container_of(rcu, struct connection, rcu); WARN_ON_ONCE(!list_empty(&con->writequeue)); WARN_ON_ONCE(con->sock); kfree(con); } /* Called from recovery when it knows that a node has left the cluster */ int dlm_lowcomms_close(int nodeid) { struct connection *con; int idx; log_print("closing connection to node %d", nodeid); idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); if (WARN_ON_ONCE(!con)) { srcu_read_unlock(&connections_srcu, idx); return -ENOENT; } stop_connection_io(con); log_print("io handling for node: %d stopped", nodeid); close_connection(con, true); spin_lock(&connections_lock); hlist_del_rcu(&con->list); spin_unlock(&connections_lock); clean_one_writequeue(con); call_srcu(&connections_srcu, &con->rcu, connection_release); if (con->othercon) { clean_one_writequeue(con->othercon); call_srcu(&connections_srcu, &con->othercon->rcu, connection_release); } srcu_read_unlock(&connections_srcu, idx); /* for debugging we print when we are done to compare with other * messages in between. This function need to be correctly synchronized * with io handling */ log_print("closing connection to node %d done", nodeid); return 0; } /* Receive worker function */ static void process_recv_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, rwork); int ret, buflen; down_read(&con->sock_lock); if (!con->sock) { up_read(&con->sock_lock); return; } buflen = READ_ONCE(dlm_config.ci_buffer_size); do { ret = receive_from_sock(con, buflen); } while (ret == DLM_IO_SUCCESS); up_read(&con->sock_lock); switch (ret) { case DLM_IO_END: /* CF_RECV_PENDING cleared */ break; case DLM_IO_EOF: close_connection(con, false); wake_up(&con->shutdown_wait); /* CF_RECV_PENDING cleared */ break; case DLM_IO_FLUSH: /* we can't flush the process_workqueue here because a * WQ_MEM_RECLAIM workequeue can occurr a deadlock for a non * WQ_MEM_RECLAIM workqueue such as process_workqueue. Instead * we have a waitqueue to wait until all messages are * processed. * * This handling is only necessary to backoff the sender and * not queue all messages from the socket layer into DLM * processqueue. When DLM is capable to parse multiple messages * on an e.g. per socket basis this handling can might be * removed. Especially in a message burst we are too slow to * process messages and the queue will fill up memory. */ wait_event(processqueue_wq, !atomic_read(&processqueue_count)); fallthrough; case DLM_IO_RESCHED: cond_resched(); queue_work(io_workqueue, &con->rwork); /* CF_RECV_PENDING not cleared */ break; default: if (ret < 0) { if (test_bit(CF_IS_OTHERCON, &con->flags)) { close_connection(con, false); } else { spin_lock_bh(&con->writequeue_lock); lowcomms_queue_swork(con); spin_unlock_bh(&con->writequeue_lock); } /* CF_RECV_PENDING cleared for othercon * we trigger send queue if not already done * and process_send_sockets will handle it */ break; } WARN_ON_ONCE(1); break; } } static void process_listen_recv_socket(struct work_struct *work) { int ret; if (WARN_ON_ONCE(!listen_con.sock)) return; do { ret = accept_from_sock(); } while (ret == DLM_IO_SUCCESS); if (ret < 0) log_print("critical error accepting connection: %d", ret); } static int dlm_connect(struct connection *con) { struct sockaddr_storage addr; int result, addr_len; struct socket *sock; unsigned int mark; memset(&addr, 0, sizeof(addr)); result = nodeid_to_addr(con->nodeid, &addr, NULL, dlm_proto_ops->try_new_addr, &mark); if (result < 0) { log_print("no address for nodeid %d", con->nodeid); return result; } /* Create a socket to communicate with */ result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family, SOCK_STREAM, dlm_proto_ops->proto, &sock); if (result < 0) return result; sock_set_mark(sock->sk, mark); dlm_proto_ops->sockopts(sock); result = dlm_proto_ops->bind(sock); if (result < 0) { sock_release(sock); return result; } add_sock(sock, con); log_print_ratelimited("connecting to %d", con->nodeid); make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len); result = kernel_connect(sock, (struct sockaddr *)&addr, addr_len, 0); switch (result) { case -EINPROGRESS: /* not an error */ fallthrough; case 0: break; default: if (result < 0) dlm_close_sock(&con->sock); break; } return result; } /* Send worker function */ static void process_send_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, swork); int ret; WARN_ON_ONCE(test_bit(CF_IS_OTHERCON, &con->flags)); down_read(&con->sock_lock); if (!con->sock) { up_read(&con->sock_lock); down_write(&con->sock_lock); if (!con->sock) { ret = dlm_connect(con); switch (ret) { case 0: break; default: /* CF_SEND_PENDING not cleared */ up_write(&con->sock_lock); log_print("connect to node %d try %d error %d", con->nodeid, con->retries++, ret); msleep(1000); /* For now we try forever to reconnect. In * future we should send a event to cluster * manager to fence itself after certain amount * of retries. */ queue_work(io_workqueue, &con->swork); return; } } downgrade_write(&con->sock_lock); } do { ret = send_to_sock(con); } while (ret == DLM_IO_SUCCESS); up_read(&con->sock_lock); switch (ret) { case DLM_IO_END: /* CF_SEND_PENDING cleared */ break; case DLM_IO_RESCHED: /* CF_SEND_PENDING not cleared */ cond_resched(); queue_work(io_workqueue, &con->swork); break; default: if (ret < 0) { close_connection(con, false); /* CF_SEND_PENDING cleared */ spin_lock_bh(&con->writequeue_lock); lowcomms_queue_swork(con); spin_unlock_bh(&con->writequeue_lock); break; } WARN_ON_ONCE(1); break; } } static void work_stop(void) { if (io_workqueue) { destroy_workqueue(io_workqueue); io_workqueue = NULL; } if (process_workqueue) { destroy_workqueue(process_workqueue); process_workqueue = NULL; } } static int work_start(void) { io_workqueue = alloc_workqueue("dlm_io", WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (!io_workqueue) { log_print("can't start dlm_io"); return -ENOMEM; } process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH, 0); if (!process_workqueue) { log_print("can't start dlm_process"); destroy_workqueue(io_workqueue); io_workqueue = NULL; return -ENOMEM; } return 0; } void dlm_lowcomms_shutdown(void) { struct connection *con; int i, idx; /* stop lowcomms_listen_data_ready calls */ lock_sock(listen_con.sock->sk); listen_con.sock->sk->sk_data_ready = listen_sock.sk_data_ready; release_sock(listen_con.sock->sk); cancel_work_sync(&listen_con.rwork); dlm_close_sock(&listen_con.sock); idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(con, &connection_hash[i], list) { shutdown_connection(con, true); stop_connection_io(con); flush_workqueue(process_workqueue); close_connection(con, true); clean_one_writequeue(con); if (con->othercon) clean_one_writequeue(con->othercon); allow_connection_io(con); } } srcu_read_unlock(&connections_srcu, idx); } void dlm_lowcomms_stop(void) { work_stop(); dlm_proto_ops = NULL; } static int dlm_listen_for_all(void) { struct socket *sock; int result; log_print("Using %s for communications", dlm_proto_ops->name); result = dlm_proto_ops->listen_validate(); if (result < 0) return result; result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family, SOCK_STREAM, dlm_proto_ops->proto, &sock); if (result < 0) { log_print("Can't create comms socket: %d", result); return result; } sock_set_mark(sock->sk, dlm_config.ci_mark); dlm_proto_ops->listen_sockopts(sock); result = dlm_proto_ops->listen_bind(sock); if (result < 0) goto out; lock_sock(sock->sk); listen_sock.sk_data_ready = sock->sk->sk_data_ready; listen_sock.sk_write_space = sock->sk->sk_write_space; listen_sock.sk_error_report = sock->sk->sk_error_report; listen_sock.sk_state_change = sock->sk->sk_state_change; listen_con.sock = sock; sock->sk->sk_allocation = GFP_NOFS; sock->sk->sk_use_task_frag = false; sock->sk->sk_data_ready = lowcomms_listen_data_ready; release_sock(sock->sk); result = sock->ops->listen(sock, 128); if (result < 0) { dlm_close_sock(&listen_con.sock); return result; } return 0; out: sock_release(sock); return result; } static int dlm_tcp_bind(struct socket *sock) { struct sockaddr_storage src_addr; int result, addr_len; /* Bind to our cluster-known address connecting to avoid * routing problems. */ memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr)); make_sockaddr(&src_addr, 0, &addr_len); result = kernel_bind(sock, (struct sockaddr *)&src_addr, addr_len); if (result < 0) { /* This *may* not indicate a critical error */ log_print("could not bind for connect: %d", result); } return 0; } static int dlm_tcp_listen_validate(void) { /* We don't support multi-homed hosts */ if (dlm_local_count > 1) { log_print("Detect multi-homed hosts but use only the first IP address."); log_print("Try SCTP, if you want to enable multi-link."); } return 0; } static void dlm_tcp_sockopts(struct socket *sock) { /* Turn off Nagle's algorithm */ tcp_sock_set_nodelay(sock->sk); } static void dlm_tcp_listen_sockopts(struct socket *sock) { dlm_tcp_sockopts(sock); sock_set_reuseaddr(sock->sk); } static int dlm_tcp_listen_bind(struct socket *sock) { int addr_len; /* Bind to our port */ make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len); return kernel_bind(sock, (struct sockaddr *)&dlm_local_addr[0], addr_len); } static const struct dlm_proto_ops dlm_tcp_ops = { .name = "TCP", .proto = IPPROTO_TCP, .how = SHUT_WR, .sockopts = dlm_tcp_sockopts, .bind = dlm_tcp_bind, .listen_validate = dlm_tcp_listen_validate, .listen_sockopts = dlm_tcp_listen_sockopts, .listen_bind = dlm_tcp_listen_bind, }; static int dlm_sctp_bind(struct socket *sock) { return sctp_bind_addrs(sock, 0); } static int dlm_sctp_listen_validate(void) { if (!IS_ENABLED(CONFIG_IP_SCTP)) { log_print("SCTP is not enabled by this kernel"); return -EOPNOTSUPP; } request_module("sctp"); return 0; } static int dlm_sctp_bind_listen(struct socket *sock) { return sctp_bind_addrs(sock, dlm_config.ci_tcp_port); } static void dlm_sctp_sockopts(struct socket *sock) { /* Turn off Nagle's algorithm */ sctp_sock_set_nodelay(sock->sk); sock_set_rcvbuf(sock->sk, NEEDED_RMEM); } static const struct dlm_proto_ops dlm_sctp_ops = { .name = "SCTP", .proto = IPPROTO_SCTP, .how = SHUT_RDWR, .try_new_addr = true, .sockopts = dlm_sctp_sockopts, .bind = dlm_sctp_bind, .listen_validate = dlm_sctp_listen_validate, .listen_sockopts = dlm_sctp_sockopts, .listen_bind = dlm_sctp_bind_listen, }; int dlm_lowcomms_start(void) { int error; init_local(); if (!dlm_local_count) { error = -ENOTCONN; log_print("no local IP address has been set"); goto fail; } error = work_start(); if (error) goto fail; /* Start listening */ switch (dlm_config.ci_protocol) { case DLM_PROTO_TCP: dlm_proto_ops = &dlm_tcp_ops; break; case DLM_PROTO_SCTP: dlm_proto_ops = &dlm_sctp_ops; break; default: log_print("Invalid protocol identifier %d set", dlm_config.ci_protocol); error = -EINVAL; goto fail_proto_ops; } error = dlm_listen_for_all(); if (error) goto fail_listen; return 0; fail_listen: dlm_proto_ops = NULL; fail_proto_ops: work_stop(); fail: return error; } void dlm_lowcomms_init(void) { int i; for (i = 0; i < CONN_HASH_SIZE; i++) INIT_HLIST_HEAD(&connection_hash[i]); INIT_WORK(&listen_con.rwork, process_listen_recv_socket); } void dlm_lowcomms_exit(void) { struct connection *con; int i, idx; idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(con, &connection_hash[i], list) { spin_lock(&connections_lock); hlist_del_rcu(&con->list); spin_unlock(&connections_lock); if (con->othercon) call_srcu(&connections_srcu, &con->othercon->rcu, connection_release); call_srcu(&connections_srcu, &con->rcu, connection_release); } } srcu_read_unlock(&connections_srcu, idx); } |
| 61 352 151 61 149 222 3 15 227 4 10 309 224 7 10 68 109 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCATTERLIST_H #define _LINUX_SCATTERLIST_H #include <linux/string.h> #include <linux/types.h> #include <linux/bug.h> #include <linux/mm.h> #include <asm/io.h> struct scatterlist { unsigned long page_link; unsigned int offset; unsigned int length; dma_addr_t dma_address; #ifdef CONFIG_NEED_SG_DMA_LENGTH unsigned int dma_length; #endif #ifdef CONFIG_NEED_SG_DMA_FLAGS unsigned int dma_flags; #endif }; /* * These macros should be used after a dma_map_sg call has been done * to get bus addresses of each of the SG entries and their lengths. * You should only work with the number of sg entries dma_map_sg * returns, or alternatively stop on the first sg_dma_len(sg) which * is 0. */ #define sg_dma_address(sg) ((sg)->dma_address) #ifdef CONFIG_NEED_SG_DMA_LENGTH #define sg_dma_len(sg) ((sg)->dma_length) #else #define sg_dma_len(sg) ((sg)->length) #endif struct sg_table { struct scatterlist *sgl; /* the list */ unsigned int nents; /* number of mapped entries */ unsigned int orig_nents; /* original size of list */ }; struct sg_append_table { struct sg_table sgt; /* The scatter list table */ struct scatterlist *prv; /* last populated sge in the table */ unsigned int total_nents; /* Total entries in the table */ }; /* * Notes on SG table design. * * We use the unsigned long page_link field in the scatterlist struct to place * the page pointer AND encode information about the sg table as well. The two * lower bits are reserved for this information. * * If bit 0 is set, then the page_link contains a pointer to the next sg * table list. Otherwise the next entry is at sg + 1. * * If bit 1 is set, then this sg entry is the last element in a list. * * See sg_next(). * */ #define SG_CHAIN 0x01UL #define SG_END 0x02UL /* * We overload the LSB of the page pointer to indicate whether it's * a valid sg entry, or whether it points to the start of a new scatterlist. * Those low bits are there for everyone! (thanks mason :-) */ #define SG_PAGE_LINK_MASK (SG_CHAIN | SG_END) static inline unsigned int __sg_flags(struct scatterlist *sg) { return sg->page_link & SG_PAGE_LINK_MASK; } static inline struct scatterlist *sg_chain_ptr(struct scatterlist *sg) { return (struct scatterlist *)(sg->page_link & ~SG_PAGE_LINK_MASK); } static inline bool sg_is_chain(struct scatterlist *sg) { return __sg_flags(sg) & SG_CHAIN; } static inline bool sg_is_last(struct scatterlist *sg) { return __sg_flags(sg) & SG_END; } /** * sg_next - return the next scatterlist entry in a list * @sg: The current sg entry * * Description: * Usually the next entry will be @sg + 1, but if this sg element is part * of a chained scatterlist, it could jump to the start of a new * scatterlist array. * **/ static inline struct scatterlist *sg_next(struct scatterlist *sg) { if (sg_is_last(sg)) return NULL; sg++; if (unlikely(sg_is_chain(sg))) sg = sg_chain_ptr(sg); return sg; } /** * sg_assign_page - Assign a given page to an SG entry * @sg: SG entry * @page: The page * * Description: * Assign page to sg entry. Also see sg_set_page(), the most commonly used * variant. * **/ static inline void sg_assign_page(struct scatterlist *sg, struct page *page) { unsigned long page_link = sg->page_link & (SG_CHAIN | SG_END); /* * In order for the low bit stealing approach to work, pages * must be aligned at a 32-bit boundary as a minimum. */ BUG_ON((unsigned long)page & SG_PAGE_LINK_MASK); #ifdef CONFIG_DEBUG_SG BUG_ON(sg_is_chain(sg)); #endif sg->page_link = page_link | (unsigned long) page; } /** * sg_set_page - Set sg entry to point at given page * @sg: SG entry * @page: The page * @len: Length of data * @offset: Offset into page * * Description: * Use this function to set an sg entry pointing at a page, never assign * the page directly. We encode sg table information in the lower bits * of the page pointer. See sg_page() for looking up the page belonging * to an sg entry. * **/ static inline void sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, unsigned int offset) { sg_assign_page(sg, page); sg->offset = offset; sg->length = len; } /** * sg_set_folio - Set sg entry to point at given folio * @sg: SG entry * @folio: The folio * @len: Length of data * @offset: Offset into folio * * Description: * Use this function to set an sg entry pointing at a folio, never assign * the folio directly. We encode sg table information in the lower bits * of the folio pointer. See sg_page() for looking up the page belonging * to an sg entry. * **/ static inline void sg_set_folio(struct scatterlist *sg, struct folio *folio, size_t len, size_t offset) { WARN_ON_ONCE(len > UINT_MAX); WARN_ON_ONCE(offset > UINT_MAX); sg_assign_page(sg, &folio->page); sg->offset = offset; sg->length = len; } static inline struct page *sg_page(struct scatterlist *sg) { #ifdef CONFIG_DEBUG_SG BUG_ON(sg_is_chain(sg)); #endif return (struct page *)((sg)->page_link & ~SG_PAGE_LINK_MASK); } /** * sg_set_buf - Set sg entry to point at given data * @sg: SG entry * @buf: Data * @buflen: Data length * **/ static inline void sg_set_buf(struct scatterlist *sg, const void *buf, unsigned int buflen) { #ifdef CONFIG_DEBUG_SG BUG_ON(!virt_addr_valid(buf)); #endif sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); } /* * Loop over each sg element, following the pointer to a new list if necessary */ #define for_each_sg(sglist, sg, nr, __i) \ for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg)) /* * Loop over each sg element in the given sg_table object. */ #define for_each_sgtable_sg(sgt, sg, i) \ for_each_sg((sgt)->sgl, sg, (sgt)->orig_nents, i) /* * Loop over each sg element in the given *DMA mapped* sg_table object. * Please use sg_dma_address(sg) and sg_dma_len(sg) to extract DMA addresses * of the each element. */ #define for_each_sgtable_dma_sg(sgt, sg, i) \ for_each_sg((sgt)->sgl, sg, (sgt)->nents, i) static inline void __sg_chain(struct scatterlist *chain_sg, struct scatterlist *sgl) { /* * offset and length are unused for chain entry. Clear them. */ chain_sg->offset = 0; chain_sg->length = 0; /* * Set lowest bit to indicate a link pointer, and make sure to clear * the termination bit if it happens to be set. */ chain_sg->page_link = ((unsigned long) sgl | SG_CHAIN) & ~SG_END; } /** * sg_chain - Chain two sglists together * @prv: First scatterlist * @prv_nents: Number of entries in prv * @sgl: Second scatterlist * * Description: * Links @prv and @sgl together, to form a longer scatterlist. * **/ static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, struct scatterlist *sgl) { __sg_chain(&prv[prv_nents - 1], sgl); } /** * sg_mark_end - Mark the end of the scatterlist * @sg: SG entryScatterlist * * Description: * Marks the passed in sg entry as the termination point for the sg * table. A call to sg_next() on this entry will return NULL. * **/ static inline void sg_mark_end(struct scatterlist *sg) { /* * Set termination bit, clear potential chain bit */ sg->page_link |= SG_END; sg->page_link &= ~SG_CHAIN; } /** * sg_unmark_end - Undo setting the end of the scatterlist * @sg: SG entryScatterlist * * Description: * Removes the termination marker from the given entry of the scatterlist. * **/ static inline void sg_unmark_end(struct scatterlist *sg) { sg->page_link &= ~SG_END; } /* * On 64-bit architectures there is a 4-byte padding in struct scatterlist * (assuming also CONFIG_NEED_SG_DMA_LENGTH is set). Use this padding for DMA * flags bits to indicate when a specific dma address is a bus address or the * buffer may have been bounced via SWIOTLB. */ #ifdef CONFIG_NEED_SG_DMA_FLAGS #define SG_DMA_BUS_ADDRESS (1 << 0) #define SG_DMA_SWIOTLB (1 << 1) /** * sg_dma_is_bus_address - Return whether a given segment was marked * as a bus address * @sg: SG entry * * Description: * Returns true if sg_dma_mark_bus_address() has been called on * this segment. **/ static inline bool sg_dma_is_bus_address(struct scatterlist *sg) { return sg->dma_flags & SG_DMA_BUS_ADDRESS; } /** * sg_dma_mark_bus_address - Mark the scatterlist entry as a bus address * @sg: SG entry * * Description: * Marks the passed in sg entry to indicate that the dma_address is * a bus address and doesn't need to be unmapped. This should only be * used by dma_map_sg() implementations to mark bus addresses * so they can be properly cleaned up in dma_unmap_sg(). **/ static inline void sg_dma_mark_bus_address(struct scatterlist *sg) { sg->dma_flags |= SG_DMA_BUS_ADDRESS; } /** * sg_dma_unmark_bus_address - Unmark the scatterlist entry as a bus address * @sg: SG entry * * Description: * Clears the bus address mark. **/ static inline void sg_dma_unmark_bus_address(struct scatterlist *sg) { sg->dma_flags &= ~SG_DMA_BUS_ADDRESS; } /** * sg_dma_is_swiotlb - Return whether the scatterlist was marked for SWIOTLB * bouncing * @sg: SG entry * * Description: * Returns true if the scatterlist was marked for SWIOTLB bouncing. Not all * elements may have been bounced, so the caller would have to check * individual SG entries with swiotlb_find_pool(). */ static inline bool sg_dma_is_swiotlb(struct scatterlist *sg) { return sg->dma_flags & SG_DMA_SWIOTLB; } /** * sg_dma_mark_swiotlb - Mark the scatterlist for SWIOTLB bouncing * @sg: SG entry * * Description: * Marks a a scatterlist for SWIOTLB bounce. Not all SG entries may be * bounced. */ static inline void sg_dma_mark_swiotlb(struct scatterlist *sg) { sg->dma_flags |= SG_DMA_SWIOTLB; } #else static inline bool sg_dma_is_bus_address(struct scatterlist *sg) { return false; } static inline void sg_dma_mark_bus_address(struct scatterlist *sg) { } static inline void sg_dma_unmark_bus_address(struct scatterlist *sg) { } static inline bool sg_dma_is_swiotlb(struct scatterlist *sg) { return false; } static inline void sg_dma_mark_swiotlb(struct scatterlist *sg) { } #endif /* CONFIG_NEED_SG_DMA_FLAGS */ /** * sg_phys - Return physical address of an sg entry * @sg: SG entry * * Description: * This calls page_to_phys() on the page in this sg entry, and adds the * sg offset. The caller must know that it is legal to call page_to_phys() * on the sg page. * **/ static inline dma_addr_t sg_phys(struct scatterlist *sg) { return page_to_phys(sg_page(sg)) + sg->offset; } /** * sg_virt - Return virtual address of an sg entry * @sg: SG entry * * Description: * This calls page_address() on the page in this sg entry, and adds the * sg offset. The caller must know that the sg page has a valid virtual * mapping. * **/ static inline void *sg_virt(struct scatterlist *sg) { return page_address(sg_page(sg)) + sg->offset; } /** * sg_init_marker - Initialize markers in sg table * @sgl: The SG table * @nents: Number of entries in table * **/ static inline void sg_init_marker(struct scatterlist *sgl, unsigned int nents) { sg_mark_end(&sgl[nents - 1]); } int sg_nents(struct scatterlist *sg); int sg_nents_for_len(struct scatterlist *sg, u64 len); struct scatterlist *sg_last(struct scatterlist *s, unsigned int); void sg_init_table(struct scatterlist *, unsigned int); void sg_init_one(struct scatterlist *, const void *, unsigned int); int sg_split(struct scatterlist *in, const int in_mapped_nents, const off_t skip, const int nb_splits, const size_t *split_sizes, struct scatterlist **out, int *out_mapped_nents, gfp_t gfp_mask); typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t); typedef void (sg_free_fn)(struct scatterlist *, unsigned int); void __sg_free_table(struct sg_table *, unsigned int, unsigned int, sg_free_fn *, unsigned int); void sg_free_table(struct sg_table *); void sg_free_append_table(struct sg_append_table *sgt); int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int, struct scatterlist *, unsigned int, gfp_t, sg_alloc_fn *); int sg_alloc_table(struct sg_table *, unsigned int, gfp_t); int sg_alloc_append_table_from_pages(struct sg_append_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, unsigned int left_pages, gfp_t gfp_mask); int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, gfp_t gfp_mask); /** * sg_alloc_table_from_pages - Allocate and initialize an sg table from * an array of pages * @sgt: The sg table header to use * @pages: Pointer to an array of page pointers * @n_pages: Number of pages in the pages array * @offset: Offset from start of the first page to the start of a buffer * @size: Number of valid bytes in the buffer (after offset) * @gfp_mask: GFP allocation mask * * Description: * Allocate and initialize an sg table from a list of pages. Contiguous * ranges of the pages are squashed into a single scatterlist node. A user * may provide an offset at a start and a size of valid data in a buffer * specified by the page array. The returned sg table is released by * sg_free_table. * * Returns: * 0 on success, negative error on failure */ static inline int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, gfp_t gfp_mask) { return sg_alloc_table_from_pages_segment(sgt, pages, n_pages, offset, size, UINT_MAX, gfp_mask); } #ifdef CONFIG_SGL_ALLOC struct scatterlist *sgl_alloc_order(unsigned long long length, unsigned int order, bool chainable, gfp_t gfp, unsigned int *nent_p); struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp, unsigned int *nent_p); void sgl_free_n_order(struct scatterlist *sgl, int nents, int order); void sgl_free_order(struct scatterlist *sgl, int order); void sgl_free(struct scatterlist *sgl); #endif /* CONFIG_SGL_ALLOC */ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip, bool to_buffer); size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen); size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen); size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen, off_t skip); size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip); size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents, size_t buflen, off_t skip); /* * Maximum number of entries that will be allocated in one piece, if * a list larger than this is required then chaining will be utilized. */ #define SG_MAX_SINGLE_ALLOC (PAGE_SIZE / sizeof(struct scatterlist)) /* * The maximum number of SG segments that we will put inside a * scatterlist (unless chaining is used). Should ideally fit inside a * single page, to avoid a higher order allocation. We could define this * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order. The * minimum value is 32 */ #define SG_CHUNK_SIZE 128 /* * Like SG_CHUNK_SIZE, but for archs that have sg chaining. This limit * is totally arbitrary, a setting of 2048 will get you at least 8mb ios. */ #ifdef CONFIG_ARCH_NO_SG_CHAIN #define SG_MAX_SEGMENTS SG_CHUNK_SIZE #else #define SG_MAX_SEGMENTS 2048 #endif #ifdef CONFIG_SG_POOL void sg_free_table_chained(struct sg_table *table, unsigned nents_first_chunk); int sg_alloc_table_chained(struct sg_table *table, int nents, struct scatterlist *first_chunk, unsigned nents_first_chunk); #endif /* * sg page iterator * * Iterates over sg entries page-by-page. On each successful iteration, you * can call sg_page_iter_page(@piter) to get the current page. * @piter->sg will point to the sg holding this page and @piter->sg_pgoffset to * the page's page offset within the sg. The iteration will stop either when a * maximum number of sg entries was reached or a terminating sg * (sg_last(sg) == true) was reached. */ struct sg_page_iter { struct scatterlist *sg; /* sg holding the page */ unsigned int sg_pgoffset; /* page offset within the sg */ /* these are internal states, keep away */ unsigned int __nents; /* remaining sg entries */ int __pg_advance; /* nr pages to advance at the * next step */ }; /* * sg page iterator for DMA addresses * * This is the same as sg_page_iter however you can call * sg_page_iter_dma_address(@dma_iter) to get the page's DMA * address. sg_page_iter_page() cannot be called on this iterator. */ struct sg_dma_page_iter { struct sg_page_iter base; }; bool __sg_page_iter_next(struct sg_page_iter *piter); bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter); void __sg_page_iter_start(struct sg_page_iter *piter, struct scatterlist *sglist, unsigned int nents, unsigned long pgoffset); /** * sg_page_iter_page - get the current page held by the page iterator * @piter: page iterator holding the page */ static inline struct page *sg_page_iter_page(struct sg_page_iter *piter) { return nth_page(sg_page(piter->sg), piter->sg_pgoffset); } /** * sg_page_iter_dma_address - get the dma address of the current page held by * the page iterator. * @dma_iter: page iterator holding the page */ static inline dma_addr_t sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) { return sg_dma_address(dma_iter->base.sg) + (dma_iter->base.sg_pgoffset << PAGE_SHIFT); } /** * for_each_sg_page - iterate over the pages of the given sg list * @sglist: sglist to iterate over * @piter: page iterator to hold current page, sg, sg_pgoffset * @nents: maximum number of sg entries to iterate over * @pgoffset: starting page offset (in pages) * * Callers may use sg_page_iter_page() to get each page pointer. * In each loop it operates on PAGE_SIZE unit. */ #define for_each_sg_page(sglist, piter, nents, pgoffset) \ for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \ __sg_page_iter_next(piter);) /** * for_each_sg_dma_page - iterate over the pages of the given sg list * @sglist: sglist to iterate over * @dma_iter: DMA page iterator to hold current page * @dma_nents: maximum number of sg entries to iterate over, this is the value * returned from dma_map_sg * @pgoffset: starting page offset (in pages) * * Callers may use sg_page_iter_dma_address() to get each page's DMA address. * In each loop it operates on PAGE_SIZE unit. */ #define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset) \ for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents, \ pgoffset); \ __sg_page_iter_dma_next(dma_iter);) /** * for_each_sgtable_page - iterate over all pages in the sg_table object * @sgt: sg_table object to iterate over * @piter: page iterator to hold current page * @pgoffset: starting page offset (in pages) * * Iterates over the all memory pages in the buffer described by * a scatterlist stored in the given sg_table object. * See also for_each_sg_page(). In each loop it operates on PAGE_SIZE unit. */ #define for_each_sgtable_page(sgt, piter, pgoffset) \ for_each_sg_page((sgt)->sgl, piter, (sgt)->orig_nents, pgoffset) /** * for_each_sgtable_dma_page - iterate over the DMA mapped sg_table object * @sgt: sg_table object to iterate over * @dma_iter: DMA page iterator to hold current page * @pgoffset: starting page offset (in pages) * * Iterates over the all DMA mapped pages in the buffer described by * a scatterlist stored in the given sg_table object. * See also for_each_sg_dma_page(). In each loop it operates on PAGE_SIZE * unit. */ #define for_each_sgtable_dma_page(sgt, dma_iter, pgoffset) \ for_each_sg_dma_page((sgt)->sgl, dma_iter, (sgt)->nents, pgoffset) /* * Mapping sg iterator * * Iterates over sg entries mapping page-by-page. On each successful * iteration, @miter->page points to the mapped page and * @miter->length bytes of data can be accessed at @miter->addr. As * long as an iteration is enclosed between start and stop, the user * is free to choose control structure and when to stop. * * @miter->consumed is set to @miter->length on each iteration. It * can be adjusted if the user can't consume all the bytes in one go. * Also, a stopped iteration can be resumed by calling next on it. * This is useful when iteration needs to release all resources and * continue later (e.g. at the next interrupt). */ #define SG_MITER_ATOMIC (1 << 0) /* use kmap_atomic */ #define SG_MITER_TO_SG (1 << 1) /* flush back to phys on unmap */ #define SG_MITER_FROM_SG (1 << 2) /* nop */ #define SG_MITER_LOCAL (1 << 3) /* use kmap_local */ struct sg_mapping_iter { /* the following three fields can be accessed directly */ struct page *page; /* currently mapped page */ void *addr; /* pointer to the mapped area */ size_t length; /* length of the mapped area */ size_t consumed; /* number of consumed bytes */ struct sg_page_iter piter; /* page iterator */ /* these are internal states, keep away */ unsigned int __offset; /* offset within page */ unsigned int __remaining; /* remaining bytes on page */ unsigned int __flags; }; void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl, unsigned int nents, unsigned int flags); bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset); bool sg_miter_next(struct sg_mapping_iter *miter); void sg_miter_stop(struct sg_mapping_iter *miter); #endif /* _LINUX_SCATTERLIST_H */ |
| 2095 2254 2251 2106 2407 2404 2143 2455 2418 2424 2310 2426 2442 2154 2146 364 2149 2148 2147 2447 2448 2452 2413 2409 2411 2404 2455 2446 2422 2460 2393 2452 2465 2246 2226 2230 2255 2229 2237 2284 2280 2236 2383 2390 2384 2235 2238 5 5 4 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 | // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/group.c - Operations for adding/removing multiple files at once. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2013 Greg Kroah-Hartman * Copyright (c) 2013 The Linux Foundation */ #include <linux/kobject.h> #include <linux/module.h> #include <linux/dcache.h> #include <linux/namei.h> #include <linux/err.h> #include <linux/fs.h> #include "sysfs.h" static void remove_files(struct kernfs_node *parent, const struct attribute_group *grp) { struct attribute *const *attr; const struct bin_attribute *const *bin_attr; if (grp->attrs) for (attr = grp->attrs; *attr; attr++) kernfs_remove_by_name(parent, (*attr)->name); if (grp->bin_attrs) for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) kernfs_remove_by_name(parent, (*bin_attr)->attr.name); } static umode_t __first_visible(const struct attribute_group *grp, struct kobject *kobj) { if (grp->attrs && grp->attrs[0] && grp->is_visible) return grp->is_visible(kobj, grp->attrs[0], 0); if (grp->bin_attrs && grp->bin_attrs[0] && grp->is_bin_visible) return grp->is_bin_visible(kobj, grp->bin_attrs[0], 0); return 0; } static int create_files(struct kernfs_node *parent, struct kobject *kobj, kuid_t uid, kgid_t gid, const struct attribute_group *grp, int update) { struct attribute *const *attr; const struct bin_attribute *const *bin_attr; int error = 0, i; if (grp->attrs) { for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) { umode_t mode = (*attr)->mode; /* * In update mode, we're changing the permissions or * visibility. Do this by first removing then * re-adding (if required) the file. */ if (update) kernfs_remove_by_name(parent, (*attr)->name); if (grp->is_visible) { mode = grp->is_visible(kobj, *attr, i); mode &= ~SYSFS_GROUP_INVISIBLE; if (!mode) continue; } WARN(mode & ~(SYSFS_PREALLOC | 0664), "Attribute %s: Invalid permissions 0%o\n", (*attr)->name, mode); mode &= SYSFS_PREALLOC | 0664; error = sysfs_add_file_mode_ns(parent, *attr, mode, uid, gid, NULL); if (unlikely(error)) break; } if (error) { remove_files(parent, grp); goto exit; } } if (grp->bin_attrs) { for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) { umode_t mode = (*bin_attr)->attr.mode; size_t size = (*bin_attr)->size; if (update) kernfs_remove_by_name(parent, (*bin_attr)->attr.name); if (grp->is_bin_visible) { mode = grp->is_bin_visible(kobj, *bin_attr, i); mode &= ~SYSFS_GROUP_INVISIBLE; if (!mode) continue; } if (grp->bin_size) size = grp->bin_size(kobj, *bin_attr, i); WARN(mode & ~(SYSFS_PREALLOC | 0664), "Attribute %s: Invalid permissions 0%o\n", (*bin_attr)->attr.name, mode); mode &= SYSFS_PREALLOC | 0664; error = sysfs_add_bin_file_mode_ns(parent, *bin_attr, mode, size, uid, gid, NULL); if (error) break; } if (error) remove_files(parent, grp); } exit: return error; } static int internal_create_group(struct kobject *kobj, int update, const struct attribute_group *grp) { struct kernfs_node *kn; kuid_t uid; kgid_t gid; int error; if (WARN_ON(!kobj || (!update && !kobj->sd))) return -EINVAL; /* Updates may happen before the object has been instantiated */ if (unlikely(update && !kobj->sd)) return -EINVAL; if (!grp->attrs && !grp->bin_attrs) { pr_debug("sysfs: (bin_)attrs not set by subsystem for group: %s/%s, skipping\n", kobj->name, grp->name ?: ""); return 0; } kobject_get_ownership(kobj, &uid, &gid); if (grp->name) { umode_t mode = __first_visible(grp, kobj); if (mode & SYSFS_GROUP_INVISIBLE) mode = 0; else mode = S_IRWXU | S_IRUGO | S_IXUGO; if (update) { kn = kernfs_find_and_get(kobj->sd, grp->name); if (!kn) { pr_debug("attr grp %s/%s not created yet\n", kobj->name, grp->name); /* may have been invisible prior to this update */ update = 0; } else if (!mode) { sysfs_remove_group(kobj, grp); kernfs_put(kn); return 0; } } if (!update) { if (!mode) return 0; kn = kernfs_create_dir_ns(kobj->sd, grp->name, mode, uid, gid, kobj, NULL); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(kobj->sd, grp->name); return PTR_ERR(kn); } } } else { kn = kobj->sd; } kernfs_get(kn); error = create_files(kn, kobj, uid, gid, grp, update); if (error) { if (grp->name) kernfs_remove(kn); } kernfs_put(kn); if (grp->name && update) kernfs_put(kn); return error; } /** * sysfs_create_group - given a directory kobject, create an attribute group * @kobj: The kobject to create the group on * @grp: The attribute group to create * * This function creates a group for the first time. It will explicitly * warn and error if any of the attribute files being created already exist. * * Returns 0 on success or error code on failure. */ int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) { return internal_create_group(kobj, 0, grp); } EXPORT_SYMBOL_GPL(sysfs_create_group); static int internal_create_groups(struct kobject *kobj, int update, const struct attribute_group **groups) { int error = 0; int i; if (!groups) return 0; for (i = 0; groups[i]; i++) { error = internal_create_group(kobj, update, groups[i]); if (error) { while (--i >= 0) sysfs_remove_group(kobj, groups[i]); break; } } return error; } /** * sysfs_create_groups - given a directory kobject, create a bunch of attribute groups * @kobj: The kobject to create the group on * @groups: The attribute groups to create, NULL terminated * * This function creates a bunch of attribute groups. If an error occurs when * creating a group, all previously created groups will be removed, unwinding * everything back to the original state when this function was called. * It will explicitly warn and error if any of the attribute files being * created already exist. * * Returns 0 on success or error code from sysfs_create_group on failure. */ int sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups) { return internal_create_groups(kobj, 0, groups); } EXPORT_SYMBOL_GPL(sysfs_create_groups); /** * sysfs_update_groups - given a directory kobject, create a bunch of attribute groups * @kobj: The kobject to update the group on * @groups: The attribute groups to update, NULL terminated * * This function update a bunch of attribute groups. If an error occurs when * updating a group, all previously updated groups will be removed together * with already existing (not updated) attributes. * * Returns 0 on success or error code from sysfs_update_group on failure. */ int sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups) { return internal_create_groups(kobj, 1, groups); } EXPORT_SYMBOL_GPL(sysfs_update_groups); /** * sysfs_update_group - given a directory kobject, update an attribute group * @kobj: The kobject to update the group on * @grp: The attribute group to update * * This function updates an attribute group. Unlike * sysfs_create_group(), it will explicitly not warn or error if any * of the attribute files being created already exist. Furthermore, * if the visibility of the files has changed through the is_visible() * callback, it will update the permissions and add or remove the * relevant files. Changing a group's name (subdirectory name under * kobj's directory in sysfs) is not allowed. * * The primary use for this function is to call it after making a change * that affects group visibility. * * Returns 0 on success or error code on failure. */ int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp) { return internal_create_group(kobj, 1, grp); } EXPORT_SYMBOL_GPL(sysfs_update_group); /** * sysfs_remove_group: remove a group from a kobject * @kobj: kobject to remove the group from * @grp: group to remove * * This function removes a group of attributes from a kobject. The attributes * previously have to have been created for this group, otherwise it will fail. */ void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { struct kernfs_node *parent = kobj->sd; struct kernfs_node *kn; if (grp->name) { kn = kernfs_find_and_get(parent, grp->name); if (!kn) { pr_debug("sysfs group '%s' not found for kobject '%s'\n", grp->name, kobject_name(kobj)); return; } } else { kn = parent; kernfs_get(kn); } remove_files(kn, grp); if (grp->name) kernfs_remove(kn); kernfs_put(kn); } EXPORT_SYMBOL_GPL(sysfs_remove_group); /** * sysfs_remove_groups - remove a list of groups * * @kobj: The kobject for the groups to be removed from * @groups: NULL terminated list of groups to be removed * * If groups is not NULL, remove the specified groups from the kobject. */ void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups) { int i; if (!groups) return; for (i = 0; groups[i]; i++) sysfs_remove_group(kobj, groups[i]); } EXPORT_SYMBOL_GPL(sysfs_remove_groups); /** * sysfs_merge_group - merge files into a pre-existing named attribute group. * @kobj: The kobject containing the group. * @grp: The files to create and the attribute group they belong to. * * This function returns an error if the group doesn't exist, the .name field is * NULL or any of the files already exist in that group, in which case none of * the new files are created. */ int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { struct kernfs_node *parent; kuid_t uid; kgid_t gid; int error = 0; struct attribute *const *attr; int i; parent = kernfs_find_and_get(kobj->sd, grp->name); if (!parent) return -ENOENT; kobject_get_ownership(kobj, &uid, &gid); for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr)) error = sysfs_add_file_mode_ns(parent, *attr, (*attr)->mode, uid, gid, NULL); if (error) { while (--i >= 0) kernfs_remove_by_name(parent, (*--attr)->name); } kernfs_put(parent); return error; } EXPORT_SYMBOL_GPL(sysfs_merge_group); /** * sysfs_unmerge_group - remove files from a pre-existing named attribute group. * @kobj: The kobject containing the group. * @grp: The files to remove and the attribute group they belong to. */ void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { struct kernfs_node *parent; struct attribute *const *attr; parent = kernfs_find_and_get(kobj->sd, grp->name); if (parent) { for (attr = grp->attrs; *attr; ++attr) kernfs_remove_by_name(parent, (*attr)->name); kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_unmerge_group); /** * sysfs_add_link_to_group - add a symlink to an attribute group. * @kobj: The kobject containing the group. * @group_name: The name of the group. * @target: The target kobject of the symlink to create. * @link_name: The name of the symlink to create. */ int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { struct kernfs_node *parent; int error = 0; parent = kernfs_find_and_get(kobj->sd, group_name); if (!parent) return -ENOENT; error = sysfs_create_link_sd(parent, target, link_name); kernfs_put(parent); return error; } EXPORT_SYMBOL_GPL(sysfs_add_link_to_group); /** * sysfs_remove_link_from_group - remove a symlink from an attribute group. * @kobj: The kobject containing the group. * @group_name: The name of the group. * @link_name: The name of the symlink to remove. */ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { struct kernfs_node *parent; parent = kernfs_find_and_get(kobj->sd, group_name); if (parent) { kernfs_remove_by_name(parent, link_name); kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); /** * compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing * to a group or an attribute * @kobj: The kobject containing the group. * @target_kobj: The target kobject. * @target_name: The name of the target group or attribute. * @symlink_name: The name of the symlink file (target_name will be * considered if symlink_name is NULL). */ int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name) { struct kernfs_node *target; struct kernfs_node *entry; struct kernfs_node *link; /* * We don't own @target_kobj and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See sysfs_remove_dir() * for details. */ spin_lock(&sysfs_symlink_target_lock); target = target_kobj->sd; if (target) kernfs_get(target); spin_unlock(&sysfs_symlink_target_lock); if (!target) return -ENOENT; entry = kernfs_find_and_get(target, target_name); if (!entry) { kernfs_put(target); return -ENOENT; } if (!symlink_name) symlink_name = target_name; link = kernfs_create_link(kobj->sd, symlink_name, entry); if (PTR_ERR(link) == -EEXIST) sysfs_warn_dup(kobj->sd, symlink_name); kernfs_put(entry); kernfs_put(target); return PTR_ERR_OR_ZERO(link); } EXPORT_SYMBOL_GPL(compat_only_sysfs_link_entry_to_kobj); static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn, const struct attribute_group *grp, struct iattr *newattrs) { struct kernfs_node *kn; int error; if (grp->attrs) { struct attribute *const *attr; for (attr = grp->attrs; *attr; attr++) { kn = kernfs_find_and_get(grp_kn, (*attr)->name); if (!kn) return -ENOENT; error = kernfs_setattr(kn, newattrs); kernfs_put(kn); if (error) return error; } } if (grp->bin_attrs) { const struct bin_attribute *const *bin_attr; for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) { kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name); if (!kn) return -ENOENT; error = kernfs_setattr(kn, newattrs); kernfs_put(kn); if (error) return error; } } return 0; } /** * sysfs_group_change_owner - change owner of an attribute group. * @kobj: The kobject containing the group. * @grp: The attribute group. * @kuid: new owner's kuid * @kgid: new owner's kgid * * Returns 0 on success or error code on failure. */ int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *grp, kuid_t kuid, kgid_t kgid) { struct kernfs_node *grp_kn; int error; struct iattr newattrs = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = kuid, .ia_gid = kgid, }; if (!kobj->state_in_sysfs) return -EINVAL; if (grp->name) { grp_kn = kernfs_find_and_get(kobj->sd, grp->name); } else { kernfs_get(kobj->sd); grp_kn = kobj->sd; } if (!grp_kn) return -ENOENT; error = kernfs_setattr(grp_kn, &newattrs); if (!error) error = sysfs_group_attrs_change_owner(grp_kn, grp, &newattrs); kernfs_put(grp_kn); return error; } EXPORT_SYMBOL_GPL(sysfs_group_change_owner); /** * sysfs_groups_change_owner - change owner of a set of attribute groups. * @kobj: The kobject containing the groups. * @groups: The attribute groups. * @kuid: new owner's kuid * @kgid: new owner's kgid * * Returns 0 on success or error code on failure. */ int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid) { int error = 0, i; if (!kobj->state_in_sysfs) return -EINVAL; if (!groups) return 0; for (i = 0; groups[i]; i++) { error = sysfs_group_change_owner(kobj, groups[i], kuid, kgid); if (error) break; } return error; } EXPORT_SYMBOL_GPL(sysfs_groups_change_owner); |
| 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 1 17 17 17 17 17 17 17 16 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 17 2 15 17 17 17 17 17 17 15 1 22 22 30 2 28 28 28 29 29 23 23 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 | // SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) "irq: " fmt #include <linux/acpi.h> #include <linux/debugfs.h> #include <linux/hardirq.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/irqdesc.h> #include <linux/irqdomain.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/of.h> #include <linux/of_address.h> #include <linux/of_irq.h> #include <linux/topology.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/smp.h> #include <linux/fs.h> static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); static struct irq_domain *irq_default_domain; static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, bool realloc, const struct irq_affinity_desc *affinity); static void irq_domain_check_hierarchy(struct irq_domain *domain); static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq); struct irqchip_fwid { struct fwnode_handle fwnode; unsigned int type; char *name; phys_addr_t *pa; }; #ifdef CONFIG_GENERIC_IRQ_DEBUGFS static void debugfs_add_domain_dir(struct irq_domain *d); static void debugfs_remove_domain_dir(struct irq_domain *d); #else static inline void debugfs_add_domain_dir(struct irq_domain *d) { } static inline void debugfs_remove_domain_dir(struct irq_domain *d) { } #endif static const char *irqchip_fwnode_get_name(const struct fwnode_handle *fwnode) { struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode); return fwid->name; } const struct fwnode_operations irqchip_fwnode_ops = { .get_name = irqchip_fwnode_get_name, }; EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); /** * __irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for * identifying an irq domain * @type: Type of irqchip_fwnode. See linux/irqdomain.h * @id: Optional user provided id if name != NULL * @name: Optional user provided domain name * @pa: Optional user-provided physical address * * Allocate a struct irqchip_fwid, and return a pointer to the embedded * fwnode_handle (or NULL on failure). * * Note: The types IRQCHIP_FWNODE_NAMED and IRQCHIP_FWNODE_NAMED_ID are * solely to transport name information to irqdomain creation code. The * node is not stored. For other types the pointer is kept in the irq * domain struct. */ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, const char *name, phys_addr_t *pa) { struct irqchip_fwid *fwid; char *n; fwid = kzalloc(sizeof(*fwid), GFP_KERNEL); switch (type) { case IRQCHIP_FWNODE_NAMED: n = kasprintf(GFP_KERNEL, "%s", name); break; case IRQCHIP_FWNODE_NAMED_ID: n = kasprintf(GFP_KERNEL, "%s-%d", name, id); break; default: n = kasprintf(GFP_KERNEL, "irqchip@%pa", pa); break; } if (!fwid || !n) { kfree(fwid); kfree(n); return NULL; } fwid->type = type; fwid->name = n; fwid->pa = pa; fwnode_init(&fwid->fwnode, &irqchip_fwnode_ops); return &fwid->fwnode; } EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode); /** * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle * @fwnode: fwnode_handle to free * * Free a fwnode_handle allocated with irq_domain_alloc_fwnode. */ void irq_domain_free_fwnode(struct fwnode_handle *fwnode) { struct irqchip_fwid *fwid; if (!fwnode || WARN_ON(!is_fwnode_irqchip(fwnode))) return; fwid = container_of(fwnode, struct irqchip_fwid, fwnode); kfree(fwid->name); kfree(fwid); } EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); static int alloc_name(struct irq_domain *domain, char *base, enum irq_domain_bus_token bus_token) { if (bus_token == DOMAIN_BUS_ANY) domain->name = kasprintf(GFP_KERNEL, "%s", base); else domain->name = kasprintf(GFP_KERNEL, "%s-%d", base, bus_token); if (!domain->name) return -ENOMEM; domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; return 0; } static int alloc_fwnode_name(struct irq_domain *domain, const struct fwnode_handle *fwnode, enum irq_domain_bus_token bus_token, const char *suffix) { const char *sep = suffix ? "-" : ""; const char *suf = suffix ? : ""; char *name; if (bus_token == DOMAIN_BUS_ANY) name = kasprintf(GFP_KERNEL, "%pfw%s%s", fwnode, sep, suf); else name = kasprintf(GFP_KERNEL, "%pfw%s%s-%d", fwnode, sep, suf, bus_token); if (!name) return -ENOMEM; /* * fwnode paths contain '/', which debugfs is legitimately unhappy * about. Replace them with ':', which does the trick and is not as * offensive as '\'... */ domain->name = strreplace(name, '/', ':'); domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; return 0; } static int alloc_unknown_name(struct irq_domain *domain, enum irq_domain_bus_token bus_token) { static atomic_t unknown_domains; int id = atomic_inc_return(&unknown_domains); if (bus_token == DOMAIN_BUS_ANY) domain->name = kasprintf(GFP_KERNEL, "unknown-%d", id); else domain->name = kasprintf(GFP_KERNEL, "unknown-%d-%d", id, bus_token); if (!domain->name) return -ENOMEM; domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; return 0; } static int irq_domain_set_name(struct irq_domain *domain, const struct irq_domain_info *info) { enum irq_domain_bus_token bus_token = info->bus_token; const struct fwnode_handle *fwnode = info->fwnode; if (is_fwnode_irqchip(fwnode)) { struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode); /* * The name_suffix is only intended to be used to avoid a name * collision when multiple domains are created for a single * device and the name is picked using a real device node. * (Typical use-case is regmap-IRQ controllers for devices * providing more than one physical IRQ.) There should be no * need to use name_suffix with irqchip-fwnode. */ if (info->name_suffix) return -EINVAL; switch (fwid->type) { case IRQCHIP_FWNODE_NAMED: case IRQCHIP_FWNODE_NAMED_ID: return alloc_name(domain, fwid->name, bus_token); default: domain->name = fwid->name; if (bus_token != DOMAIN_BUS_ANY) return alloc_name(domain, fwid->name, bus_token); } } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) || is_software_node(fwnode)) { return alloc_fwnode_name(domain, fwnode, bus_token, info->name_suffix); } if (domain->name) return 0; if (fwnode) pr_err("Invalid fwnode type for irqdomain\n"); return alloc_unknown_name(domain, bus_token); } static struct irq_domain *__irq_domain_create(const struct irq_domain_info *info) { struct irq_domain *domain; int err; if (WARN_ON((info->size && info->direct_max) || (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && info->direct_max) || (info->direct_max && info->direct_max != info->hwirq_max))) return ERR_PTR(-EINVAL); domain = kzalloc_node(struct_size(domain, revmap, info->size), GFP_KERNEL, of_node_to_nid(to_of_node(info->fwnode))); if (!domain) return ERR_PTR(-ENOMEM); err = irq_domain_set_name(domain, info); if (err) { kfree(domain); return ERR_PTR(err); } domain->fwnode = fwnode_handle_get(info->fwnode); fwnode_dev_initialized(domain->fwnode, true); /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); domain->ops = info->ops; domain->host_data = info->host_data; domain->bus_token = info->bus_token; domain->hwirq_max = info->hwirq_max; if (info->direct_max) domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP; domain->revmap_size = info->size; /* * Hierarchical domains use the domain lock of the root domain * (innermost domain). * * For non-hierarchical domains (as for root domains), the root * pointer is set to the domain itself so that &domain->root->mutex * always points to the right lock. */ mutex_init(&domain->mutex); domain->root = domain; irq_domain_check_hierarchy(domain); return domain; } static void __irq_domain_publish(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); debugfs_add_domain_dir(domain); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); pr_debug("Added domain %s\n", domain->name); } static void irq_domain_free(struct irq_domain *domain) { fwnode_dev_initialized(domain->fwnode, false); fwnode_handle_put(domain->fwnode); if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) kfree(domain->name); kfree(domain); } static void irq_domain_instantiate_descs(const struct irq_domain_info *info) { if (!IS_ENABLED(CONFIG_SPARSE_IRQ)) return; if (irq_alloc_descs(info->virq_base, info->virq_base, info->size, of_node_to_nid(to_of_node(info->fwnode))) < 0) { pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", info->virq_base); } } static struct irq_domain *__irq_domain_instantiate(const struct irq_domain_info *info, bool cond_alloc_descs, bool force_associate) { struct irq_domain *domain; int err; domain = __irq_domain_create(info); if (IS_ERR(domain)) return domain; domain->flags |= info->domain_flags; domain->exit = info->exit; domain->dev = info->dev; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (info->parent) { domain->root = info->parent->root; domain->parent = info->parent; } #endif if (info->dgc_info) { err = irq_domain_alloc_generic_chips(domain, info->dgc_info); if (err) goto err_domain_free; } if (info->init) { err = info->init(domain); if (err) goto err_domain_gc_remove; } __irq_domain_publish(domain); if (cond_alloc_descs && info->virq_base > 0) irq_domain_instantiate_descs(info); /* * Legacy interrupt domains have a fixed Linux interrupt number * associated. Other interrupt domains can request association by * providing a Linux interrupt number > 0. */ if (force_associate || info->virq_base > 0) { irq_domain_associate_many(domain, info->virq_base, info->hwirq_base, info->size - info->hwirq_base); } return domain; err_domain_gc_remove: if (info->dgc_info) irq_domain_remove_generic_chips(domain); err_domain_free: irq_domain_free(domain); return ERR_PTR(err); } /** * irq_domain_instantiate() - Instantiate a new irq domain data structure * @info: Domain information pointer pointing to the information for this domain * * Return: A pointer to the instantiated irq domain or an ERR_PTR value. */ struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info) { return __irq_domain_instantiate(info, false, false); } EXPORT_SYMBOL_GPL(irq_domain_instantiate); /** * irq_domain_remove() - Remove an irq domain. * @domain: domain to remove * * This routine is used to remove an irq domain. The caller must ensure * that all mappings within the domain have been disposed of prior to * use, depending on the revmap type. */ void irq_domain_remove(struct irq_domain *domain) { if (domain->exit) domain->exit(domain); mutex_lock(&irq_domain_mutex); debugfs_remove_domain_dir(domain); WARN_ON(!radix_tree_empty(&domain->revmap_tree)); list_del(&domain->link); /* * If the going away domain is the default one, reset it. */ if (unlikely(irq_default_domain == domain)) irq_set_default_domain(NULL); mutex_unlock(&irq_domain_mutex); if (domain->flags & IRQ_DOMAIN_FLAG_DESTROY_GC) irq_domain_remove_generic_chips(domain); pr_debug("Removed domain %s\n", domain->name); irq_domain_free(domain); } EXPORT_SYMBOL_GPL(irq_domain_remove); void irq_domain_update_bus_token(struct irq_domain *domain, enum irq_domain_bus_token bus_token) { char *name; if (domain->bus_token == bus_token) return; mutex_lock(&irq_domain_mutex); domain->bus_token = bus_token; name = kasprintf(GFP_KERNEL, "%s-%d", domain->name, bus_token); if (!name) { mutex_unlock(&irq_domain_mutex); return; } debugfs_remove_domain_dir(domain); if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) kfree(domain->name); else domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; domain->name = name; debugfs_add_domain_dir(domain); mutex_unlock(&irq_domain_mutex); } EXPORT_SYMBOL_GPL(irq_domain_update_bus_token); /** * irq_domain_create_simple() - Register an irq_domain and optionally map a range of irqs * @fwnode: firmware node for the interrupt controller * @size: total number of irqs in mapping * @first_irq: first number of irq block assigned to the domain, * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then * pre-map all of the irqs in the domain to virqs starting at first_irq. * @ops: domain callbacks * @host_data: Controller private data pointer * * Allocates an irq_domain, and optionally if first_irq is positive then also * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq. * * This is intended to implement the expected behaviour for most * interrupt controllers. If device tree is used, then first_irq will be 0 and * irqs get mapped dynamically on the fly. However, if the controller requires * static virq assignments (non-DT boot) then it will set that up correctly. */ struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, unsigned int size, unsigned int first_irq, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain_info info = { .fwnode = fwnode, .size = size, .hwirq_max = size, .virq_base = first_irq, .ops = ops, .host_data = host_data, }; struct irq_domain *domain = __irq_domain_instantiate(&info, true, false); return IS_ERR(domain) ? NULL : domain; } EXPORT_SYMBOL_GPL(irq_domain_create_simple); struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, unsigned int size, unsigned int first_irq, irq_hw_number_t first_hwirq, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain_info info = { .fwnode = fwnode, .size = first_hwirq + size, .hwirq_max = first_hwirq + size, .hwirq_base = first_hwirq, .virq_base = first_irq, .ops = ops, .host_data = host_data, }; struct irq_domain *domain = __irq_domain_instantiate(&info, false, true); return IS_ERR(domain) ? NULL : domain; } EXPORT_SYMBOL_GPL(irq_domain_create_legacy); /** * irq_find_matching_fwspec() - Locates a domain for a given fwspec * @fwspec: FW specifier for an interrupt * @bus_token: domain-specific data */ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token) { struct irq_domain *h, *found = NULL; struct fwnode_handle *fwnode = fwspec->fwnode; int rc; /* * We might want to match the legacy controller last since * it might potentially be set to match all interrupts in * the absence of a device node. This isn't a problem so far * yet though... * * bus_token == DOMAIN_BUS_ANY matches any domain, any other * values must generate an exact match for the domain to be * selected. */ mutex_lock(&irq_domain_mutex); list_for_each_entry(h, &irq_domain_list, link) { if (h->ops->select && bus_token != DOMAIN_BUS_ANY) rc = h->ops->select(h, fwspec, bus_token); else if (h->ops->match) rc = h->ops->match(h, to_of_node(fwnode), bus_token); else rc = ((fwnode != NULL) && (h->fwnode == fwnode) && ((bus_token == DOMAIN_BUS_ANY) || (h->bus_token == bus_token))); if (rc) { found = h; break; } } mutex_unlock(&irq_domain_mutex); return found; } EXPORT_SYMBOL_GPL(irq_find_matching_fwspec); /** * irq_set_default_domain() - Set a "default" irq domain * @domain: default domain pointer * * For convenience, it's possible to set a "default" domain that will be used * whenever NULL is passed to irq_create_mapping(). It makes life easier for * platforms that want to manipulate a few hard coded interrupt numbers that * aren't properly represented in the device-tree. */ void irq_set_default_domain(struct irq_domain *domain) { pr_debug("Default domain set to @0x%p\n", domain); irq_default_domain = domain; } EXPORT_SYMBOL_GPL(irq_set_default_domain); /** * irq_get_default_domain() - Retrieve the "default" irq domain * * Returns: the default domain, if any. * * Modern code should never use this. This should only be used on * systems that cannot implement a firmware->fwnode mapping (which * both DT and ACPI provide). */ struct irq_domain *irq_get_default_domain(void) { return irq_default_domain; } EXPORT_SYMBOL_GPL(irq_get_default_domain); static bool irq_domain_is_nomap(struct irq_domain *domain) { return IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && (domain->flags & IRQ_DOMAIN_FLAG_NO_MAP); } static void irq_domain_clear_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { lockdep_assert_held(&domain->root->mutex); if (irq_domain_is_nomap(domain)) return; if (hwirq < domain->revmap_size) rcu_assign_pointer(domain->revmap[hwirq], NULL); else radix_tree_delete(&domain->revmap_tree, hwirq); } static void irq_domain_set_mapping(struct irq_domain *domain, irq_hw_number_t hwirq, struct irq_data *irq_data) { /* * This also makes sure that all domains point to the same root when * called from irq_domain_insert_irq() for each domain in a hierarchy. */ lockdep_assert_held(&domain->root->mutex); if (irq_domain_is_nomap(domain)) return; if (hwirq < domain->revmap_size) rcu_assign_pointer(domain->revmap[hwirq], irq_data); else radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); } static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) { struct irq_data *irq_data = irq_get_irq_data(irq); irq_hw_number_t hwirq; if (WARN(!irq_data || irq_data->domain != domain, "virq%i doesn't exist; cannot disassociate\n", irq)) return; hwirq = irq_data->hwirq; mutex_lock(&domain->root->mutex); irq_set_status_flags(irq, IRQ_NOREQUEST); /* remove chip and handler */ irq_set_chip_and_handler(irq, NULL, NULL); /* Make sure it's completed */ synchronize_irq(irq); /* Tell the PIC about it */ if (domain->ops->unmap) domain->ops->unmap(domain, irq); smp_mb(); irq_data->domain = NULL; irq_data->hwirq = 0; domain->mapcount--; /* Clear reverse map for this hwirq */ irq_domain_clear_mapping(domain, hwirq); mutex_unlock(&domain->root->mutex); } static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { struct irq_data *irq_data = irq_get_irq_data(virq); int ret; if (WARN(hwirq >= domain->hwirq_max, "error: hwirq 0x%x is too large for %s\n", (int)hwirq, domain->name)) return -EINVAL; if (WARN(!irq_data, "error: virq%i is not allocated", virq)) return -EINVAL; if (WARN(irq_data->domain, "error: virq%i is already associated", virq)) return -EINVAL; irq_data->hwirq = hwirq; irq_data->domain = domain; if (domain->ops->map) { ret = domain->ops->map(domain, virq, hwirq); if (ret != 0) { /* * If map() returns -EPERM, this interrupt is protected * by the firmware or some other service and shall not * be mapped. Don't bother telling the user about it. */ if (ret != -EPERM) { pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", domain->name, hwirq, virq, ret); } irq_data->domain = NULL; irq_data->hwirq = 0; return ret; } } domain->mapcount++; irq_domain_set_mapping(domain, hwirq, irq_data); irq_clear_status_flags(virq, IRQ_NOREQUEST); return 0; } int irq_domain_associate(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { int ret; mutex_lock(&domain->root->mutex); ret = irq_domain_associate_locked(domain, virq, hwirq); mutex_unlock(&domain->root->mutex); return ret; } EXPORT_SYMBOL_GPL(irq_domain_associate); void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, irq_hw_number_t hwirq_base, int count) { struct device_node *of_node; int i; of_node = irq_domain_get_of_node(domain); pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, of_node_full_name(of_node), irq_base, (int)hwirq_base, count); for (i = 0; i < count; i++) irq_domain_associate(domain, irq_base + i, hwirq_base + i); } EXPORT_SYMBOL_GPL(irq_domain_associate_many); #ifdef CONFIG_IRQ_DOMAIN_NOMAP /** * irq_create_direct_mapping() - Allocate an irq for direct mapping * @domain: domain to allocate the irq for or NULL for default domain * * This routine is used for irq controllers which can choose the hardware * interrupt numbers they generate. In such a case it's simplest to use * the linux irq as the hardware interrupt number. It still uses the linear * or radix tree to store the mapping, but the irq controller can optimize * the revmap path by using the hwirq directly. */ unsigned int irq_create_direct_mapping(struct irq_domain *domain) { struct device_node *of_node; unsigned int virq; if (domain == NULL) domain = irq_default_domain; of_node = irq_domain_get_of_node(domain); virq = irq_alloc_desc_from(1, of_node_to_nid(of_node)); if (!virq) { pr_debug("create_direct virq allocation failed\n"); return 0; } if (virq >= domain->hwirq_max) { pr_err("ERROR: no free irqs available below %lu maximum\n", domain->hwirq_max); irq_free_desc(virq); return 0; } pr_debug("create_direct obtained virq %d\n", virq); if (irq_domain_associate(domain, virq, virq)) { irq_free_desc(virq); return 0; } return virq; } EXPORT_SYMBOL_GPL(irq_create_direct_mapping); #endif static unsigned int irq_create_mapping_affinity_locked(struct irq_domain *domain, irq_hw_number_t hwirq, const struct irq_affinity_desc *affinity) { struct device_node *of_node = irq_domain_get_of_node(domain); int virq; pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); /* Allocate a virtual interrupt number */ virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), affinity); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; } if (irq_domain_associate_locked(domain, virq, hwirq)) { irq_free_desc(virq); return 0; } pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", hwirq, of_node_full_name(of_node), virq); return virq; } /** * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space * @domain: domain owning this hardware interrupt or NULL for default domain * @hwirq: hardware irq number in that domain space * @affinity: irq affinity * * Only one mapping per hardware interrupt is permitted. Returns a linux * irq number. * If the sense/trigger is to be specified, set_irq_type() should be called * on the number returned from that call. */ unsigned int irq_create_mapping_affinity(struct irq_domain *domain, irq_hw_number_t hwirq, const struct irq_affinity_desc *affinity) { int virq; /* Look for default domain if necessary */ if (domain == NULL) domain = irq_default_domain; if (domain == NULL) { WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq); return 0; } mutex_lock(&domain->root->mutex); /* Check if mapping already exists */ virq = irq_find_mapping(domain, hwirq); if (virq) { pr_debug("existing mapping on virq %d\n", virq); goto out; } virq = irq_create_mapping_affinity_locked(domain, hwirq, affinity); out: mutex_unlock(&domain->root->mutex); return virq; } EXPORT_SYMBOL_GPL(irq_create_mapping_affinity); static int irq_domain_translate(struct irq_domain *d, struct irq_fwspec *fwspec, irq_hw_number_t *hwirq, unsigned int *type) { #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (d->ops->translate) return d->ops->translate(d, fwspec, hwirq, type); #endif if (d->ops->xlate) return d->ops->xlate(d, to_of_node(fwspec->fwnode), fwspec->param, fwspec->param_count, hwirq, type); /* If domain has no translation, then we assume interrupt line */ *hwirq = fwspec->param[0]; return 0; } void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, unsigned int count, struct irq_fwspec *fwspec) { int i; fwspec->fwnode = of_fwnode_handle(np); fwspec->param_count = count; for (i = 0; i < count; i++) fwspec->param[i] = args[i]; } EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec); unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) { struct irq_domain *domain; struct irq_data *irq_data; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; int virq; if (fwspec->fwnode) { domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED); if (!domain) domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_ANY); } else { domain = irq_default_domain; } if (!domain) { pr_warn("no irq domain found for %s !\n", of_node_full_name(to_of_node(fwspec->fwnode))); return 0; } if (irq_domain_translate(domain, fwspec, &hwirq, &type)) return 0; /* * WARN if the irqchip returns a type with bits * outside the sense mask set and clear these bits. */ if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK)) type &= IRQ_TYPE_SENSE_MASK; mutex_lock(&domain->root->mutex); /* * If we've already configured this interrupt, * don't do it again, or hell will break loose. */ virq = irq_find_mapping(domain, hwirq); if (virq) { /* * If the trigger type is not specified or matches the * current trigger type then we are done so return the * interrupt number. */ if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) goto out; /* * If the trigger type has not been set yet, then set * it now and return the interrupt number. */ if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) { irq_data = irq_get_irq_data(virq); if (!irq_data) { virq = 0; goto out; } irqd_set_trigger_type(irq_data, type); goto out; } pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n", hwirq, of_node_full_name(to_of_node(fwspec->fwnode))); virq = 0; goto out; } if (irq_domain_is_hierarchy(domain)) { if (irq_domain_is_msi_device(domain)) { mutex_unlock(&domain->root->mutex); virq = msi_device_domain_alloc_wired(domain, hwirq, type); mutex_lock(&domain->root->mutex); } else virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE, fwspec, false, NULL); if (virq <= 0) { virq = 0; goto out; } } else { /* Create mapping */ virq = irq_create_mapping_affinity_locked(domain, hwirq, NULL); if (!virq) goto out; } irq_data = irq_get_irq_data(virq); if (WARN_ON(!irq_data)) { virq = 0; goto out; } /* Store trigger type */ irqd_set_trigger_type(irq_data, type); out: mutex_unlock(&domain->root->mutex); return virq; } EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) { struct irq_fwspec fwspec; of_phandle_args_to_fwspec(irq_data->np, irq_data->args, irq_data->args_count, &fwspec); return irq_create_fwspec_mapping(&fwspec); } EXPORT_SYMBOL_GPL(irq_create_of_mapping); /** * irq_dispose_mapping() - Unmap an interrupt * @virq: linux irq number of the interrupt to unmap */ void irq_dispose_mapping(unsigned int virq) { struct irq_data *irq_data; struct irq_domain *domain; irq_data = virq ? irq_get_irq_data(virq) : NULL; if (!irq_data) return; domain = irq_data->domain; if (WARN_ON(domain == NULL)) return; if (irq_domain_is_hierarchy(domain)) { irq_domain_free_one_irq(domain, virq); } else { irq_domain_disassociate(domain, virq); irq_free_desc(virq); } } EXPORT_SYMBOL_GPL(irq_dispose_mapping); /** * __irq_resolve_mapping() - Find a linux irq from a hw irq number. * @domain: domain owning this hardware interrupt * @hwirq: hardware irq number in that domain space * @irq: optional pointer to return the Linux irq if required * * Returns the interrupt descriptor. */ struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, irq_hw_number_t hwirq, unsigned int *irq) { struct irq_desc *desc = NULL; struct irq_data *data; /* Look for default domain if necessary */ if (domain == NULL) domain = irq_default_domain; if (domain == NULL) return desc; if (irq_domain_is_nomap(domain)) { if (hwirq < domain->hwirq_max) { data = irq_domain_get_irq_data(domain, hwirq); if (data && data->hwirq == hwirq) desc = irq_data_to_desc(data); if (irq && desc) *irq = hwirq; } return desc; } rcu_read_lock(); /* Check if the hwirq is in the linear revmap. */ if (hwirq < domain->revmap_size) data = rcu_dereference(domain->revmap[hwirq]); else data = radix_tree_lookup(&domain->revmap_tree, hwirq); if (likely(data)) { desc = irq_data_to_desc(data); if (irq) *irq = data->irq; } rcu_read_unlock(); return desc; } EXPORT_SYMBOL_GPL(__irq_resolve_mapping); /** * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings * @d: Interrupt domain involved in the translation * @ctrlr: The device tree node for the device whose interrupt is translated * @intspec: The interrupt specifier data from the device tree * @intsize: The number of entries in @intspec * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Device Tree IRQ specifier translation function which works with one cell * bindings where the cell value maps directly to the hwirq number. */ int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, unsigned long *out_hwirq, unsigned int *out_type) { if (WARN_ON(intsize < 1)) return -EINVAL; *out_hwirq = intspec[0]; *out_type = IRQ_TYPE_NONE; return 0; } EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell); /** * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings * @d: Interrupt domain involved in the translation * @ctrlr: The device tree node for the device whose interrupt is translated * @intspec: The interrupt specifier data from the device tree * @intsize: The number of entries in @intspec * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Device Tree IRQ specifier translation function which works with two cell * bindings where the cell values map directly to the hwirq number * and linux irq flags. */ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type) { struct irq_fwspec fwspec; of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec); return irq_domain_translate_twocell(d, &fwspec, out_hwirq, out_type); } EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); /** * irq_domain_xlate_twothreecell() - Generic xlate for direct two or three cell bindings * @d: Interrupt domain involved in the translation * @ctrlr: The device tree node for the device whose interrupt is translated * @intspec: The interrupt specifier data from the device tree * @intsize: The number of entries in @intspec * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Device Tree interrupt specifier translation function for two or three * cell bindings, where the cell values map directly to the hardware * interrupt number and the type specifier. */ int irq_domain_xlate_twothreecell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type) { struct irq_fwspec fwspec; of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec); return irq_domain_translate_twothreecell(d, &fwspec, out_hwirq, out_type); } EXPORT_SYMBOL_GPL(irq_domain_xlate_twothreecell); /** * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings * @d: Interrupt domain involved in the translation * @ctrlr: The device tree node for the device whose interrupt is translated * @intspec: The interrupt specifier data from the device tree * @intsize: The number of entries in @intspec * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Device Tree IRQ specifier translation function which works with either one * or two cell bindings where the cell values map directly to the hwirq number * and linux irq flags. * * Note: don't use this function unless your interrupt controller explicitly * supports both one and two cell bindings. For the majority of controllers * the _onecell() or _twocell() variants above should be used. */ int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, unsigned long *out_hwirq, unsigned int *out_type) { if (WARN_ON(intsize < 1)) return -EINVAL; *out_hwirq = intspec[0]; if (intsize > 1) *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; else *out_type = IRQ_TYPE_NONE; return 0; } EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); const struct irq_domain_ops irq_domain_simple_ops = { .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); /** * irq_domain_translate_onecell() - Generic translate for direct one cell * bindings * @d: Interrupt domain involved in the translation * @fwspec: The firmware interrupt specifier to translate * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type */ int irq_domain_translate_onecell(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type) { if (WARN_ON(fwspec->param_count < 1)) return -EINVAL; *out_hwirq = fwspec->param[0]; *out_type = IRQ_TYPE_NONE; return 0; } EXPORT_SYMBOL_GPL(irq_domain_translate_onecell); /** * irq_domain_translate_twocell() - Generic translate for direct two cell * bindings * @d: Interrupt domain involved in the translation * @fwspec: The firmware interrupt specifier to translate * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Device Tree IRQ specifier translation function which works with two cell * bindings where the cell values map directly to the hwirq number * and linux irq flags. */ int irq_domain_translate_twocell(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type) { if (WARN_ON(fwspec->param_count < 2)) return -EINVAL; *out_hwirq = fwspec->param[0]; *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK; return 0; } EXPORT_SYMBOL_GPL(irq_domain_translate_twocell); /** * irq_domain_translate_twothreecell() - Generic translate for direct two or three cell * bindings * @d: Interrupt domain involved in the translation * @fwspec: The firmware interrupt specifier to translate * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Firmware interrupt specifier translation function for two or three cell * specifications, where the parameter values map directly to the hardware * interrupt number and the type specifier. */ int irq_domain_translate_twothreecell(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type) { if (fwspec->param_count == 2) { *out_hwirq = fwspec->param[0]; *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK; return 0; } if (fwspec->param_count == 3) { *out_hwirq = fwspec->param[1]; *out_type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK; return 0; } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_domain_translate_twothreecell); int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity) { unsigned int hint; if (virq >= 0) { virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE, affinity); } else { hint = hwirq % irq_get_nr_irqs(); if (hint == 0) hint++; virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE, affinity); if (virq <= 0 && hint > 1) { virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE, affinity); } } return virq; } /** * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data * @irq_data: The pointer to irq_data */ void irq_domain_reset_irq_data(struct irq_data *irq_data) { irq_data->hwirq = 0; irq_data->chip = &no_irq_chip; irq_data->chip_data = NULL; } EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY static void irq_domain_insert_irq(int virq) { struct irq_data *data; for (data = irq_get_irq_data(virq); data; data = data->parent_data) { struct irq_domain *domain = data->domain; domain->mapcount++; irq_domain_set_mapping(domain, data->hwirq, data); } irq_clear_status_flags(virq, IRQ_NOREQUEST); } static void irq_domain_remove_irq(int virq) { struct irq_data *data; irq_set_status_flags(virq, IRQ_NOREQUEST); irq_set_chip_and_handler(virq, NULL, NULL); synchronize_irq(virq); smp_mb(); for (data = irq_get_irq_data(virq); data; data = data->parent_data) { struct irq_domain *domain = data->domain; irq_hw_number_t hwirq = data->hwirq; domain->mapcount--; irq_domain_clear_mapping(domain, hwirq); } } static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, struct irq_data *child) { struct irq_data *irq_data; irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, irq_data_get_node(child)); if (irq_data) { child->parent_data = irq_data; irq_data->irq = child->irq; irq_data->common = child->common; irq_data->domain = domain; } return irq_data; } static void __irq_domain_free_hierarchy(struct irq_data *irq_data) { struct irq_data *tmp; while (irq_data) { tmp = irq_data; irq_data = irq_data->parent_data; kfree(tmp); } } static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) { struct irq_data *irq_data, *tmp; int i; for (i = 0; i < nr_irqs; i++) { irq_data = irq_get_irq_data(virq + i); tmp = irq_data->parent_data; irq_data->parent_data = NULL; irq_data->domain = NULL; __irq_domain_free_hierarchy(tmp); } } /** * irq_domain_disconnect_hierarchy - Mark the first unused level of a hierarchy * @domain: IRQ domain from which the hierarchy is to be disconnected * @virq: IRQ number where the hierarchy is to be trimmed * * Marks the @virq level belonging to @domain as disconnected. * Returns -EINVAL if @virq doesn't have a valid irq_data pointing * to @domain. * * Its only use is to be able to trim levels of hierarchy that do not * have any real meaning for this interrupt, and that the driver marks * as such from its .alloc() callback. */ int irq_domain_disconnect_hierarchy(struct irq_domain *domain, unsigned int virq) { struct irq_data *irqd; irqd = irq_domain_get_irq_data(domain, virq); if (!irqd) return -EINVAL; irqd->chip = ERR_PTR(-ENOTCONN); return 0; } EXPORT_SYMBOL_GPL(irq_domain_disconnect_hierarchy); static int irq_domain_trim_hierarchy(unsigned int virq) { struct irq_data *tail, *irqd, *irq_data; irq_data = irq_get_irq_data(virq); tail = NULL; /* The first entry must have a valid irqchip */ if (IS_ERR_OR_NULL(irq_data->chip)) return -EINVAL; /* * Validate that the irq_data chain is sane in the presence of * a hierarchy trimming marker. */ for (irqd = irq_data->parent_data; irqd; irq_data = irqd, irqd = irqd->parent_data) { /* Can't have a valid irqchip after a trim marker */ if (irqd->chip && tail) return -EINVAL; /* Can't have an empty irqchip before a trim marker */ if (!irqd->chip && !tail) return -EINVAL; if (IS_ERR(irqd->chip)) { /* Only -ENOTCONN is a valid trim marker */ if (PTR_ERR(irqd->chip) != -ENOTCONN) return -EINVAL; tail = irq_data; } } /* No trim marker, nothing to do */ if (!tail) return 0; pr_info("IRQ%d: trimming hierarchy from %s\n", virq, tail->parent_data->domain->name); /* Sever the inner part of the hierarchy... */ irqd = tail; tail = tail->parent_data; irqd->parent_data = NULL; __irq_domain_free_hierarchy(tail); return 0; } static int irq_domain_alloc_irq_data(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { struct irq_data *irq_data; struct irq_domain *parent; int i; /* The outermost irq_data is embedded in struct irq_desc */ for (i = 0; i < nr_irqs; i++) { irq_data = irq_get_irq_data(virq + i); irq_data->domain = domain; for (parent = domain->parent; parent; parent = parent->parent) { irq_data = irq_domain_insert_irq_data(parent, irq_data); if (!irq_data) { irq_domain_free_irq_data(virq, i + 1); return -ENOMEM; } } } return 0; } /** * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain * @domain: domain to match * @virq: IRQ number to get irq_data */ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, unsigned int virq) { struct irq_data *irq_data; for (irq_data = irq_get_irq_data(virq); irq_data; irq_data = irq_data->parent_data) if (irq_data->domain == domain) return irq_data; return NULL; } EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); /** * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain * @domain: Interrupt domain to match * @virq: IRQ number * @hwirq: The hwirq number * @chip: The associated interrupt chip * @chip_data: The associated chip data */ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data) { struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); if (!irq_data) return -ENOENT; irq_data->hwirq = hwirq; irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip); irq_data->chip_data = chip_data; return 0; } EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip); /** * irq_domain_set_info - Set the complete data for a @virq in @domain * @domain: Interrupt domain to match * @virq: IRQ number * @hwirq: The hardware interrupt number * @chip: The associated interrupt chip * @chip_data: The associated interrupt chip data * @handler: The interrupt flow handler * @handler_data: The interrupt flow handler data * @handler_name: The interrupt handler name */ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name) { irq_domain_set_hwirq_and_chip(domain, virq, hwirq, chip, chip_data); __irq_set_handler(virq, handler, 0, handler_name); irq_set_handler_data(virq, handler_data); } EXPORT_SYMBOL(irq_domain_set_info); /** * irq_domain_free_irqs_common - Clear irq_data and free the parent * @domain: Interrupt domain to match * @virq: IRQ number to start with * @nr_irqs: The number of irqs to free */ void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { struct irq_data *irq_data; int i; for (i = 0; i < nr_irqs; i++) { irq_data = irq_domain_get_irq_data(domain, virq + i); if (irq_data) irq_domain_reset_irq_data(irq_data); } irq_domain_free_irqs_parent(domain, virq, nr_irqs); } EXPORT_SYMBOL_GPL(irq_domain_free_irqs_common); /** * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent * @domain: Interrupt domain to match * @virq: IRQ number to start with * @nr_irqs: The number of irqs to free */ void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { int i; for (i = 0; i < nr_irqs; i++) { irq_set_handler_data(virq + i, NULL); irq_set_handler(virq + i, NULL); } irq_domain_free_irqs_common(domain, virq, nr_irqs); } EXPORT_SYMBOL_GPL(irq_domain_free_irqs_top); static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs) { unsigned int i; if (!domain->ops->free) return; for (i = 0; i < nr_irqs; i++) { if (irq_domain_get_irq_data(domain, irq_base + i)) domain->ops->free(domain, irq_base + i, 1); } } static int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs, void *arg) { if (!domain->ops->alloc) { pr_debug("domain->ops->alloc() is NULL\n"); return -ENOSYS; } return domain->ops->alloc(domain, irq_base, nr_irqs, arg); } static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, bool realloc, const struct irq_affinity_desc *affinity) { int i, ret, virq; if (realloc && irq_base >= 0) { virq = irq_base; } else { virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node, affinity); if (virq < 0) { pr_debug("cannot allocate IRQ(base %d, count %d)\n", irq_base, nr_irqs); return virq; } } if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) { pr_debug("cannot allocate memory for IRQ%d\n", virq); ret = -ENOMEM; goto out_free_desc; } ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg); if (ret < 0) goto out_free_irq_data; for (i = 0; i < nr_irqs; i++) { ret = irq_domain_trim_hierarchy(virq + i); if (ret) goto out_free_irq_data; } for (i = 0; i < nr_irqs; i++) irq_domain_insert_irq(virq + i); return virq; out_free_irq_data: irq_domain_free_irq_data(virq, nr_irqs); out_free_desc: irq_free_descs(virq, nr_irqs); return ret; } /** * __irq_domain_alloc_irqs - Allocate IRQs from domain * @domain: domain to allocate from * @irq_base: allocate specified IRQ number if irq_base >= 0 * @nr_irqs: number of IRQs to allocate * @node: NUMA node id for memory allocation * @arg: domain specific argument * @realloc: IRQ descriptors have already been allocated if true * @affinity: Optional irq affinity mask for multiqueue devices * * Allocate IRQ numbers and initialized all data structures to support * hierarchy IRQ domains. * Parameter @realloc is mainly to support legacy IRQs. * Returns error code or allocated IRQ number * * The whole process to setup an IRQ has been split into two steps. * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ * descriptor and required hardware resources. The second step, * irq_domain_activate_irq(), is to program the hardware with preallocated * resources. In this way, it's easier to rollback when failing to * allocate resources. */ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, bool realloc, const struct irq_affinity_desc *affinity) { int ret; if (domain == NULL) { domain = irq_default_domain; if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n")) return -EINVAL; } mutex_lock(&domain->root->mutex); ret = irq_domain_alloc_irqs_locked(domain, irq_base, nr_irqs, node, arg, realloc, affinity); mutex_unlock(&domain->root->mutex); return ret; } EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs); /* The irq_data was moved, fix the revmap to refer to the new location */ static void irq_domain_fix_revmap(struct irq_data *d) { void __rcu **slot; lockdep_assert_held(&d->domain->root->mutex); if (irq_domain_is_nomap(d->domain)) return; /* Fix up the revmap. */ if (d->hwirq < d->domain->revmap_size) { /* Not using radix tree */ rcu_assign_pointer(d->domain->revmap[d->hwirq], d); } else { slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq); if (slot) radix_tree_replace_slot(&d->domain->revmap_tree, slot, d); } } /** * irq_domain_push_irq() - Push a domain in to the top of a hierarchy. * @domain: Domain to push. * @virq: Irq to push the domain in to. * @arg: Passed to the irq_domain_ops alloc() function. * * For an already existing irqdomain hierarchy, as might be obtained * via a call to pci_enable_msix(), add an additional domain to the * head of the processing chain. Must be called before request_irq() * has been called. */ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg) { struct irq_data *irq_data = irq_get_irq_data(virq); struct irq_data *parent_irq_data; struct irq_desc *desc; int rv = 0; /* * Check that no action has been set, which indicates the virq * is in a state where this function doesn't have to deal with * races between interrupt handling and maintaining the * hierarchy. This will catch gross misuse. Attempting to * make the check race free would require holding locks across * calls to struct irq_domain_ops->alloc(), which could lead * to deadlock, so we just do a simple check before starting. */ desc = irq_to_desc(virq); if (!desc) return -EINVAL; if (WARN_ON(desc->action)) return -EBUSY; if (domain == NULL) return -EINVAL; if (WARN_ON(!irq_domain_is_hierarchy(domain))) return -EINVAL; if (!irq_data) return -EINVAL; if (domain->parent != irq_data->domain) return -EINVAL; parent_irq_data = kzalloc_node(sizeof(*parent_irq_data), GFP_KERNEL, irq_data_get_node(irq_data)); if (!parent_irq_data) return -ENOMEM; mutex_lock(&domain->root->mutex); /* Copy the original irq_data. */ *parent_irq_data = *irq_data; /* * Overwrite the irq_data, which is embedded in struct irq_desc, with * values for this domain. */ irq_data->parent_data = parent_irq_data; irq_data->domain = domain; irq_data->mask = 0; irq_data->hwirq = 0; irq_data->chip = NULL; irq_data->chip_data = NULL; /* May (probably does) set hwirq, chip, etc. */ rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); if (rv) { /* Restore the original irq_data. */ *irq_data = *parent_irq_data; kfree(parent_irq_data); goto error; } irq_domain_fix_revmap(parent_irq_data); irq_domain_set_mapping(domain, irq_data->hwirq, irq_data); error: mutex_unlock(&domain->root->mutex); return rv; } EXPORT_SYMBOL_GPL(irq_domain_push_irq); /** * irq_domain_pop_irq() - Remove a domain from the top of a hierarchy. * @domain: Domain to remove. * @virq: Irq to remove the domain from. * * Undo the effects of a call to irq_domain_push_irq(). Must be * called either before request_irq() or after free_irq(). */ int irq_domain_pop_irq(struct irq_domain *domain, int virq) { struct irq_data *irq_data = irq_get_irq_data(virq); struct irq_data *parent_irq_data; struct irq_data *tmp_irq_data; struct irq_desc *desc; /* * Check that no action is set, which indicates the virq is in * a state where this function doesn't have to deal with races * between interrupt handling and maintaining the hierarchy. * This will catch gross misuse. Attempting to make the check * race free would require holding locks across calls to * struct irq_domain_ops->free(), which could lead to * deadlock, so we just do a simple check before starting. */ desc = irq_to_desc(virq); if (!desc) return -EINVAL; if (WARN_ON(desc->action)) return -EBUSY; if (domain == NULL) return -EINVAL; if (!irq_data) return -EINVAL; tmp_irq_data = irq_domain_get_irq_data(domain, virq); /* We can only "pop" if this domain is at the top of the list */ if (WARN_ON(irq_data != tmp_irq_data)) return -EINVAL; if (WARN_ON(irq_data->domain != domain)) return -EINVAL; parent_irq_data = irq_data->parent_data; if (WARN_ON(!parent_irq_data)) return -EINVAL; mutex_lock(&domain->root->mutex); irq_data->parent_data = NULL; irq_domain_clear_mapping(domain, irq_data->hwirq); irq_domain_free_irqs_hierarchy(domain, virq, 1); /* Restore the original irq_data. */ *irq_data = *parent_irq_data; irq_domain_fix_revmap(irq_data); mutex_unlock(&domain->root->mutex); kfree(parent_irq_data); return 0; } EXPORT_SYMBOL_GPL(irq_domain_pop_irq); /** * irq_domain_free_irqs - Free IRQ number and associated data structures * @virq: base IRQ number * @nr_irqs: number of IRQs to free */ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) { struct irq_data *data = irq_get_irq_data(virq); struct irq_domain *domain; int i; if (WARN(!data || !data->domain || !data->domain->ops->free, "NULL pointer, cannot free irq\n")) return; domain = data->domain; mutex_lock(&domain->root->mutex); for (i = 0; i < nr_irqs; i++) irq_domain_remove_irq(virq + i); irq_domain_free_irqs_hierarchy(domain, virq, nr_irqs); mutex_unlock(&domain->root->mutex); irq_domain_free_irq_data(virq, nr_irqs); irq_free_descs(virq, nr_irqs); } static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq) { if (irq_domain_is_msi_device(domain)) msi_device_domain_free_wired(domain, virq); else irq_domain_free_irqs(virq, 1); } /** * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain * @domain: Domain below which interrupts must be allocated * @irq_base: Base IRQ number * @nr_irqs: Number of IRQs to allocate * @arg: Allocation data (arch/domain specific) */ int irq_domain_alloc_irqs_parent(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs, void *arg) { if (!domain->parent) return -ENOSYS; return irq_domain_alloc_irqs_hierarchy(domain->parent, irq_base, nr_irqs, arg); } EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); /** * irq_domain_free_irqs_parent - Free interrupts from parent domain * @domain: Domain below which interrupts must be freed * @irq_base: Base IRQ number * @nr_irqs: Number of IRQs to free */ void irq_domain_free_irqs_parent(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs) { if (!domain->parent) return; irq_domain_free_irqs_hierarchy(domain->parent, irq_base, nr_irqs); } EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); static void __irq_domain_deactivate_irq(struct irq_data *irq_data) { if (irq_data && irq_data->domain) { struct irq_domain *domain = irq_data->domain; if (domain->ops->deactivate) domain->ops->deactivate(domain, irq_data); if (irq_data->parent_data) __irq_domain_deactivate_irq(irq_data->parent_data); } } static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve) { int ret = 0; if (irqd && irqd->domain) { struct irq_domain *domain = irqd->domain; if (irqd->parent_data) ret = __irq_domain_activate_irq(irqd->parent_data, reserve); if (!ret && domain->ops->activate) { ret = domain->ops->activate(domain, irqd, reserve); /* Rollback in case of error */ if (ret && irqd->parent_data) __irq_domain_deactivate_irq(irqd->parent_data); } } return ret; } /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate * interrupt * @irq_data: Outermost irq_data associated with interrupt * @reserve: If set only reserve an interrupt vector instead of assigning one * * This is the second step to call domain_ops->activate to program interrupt * controllers, so the interrupt could actually get delivered. */ int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve) { int ret = 0; if (!irqd_is_activated(irq_data)) ret = __irq_domain_activate_irq(irq_data, reserve); if (!ret) irqd_set_activated(irq_data); return ret; } /** * irq_domain_deactivate_irq - Call domain_ops->deactivate recursively to * deactivate interrupt * @irq_data: outermost irq_data associated with interrupt * * It calls domain_ops->deactivate to program interrupt controllers to disable * interrupt delivery. */ void irq_domain_deactivate_irq(struct irq_data *irq_data) { if (irqd_is_activated(irq_data)) { __irq_domain_deactivate_irq(irq_data); irqd_clr_activated(irq_data); } } static void irq_domain_check_hierarchy(struct irq_domain *domain) { /* Hierarchy irq_domains must implement callback alloc() */ if (domain->ops->alloc) domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY; } #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ /* * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain * @domain: domain to match * @virq: IRQ number to get irq_data */ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, unsigned int virq) { struct irq_data *irq_data = irq_get_irq_data(virq); return (irq_data && irq_data->domain == domain) ? irq_data : NULL; } EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); /* * irq_domain_set_info - Set the complete data for a @virq in @domain * @domain: Interrupt domain to match * @virq: IRQ number * @hwirq: The hardware interrupt number * @chip: The associated interrupt chip * @chip_data: The associated interrupt chip data * @handler: The interrupt flow handler * @handler_data: The interrupt flow handler data * @handler_name: The interrupt handler name */ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name) { irq_set_chip_and_handler_name(virq, chip, handler, handler_name); irq_set_chip_data(virq, chip_data); irq_set_handler_data(virq, handler_data); } static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, bool realloc, const struct irq_affinity_desc *affinity) { return -EINVAL; } static void irq_domain_check_hierarchy(struct irq_domain *domain) { } static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq) { } #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #ifdef CONFIG_GENERIC_IRQ_DEBUGFS #include "internals.h" static struct dentry *domain_dir; static const struct irq_bit_descr irqdomain_flags[] = { BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_HIERARCHY), BIT_MASK_DESCR(IRQ_DOMAIN_NAME_ALLOCATED), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_IPI_PER_CPU), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_IPI_SINGLE), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_ISOLATED_MSI), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_NO_MAP), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI_PARENT), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI_DEVICE), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_NONCORE), }; static void irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind) { seq_printf(m, "%*sname: %s\n", ind, "", d->name); seq_printf(m, "%*ssize: %u\n", ind + 1, "", d->revmap_size); seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount); seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags); irq_debug_show_bits(m, ind, d->flags, irqdomain_flags, ARRAY_SIZE(irqdomain_flags)); if (d->ops && d->ops->debug_show) d->ops->debug_show(m, d, NULL, ind + 1); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (!d->parent) return; seq_printf(m, "%*sparent: %s\n", ind + 1, "", d->parent->name); irq_domain_debug_show_one(m, d->parent, ind + 4); #endif } static int irq_domain_debug_show(struct seq_file *m, void *p) { struct irq_domain *d = m->private; /* Default domain? Might be NULL */ if (!d) { if (!irq_default_domain) return 0; d = irq_default_domain; } irq_domain_debug_show_one(m, d, 0); return 0; } DEFINE_SHOW_ATTRIBUTE(irq_domain_debug); static void debugfs_add_domain_dir(struct irq_domain *d) { if (!d->name || !domain_dir) return; debugfs_create_file(d->name, 0444, domain_dir, d, &irq_domain_debug_fops); } static void debugfs_remove_domain_dir(struct irq_domain *d) { debugfs_lookup_and_remove(d->name, domain_dir); } void __init irq_domain_debugfs_init(struct dentry *root) { struct irq_domain *d; domain_dir = debugfs_create_dir("domains", root); debugfs_create_file("default", 0444, domain_dir, NULL, &irq_domain_debug_fops); mutex_lock(&irq_domain_mutex); list_for_each_entry(d, &irq_domain_list, link) debugfs_add_domain_dir(d); mutex_unlock(&irq_domain_mutex); } #endif |
| 2 2 4 4 4 4 2 2 2 2 2 2 4 1 1 1 5 1 4 4 1 3 3 1 1 1 13 4 4 4 4 13 13 13 13 13 4 4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 | // SPDX-License-Identifier: GPL-2.0-only /* * IEEE 802.1Q Multiple Registration Protocol (MRP) * * Copyright (c) 2012 Massachusetts Institute of Technology * * Adapted from code in net/802/garp.c * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/kernel.h> #include <linux/timer.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/module.h> #include <net/mrp.h> #include <linux/unaligned.h> static unsigned int mrp_join_time __read_mostly = 200; module_param(mrp_join_time, uint, 0644); MODULE_PARM_DESC(mrp_join_time, "Join time in ms (default 200ms)"); static unsigned int mrp_periodic_time __read_mostly = 1000; module_param(mrp_periodic_time, uint, 0644); MODULE_PARM_DESC(mrp_periodic_time, "Periodic time in ms (default 1s)"); MODULE_DESCRIPTION("IEEE 802.1Q Multiple Registration Protocol (MRP)"); MODULE_LICENSE("GPL"); static const u8 mrp_applicant_state_table[MRP_APPLICANT_MAX + 1][MRP_EVENT_MAX + 1] = { [MRP_APPLICANT_VO] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_VP, [MRP_EVENT_LV] = MRP_APPLICANT_VO, [MRP_EVENT_TX] = MRP_APPLICANT_VO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_VO, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AO, [MRP_EVENT_R_IN] = MRP_APPLICANT_VO, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VO, [MRP_EVENT_R_MT] = MRP_APPLICANT_VO, [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VO, }, [MRP_APPLICANT_VP] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_VP, [MRP_EVENT_LV] = MRP_APPLICANT_VO, [MRP_EVENT_TX] = MRP_APPLICANT_AA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_VP, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AP, [MRP_EVENT_R_IN] = MRP_APPLICANT_VP, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VP, [MRP_EVENT_R_MT] = MRP_APPLICANT_VP, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VP, }, [MRP_APPLICANT_VN] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_VN, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_AN, [MRP_EVENT_R_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_VN, [MRP_EVENT_R_IN] = MRP_APPLICANT_VN, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VN, [MRP_EVENT_R_MT] = MRP_APPLICANT_VN, [MRP_EVENT_R_LV] = MRP_APPLICANT_VN, [MRP_EVENT_R_LA] = MRP_APPLICANT_VN, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VN, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VN, }, [MRP_APPLICANT_AN] = { [MRP_EVENT_NEW] = MRP_APPLICANT_AN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AN, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AN, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AN, [MRP_EVENT_R_IN] = MRP_APPLICANT_AN, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AN, [MRP_EVENT_R_MT] = MRP_APPLICANT_AN, [MRP_EVENT_R_LV] = MRP_APPLICANT_VN, [MRP_EVENT_R_LA] = MRP_APPLICANT_VN, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VN, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AN, }, [MRP_APPLICANT_AA] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AA, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AA, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QA, [MRP_EVENT_R_IN] = MRP_APPLICANT_AA, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AA, }, [MRP_APPLICANT_QA] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_QA, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_QA, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QA, [MRP_EVENT_R_IN] = MRP_APPLICANT_QA, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AA, }, [MRP_APPLICANT_LA] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AA, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_VO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_LA, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_LA, [MRP_EVENT_R_IN] = MRP_APPLICANT_LA, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_LA, [MRP_EVENT_R_MT] = MRP_APPLICANT_LA, [MRP_EVENT_R_LV] = MRP_APPLICANT_LA, [MRP_EVENT_R_LA] = MRP_APPLICANT_LA, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_LA, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_LA, }, [MRP_APPLICANT_AO] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AP, [MRP_EVENT_LV] = MRP_APPLICANT_AO, [MRP_EVENT_TX] = MRP_APPLICANT_AO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AO, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QO, [MRP_EVENT_R_IN] = MRP_APPLICANT_AO, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AO, }, [MRP_APPLICANT_QO] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_QP, [MRP_EVENT_LV] = MRP_APPLICANT_QO, [MRP_EVENT_TX] = MRP_APPLICANT_QO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_QO, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QO, [MRP_EVENT_R_IN] = MRP_APPLICANT_QO, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_QO, }, [MRP_APPLICANT_AP] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AP, [MRP_EVENT_LV] = MRP_APPLICANT_AO, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AP, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QP, [MRP_EVENT_R_IN] = MRP_APPLICANT_AP, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AP, }, [MRP_APPLICANT_QP] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_QP, [MRP_EVENT_LV] = MRP_APPLICANT_QO, [MRP_EVENT_TX] = MRP_APPLICANT_QP, [MRP_EVENT_R_NEW] = MRP_APPLICANT_QP, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QP, [MRP_EVENT_R_IN] = MRP_APPLICANT_QP, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AP, }, }; static const u8 mrp_tx_action_table[MRP_APPLICANT_MAX + 1] = { [MRP_APPLICANT_VO] = MRP_TX_ACTION_S_IN_OPTIONAL, [MRP_APPLICANT_VP] = MRP_TX_ACTION_S_JOIN_IN, [MRP_APPLICANT_VN] = MRP_TX_ACTION_S_NEW, [MRP_APPLICANT_AN] = MRP_TX_ACTION_S_NEW, [MRP_APPLICANT_AA] = MRP_TX_ACTION_S_JOIN_IN, [MRP_APPLICANT_QA] = MRP_TX_ACTION_S_JOIN_IN_OPTIONAL, [MRP_APPLICANT_LA] = MRP_TX_ACTION_S_LV, [MRP_APPLICANT_AO] = MRP_TX_ACTION_S_IN_OPTIONAL, [MRP_APPLICANT_QO] = MRP_TX_ACTION_S_IN_OPTIONAL, [MRP_APPLICANT_AP] = MRP_TX_ACTION_S_JOIN_IN, [MRP_APPLICANT_QP] = MRP_TX_ACTION_S_IN_OPTIONAL, }; static void mrp_attrvalue_inc(void *value, u8 len) { u8 *v = (u8 *)value; /* Add 1 to the last byte. If it becomes zero, * go to the previous byte and repeat. */ while (len > 0 && !++v[--len]) ; } static int mrp_attr_cmp(const struct mrp_attr *attr, const void *value, u8 len, u8 type) { if (attr->type != type) return attr->type - type; if (attr->len != len) return attr->len - len; return memcmp(attr->value, value, len); } static struct mrp_attr *mrp_attr_lookup(const struct mrp_applicant *app, const void *value, u8 len, u8 type) { struct rb_node *parent = app->mad.rb_node; struct mrp_attr *attr; int d; while (parent) { attr = rb_entry(parent, struct mrp_attr, node); d = mrp_attr_cmp(attr, value, len, type); if (d > 0) parent = parent->rb_left; else if (d < 0) parent = parent->rb_right; else return attr; } return NULL; } static struct mrp_attr *mrp_attr_create(struct mrp_applicant *app, const void *value, u8 len, u8 type) { struct rb_node *parent = NULL, **p = &app->mad.rb_node; struct mrp_attr *attr; int d; while (*p) { parent = *p; attr = rb_entry(parent, struct mrp_attr, node); d = mrp_attr_cmp(attr, value, len, type); if (d > 0) p = &parent->rb_left; else if (d < 0) p = &parent->rb_right; else { /* The attribute already exists; re-use it. */ return attr; } } attr = kmalloc(sizeof(*attr) + len, GFP_ATOMIC); if (!attr) return attr; attr->state = MRP_APPLICANT_VO; attr->type = type; attr->len = len; memcpy(attr->value, value, len); rb_link_node(&attr->node, parent, p); rb_insert_color(&attr->node, &app->mad); return attr; } static void mrp_attr_destroy(struct mrp_applicant *app, struct mrp_attr *attr) { rb_erase(&attr->node, &app->mad); kfree(attr); } static void mrp_attr_destroy_all(struct mrp_applicant *app) { struct rb_node *node, *next; struct mrp_attr *attr; for (node = rb_first(&app->mad); next = node ? rb_next(node) : NULL, node != NULL; node = next) { attr = rb_entry(node, struct mrp_attr, node); mrp_attr_destroy(app, attr); } } static int mrp_pdu_init(struct mrp_applicant *app) { struct sk_buff *skb; struct mrp_pdu_hdr *ph; skb = alloc_skb(app->dev->mtu + LL_RESERVED_SPACE(app->dev), GFP_ATOMIC); if (!skb) return -ENOMEM; skb->dev = app->dev; skb->protocol = app->app->pkttype.type; skb_reserve(skb, LL_RESERVED_SPACE(app->dev)); skb_reset_network_header(skb); skb_reset_transport_header(skb); ph = __skb_put(skb, sizeof(*ph)); ph->version = app->app->version; app->pdu = skb; return 0; } static int mrp_pdu_append_end_mark(struct mrp_applicant *app) { __be16 *endmark; if (skb_tailroom(app->pdu) < sizeof(*endmark)) return -1; endmark = __skb_put(app->pdu, sizeof(*endmark)); put_unaligned(MRP_END_MARK, endmark); return 0; } static void mrp_pdu_queue(struct mrp_applicant *app) { if (!app->pdu) return; if (mrp_cb(app->pdu)->mh) mrp_pdu_append_end_mark(app); mrp_pdu_append_end_mark(app); dev_hard_header(app->pdu, app->dev, ntohs(app->app->pkttype.type), app->app->group_address, app->dev->dev_addr, app->pdu->len); skb_queue_tail(&app->queue, app->pdu); app->pdu = NULL; } static void mrp_queue_xmit(struct mrp_applicant *app) { struct sk_buff *skb; while ((skb = skb_dequeue(&app->queue))) dev_queue_xmit(skb); } static int mrp_pdu_append_msg_hdr(struct mrp_applicant *app, u8 attrtype, u8 attrlen) { struct mrp_msg_hdr *mh; if (mrp_cb(app->pdu)->mh) { if (mrp_pdu_append_end_mark(app) < 0) return -1; mrp_cb(app->pdu)->mh = NULL; mrp_cb(app->pdu)->vah = NULL; } if (skb_tailroom(app->pdu) < sizeof(*mh)) return -1; mh = __skb_put(app->pdu, sizeof(*mh)); mh->attrtype = attrtype; mh->attrlen = attrlen; mrp_cb(app->pdu)->mh = mh; return 0; } static int mrp_pdu_append_vecattr_hdr(struct mrp_applicant *app, const void *firstattrvalue, u8 attrlen) { struct mrp_vecattr_hdr *vah; if (skb_tailroom(app->pdu) < sizeof(*vah) + attrlen) return -1; vah = __skb_put(app->pdu, sizeof(*vah) + attrlen); put_unaligned(0, &vah->lenflags); memcpy(vah->firstattrvalue, firstattrvalue, attrlen); mrp_cb(app->pdu)->vah = vah; memcpy(mrp_cb(app->pdu)->attrvalue, firstattrvalue, attrlen); return 0; } static int mrp_pdu_append_vecattr_event(struct mrp_applicant *app, const struct mrp_attr *attr, enum mrp_vecattr_event vaevent) { u16 len, pos; u8 *vaevents; int err; again: if (!app->pdu) { err = mrp_pdu_init(app); if (err < 0) return err; } /* If there is no Message header in the PDU, or the Message header is * for a different attribute type, add an EndMark (if necessary) and a * new Message header to the PDU. */ if (!mrp_cb(app->pdu)->mh || mrp_cb(app->pdu)->mh->attrtype != attr->type || mrp_cb(app->pdu)->mh->attrlen != attr->len) { if (mrp_pdu_append_msg_hdr(app, attr->type, attr->len) < 0) goto queue; } /* If there is no VectorAttribute header for this Message in the PDU, * or this attribute's value does not sequentially follow the previous * attribute's value, add a new VectorAttribute header to the PDU. */ if (!mrp_cb(app->pdu)->vah || memcmp(mrp_cb(app->pdu)->attrvalue, attr->value, attr->len)) { if (mrp_pdu_append_vecattr_hdr(app, attr->value, attr->len) < 0) goto queue; } len = be16_to_cpu(get_unaligned(&mrp_cb(app->pdu)->vah->lenflags)); pos = len % 3; /* Events are packed into Vectors in the PDU, three to a byte. Add a * byte to the end of the Vector if necessary. */ if (!pos) { if (skb_tailroom(app->pdu) < sizeof(u8)) goto queue; vaevents = __skb_put(app->pdu, sizeof(u8)); } else { vaevents = (u8 *)(skb_tail_pointer(app->pdu) - sizeof(u8)); } switch (pos) { case 0: *vaevents = vaevent * (__MRP_VECATTR_EVENT_MAX * __MRP_VECATTR_EVENT_MAX); break; case 1: *vaevents += vaevent * __MRP_VECATTR_EVENT_MAX; break; case 2: *vaevents += vaevent; break; default: WARN_ON(1); } /* Increment the length of the VectorAttribute in the PDU, as well as * the value of the next attribute that would continue its Vector. */ put_unaligned(cpu_to_be16(++len), &mrp_cb(app->pdu)->vah->lenflags); mrp_attrvalue_inc(mrp_cb(app->pdu)->attrvalue, attr->len); return 0; queue: mrp_pdu_queue(app); goto again; } static void mrp_attr_event(struct mrp_applicant *app, struct mrp_attr *attr, enum mrp_event event) { enum mrp_applicant_state state; state = mrp_applicant_state_table[attr->state][event]; if (state == MRP_APPLICANT_INVALID) { WARN_ON(1); return; } if (event == MRP_EVENT_TX) { /* When appending the attribute fails, don't update its state * in order to retry at the next TX event. */ switch (mrp_tx_action_table[attr->state]) { case MRP_TX_ACTION_NONE: case MRP_TX_ACTION_S_JOIN_IN_OPTIONAL: case MRP_TX_ACTION_S_IN_OPTIONAL: break; case MRP_TX_ACTION_S_NEW: if (mrp_pdu_append_vecattr_event( app, attr, MRP_VECATTR_EVENT_NEW) < 0) return; break; case MRP_TX_ACTION_S_JOIN_IN: if (mrp_pdu_append_vecattr_event( app, attr, MRP_VECATTR_EVENT_JOIN_IN) < 0) return; break; case MRP_TX_ACTION_S_LV: if (mrp_pdu_append_vecattr_event( app, attr, MRP_VECATTR_EVENT_LV) < 0) return; /* As a pure applicant, sending a leave message * implies that the attribute was unregistered and * can be destroyed. */ mrp_attr_destroy(app, attr); return; default: WARN_ON(1); } } attr->state = state; } int mrp_request_join(const struct net_device *dev, const struct mrp_application *appl, const void *value, u8 len, u8 type) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); struct mrp_applicant *app = rtnl_dereference( port->applicants[appl->type]); struct mrp_attr *attr; if (sizeof(struct mrp_skb_cb) + len > sizeof_field(struct sk_buff, cb)) return -ENOMEM; spin_lock_bh(&app->lock); attr = mrp_attr_create(app, value, len, type); if (!attr) { spin_unlock_bh(&app->lock); return -ENOMEM; } mrp_attr_event(app, attr, MRP_EVENT_JOIN); spin_unlock_bh(&app->lock); return 0; } EXPORT_SYMBOL_GPL(mrp_request_join); void mrp_request_leave(const struct net_device *dev, const struct mrp_application *appl, const void *value, u8 len, u8 type) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); struct mrp_applicant *app = rtnl_dereference( port->applicants[appl->type]); struct mrp_attr *attr; if (sizeof(struct mrp_skb_cb) + len > sizeof_field(struct sk_buff, cb)) return; spin_lock_bh(&app->lock); attr = mrp_attr_lookup(app, value, len, type); if (!attr) { spin_unlock_bh(&app->lock); return; } mrp_attr_event(app, attr, MRP_EVENT_LV); spin_unlock_bh(&app->lock); } EXPORT_SYMBOL_GPL(mrp_request_leave); static void mrp_mad_event(struct mrp_applicant *app, enum mrp_event event) { struct rb_node *node, *next; struct mrp_attr *attr; for (node = rb_first(&app->mad); next = node ? rb_next(node) : NULL, node != NULL; node = next) { attr = rb_entry(node, struct mrp_attr, node); mrp_attr_event(app, attr, event); } } static void mrp_join_timer_arm(struct mrp_applicant *app) { unsigned long delay; delay = get_random_u32_below(msecs_to_jiffies(mrp_join_time)); mod_timer(&app->join_timer, jiffies + delay); } static void mrp_join_timer(struct timer_list *t) { struct mrp_applicant *app = timer_container_of(app, t, join_timer); spin_lock(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); mrp_pdu_queue(app); spin_unlock(&app->lock); mrp_queue_xmit(app); spin_lock(&app->lock); if (likely(app->active)) mrp_join_timer_arm(app); spin_unlock(&app->lock); } static void mrp_periodic_timer_arm(struct mrp_applicant *app) { mod_timer(&app->periodic_timer, jiffies + msecs_to_jiffies(mrp_periodic_time)); } static void mrp_periodic_timer(struct timer_list *t) { struct mrp_applicant *app = timer_container_of(app, t, periodic_timer); spin_lock(&app->lock); if (likely(app->active)) { mrp_mad_event(app, MRP_EVENT_PERIODIC); mrp_pdu_queue(app); mrp_periodic_timer_arm(app); } spin_unlock(&app->lock); } static int mrp_pdu_parse_end_mark(struct sk_buff *skb, int *offset) { __be16 endmark; if (skb_copy_bits(skb, *offset, &endmark, sizeof(endmark)) < 0) return -1; if (endmark == MRP_END_MARK) { *offset += sizeof(endmark); return -1; } return 0; } static void mrp_pdu_parse_vecattr_event(struct mrp_applicant *app, struct sk_buff *skb, enum mrp_vecattr_event vaevent) { struct mrp_attr *attr; enum mrp_event event; attr = mrp_attr_lookup(app, mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen, mrp_cb(skb)->mh->attrtype); if (attr == NULL) return; switch (vaevent) { case MRP_VECATTR_EVENT_NEW: event = MRP_EVENT_R_NEW; break; case MRP_VECATTR_EVENT_JOIN_IN: event = MRP_EVENT_R_JOIN_IN; break; case MRP_VECATTR_EVENT_IN: event = MRP_EVENT_R_IN; break; case MRP_VECATTR_EVENT_JOIN_MT: event = MRP_EVENT_R_JOIN_MT; break; case MRP_VECATTR_EVENT_MT: event = MRP_EVENT_R_MT; break; case MRP_VECATTR_EVENT_LV: event = MRP_EVENT_R_LV; break; default: return; } mrp_attr_event(app, attr, event); } static int mrp_pdu_parse_vecattr(struct mrp_applicant *app, struct sk_buff *skb, int *offset) { struct mrp_vecattr_hdr _vah; u16 valen; u8 vaevents, vaevent; mrp_cb(skb)->vah = skb_header_pointer(skb, *offset, sizeof(_vah), &_vah); if (!mrp_cb(skb)->vah) return -1; *offset += sizeof(_vah); if (get_unaligned(&mrp_cb(skb)->vah->lenflags) & MRP_VECATTR_HDR_FLAG_LA) mrp_mad_event(app, MRP_EVENT_R_LA); valen = be16_to_cpu(get_unaligned(&mrp_cb(skb)->vah->lenflags) & MRP_VECATTR_HDR_LEN_MASK); /* The VectorAttribute structure in a PDU carries event information * about one or more attributes having consecutive values. Only the * value for the first attribute is contained in the structure. So * we make a copy of that value, and then increment it each time we * advance to the next event in its Vector. */ if (sizeof(struct mrp_skb_cb) + mrp_cb(skb)->mh->attrlen > sizeof_field(struct sk_buff, cb)) return -1; if (skb_copy_bits(skb, *offset, mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen) < 0) return -1; *offset += mrp_cb(skb)->mh->attrlen; /* In a VectorAttribute, the Vector contains events which are packed * three to a byte. We process one byte of the Vector at a time. */ while (valen > 0) { if (skb_copy_bits(skb, *offset, &vaevents, sizeof(vaevents)) < 0) return -1; *offset += sizeof(vaevents); /* Extract and process the first event. */ vaevent = vaevents / (__MRP_VECATTR_EVENT_MAX * __MRP_VECATTR_EVENT_MAX); if (vaevent >= __MRP_VECATTR_EVENT_MAX) { /* The byte is malformed; stop processing. */ return -1; } mrp_pdu_parse_vecattr_event(app, skb, vaevent); /* If present, extract and process the second event. */ if (!--valen) break; mrp_attrvalue_inc(mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen); vaevents %= (__MRP_VECATTR_EVENT_MAX * __MRP_VECATTR_EVENT_MAX); vaevent = vaevents / __MRP_VECATTR_EVENT_MAX; mrp_pdu_parse_vecattr_event(app, skb, vaevent); /* If present, extract and process the third event. */ if (!--valen) break; mrp_attrvalue_inc(mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen); vaevents %= __MRP_VECATTR_EVENT_MAX; vaevent = vaevents; mrp_pdu_parse_vecattr_event(app, skb, vaevent); } return 0; } static int mrp_pdu_parse_msg(struct mrp_applicant *app, struct sk_buff *skb, int *offset) { struct mrp_msg_hdr _mh; mrp_cb(skb)->mh = skb_header_pointer(skb, *offset, sizeof(_mh), &_mh); if (!mrp_cb(skb)->mh) return -1; *offset += sizeof(_mh); if (mrp_cb(skb)->mh->attrtype == 0 || mrp_cb(skb)->mh->attrtype > app->app->maxattr || mrp_cb(skb)->mh->attrlen == 0) return -1; while (skb->len > *offset) { if (mrp_pdu_parse_end_mark(skb, offset) < 0) break; if (mrp_pdu_parse_vecattr(app, skb, offset) < 0) return -1; } return 0; } static int mrp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct mrp_application *appl = container_of(pt, struct mrp_application, pkttype); struct mrp_port *port; struct mrp_applicant *app; struct mrp_pdu_hdr _ph; const struct mrp_pdu_hdr *ph; int offset = skb_network_offset(skb); /* If the interface is in promiscuous mode, drop the packet if * it was unicast to another host. */ if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) goto out; skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto out; port = rcu_dereference(dev->mrp_port); if (unlikely(!port)) goto out; app = rcu_dereference(port->applicants[appl->type]); if (unlikely(!app)) goto out; ph = skb_header_pointer(skb, offset, sizeof(_ph), &_ph); if (!ph) goto out; offset += sizeof(_ph); if (ph->version != app->app->version) goto out; spin_lock(&app->lock); while (skb->len > offset) { if (mrp_pdu_parse_end_mark(skb, &offset) < 0) break; if (mrp_pdu_parse_msg(app, skb, &offset) < 0) break; } spin_unlock(&app->lock); out: kfree_skb(skb); return 0; } static int mrp_init_port(struct net_device *dev) { struct mrp_port *port; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; rcu_assign_pointer(dev->mrp_port, port); return 0; } static void mrp_release_port(struct net_device *dev) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); unsigned int i; for (i = 0; i <= MRP_APPLICATION_MAX; i++) { if (rtnl_dereference(port->applicants[i])) return; } RCU_INIT_POINTER(dev->mrp_port, NULL); kfree_rcu(port, rcu); } int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl) { struct mrp_applicant *app; int err; ASSERT_RTNL(); if (!rtnl_dereference(dev->mrp_port)) { err = mrp_init_port(dev); if (err < 0) goto err1; } err = -ENOMEM; app = kzalloc(sizeof(*app), GFP_KERNEL); if (!app) goto err2; err = dev_mc_add(dev, appl->group_address); if (err < 0) goto err3; app->dev = dev; app->app = appl; app->mad = RB_ROOT; app->active = true; spin_lock_init(&app->lock); skb_queue_head_init(&app->queue); rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app); timer_setup(&app->join_timer, mrp_join_timer, 0); mrp_join_timer_arm(app); timer_setup(&app->periodic_timer, mrp_periodic_timer, 0); mrp_periodic_timer_arm(app); return 0; err3: kfree(app); err2: mrp_release_port(dev); err1: return err; } EXPORT_SYMBOL_GPL(mrp_init_applicant); void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); struct mrp_applicant *app = rtnl_dereference( port->applicants[appl->type]); ASSERT_RTNL(); RCU_INIT_POINTER(port->applicants[appl->type], NULL); spin_lock_bh(&app->lock); app->active = false; spin_unlock_bh(&app->lock); /* Delete timer and generate a final TX event to flush out * all pending messages before the applicant is gone. */ timer_shutdown_sync(&app->join_timer); timer_shutdown_sync(&app->periodic_timer); spin_lock_bh(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); mrp_attr_destroy_all(app); mrp_pdu_queue(app); spin_unlock_bh(&app->lock); mrp_queue_xmit(app); dev_mc_del(dev, appl->group_address); kfree_rcu(app, rcu); mrp_release_port(dev); } EXPORT_SYMBOL_GPL(mrp_uninit_applicant); int mrp_register_application(struct mrp_application *appl) { appl->pkttype.func = mrp_rcv; dev_add_pack(&appl->pkttype); return 0; } EXPORT_SYMBOL_GPL(mrp_register_application); void mrp_unregister_application(struct mrp_application *appl) { dev_remove_pack(&appl->pkttype); } EXPORT_SYMBOL_GPL(mrp_unregister_application); |
| 9 9 8 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 | // SPDX-License-Identifier: GPL-2.0 /* * Portions * Copyright (C) 2022-2024 Intel Corporation */ #include <linux/ieee80211.h> #include <linux/export.h> #include <net/cfg80211.h> #include "nl80211.h" #include "core.h" #include "rdev-ops.h" /* Default values, timeouts in ms */ #define MESH_TTL 31 #define MESH_DEFAULT_ELEMENT_TTL 31 #define MESH_MAX_RETR 3 #define MESH_RET_T 100 #define MESH_CONF_T 100 #define MESH_HOLD_T 100 #define MESH_PATH_TIMEOUT 5000 #define MESH_RANN_INTERVAL 5000 #define MESH_PATH_TO_ROOT_TIMEOUT 6000 #define MESH_ROOT_INTERVAL 5000 #define MESH_ROOT_CONFIRMATION_INTERVAL 2000 #define MESH_DEFAULT_PLINK_TIMEOUT 1800 /* timeout in seconds */ /* * Minimum interval between two consecutive PREQs originated by the same * interface */ #define MESH_PREQ_MIN_INT 10 #define MESH_PERR_MIN_INT 100 #define MESH_DIAM_TRAVERSAL_TIME 50 #define MESH_RSSI_THRESHOLD 0 /* * A path will be refreshed if it is used PATH_REFRESH_TIME milliseconds * before timing out. This way it will remain ACTIVE and no data frames * will be unnecessarily held in the pending queue. */ #define MESH_PATH_REFRESH_TIME 1000 #define MESH_MIN_DISCOVERY_TIMEOUT (2 * MESH_DIAM_TRAVERSAL_TIME) /* Default maximum number of established plinks per interface */ #define MESH_MAX_ESTAB_PLINKS 32 #define MESH_MAX_PREQ_RETRIES 4 #define MESH_SYNC_NEIGHBOR_OFFSET_MAX 50 #define MESH_DEFAULT_BEACON_INTERVAL 1000 /* in 1024 us units (=TUs) */ #define MESH_DEFAULT_DTIM_PERIOD 2 #define MESH_DEFAULT_AWAKE_WINDOW 10 /* in 1024 us units (=TUs) */ const struct mesh_config default_mesh_config = { .dot11MeshRetryTimeout = MESH_RET_T, .dot11MeshConfirmTimeout = MESH_CONF_T, .dot11MeshHoldingTimeout = MESH_HOLD_T, .dot11MeshMaxRetries = MESH_MAX_RETR, .dot11MeshTTL = MESH_TTL, .element_ttl = MESH_DEFAULT_ELEMENT_TTL, .auto_open_plinks = true, .dot11MeshMaxPeerLinks = MESH_MAX_ESTAB_PLINKS, .dot11MeshNbrOffsetMaxNeighbor = MESH_SYNC_NEIGHBOR_OFFSET_MAX, .dot11MeshHWMPactivePathTimeout = MESH_PATH_TIMEOUT, .dot11MeshHWMPpreqMinInterval = MESH_PREQ_MIN_INT, .dot11MeshHWMPperrMinInterval = MESH_PERR_MIN_INT, .dot11MeshHWMPnetDiameterTraversalTime = MESH_DIAM_TRAVERSAL_TIME, .dot11MeshHWMPmaxPREQretries = MESH_MAX_PREQ_RETRIES, .path_refresh_time = MESH_PATH_REFRESH_TIME, .min_discovery_timeout = MESH_MIN_DISCOVERY_TIMEOUT, .dot11MeshHWMPRannInterval = MESH_RANN_INTERVAL, .dot11MeshGateAnnouncementProtocol = false, .dot11MeshForwarding = true, .rssi_threshold = MESH_RSSI_THRESHOLD, .ht_opmode = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED, .dot11MeshHWMPactivePathToRootTimeout = MESH_PATH_TO_ROOT_TIMEOUT, .dot11MeshHWMProotInterval = MESH_ROOT_INTERVAL, .dot11MeshHWMPconfirmationInterval = MESH_ROOT_CONFIRMATION_INTERVAL, .power_mode = NL80211_MESH_POWER_ACTIVE, .dot11MeshAwakeWindowDuration = MESH_DEFAULT_AWAKE_WINDOW, .plink_timeout = MESH_DEFAULT_PLINK_TIMEOUT, .dot11MeshNolearn = false, }; const struct mesh_setup default_mesh_setup = { /* cfg80211_join_mesh() will pick a channel if needed */ .sync_method = IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET, .path_sel_proto = IEEE80211_PATH_PROTOCOL_HWMP, .path_metric = IEEE80211_PATH_METRIC_AIRTIME, .auth_id = 0, /* open */ .ie = NULL, .ie_len = 0, .is_secure = false, .user_mpm = false, .beacon_interval = MESH_DEFAULT_BEACON_INTERVAL, .dtim_period = MESH_DEFAULT_DTIM_PERIOD, }; int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev, struct mesh_setup *setup, const struct mesh_config *conf) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN != IEEE80211_MAX_MESH_ID_LEN); lockdep_assert_wiphy(wdev->wiphy); if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) return -EOPNOTSUPP; if (!(rdev->wiphy.flags & WIPHY_FLAG_MESH_AUTH) && setup->is_secure) return -EOPNOTSUPP; if (wdev->u.mesh.id_len) return -EALREADY; if (!setup->mesh_id_len) return -EINVAL; if (!rdev->ops->join_mesh) return -EOPNOTSUPP; if (wdev->links[0].cac_started) return -EBUSY; if (!setup->chandef.chan) { /* if no channel explicitly given, use preset channel */ setup->chandef = wdev->u.mesh.preset_chandef; } if (!setup->chandef.chan) { /* if we don't have that either, use the first usable channel */ enum nl80211_band band; for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; int i; sband = rdev->wiphy.bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) { chan = &sband->channels[i]; if (chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_DISABLED | IEEE80211_CHAN_RADAR)) continue; setup->chandef.chan = chan; break; } if (setup->chandef.chan) break; } /* no usable channel ... */ if (!setup->chandef.chan) return -EINVAL; setup->chandef.width = NL80211_CHAN_WIDTH_20_NOHT; setup->chandef.center_freq1 = setup->chandef.chan->center_freq; } /* * check if basic rates are available otherwise use mandatory rates as * basic rates */ if (!setup->basic_rates) { struct ieee80211_supported_band *sband = rdev->wiphy.bands[setup->chandef.chan->band]; if (setup->chandef.chan->band == NL80211_BAND_2GHZ) { int i; /* * Older versions selected the mandatory rates for * 2.4 GHz as well, but were broken in that only * 1 Mbps was regarded as a mandatory rate. Keep * using just 1 Mbps as the default basic rate for * mesh to be interoperable with older versions. */ for (i = 0; i < sband->n_bitrates; i++) { if (sband->bitrates[i].bitrate == 10) { setup->basic_rates = BIT(i); break; } } } else { setup->basic_rates = ieee80211_mandatory_rates(sband); } } err = cfg80211_chandef_dfs_required(&rdev->wiphy, &setup->chandef, NL80211_IFTYPE_MESH_POINT); if (err < 0) return err; if (err > 0 && !setup->userspace_handles_dfs) return -EINVAL; if (!cfg80211_reg_can_beacon(&rdev->wiphy, &setup->chandef, NL80211_IFTYPE_MESH_POINT)) return -EINVAL; err = rdev_join_mesh(rdev, dev, conf, setup); if (!err) { memcpy(wdev->u.mesh.id, setup->mesh_id, setup->mesh_id_len); wdev->u.mesh.id_len = setup->mesh_id_len; wdev->u.mesh.chandef = setup->chandef; wdev->u.mesh.beacon_interval = setup->beacon_interval; } return err; } int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_chan_def *chandef) { int err; /* * Workaround for libertas (only!), it puts the interface * into mesh mode but doesn't implement join_mesh. Instead, * it is configured via sysfs and then joins the mesh when * you set the channel. Note that the libertas mesh isn't * compatible with 802.11 mesh. */ if (rdev->ops->libertas_set_mesh_channel) { if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT) return -EINVAL; if (!netif_running(wdev->netdev)) return -ENETDOWN; err = rdev_libertas_set_mesh_channel(rdev, wdev->netdev, chandef->chan); if (!err) wdev->u.mesh.chandef = *chandef; return err; } if (wdev->u.mesh.id_len) return -EBUSY; wdev->u.mesh.preset_chandef = *chandef; return 0; } int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_wiphy(wdev->wiphy); if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) return -EOPNOTSUPP; if (!rdev->ops->leave_mesh) return -EOPNOTSUPP; if (!wdev->u.mesh.id_len) return -ENOTCONN; err = rdev_leave_mesh(rdev, dev); if (!err) { wdev->conn_owner_nlportid = 0; wdev->u.mesh.id_len = 0; wdev->u.mesh.beacon_interval = 0; memset(&wdev->u.mesh.chandef, 0, sizeof(wdev->u.mesh.chandef)); rdev_set_qos_map(rdev, dev, NULL); cfg80211_sched_dfs_chan_update(rdev); } return err; } |
| 6 6 2 2 2 2 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2007 Andi Kleen, SUSE Labs. * * This contains most of the x86 vDSO kernel-side code. */ #include <linux/mm.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/random.h> #include <linux/elf.h> #include <linux/cpu.h> #include <linux/ptrace.h> #include <linux/vdso_datastore.h> #include <asm/pvclock.h> #include <asm/vgtod.h> #include <asm/proto.h> #include <asm/vdso.h> #include <asm/tlb.h> #include <asm/page.h> #include <asm/desc.h> #include <asm/cpufeature.h> #include <asm/vdso/vsyscall.h> #include <clocksource/hyperv_timer.h> static_assert(VDSO_NR_PAGES + VDSO_NR_VCLOCK_PAGES == __VDSO_PAGES); unsigned int vclocks_used __read_mostly; #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; #endif int __init init_vdso_image(const struct vdso_image *image) { BUILD_BUG_ON(VDSO_CLOCKMODE_MAX >= 32); BUG_ON(image->size % PAGE_SIZE != 0); apply_alternatives((struct alt_instr *)(image->data + image->alt), (struct alt_instr *)(image->data + image->alt + image->alt_len)); return 0; } struct linux_binprm; static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { const struct vdso_image *image = vma->vm_mm->context.vdso_image; if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size) return VM_FAULT_SIGBUS; vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT)); get_page(vmf->page); return 0; } static void vdso_fix_landing(const struct vdso_image *image, struct vm_area_struct *new_vma) { if (in_ia32_syscall() && image == &vdso_image_32) { struct pt_regs *regs = current_pt_regs(); unsigned long vdso_land = image->sym_int80_landing_pad; unsigned long old_land_addr = vdso_land + (unsigned long)current->mm->context.vdso; /* Fixing userspace landing - look at do_fast_syscall_32 */ if (regs->ip == old_land_addr) regs->ip = new_vma->vm_start + vdso_land; } } static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { const struct vdso_image *image = current->mm->context.vdso_image; vdso_fix_landing(image, new_vma); current->mm->context.vdso = (void __user *)new_vma->vm_start; return 0; } static vm_fault_t vvar_vclock_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { switch (vmf->pgoff) { #ifdef CONFIG_PARAVIRT_CLOCK case VDSO_PAGE_PVCLOCK_OFFSET: { struct pvclock_vsyscall_time_info *pvti = pvclock_get_pvti_cpu0_va(); if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) return vmf_insert_pfn_prot(vma, vmf->address, __pa(pvti) >> PAGE_SHIFT, pgprot_decrypted(vma->vm_page_prot)); break; } #endif /* CONFIG_PARAVIRT_CLOCK */ #ifdef CONFIG_HYPERV_TIMER case VDSO_PAGE_HVCLOCK_OFFSET: { unsigned long pfn = hv_get_tsc_pfn(); if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) return vmf_insert_pfn(vma, vmf->address, pfn); break; } #endif /* CONFIG_HYPERV_TIMER */ } return VM_FAULT_SIGBUS; } static const struct vm_special_mapping vdso_mapping = { .name = "[vdso]", .fault = vdso_fault, .mremap = vdso_mremap, }; static const struct vm_special_mapping vvar_vclock_mapping = { .name = "[vvar_vclock]", .fault = vvar_vclock_fault, }; /* * Add vdso and vvar mappings to current process. * @image - blob to map * @addr - request a specific address (zero to map at free addr) */ static int map_vdso(const struct vdso_image *image, unsigned long addr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long text_start; int ret = 0; if (mmap_write_lock_killable(mm)) return -EINTR; addr = get_unmapped_area(NULL, addr, image->size + __VDSO_PAGES * PAGE_SIZE, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; } text_start = addr + __VDSO_PAGES * PAGE_SIZE; /* * MAYWRITE to allow gdb to COW and set breakpoints */ vma = _install_special_mapping(mm, text_start, image->size, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| VM_SEALED_SYSMAP, &vdso_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto up_fail; } vma = vdso_install_vvar_mapping(mm, addr); if (IS_ERR(vma)) { ret = PTR_ERR(vma); do_munmap(mm, text_start, image->size, NULL); goto up_fail; } vma = _install_special_mapping(mm, VDSO_VCLOCK_PAGES_START(addr), VDSO_NR_VCLOCK_PAGES * PAGE_SIZE, VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| VM_PFNMAP|VM_SEALED_SYSMAP, &vvar_vclock_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); do_munmap(mm, text_start, image->size, NULL); do_munmap(mm, addr, image->size, NULL); goto up_fail; } current->mm->context.vdso = (void __user *)text_start; current->mm->context.vdso_image = image; up_fail: mmap_write_unlock(mm); return ret; } int map_vdso_once(const struct vdso_image *image, unsigned long addr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); /* * Check if we have already mapped vdso blob - fail to prevent * abusing from userspace install_special_mapping, which may * not do accounting and rlimit right. * We could search vma near context.vdso, but it's a slowpath, * so let's explicitly check all VMAs to be completely sure. */ for_each_vma(vmi, vma) { if (vma_is_special_mapping(vma, &vdso_mapping) || vma_is_special_mapping(vma, &vdso_vvar_mapping) || vma_is_special_mapping(vma, &vvar_vclock_mapping)) { mmap_write_unlock(mm); return -EEXIST; } } mmap_write_unlock(mm); return map_vdso(image, addr); } static int load_vdso32(void) { if (vdso32_enabled != 1) /* Other values all mean "disabled" */ return 0; return map_vdso(&vdso_image_32, 0); } int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { if (IS_ENABLED(CONFIG_X86_64)) { if (!vdso64_enabled) return 0; return map_vdso(&vdso_image_64, 0); } return load_vdso32(); } #ifdef CONFIG_COMPAT int compat_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp, bool x32) { if (IS_ENABLED(CONFIG_X86_X32_ABI) && x32) { if (!vdso64_enabled) return 0; return map_vdso(&vdso_image_x32, 0); } if (IS_ENABLED(CONFIG_IA32_EMULATION)) return load_vdso32(); return 0; } #endif bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) { const struct vdso_image *image = current->mm->context.vdso_image; unsigned long vdso = (unsigned long) current->mm->context.vdso; if (in_ia32_syscall() && image == &vdso_image_32) { if (regs->ip == vdso + image->sym_vdso32_sigreturn_landing_pad || regs->ip == vdso + image->sym_vdso32_rt_sigreturn_landing_pad) return true; } return false; } #ifdef CONFIG_X86_64 static __init int vdso_setup(char *s) { vdso64_enabled = simple_strtoul(s, NULL, 0); return 1; } __setup("vdso=", vdso_setup); #endif /* CONFIG_X86_64 */ |
| 2 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _CRYPTO_XTS_H #define _CRYPTO_XTS_H #include <crypto/b128ops.h> #include <crypto/internal/skcipher.h> #include <linux/fips.h> #define XTS_BLOCK_SIZE 16 static inline int xts_verify_key(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { /* * key consists of keys of equal size concatenated, therefore * the length must be even. */ if (keylen % 2) return -EINVAL; /* * In FIPS mode only a combined key length of either 256 or * 512 bits is allowed, c.f. FIPS 140-3 IG C.I. */ if (fips_enabled && keylen != 32 && keylen != 64) return -EINVAL; /* * Ensure that the AES and tweak key are not identical when * in FIPS mode or the FORBID_WEAK_KEYS flag is set. */ if ((fips_enabled || (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS)) && !crypto_memneq(key, key + (keylen / 2), keylen / 2)) return -EINVAL; return 0; } #endif /* _CRYPTO_XTS_H */ |
| 13 13 13 13 12 12 1 12 2 2 1 1 2 1 1 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 | // SPDX-License-Identifier: GPL-2.0-only /* * vivid-kthread-cap.h - video/vbi capture thread support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/font.h> #include <linux/mutex.h> #include <linux/videodev2.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/random.h> #include <linux/v4l2-dv-timings.h> #include <linux/jiffies.h> #include <asm/div64.h> #include <media/videobuf2-vmalloc.h> #include <media/v4l2-dv-timings.h> #include <media/v4l2-ioctl.h> #include <media/v4l2-fh.h> #include <media/v4l2-event.h> #include <media/v4l2-rect.h> #include "vivid-core.h" #include "vivid-vid-common.h" #include "vivid-vid-cap.h" #include "vivid-vid-out.h" #include "vivid-radio-common.h" #include "vivid-radio-rx.h" #include "vivid-radio-tx.h" #include "vivid-sdr-cap.h" #include "vivid-vbi-cap.h" #include "vivid-vbi-out.h" #include "vivid-osd.h" #include "vivid-ctrls.h" #include "vivid-kthread-cap.h" #include "vivid-meta-cap.h" static inline v4l2_std_id vivid_get_std_cap(const struct vivid_dev *dev) { if (vivid_is_sdtv_cap(dev)) return dev->std_cap[dev->input]; return 0; } static void copy_pix(struct vivid_dev *dev, int win_y, int win_x, u16 *cap, const u16 *osd) { u16 out; out = *cap; *cap = *osd; if ((dev->fbuf_out_flags & V4L2_FBUF_FLAG_CHROMAKEY) && *osd != dev->chromakey_out) return; if ((dev->fbuf_out_flags & V4L2_FBUF_FLAG_SRC_CHROMAKEY) && out == dev->chromakey_out) return; if (dev->fmt_cap->alpha_mask) { if ((dev->fbuf_out_flags & V4L2_FBUF_FLAG_GLOBAL_ALPHA) && dev->global_alpha_out) return; if ((dev->fbuf_out_flags & V4L2_FBUF_FLAG_LOCAL_ALPHA) && *cap & dev->fmt_cap->alpha_mask) return; if ((dev->fbuf_out_flags & V4L2_FBUF_FLAG_LOCAL_INV_ALPHA) && !(*cap & dev->fmt_cap->alpha_mask)) return; } *cap = out; } static void blend_line(struct vivid_dev *dev, unsigned y_offset, unsigned x_offset, u8 *vcapbuf, const u8 *vosdbuf, unsigned width, unsigned pixsize) { unsigned x; for (x = 0; x < width; x++, vcapbuf += pixsize, vosdbuf += pixsize) { copy_pix(dev, y_offset, x_offset + x, (u16 *)vcapbuf, (const u16 *)vosdbuf); } } static void scale_line(const u8 *src, u8 *dst, unsigned srcw, unsigned dstw, unsigned twopixsize) { /* Coarse scaling with Bresenham */ unsigned int_part; unsigned fract_part; unsigned src_x = 0; unsigned error = 0; unsigned x; /* * We always combine two pixels to prevent color bleed in the packed * yuv case. */ srcw /= 2; dstw /= 2; int_part = srcw / dstw; fract_part = srcw % dstw; for (x = 0; x < dstw; x++, dst += twopixsize) { memcpy(dst, src + src_x * twopixsize, twopixsize); src_x += int_part; error += fract_part; if (error >= dstw) { error -= dstw; src_x++; } } } /* * Precalculate the rectangles needed to perform video looping: * * The nominal pipeline is that the video output buffer is cropped by * crop_out, scaled to compose_out, overlaid with the output overlay, * cropped on the capture side by crop_cap and scaled again to the video * capture buffer using compose_cap. * * To keep things efficient we calculate the intersection of compose_out * and crop_cap (since that's the only part of the video that will * actually end up in the capture buffer), determine which part of the * video output buffer that is and which part of the video capture buffer * so we can scale the video straight from the output buffer to the capture * buffer without any intermediate steps. * * If we need to deal with an output overlay, then there is no choice and * that intermediate step still has to be taken. For the output overlay * support we calculate the intersection of the framebuffer and the overlay * window (which may be partially or wholly outside of the framebuffer * itself) and the intersection of that with loop_vid_copy (i.e. the part of * the actual looped video that will be overlaid). The result is calculated * both in framebuffer coordinates (loop_fb_copy) and compose_out coordinates * (loop_vid_overlay). Finally calculate the part of the capture buffer that * will receive that overlaid video. */ static void vivid_precalc_copy_rects(struct vivid_dev *dev, struct vivid_dev *out_dev) { /* Framebuffer rectangle */ struct v4l2_rect r_fb = { 0, 0, dev->display_width, dev->display_height }; /* Overlay window rectangle in framebuffer coordinates */ struct v4l2_rect r_overlay = { out_dev->overlay_out_left, out_dev->overlay_out_top, out_dev->compose_out.width, out_dev->compose_out.height }; v4l2_rect_intersect(&dev->loop_vid_copy, &dev->crop_cap, &out_dev->compose_out); dev->loop_vid_out = dev->loop_vid_copy; v4l2_rect_scale(&dev->loop_vid_out, &out_dev->compose_out, &out_dev->crop_out); dev->loop_vid_out.left += out_dev->crop_out.left; dev->loop_vid_out.top += out_dev->crop_out.top; dev->loop_vid_cap = dev->loop_vid_copy; v4l2_rect_scale(&dev->loop_vid_cap, &dev->crop_cap, &dev->compose_cap); dprintk(dev, 1, "loop_vid_copy: (%d,%d)/%ux%u loop_vid_out: (%d,%d)/%ux%u loop_vid_cap: (%d,%d)/%ux%u\n", dev->loop_vid_copy.left, dev->loop_vid_copy.top, dev->loop_vid_copy.width, dev->loop_vid_copy.height, dev->loop_vid_out.left, dev->loop_vid_out.top, dev->loop_vid_out.width, dev->loop_vid_out.height, dev->loop_vid_cap.left, dev->loop_vid_cap.top, dev->loop_vid_cap.width, dev->loop_vid_cap.height); v4l2_rect_intersect(&r_overlay, &r_fb, &r_overlay); /* shift r_overlay to the same origin as compose_out */ r_overlay.left += out_dev->compose_out.left - out_dev->overlay_out_left; r_overlay.top += out_dev->compose_out.top - out_dev->overlay_out_top; v4l2_rect_intersect(&dev->loop_vid_overlay, &r_overlay, &dev->loop_vid_copy); dev->loop_fb_copy = dev->loop_vid_overlay; /* shift dev->loop_fb_copy back again to the fb origin */ dev->loop_fb_copy.left -= out_dev->compose_out.left - out_dev->overlay_out_left; dev->loop_fb_copy.top -= out_dev->compose_out.top - out_dev->overlay_out_top; dev->loop_vid_overlay_cap = dev->loop_vid_overlay; v4l2_rect_scale(&dev->loop_vid_overlay_cap, &dev->crop_cap, &dev->compose_cap); dprintk(dev, 1, "loop_fb_copy: (%d,%d)/%ux%u loop_vid_overlay: (%d,%d)/%ux%u loop_vid_overlay_cap: (%d,%d)/%ux%u\n", dev->loop_fb_copy.left, dev->loop_fb_copy.top, dev->loop_fb_copy.width, dev->loop_fb_copy.height, dev->loop_vid_overlay.left, dev->loop_vid_overlay.top, dev->loop_vid_overlay.width, dev->loop_vid_overlay.height, dev->loop_vid_overlay_cap.left, dev->loop_vid_overlay_cap.top, dev->loop_vid_overlay_cap.width, dev->loop_vid_overlay_cap.height); } static void *plane_vaddr(struct tpg_data *tpg, struct vivid_buffer *buf, unsigned p, unsigned bpl[TPG_MAX_PLANES], unsigned h) { unsigned i; void *vbuf; if (p == 0 || tpg_g_buffers(tpg) > 1) return vb2_plane_vaddr(&buf->vb.vb2_buf, p); vbuf = vb2_plane_vaddr(&buf->vb.vb2_buf, 0); for (i = 0; i < p; i++) vbuf += bpl[i] * h / tpg->vdownsampling[i]; return vbuf; } static noinline_for_stack int vivid_copy_buffer(struct vivid_dev *dev, struct vivid_dev *out_dev, unsigned p, u8 *vcapbuf, struct vivid_buffer *vid_cap_buf) { bool blank = dev->must_blank[vid_cap_buf->vb.vb2_buf.index]; struct tpg_data *tpg = &dev->tpg; struct vivid_buffer *vid_out_buf = NULL; unsigned vdiv = out_dev->fmt_out->vdownsampling[p]; unsigned twopixsize = tpg_g_twopixelsize(tpg, p); unsigned img_width = tpg_hdiv(tpg, p, dev->compose_cap.width); unsigned img_height = dev->compose_cap.height; unsigned stride_cap = tpg->bytesperline[p]; unsigned stride_out = out_dev->bytesperline_out[p]; unsigned stride_osd = dev->display_byte_stride; unsigned hmax = (img_height * tpg->perc_fill) / 100; u8 *voutbuf; u8 *vosdbuf = NULL; unsigned y; bool blend = out_dev->fbuf_out_flags; /* Coarse scaling with Bresenham */ unsigned vid_out_int_part; unsigned vid_out_fract_part; unsigned vid_out_y = 0; unsigned vid_out_error = 0; unsigned vid_overlay_int_part = 0; unsigned vid_overlay_fract_part = 0; unsigned vid_overlay_y = 0; unsigned vid_overlay_error = 0; unsigned vid_cap_left = tpg_hdiv(tpg, p, dev->loop_vid_cap.left); unsigned vid_cap_right; bool quick; vid_out_int_part = dev->loop_vid_out.height / dev->loop_vid_cap.height; vid_out_fract_part = dev->loop_vid_out.height % dev->loop_vid_cap.height; if (!list_empty(&out_dev->vid_out_active)) vid_out_buf = list_entry(out_dev->vid_out_active.next, struct vivid_buffer, list); if (vid_out_buf == NULL) return -ENODATA; vid_cap_buf->vb.field = vid_out_buf->vb.field; voutbuf = plane_vaddr(tpg, vid_out_buf, p, out_dev->bytesperline_out, out_dev->fmt_out_rect.height); if (p < out_dev->fmt_out->buffers) voutbuf += vid_out_buf->vb.vb2_buf.planes[p].data_offset; voutbuf += tpg_hdiv(tpg, p, dev->loop_vid_out.left) + (dev->loop_vid_out.top / vdiv) * stride_out; vcapbuf += tpg_hdiv(tpg, p, dev->compose_cap.left) + (dev->compose_cap.top / vdiv) * stride_cap; if (dev->loop_vid_copy.width == 0 || dev->loop_vid_copy.height == 0) { /* * If there is nothing to copy, then just fill the capture window * with black. */ for (y = 0; y < hmax / vdiv; y++, vcapbuf += stride_cap) memcpy(vcapbuf, tpg->black_line[p], img_width); return 0; } if (out_dev->overlay_out_enabled && dev->loop_vid_overlay.width && dev->loop_vid_overlay.height) { vosdbuf = dev->video_vbase; vosdbuf += (dev->loop_fb_copy.left * twopixsize) / 2 + dev->loop_fb_copy.top * stride_osd; vid_overlay_int_part = dev->loop_vid_overlay.height / dev->loop_vid_overlay_cap.height; vid_overlay_fract_part = dev->loop_vid_overlay.height % dev->loop_vid_overlay_cap.height; } vid_cap_right = tpg_hdiv(tpg, p, dev->loop_vid_cap.left + dev->loop_vid_cap.width); /* quick is true if no video scaling is needed */ quick = dev->loop_vid_out.width == dev->loop_vid_cap.width; dev->cur_scaled_line = dev->loop_vid_out.height; for (y = 0; y < hmax; y += vdiv, vcapbuf += stride_cap) { /* osdline is true if this line requires overlay blending */ bool osdline = vosdbuf && y >= dev->loop_vid_overlay_cap.top && y < dev->loop_vid_overlay_cap.top + dev->loop_vid_overlay_cap.height; /* * If this line of the capture buffer doesn't get any video, then * just fill with black. */ if (y < dev->loop_vid_cap.top || y >= dev->loop_vid_cap.top + dev->loop_vid_cap.height) { memcpy(vcapbuf, tpg->black_line[p], img_width); continue; } /* fill the left border with black */ if (dev->loop_vid_cap.left) memcpy(vcapbuf, tpg->black_line[p], vid_cap_left); /* fill the right border with black */ if (vid_cap_right < img_width) memcpy(vcapbuf + vid_cap_right, tpg->black_line[p], img_width - vid_cap_right); if (quick && !osdline) { memcpy(vcapbuf + vid_cap_left, voutbuf + vid_out_y * stride_out, tpg_hdiv(tpg, p, dev->loop_vid_cap.width)); goto update_vid_out_y; } if (dev->cur_scaled_line == vid_out_y) { memcpy(vcapbuf + vid_cap_left, dev->scaled_line, tpg_hdiv(tpg, p, dev->loop_vid_cap.width)); goto update_vid_out_y; } if (!osdline) { scale_line(voutbuf + vid_out_y * stride_out, dev->scaled_line, tpg_hdiv(tpg, p, dev->loop_vid_out.width), tpg_hdiv(tpg, p, dev->loop_vid_cap.width), tpg_g_twopixelsize(tpg, p)); } else { /* * Offset in bytes within loop_vid_copy to the start of the * loop_vid_overlay rectangle. */ unsigned offset = ((dev->loop_vid_overlay.left - dev->loop_vid_copy.left) * twopixsize) / 2; u8 *osd = vosdbuf + vid_overlay_y * stride_osd; scale_line(voutbuf + vid_out_y * stride_out, dev->blended_line, dev->loop_vid_out.width, dev->loop_vid_copy.width, tpg_g_twopixelsize(tpg, p)); if (blend) blend_line(dev, vid_overlay_y + dev->loop_vid_overlay.top, dev->loop_vid_overlay.left, dev->blended_line + offset, osd, dev->loop_vid_overlay.width, twopixsize / 2); else memcpy(dev->blended_line + offset, osd, (dev->loop_vid_overlay.width * twopixsize) / 2); scale_line(dev->blended_line, dev->scaled_line, dev->loop_vid_copy.width, dev->loop_vid_cap.width, tpg_g_twopixelsize(tpg, p)); } dev->cur_scaled_line = vid_out_y; memcpy(vcapbuf + vid_cap_left, dev->scaled_line, tpg_hdiv(tpg, p, dev->loop_vid_cap.width)); update_vid_out_y: if (osdline) { vid_overlay_y += vid_overlay_int_part; vid_overlay_error += vid_overlay_fract_part; if (vid_overlay_error >= dev->loop_vid_overlay_cap.height) { vid_overlay_error -= dev->loop_vid_overlay_cap.height; vid_overlay_y++; } } vid_out_y += vid_out_int_part; vid_out_error += vid_out_fract_part; if (vid_out_error >= dev->loop_vid_cap.height / vdiv) { vid_out_error -= dev->loop_vid_cap.height / vdiv; vid_out_y++; } } if (!blank) return 0; for (; y < img_height; y += vdiv, vcapbuf += stride_cap) memcpy(vcapbuf, tpg->contrast_line[p], img_width); return 0; } static void vivid_fillbuff(struct vivid_dev *dev, struct vivid_buffer *buf) { struct vivid_dev *out_dev = NULL; struct tpg_data *tpg = &dev->tpg; unsigned factor = V4L2_FIELD_HAS_T_OR_B(dev->field_cap) ? 2 : 1; unsigned line_height = 16 / factor; bool is_tv = vivid_is_sdtv_cap(dev); bool is_60hz = is_tv && (dev->std_cap[dev->input] & V4L2_STD_525_60); unsigned p; int line = 1; u8 *basep[TPG_MAX_PLANES][2]; unsigned ms; char str[100]; s32 gain; buf->vb.sequence = dev->vid_cap_seq_count; v4l2_ctrl_s_ctrl(dev->ro_int32, buf->vb.sequence & 0xff); if (dev->field_cap == V4L2_FIELD_ALTERNATE) { /* * 60 Hz standards start with the bottom field, 50 Hz standards * with the top field. So if the 0-based seq_count is even, * then the field is TOP for 50 Hz and BOTTOM for 60 Hz * standards. */ buf->vb.field = ((dev->vid_cap_seq_count & 1) ^ is_60hz) ? V4L2_FIELD_BOTTOM : V4L2_FIELD_TOP; /* * The sequence counter counts frames, not fields. So divide * by two. */ buf->vb.sequence /= 2; } else { buf->vb.field = dev->field_cap; } tpg_s_field(tpg, buf->vb.field, dev->field_cap == V4L2_FIELD_ALTERNATE); tpg_s_perc_fill_blank(tpg, dev->must_blank[buf->vb.vb2_buf.index]); if (vivid_vid_can_loop(dev) && ((vivid_is_svid_cap(dev) && !VIVID_INVALID_SIGNAL(dev->std_signal_mode[dev->input])) || (vivid_is_hdmi_cap(dev) && !VIVID_INVALID_SIGNAL(dev->dv_timings_signal_mode[dev->input])))) { out_dev = vivid_input_is_connected_to(dev); /* * If the vivid instance of the output device is different * from the vivid instance of this input device, then we * must take care to properly serialize the output device to * prevent that the buffer we are copying from is being freed. * * If the output device is part of the same instance, then the * lock is already taken and there is no need to take the mutex. * * The problem with taking the mutex is that you can get * deadlocked if instance A locks instance B and vice versa. * It is not really worth trying to be very smart about this, * so just try to take the lock, and if you can't, then just * set out_dev to NULL and you will end up with a single frame * of Noise (the default test pattern in this case). */ if (out_dev && dev != out_dev && !mutex_trylock(&out_dev->mutex)) out_dev = NULL; } if (out_dev) vivid_precalc_copy_rects(dev, out_dev); for (p = 0; p < tpg_g_planes(tpg); p++) { void *vbuf = plane_vaddr(tpg, buf, p, tpg->bytesperline, tpg->buf_height); /* * The first plane of a multiplanar format has a non-zero * data_offset. This helps testing whether the application * correctly supports non-zero data offsets. */ if (p < tpg_g_buffers(tpg) && dev->fmt_cap->data_offset[p]) { memset(vbuf, dev->fmt_cap->data_offset[p] & 0xff, dev->fmt_cap->data_offset[p]); vbuf += dev->fmt_cap->data_offset[p]; } tpg_calc_text_basep(tpg, basep, p, vbuf); if (!out_dev || vivid_copy_buffer(dev, out_dev, p, vbuf, buf)) tpg_fill_plane_buffer(tpg, vivid_get_std_cap(dev), p, vbuf); } if (out_dev && dev != out_dev) mutex_unlock(&out_dev->mutex); dev->must_blank[buf->vb.vb2_buf.index] = false; /* Updates stream time, only update at the start of a new frame. */ if (dev->field_cap != V4L2_FIELD_ALTERNATE || (dev->vid_cap_seq_count & 1) == 0) dev->ms_vid_cap = jiffies_to_msecs(jiffies - dev->jiffies_vid_cap); ms = dev->ms_vid_cap; if (dev->osd_mode <= 1) { snprintf(str, sizeof(str), " %02d:%02d:%02d:%03d %u%s", (ms / (60 * 60 * 1000)) % 24, (ms / (60 * 1000)) % 60, (ms / 1000) % 60, ms % 1000, buf->vb.sequence, (dev->field_cap == V4L2_FIELD_ALTERNATE) ? (buf->vb.field == V4L2_FIELD_TOP ? " top" : " bottom") : ""); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); } if (dev->osd_mode == 0) { snprintf(str, sizeof(str), " %dx%d, input %d ", dev->src_rect.width, dev->src_rect.height, dev->input); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); gain = v4l2_ctrl_g_ctrl(dev->gain); mutex_lock(dev->ctrl_hdl_user_vid.lock); snprintf(str, sizeof(str), " brightness %3d, contrast %3d, saturation %3d, hue %d ", dev->brightness->cur.val, dev->contrast->cur.val, dev->saturation->cur.val, dev->hue->cur.val); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); snprintf(str, sizeof(str), " autogain %d, gain %3d, alpha 0x%02x ", dev->autogain->cur.val, gain, dev->alpha->cur.val); mutex_unlock(dev->ctrl_hdl_user_vid.lock); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); mutex_lock(dev->ctrl_hdl_user_aud.lock); snprintf(str, sizeof(str), " volume %3d, mute %d ", dev->volume->cur.val, dev->mute->cur.val); mutex_unlock(dev->ctrl_hdl_user_aud.lock); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); mutex_lock(dev->ctrl_hdl_user_gen.lock); snprintf(str, sizeof(str), " int32 %d, ro_int32 %d, int64 %lld, bitmask %08x ", dev->int32->cur.val, dev->ro_int32->cur.val, *dev->int64->p_cur.p_s64, dev->bitmask->cur.val); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); snprintf(str, sizeof(str), " boolean %d, menu %s, string \"%s\" ", dev->boolean->cur.val, dev->menu->qmenu[dev->menu->cur.val], dev->string->p_cur.p_char); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); snprintf(str, sizeof(str), " integer_menu %lld, value %d ", dev->int_menu->qmenu_int[dev->int_menu->cur.val], dev->int_menu->cur.val); mutex_unlock(dev->ctrl_hdl_user_gen.lock); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); if (dev->button_pressed) { dev->button_pressed--; snprintf(str, sizeof(str), " button pressed!"); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); } if (dev->osd[0]) { if (vivid_is_hdmi_cap(dev)) { snprintf(str, sizeof(str), " OSD \"%s\"", dev->osd); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); } if (dev->osd_jiffies && time_is_before_jiffies(dev->osd_jiffies + 5 * HZ)) { dev->osd[0] = 0; dev->osd_jiffies = 0; } } } } static void vivid_cap_update_frame_period(struct vivid_dev *dev) { u64 f_period; f_period = (u64)dev->timeperframe_vid_cap.numerator * 1000000000; if (WARN_ON(dev->timeperframe_vid_cap.denominator == 0)) dev->timeperframe_vid_cap.denominator = 1; do_div(f_period, dev->timeperframe_vid_cap.denominator); if (dev->field_cap == V4L2_FIELD_ALTERNATE) f_period >>= 1; /* * If "End of Frame", then offset the exposure time by 0.9 * of the frame period. */ dev->cap_frame_eof_offset = f_period * 9; do_div(dev->cap_frame_eof_offset, 10); dev->cap_frame_period = f_period; } static noinline_for_stack void vivid_thread_vid_cap_tick(struct vivid_dev *dev, int dropped_bufs) { struct vivid_buffer *vid_cap_buf = NULL; struct vivid_buffer *vbi_cap_buf = NULL; struct vivid_buffer *meta_cap_buf = NULL; u64 f_time = 0; dprintk(dev, 1, "Video Capture Thread Tick\n"); while (dropped_bufs-- > 1) tpg_update_mv_count(&dev->tpg, dev->field_cap == V4L2_FIELD_NONE || dev->field_cap == V4L2_FIELD_ALTERNATE); /* Drop a certain percentage of buffers. */ if (dev->perc_dropped_buffers && get_random_u32_below(100) < dev->perc_dropped_buffers) goto update_mv; spin_lock(&dev->slock); if (!list_empty(&dev->vid_cap_active)) { vid_cap_buf = list_entry(dev->vid_cap_active.next, struct vivid_buffer, list); list_del(&vid_cap_buf->list); } if (!list_empty(&dev->vbi_cap_active)) { if (dev->field_cap != V4L2_FIELD_ALTERNATE || (dev->vbi_cap_seq_count & 1)) { vbi_cap_buf = list_entry(dev->vbi_cap_active.next, struct vivid_buffer, list); list_del(&vbi_cap_buf->list); } } if (!list_empty(&dev->meta_cap_active)) { meta_cap_buf = list_entry(dev->meta_cap_active.next, struct vivid_buffer, list); list_del(&meta_cap_buf->list); } spin_unlock(&dev->slock); if (!vid_cap_buf && !vbi_cap_buf && !meta_cap_buf) goto update_mv; f_time = ktime_get_ns() + dev->time_wrap_offset; if (vid_cap_buf) { v4l2_ctrl_request_setup(vid_cap_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vid_cap); /* Fill buffer */ vivid_fillbuff(dev, vid_cap_buf); dprintk(dev, 1, "filled buffer %d\n", vid_cap_buf->vb.vb2_buf.index); v4l2_ctrl_request_complete(vid_cap_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vid_cap); vb2_buffer_done(&vid_cap_buf->vb.vb2_buf, dev->dqbuf_error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE); dprintk(dev, 2, "vid_cap buffer %d done\n", vid_cap_buf->vb.vb2_buf.index); vid_cap_buf->vb.vb2_buf.timestamp = f_time; if (!dev->tstamp_src_is_soe) vid_cap_buf->vb.vb2_buf.timestamp += dev->cap_frame_eof_offset; } if (vbi_cap_buf) { u64 vbi_period; v4l2_ctrl_request_setup(vbi_cap_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vbi_cap); if (vbi_cap_buf->vb.vb2_buf.type == V4L2_BUF_TYPE_SLICED_VBI_CAPTURE) vivid_sliced_vbi_cap_process(dev, vbi_cap_buf); else vivid_raw_vbi_cap_process(dev, vbi_cap_buf); v4l2_ctrl_request_complete(vbi_cap_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vbi_cap); vb2_buffer_done(&vbi_cap_buf->vb.vb2_buf, dev->dqbuf_error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE); dprintk(dev, 2, "vbi_cap %d done\n", vbi_cap_buf->vb.vb2_buf.index); /* If capturing a VBI, offset by 0.05 */ vbi_period = dev->cap_frame_period * 5; do_div(vbi_period, 100); vbi_cap_buf->vb.vb2_buf.timestamp = f_time + dev->cap_frame_eof_offset + vbi_period; } if (meta_cap_buf) { v4l2_ctrl_request_setup(meta_cap_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_meta_cap); vivid_meta_cap_fillbuff(dev, meta_cap_buf, f_time); v4l2_ctrl_request_complete(meta_cap_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_meta_cap); vb2_buffer_done(&meta_cap_buf->vb.vb2_buf, dev->dqbuf_error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE); dprintk(dev, 2, "meta_cap %d done\n", meta_cap_buf->vb.vb2_buf.index); meta_cap_buf->vb.vb2_buf.timestamp = f_time + dev->cap_frame_eof_offset; } dev->dqbuf_error = false; update_mv: /* Update the test pattern movement counters */ tpg_update_mv_count(&dev->tpg, dev->field_cap == V4L2_FIELD_NONE || dev->field_cap == V4L2_FIELD_ALTERNATE); } static int vivid_thread_vid_cap(void *data) { struct vivid_dev *dev = data; u64 numerators_since_start; u64 buffers_since_start; u64 next_jiffies_since_start; unsigned long jiffies_since_start; unsigned long cur_jiffies; unsigned wait_jiffies; unsigned numerator; unsigned denominator; int dropped_bufs; dprintk(dev, 1, "Video Capture Thread Start\n"); set_freezable(); /* Resets frame counters */ dev->cap_seq_offset = 0; dev->cap_seq_count = 0; dev->cap_seq_resync = false; dev->jiffies_vid_cap = jiffies; dev->cap_stream_start = ktime_get_ns(); if (dev->time_wrap) dev->time_wrap_offset = dev->time_wrap - dev->cap_stream_start; else dev->time_wrap_offset = 0; vivid_cap_update_frame_period(dev); for (;;) { try_to_freeze(); if (kthread_should_stop()) break; if (!mutex_trylock(&dev->mutex)) { schedule(); continue; } cur_jiffies = jiffies; if (dev->cap_seq_resync) { dev->jiffies_vid_cap = cur_jiffies; dev->cap_seq_offset = dev->cap_seq_count + 1; dev->cap_seq_count = 0; dev->cap_stream_start += dev->cap_frame_period * dev->cap_seq_offset; vivid_cap_update_frame_period(dev); dev->cap_seq_resync = false; } numerator = dev->timeperframe_vid_cap.numerator; denominator = dev->timeperframe_vid_cap.denominator; if (dev->field_cap == V4L2_FIELD_ALTERNATE) denominator *= 2; /* Calculate the number of jiffies since we started streaming */ jiffies_since_start = cur_jiffies - dev->jiffies_vid_cap; /* Get the number of buffers streamed since the start */ buffers_since_start = (u64)jiffies_since_start * denominator + (HZ * numerator) / 2; do_div(buffers_since_start, HZ * numerator); /* * After more than 0xf0000000 (rounded down to a multiple of * 'jiffies-per-day' to ease jiffies_to_msecs calculation) * jiffies have passed since we started streaming reset the * counters and keep track of the sequence offset. */ if (jiffies_since_start > JIFFIES_RESYNC) { dev->jiffies_vid_cap = cur_jiffies; dev->cap_seq_offset = buffers_since_start; buffers_since_start = 0; } dropped_bufs = buffers_since_start + dev->cap_seq_offset - dev->cap_seq_count; dev->cap_seq_count = buffers_since_start + dev->cap_seq_offset; dev->vid_cap_seq_count = dev->cap_seq_count - dev->vid_cap_seq_start; dev->vbi_cap_seq_count = dev->cap_seq_count - dev->vbi_cap_seq_start; dev->meta_cap_seq_count = dev->cap_seq_count - dev->meta_cap_seq_start; vivid_thread_vid_cap_tick(dev, dropped_bufs); /* * Calculate the number of 'numerators' streamed since we started, * including the current buffer. */ numerators_since_start = ++buffers_since_start * numerator; /* And the number of jiffies since we started */ jiffies_since_start = jiffies - dev->jiffies_vid_cap; mutex_unlock(&dev->mutex); /* * Calculate when that next buffer is supposed to start * in jiffies since we started streaming. */ next_jiffies_since_start = numerators_since_start * HZ + denominator / 2; do_div(next_jiffies_since_start, denominator); /* If it is in the past, then just schedule asap */ if (next_jiffies_since_start < jiffies_since_start) next_jiffies_since_start = jiffies_since_start; wait_jiffies = next_jiffies_since_start - jiffies_since_start; if (!time_is_after_jiffies(cur_jiffies + wait_jiffies)) continue; wait_queue_head_t wait; init_waitqueue_head(&wait); wait_event_interruptible_timeout(wait, kthread_should_stop(), cur_jiffies + wait_jiffies - jiffies); } dprintk(dev, 1, "Video Capture Thread End\n"); return 0; } static void vivid_grab_controls(struct vivid_dev *dev, bool grab) { v4l2_ctrl_grab(dev->ctrl_has_crop_cap, grab); v4l2_ctrl_grab(dev->ctrl_has_compose_cap, grab); v4l2_ctrl_grab(dev->ctrl_has_scaler_cap, grab); } int vivid_start_generating_vid_cap(struct vivid_dev *dev, bool *pstreaming) { dprintk(dev, 1, "%s\n", __func__); if (dev->kthread_vid_cap) { u32 seq_count = dev->cap_seq_count + dev->seq_wrap * 128; if (pstreaming == &dev->vid_cap_streaming) dev->vid_cap_seq_start = seq_count; else if (pstreaming == &dev->vbi_cap_streaming) dev->vbi_cap_seq_start = seq_count; else dev->meta_cap_seq_start = seq_count; *pstreaming = true; return 0; } /* Resets frame counters */ tpg_init_mv_count(&dev->tpg); dev->vid_cap_seq_start = dev->seq_wrap * 128; dev->vbi_cap_seq_start = dev->seq_wrap * 128; dev->meta_cap_seq_start = dev->seq_wrap * 128; dev->kthread_vid_cap = kthread_run(vivid_thread_vid_cap, dev, "%s-vid-cap", dev->v4l2_dev.name); if (IS_ERR(dev->kthread_vid_cap)) { int err = PTR_ERR(dev->kthread_vid_cap); dev->kthread_vid_cap = NULL; v4l2_err(&dev->v4l2_dev, "kernel_thread() failed\n"); return err; } *pstreaming = true; vivid_grab_controls(dev, true); dprintk(dev, 1, "returning from %s\n", __func__); return 0; } void vivid_stop_generating_vid_cap(struct vivid_dev *dev, bool *pstreaming) { dprintk(dev, 1, "%s\n", __func__); if (dev->kthread_vid_cap == NULL) return; *pstreaming = false; if (pstreaming == &dev->vid_cap_streaming) { /* Release all active buffers */ while (!list_empty(&dev->vid_cap_active)) { struct vivid_buffer *buf; buf = list_entry(dev->vid_cap_active.next, struct vivid_buffer, list); list_del(&buf->list); v4l2_ctrl_request_complete(buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vid_cap); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); dprintk(dev, 2, "vid_cap buffer %d done\n", buf->vb.vb2_buf.index); } } if (pstreaming == &dev->vbi_cap_streaming) { while (!list_empty(&dev->vbi_cap_active)) { struct vivid_buffer *buf; buf = list_entry(dev->vbi_cap_active.next, struct vivid_buffer, list); list_del(&buf->list); v4l2_ctrl_request_complete(buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vbi_cap); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); dprintk(dev, 2, "vbi_cap buffer %d done\n", buf->vb.vb2_buf.index); } } if (pstreaming == &dev->meta_cap_streaming) { while (!list_empty(&dev->meta_cap_active)) { struct vivid_buffer *buf; buf = list_entry(dev->meta_cap_active.next, struct vivid_buffer, list); list_del(&buf->list); v4l2_ctrl_request_complete(buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_meta_cap); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); dprintk(dev, 2, "meta_cap buffer %d done\n", buf->vb.vb2_buf.index); } } if (dev->vid_cap_streaming || dev->vbi_cap_streaming || dev->meta_cap_streaming) return; /* shutdown control thread */ vivid_grab_controls(dev, false); kthread_stop(dev->kthread_vid_cap); dev->kthread_vid_cap = NULL; } |
open /syzkaller/managers/ci-upstream-rust-kasan-gce/kernel/usr/local/rustup/toolchains/1.87.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/ub_checks.rs: no such file or directory
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_LINUX_IOPRIO_H #define _UAPI_LINUX_IOPRIO_H #include <linux/stddef.h> #include <linux/types.h> /* * Gives us 8 prio classes with 13-bits of data for each class */ #define IOPRIO_CLASS_SHIFT 13 #define IOPRIO_NR_CLASSES 8 #define IOPRIO_CLASS_MASK (IOPRIO_NR_CLASSES - 1) #define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) #define IOPRIO_PRIO_CLASS(ioprio) \ (((ioprio) >> IOPRIO_CLASS_SHIFT) & IOPRIO_CLASS_MASK) #define IOPRIO_PRIO_DATA(ioprio) ((ioprio) & IOPRIO_PRIO_MASK) /* * These are the io priority classes as implemented by the BFQ and mq-deadline * schedulers. RT is the realtime class, it always gets premium service. For * ATA disks supporting NCQ IO priority, RT class IOs will be processed using * high priority NCQ commands. BE is the best-effort scheduling class, the * default for any process. IDLE is the idle scheduling class, it is only * served when no one else is using the disk. */ enum { IOPRIO_CLASS_NONE = 0, IOPRIO_CLASS_RT = 1, IOPRIO_CLASS_BE = 2, IOPRIO_CLASS_IDLE = 3, /* Special class to indicate an invalid ioprio value */ IOPRIO_CLASS_INVALID = 7, }; /* * The RT and BE priority classes both support up to 8 priority levels that * can be specified using the lower 3-bits of the priority data. */ #define IOPRIO_LEVEL_NR_BITS 3 #define IOPRIO_NR_LEVELS (1 << IOPRIO_LEVEL_NR_BITS) #define IOPRIO_LEVEL_MASK (IOPRIO_NR_LEVELS - 1) #define IOPRIO_PRIO_LEVEL(ioprio) ((ioprio) & IOPRIO_LEVEL_MASK) #define IOPRIO_BE_NR IOPRIO_NR_LEVELS /* * Possible values for the "which" argument of the ioprio_get() and * ioprio_set() system calls (see "man ioprio_set"). */ enum { IOPRIO_WHO_PROCESS = 1, IOPRIO_WHO_PGRP, IOPRIO_WHO_USER, }; /* * Fallback BE class priority level. */ #define IOPRIO_NORM 4 #define IOPRIO_BE_NORM IOPRIO_NORM /* * The 10 bits between the priority class and the priority level are used to * optionally define I/O hints for any combination of I/O priority class and * level. Depending on the kernel configuration, I/O scheduler being used and * the target I/O device being used, hints can influence how I/Os are processed * without affecting the I/O scheduling ordering defined by the I/O priority * class and level. */ #define IOPRIO_HINT_SHIFT IOPRIO_LEVEL_NR_BITS #define IOPRIO_HINT_NR_BITS 10 #define IOPRIO_NR_HINTS (1 << IOPRIO_HINT_NR_BITS) #define IOPRIO_HINT_MASK (IOPRIO_NR_HINTS - 1) #define IOPRIO_PRIO_HINT(ioprio) \ (((ioprio) >> IOPRIO_HINT_SHIFT) & IOPRIO_HINT_MASK) /* * I/O hints. */ enum { /* No hint */ IOPRIO_HINT_NONE = 0, /* * Device command duration limits: indicate to the device a desired * duration limit for the commands that will be used to process an I/O. * These will currently only be effective for SCSI and ATA devices that * support the command duration limits feature. If this feature is * enabled, then the commands issued to the device to process an I/O with * one of these hints set will have the duration limit index (dld field) * set to the value of the hint. */ IOPRIO_HINT_DEV_DURATION_LIMIT_1 = 1, IOPRIO_HINT_DEV_DURATION_LIMIT_2 = 2, IOPRIO_HINT_DEV_DURATION_LIMIT_3 = 3, IOPRIO_HINT_DEV_DURATION_LIMIT_4 = 4, IOPRIO_HINT_DEV_DURATION_LIMIT_5 = 5, IOPRIO_HINT_DEV_DURATION_LIMIT_6 = 6, IOPRIO_HINT_DEV_DURATION_LIMIT_7 = 7, }; #define IOPRIO_BAD_VALUE(val, max) ((val) < 0 || (val) >= (max)) /* * Return an I/O priority value based on a class, a level and a hint. */ static __always_inline __u16 ioprio_value(int prioclass, int priolevel, int priohint) { if (IOPRIO_BAD_VALUE(prioclass, IOPRIO_NR_CLASSES) || IOPRIO_BAD_VALUE(priolevel, IOPRIO_NR_LEVELS) || IOPRIO_BAD_VALUE(priohint, IOPRIO_NR_HINTS)) return IOPRIO_CLASS_INVALID << IOPRIO_CLASS_SHIFT; return (prioclass << IOPRIO_CLASS_SHIFT) | (priohint << IOPRIO_HINT_SHIFT) | priolevel; } #define IOPRIO_PRIO_VALUE(prioclass, priolevel) \ ioprio_value(prioclass, priolevel, IOPRIO_HINT_NONE) #define IOPRIO_PRIO_VALUE_HINT(prioclass, priolevel, priohint) \ ioprio_value(prioclass, priolevel, priohint) #endif /* _UAPI_LINUX_IOPRIO_H */ |
| 5 2 5 3 5 5 3 3 7 5 5 5 5 5 5 5 3 3 3 3 3 5 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008-2014 Mathieu Desnoyers */ #include <linux/module.h> #include <linux/mutex.h> #include <linux/types.h> #include <linux/jhash.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/tracepoint.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/static_key.h> enum tp_func_state { TP_FUNC_0, TP_FUNC_1, TP_FUNC_2, TP_FUNC_N, }; extern tracepoint_ptr_t __start___tracepoints_ptrs[]; extern tracepoint_ptr_t __stop___tracepoints_ptrs[]; enum tp_transition_sync { TP_TRANSITION_SYNC_1_0_1, TP_TRANSITION_SYNC_N_2_1, _NR_TP_TRANSITION_SYNC, }; struct tp_transition_snapshot { unsigned long rcu; bool ongoing; }; /* Protected by tracepoints_mutex */ static struct tp_transition_snapshot tp_transition_snapshot[_NR_TP_TRANSITION_SYNC]; static void tp_rcu_get_state(enum tp_transition_sync sync) { struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync]; /* Keep the latest get_state snapshot. */ snapshot->rcu = get_state_synchronize_rcu(); snapshot->ongoing = true; } static void tp_rcu_cond_sync(enum tp_transition_sync sync) { struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync]; if (!snapshot->ongoing) return; cond_synchronize_rcu(snapshot->rcu); snapshot->ongoing = false; } /* Set to 1 to enable tracepoint debug output */ static const int tracepoint_debug; #ifdef CONFIG_MODULES /* * Tracepoint module list mutex protects the local module list. */ static DEFINE_MUTEX(tracepoint_module_list_mutex); /* Local list of struct tp_module */ static LIST_HEAD(tracepoint_module_list); #endif /* CONFIG_MODULES */ /* * tracepoints_mutex protects the builtin and module tracepoints. * tracepoints_mutex nests inside tracepoint_module_list_mutex. */ static DEFINE_MUTEX(tracepoints_mutex); /* * Note about RCU : * It is used to delay the free of multiple probes array until a quiescent * state is reached. */ struct tp_probes { struct rcu_head rcu; struct tracepoint_func probes[]; }; /* Called in removal of a func but failed to allocate a new tp_funcs */ static void tp_stub_func(void) { return; } static inline void *allocate_probes(int count) { struct tp_probes *p = kmalloc(struct_size(p, probes, count), GFP_KERNEL); return p == NULL ? NULL : p->probes; } static void rcu_free_old_probes(struct rcu_head *head) { kfree(container_of(head, struct tp_probes, rcu)); } static inline void release_probes(struct tracepoint *tp, struct tracepoint_func *old) { if (old) { struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); if (tracepoint_is_faultable(tp)) call_rcu_tasks_trace(&tp_probes->rcu, rcu_free_old_probes); else call_rcu(&tp_probes->rcu, rcu_free_old_probes); } } static void debug_print_probes(struct tracepoint_func *funcs) { int i; if (!tracepoint_debug || !funcs) return; for (i = 0; funcs[i].func; i++) printk(KERN_DEBUG "Probe %d : %pSb\n", i, funcs[i].func); } static struct tracepoint_func * func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func, int prio) { struct tracepoint_func *old, *new; int iter_probes; /* Iterate over old probe array. */ int nr_probes = 0; /* Counter for probes */ int pos = -1; /* Insertion position into new array */ if (WARN_ON(!tp_func->func)) return ERR_PTR(-EINVAL); debug_print_probes(*funcs); old = *funcs; if (old) { /* (N -> N+1), (N != 0, 1) probes */ for (iter_probes = 0; old[iter_probes].func; iter_probes++) { if (old[iter_probes].func == tp_stub_func) continue; /* Skip stub functions. */ if (old[iter_probes].func == tp_func->func && old[iter_probes].data == tp_func->data) return ERR_PTR(-EEXIST); nr_probes++; } } /* + 2 : one for new probe, one for NULL func */ new = allocate_probes(nr_probes + 2); if (new == NULL) return ERR_PTR(-ENOMEM); if (old) { nr_probes = 0; for (iter_probes = 0; old[iter_probes].func; iter_probes++) { if (old[iter_probes].func == tp_stub_func) continue; /* Insert before probes of lower priority */ if (pos < 0 && old[iter_probes].prio < prio) pos = nr_probes++; new[nr_probes++] = old[iter_probes]; } if (pos < 0) pos = nr_probes++; /* nr_probes now points to the end of the new array */ } else { pos = 0; nr_probes = 1; /* must point at end of array */ } new[pos] = *tp_func; new[nr_probes].func = NULL; *funcs = new; debug_print_probes(*funcs); return old; } static void *func_remove(struct tracepoint_func **funcs, struct tracepoint_func *tp_func) { int nr_probes = 0, nr_del = 0, i; struct tracepoint_func *old, *new; old = *funcs; if (!old) return ERR_PTR(-ENOENT); debug_print_probes(*funcs); /* (N -> M), (N > 1, M >= 0) probes */ if (tp_func->func) { for (nr_probes = 0; old[nr_probes].func; nr_probes++) { if ((old[nr_probes].func == tp_func->func && old[nr_probes].data == tp_func->data) || old[nr_probes].func == tp_stub_func) nr_del++; } } /* * If probe is NULL, then nr_probes = nr_del = 0, and then the * entire entry will be removed. */ if (nr_probes - nr_del == 0) { /* N -> 0, (N > 1) */ *funcs = NULL; debug_print_probes(*funcs); return old; } else { int j = 0; /* N -> M, (N > 1, M > 0) */ /* + 1 for NULL */ new = allocate_probes(nr_probes - nr_del + 1); if (new) { for (i = 0; old[i].func; i++) { if ((old[i].func != tp_func->func || old[i].data != tp_func->data) && old[i].func != tp_stub_func) new[j++] = old[i]; } new[nr_probes - nr_del].func = NULL; *funcs = new; } else { /* * Failed to allocate, replace the old function * with calls to tp_stub_func. */ for (i = 0; old[i].func; i++) { if (old[i].func == tp_func->func && old[i].data == tp_func->data) WRITE_ONCE(old[i].func, tp_stub_func); } *funcs = old; } } debug_print_probes(*funcs); return old; } /* * Count the number of functions (enum tp_func_state) in a tp_funcs array. */ static enum tp_func_state nr_func_state(const struct tracepoint_func *tp_funcs) { if (!tp_funcs) return TP_FUNC_0; if (!tp_funcs[1].func) return TP_FUNC_1; if (!tp_funcs[2].func) return TP_FUNC_2; return TP_FUNC_N; /* 3 or more */ } static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs) { void *func = tp->iterator; /* Synthetic events do not have static call sites */ if (!tp->static_call_key) return; if (nr_func_state(tp_funcs) == TP_FUNC_1) func = tp_funcs[0].func; __static_call_update(tp->static_call_key, tp->static_call_tramp, func); } /* * Add the probe function to a tracepoint. */ static int tracepoint_add_func(struct tracepoint *tp, struct tracepoint_func *func, int prio, bool warn) { struct tracepoint_func *old, *tp_funcs; int ret; if (tp->ext && tp->ext->regfunc && !static_key_enabled(&tp->key)) { ret = tp->ext->regfunc(); if (ret < 0) return ret; } tp_funcs = rcu_dereference_protected(tp->funcs, lockdep_is_held(&tracepoints_mutex)); old = func_add(&tp_funcs, func, prio); if (IS_ERR(old)) { WARN_ON_ONCE(warn && PTR_ERR(old) != -ENOMEM); return PTR_ERR(old); } /* * rcu_assign_pointer has as smp_store_release() which makes sure * that the new probe callbacks array is consistent before setting * a pointer to it. This array is referenced by __DO_TRACE from * include/linux/tracepoint.h using rcu_dereference_sched(). */ switch (nr_func_state(tp_funcs)) { case TP_FUNC_1: /* 0->1 */ /* * Make sure new static func never uses old data after a * 1->0->1 transition sequence. */ tp_rcu_cond_sync(TP_TRANSITION_SYNC_1_0_1); /* Set static call to first function */ tracepoint_update_call(tp, tp_funcs); /* Both iterator and static call handle NULL tp->funcs */ rcu_assign_pointer(tp->funcs, tp_funcs); static_branch_enable(&tp->key); break; case TP_FUNC_2: /* 1->2 */ /* Set iterator static call */ tracepoint_update_call(tp, tp_funcs); /* * Iterator callback installed before updating tp->funcs. * Requires ordering between RCU assign/dereference and * static call update/call. */ fallthrough; case TP_FUNC_N: /* N->N+1 (N>1) */ rcu_assign_pointer(tp->funcs, tp_funcs); /* * Make sure static func never uses incorrect data after a * N->...->2->1 (N>1) transition sequence. */ if (tp_funcs[0].data != old[0].data) tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1); break; default: WARN_ON_ONCE(1); break; } release_probes(tp, old); return 0; } /* * Remove a probe function from a tracepoint. * Note: only waiting an RCU period after setting elem->call to the empty * function insures that the original callback is not used anymore. This insured * by preempt_disable around the call site. */ static int tracepoint_remove_func(struct tracepoint *tp, struct tracepoint_func *func) { struct tracepoint_func *old, *tp_funcs; tp_funcs = rcu_dereference_protected(tp->funcs, lockdep_is_held(&tracepoints_mutex)); old = func_remove(&tp_funcs, func); if (WARN_ON_ONCE(IS_ERR(old))) return PTR_ERR(old); if (tp_funcs == old) /* Failed allocating new tp_funcs, replaced func with stub */ return 0; switch (nr_func_state(tp_funcs)) { case TP_FUNC_0: /* 1->0 */ /* Removed last function */ if (tp->ext && tp->ext->unregfunc && static_key_enabled(&tp->key)) tp->ext->unregfunc(); static_branch_disable(&tp->key); /* Set iterator static call */ tracepoint_update_call(tp, tp_funcs); /* Both iterator and static call handle NULL tp->funcs */ rcu_assign_pointer(tp->funcs, NULL); /* * Make sure new static func never uses old data after a * 1->0->1 transition sequence. */ tp_rcu_get_state(TP_TRANSITION_SYNC_1_0_1); break; case TP_FUNC_1: /* 2->1 */ rcu_assign_pointer(tp->funcs, tp_funcs); /* * Make sure static func never uses incorrect data after a * N->...->2->1 (N>2) transition sequence. If the first * element's data has changed, then force the synchronization * to prevent current readers that have loaded the old data * from calling the new function. */ if (tp_funcs[0].data != old[0].data) tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1); tp_rcu_cond_sync(TP_TRANSITION_SYNC_N_2_1); /* Set static call to first function */ tracepoint_update_call(tp, tp_funcs); break; case TP_FUNC_2: /* N->N-1 (N>2) */ fallthrough; case TP_FUNC_N: rcu_assign_pointer(tp->funcs, tp_funcs); /* * Make sure static func never uses incorrect data after a * N->...->2->1 (N>2) transition sequence. */ if (tp_funcs[0].data != old[0].data) tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1); break; default: WARN_ON_ONCE(1); break; } release_probes(tp, old); return 0; } /** * tracepoint_probe_register_prio_may_exist - Connect a probe to a tracepoint with priority * @tp: tracepoint * @probe: probe handler * @data: tracepoint data * @prio: priority of this function over other registered functions * * Same as tracepoint_probe_register_prio() except that it will not warn * if the tracepoint is already registered. */ int tracepoint_probe_register_prio_may_exist(struct tracepoint *tp, void *probe, void *data, int prio) { struct tracepoint_func tp_func; int ret; mutex_lock(&tracepoints_mutex); tp_func.func = probe; tp_func.data = data; tp_func.prio = prio; ret = tracepoint_add_func(tp, &tp_func, prio, false); mutex_unlock(&tracepoints_mutex); return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio_may_exist); /** * tracepoint_probe_register_prio - Connect a probe to a tracepoint with priority * @tp: tracepoint * @probe: probe handler * @data: tracepoint data * @prio: priority of this function over other registered functions * * Returns 0 if ok, error value on error. * Note: if @tp is within a module, the caller is responsible for * unregistering the probe before the module is gone. This can be * performed either with a tracepoint module going notifier, or from * within module exit functions. */ int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe, void *data, int prio) { struct tracepoint_func tp_func; int ret; mutex_lock(&tracepoints_mutex); tp_func.func = probe; tp_func.data = data; tp_func.prio = prio; ret = tracepoint_add_func(tp, &tp_func, prio, true); mutex_unlock(&tracepoints_mutex); return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio); /** * tracepoint_probe_register - Connect a probe to a tracepoint * @tp: tracepoint * @probe: probe handler * @data: tracepoint data * * Returns 0 if ok, error value on error. * Note: if @tp is within a module, the caller is responsible for * unregistering the probe before the module is gone. This can be * performed either with a tracepoint module going notifier, or from * within module exit functions. */ int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data) { return tracepoint_probe_register_prio(tp, probe, data, TRACEPOINT_DEFAULT_PRIO); } EXPORT_SYMBOL_GPL(tracepoint_probe_register); /** * tracepoint_probe_unregister - Disconnect a probe from a tracepoint * @tp: tracepoint * @probe: probe function pointer * @data: tracepoint data * * Returns 0 if ok, error value on error. */ int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data) { struct tracepoint_func tp_func; int ret; mutex_lock(&tracepoints_mutex); tp_func.func = probe; tp_func.data = data; ret = tracepoint_remove_func(tp, &tp_func); mutex_unlock(&tracepoints_mutex); return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); static void for_each_tracepoint_range( tracepoint_ptr_t *begin, tracepoint_ptr_t *end, void (*fct)(struct tracepoint *tp, void *priv), void *priv) { tracepoint_ptr_t *iter; if (!begin) return; for (iter = begin; iter < end; iter++) fct(tracepoint_ptr_deref(iter), priv); } #ifdef CONFIG_MODULES bool trace_module_has_bad_taint(struct module *mod) { return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) | (1 << TAINT_UNSIGNED_MODULE) | (1 << TAINT_TEST) | (1 << TAINT_LIVEPATCH)); } static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); /** * register_tracepoint_module_notifier - register tracepoint coming/going notifier * @nb: notifier block * * Notifiers registered with this function are called on module * coming/going with the tracepoint_module_list_mutex held. * The notifier block callback should expect a "struct tp_module" data * pointer. */ int register_tracepoint_module_notifier(struct notifier_block *nb) { struct tp_module *tp_mod; int ret; mutex_lock(&tracepoint_module_list_mutex); ret = blocking_notifier_chain_register(&tracepoint_notify_list, nb); if (ret) goto end; list_for_each_entry(tp_mod, &tracepoint_module_list, list) (void) nb->notifier_call(nb, MODULE_STATE_COMING, tp_mod); end: mutex_unlock(&tracepoint_module_list_mutex); return ret; } EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier); /** * unregister_tracepoint_module_notifier - unregister tracepoint coming/going notifier * @nb: notifier block * * The notifier block callback should expect a "struct tp_module" data * pointer. */ int unregister_tracepoint_module_notifier(struct notifier_block *nb) { struct tp_module *tp_mod; int ret; mutex_lock(&tracepoint_module_list_mutex); ret = blocking_notifier_chain_unregister(&tracepoint_notify_list, nb); if (ret) goto end; list_for_each_entry(tp_mod, &tracepoint_module_list, list) (void) nb->notifier_call(nb, MODULE_STATE_GOING, tp_mod); end: mutex_unlock(&tracepoint_module_list_mutex); return ret; } EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier); /* * Ensure the tracer unregistered the module's probes before the module * teardown is performed. Prevents leaks of probe and data pointers. */ static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv) { WARN_ON_ONCE(tp->funcs); } static int tracepoint_module_coming(struct module *mod) { struct tp_module *tp_mod; if (!mod->num_tracepoints) return 0; /* * We skip modules that taint the kernel, especially those with different * module headers (for forced load), to make sure we don't cause a crash. * Staging, out-of-tree, unsigned GPL, and test modules are fine. */ if (trace_module_has_bad_taint(mod)) return 0; tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); if (!tp_mod) return -ENOMEM; tp_mod->mod = mod; mutex_lock(&tracepoint_module_list_mutex); list_add_tail(&tp_mod->list, &tracepoint_module_list); blocking_notifier_call_chain(&tracepoint_notify_list, MODULE_STATE_COMING, tp_mod); mutex_unlock(&tracepoint_module_list_mutex); return 0; } static void tracepoint_module_going(struct module *mod) { struct tp_module *tp_mod; if (!mod->num_tracepoints) return; mutex_lock(&tracepoint_module_list_mutex); list_for_each_entry(tp_mod, &tracepoint_module_list, list) { if (tp_mod->mod == mod) { blocking_notifier_call_chain(&tracepoint_notify_list, MODULE_STATE_GOING, tp_mod); list_del(&tp_mod->list); kfree(tp_mod); /* * Called the going notifier before checking for * quiescence. */ for_each_tracepoint_range(mod->tracepoints_ptrs, mod->tracepoints_ptrs + mod->num_tracepoints, tp_module_going_check_quiescent, NULL); break; } } /* * In the case of modules that were tainted at "coming", we'll simply * walk through the list without finding it. We cannot use the "tainted" * flag on "going", in case a module taints the kernel only after being * loaded. */ mutex_unlock(&tracepoint_module_list_mutex); } static int tracepoint_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; int ret = 0; switch (val) { case MODULE_STATE_COMING: ret = tracepoint_module_coming(mod); break; case MODULE_STATE_LIVE: break; case MODULE_STATE_GOING: tracepoint_module_going(mod); break; case MODULE_STATE_UNFORMED: break; } return notifier_from_errno(ret); } static struct notifier_block tracepoint_module_nb = { .notifier_call = tracepoint_module_notify, .priority = 0, }; static __init int init_tracepoints(void) { int ret; ret = register_module_notifier(&tracepoint_module_nb); if (ret) pr_warn("Failed to register tracepoint module enter notifier\n"); return ret; } __initcall(init_tracepoints); /** * for_each_tracepoint_in_module - iteration on all tracepoints in a module * @mod: module * @fct: callback * @priv: private data */ void for_each_tracepoint_in_module(struct module *mod, void (*fct)(struct tracepoint *tp, struct module *mod, void *priv), void *priv) { tracepoint_ptr_t *begin, *end, *iter; lockdep_assert_held(&tracepoint_module_list_mutex); if (!mod) return; begin = mod->tracepoints_ptrs; end = mod->tracepoints_ptrs + mod->num_tracepoints; for (iter = begin; iter < end; iter++) fct(tracepoint_ptr_deref(iter), mod, priv); } /** * for_each_module_tracepoint - iteration on all tracepoints in all modules * @fct: callback * @priv: private data */ void for_each_module_tracepoint(void (*fct)(struct tracepoint *tp, struct module *mod, void *priv), void *priv) { struct tp_module *tp_mod; mutex_lock(&tracepoint_module_list_mutex); list_for_each_entry(tp_mod, &tracepoint_module_list, list) for_each_tracepoint_in_module(tp_mod->mod, fct, priv); mutex_unlock(&tracepoint_module_list_mutex); } #endif /* CONFIG_MODULES */ /** * for_each_kernel_tracepoint - iteration on all kernel tracepoints * @fct: callback * @priv: private data */ void for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv), void *priv) { for_each_tracepoint_range(__start___tracepoints_ptrs, __stop___tracepoints_ptrs, fct, priv); } EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint); #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS /* NB: reg/unreg are called while guarded with the tracepoints_mutex */ static int sys_tracepoint_refcount; int syscall_regfunc(void) { struct task_struct *p, *t; if (!sys_tracepoint_refcount) { read_lock(&tasklist_lock); for_each_process_thread(p, t) { set_task_syscall_work(t, SYSCALL_TRACEPOINT); } read_unlock(&tasklist_lock); } sys_tracepoint_refcount++; return 0; } void syscall_unregfunc(void) { struct task_struct *p, *t; sys_tracepoint_refcount--; if (!sys_tracepoint_refcount) { read_lock(&tasklist_lock); for_each_process_thread(p, t) { clear_task_syscall_work(t, SYSCALL_TRACEPOINT); } read_unlock(&tasklist_lock); } } #endif |
| 12 15 12 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | // SPDX-License-Identifier: GPL-2.0-only /* * llc_output.c - LLC minimal output path * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/llc.h> #include <net/llc_pdu.h> /** * llc_mac_hdr_init - fills MAC header fields * @skb: Address of the frame to initialize its MAC header * @sa: The MAC source address * @da: The MAC destination address * * Fills MAC header fields, depending on MAC type. Returns 0, If MAC type * is a valid type and initialization completes correctly 1, otherwise. */ int llc_mac_hdr_init(struct sk_buff *skb, const unsigned char *sa, const unsigned char *da) { int rc = -EINVAL; switch (skb->dev->type) { case ARPHRD_ETHER: case ARPHRD_LOOPBACK: rc = dev_hard_header(skb, skb->dev, ETH_P_802_2, da, sa, skb->len); if (rc > 0) rc = 0; break; default: break; } return rc; } /** * llc_build_and_send_ui_pkt - unitdata request interface for upper layers * @sap: sap to use * @skb: packet to send * @dmac: destination mac address * @dsap: destination sap * * Upper layers calls this function when upper layer wants to send data * using connection-less mode communication (UI pdu). * * Accept data frame from network layer to be sent using connection- * less mode communication; timeout/retries handled by network layer; * package primitive as an event and send to SAP event handler */ int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, const unsigned char *dmac, unsigned char dsap) { int rc; llc_pdu_header_init(skb, LLC_PDU_TYPE_U, sap->laddr.lsap, dsap, LLC_PDU_CMD); llc_pdu_init_as_ui_cmd(skb); rc = llc_mac_hdr_init(skb, skb->dev->dev_addr, dmac); if (likely(!rc)) rc = dev_queue_xmit(skb); else kfree_skb(skb); return rc; } EXPORT_SYMBOL(llc_mac_hdr_init); EXPORT_SYMBOL(llc_build_and_send_ui_pkt); |
| 15 15 1 1 13 1 13 11 2 2 17 17 15 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ChaCha and HChaCha functions (x86_64 optimized) * * Copyright (C) 2015 Martin Willi */ #include <asm/simd.h> #include <crypto/chacha.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/sizes.h> asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void hchacha_block_ssse3(const struct chacha_state *state, u32 out[HCHACHA_OUT_WORDS], int nrounds); asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) { len = min(len, maxblocks * CHACHA_BLOCK_SIZE); return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; } static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { if (static_branch_likely(&chacha_use_avx512vl)) { while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha_8block_xor_avx512vl(state, dst, src, bytes, nrounds); bytes -= CHACHA_BLOCK_SIZE * 8; src += CHACHA_BLOCK_SIZE * 8; dst += CHACHA_BLOCK_SIZE * 8; state->x[12] += 8; } if (bytes > CHACHA_BLOCK_SIZE * 4) { chacha_8block_xor_avx512vl(state, dst, src, bytes, nrounds); state->x[12] += chacha_advance(bytes, 8); return; } if (bytes > CHACHA_BLOCK_SIZE * 2) { chacha_4block_xor_avx512vl(state, dst, src, bytes, nrounds); state->x[12] += chacha_advance(bytes, 4); return; } if (bytes) { chacha_2block_xor_avx512vl(state, dst, src, bytes, nrounds); state->x[12] += chacha_advance(bytes, 2); return; } } if (static_branch_likely(&chacha_use_avx2)) { while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); bytes -= CHACHA_BLOCK_SIZE * 8; src += CHACHA_BLOCK_SIZE * 8; dst += CHACHA_BLOCK_SIZE * 8; state->x[12] += 8; } if (bytes > CHACHA_BLOCK_SIZE * 4) { chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); state->x[12] += chacha_advance(bytes, 8); return; } if (bytes > CHACHA_BLOCK_SIZE * 2) { chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); state->x[12] += chacha_advance(bytes, 4); return; } if (bytes > CHACHA_BLOCK_SIZE) { chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); state->x[12] += chacha_advance(bytes, 2); return; } } while (bytes >= CHACHA_BLOCK_SIZE * 4) { chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); bytes -= CHACHA_BLOCK_SIZE * 4; src += CHACHA_BLOCK_SIZE * 4; dst += CHACHA_BLOCK_SIZE * 4; state->x[12] += 4; } if (bytes > CHACHA_BLOCK_SIZE) { chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); state->x[12] += chacha_advance(bytes, 4); return; } if (bytes) { chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); state->x[12]++; } } void hchacha_block_arch(const struct chacha_state *state, u32 out[HCHACHA_OUT_WORDS], int nrounds) { if (!static_branch_likely(&chacha_use_simd)) { hchacha_block_generic(state, out, nrounds); } else { kernel_fpu_begin(); hchacha_block_ssse3(state, out, nrounds); kernel_fpu_end(); } } EXPORT_SYMBOL(hchacha_block_arch); void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { if (!static_branch_likely(&chacha_use_simd) || bytes <= CHACHA_BLOCK_SIZE) return chacha_crypt_generic(state, dst, src, bytes, nrounds); do { unsigned int todo = min_t(unsigned int, bytes, SZ_4K); kernel_fpu_begin(); chacha_dosimd(state, dst, src, todo, nrounds); kernel_fpu_end(); bytes -= todo; src += todo; dst += todo; } while (bytes); } EXPORT_SYMBOL(chacha_crypt_arch); bool chacha_is_arch_optimized(void) { return static_key_enabled(&chacha_use_simd); } EXPORT_SYMBOL(chacha_is_arch_optimized); static int __init chacha_simd_mod_init(void) { if (!boot_cpu_has(X86_FEATURE_SSSE3)) return 0; static_branch_enable(&chacha_use_simd); if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { static_branch_enable(&chacha_use_avx2); if (boot_cpu_has(X86_FEATURE_AVX512VL) && boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ static_branch_enable(&chacha_use_avx512vl); } return 0; } subsys_initcall(chacha_simd_mod_init); static void __exit chacha_simd_mod_exit(void) { } module_exit(chacha_simd_mod_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)"); |
| 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | // SPDX-License-Identifier: GPL-2.0 /* * media-dev-allocator.c - Media Controller Device Allocator API * * Copyright (c) 2019 Shuah Khan <shuah@kernel.org> * * Credits: Suggested by Laurent Pinchart <laurent.pinchart@ideasonboard.com> */ /* * This file adds a global refcounted Media Controller Device Instance API. * A system wide global media device list is managed and each media device * includes a kref count. The last put on the media device releases the media * device instance. * */ #include <linux/kref.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/usb.h> #include <media/media-device.h> #include <media/media-dev-allocator.h> static LIST_HEAD(media_device_list); static DEFINE_MUTEX(media_device_lock); struct media_device_instance { struct media_device mdev; struct module *owner; struct list_head list; struct kref refcount; }; static inline struct media_device_instance * to_media_device_instance(struct media_device *mdev) { return container_of(mdev, struct media_device_instance, mdev); } static void media_device_instance_release(struct kref *kref) { struct media_device_instance *mdi = container_of(kref, struct media_device_instance, refcount); dev_dbg(mdi->mdev.dev, "%s: releasing Media Device\n", __func__); mutex_lock(&media_device_lock); media_device_unregister(&mdi->mdev); media_device_cleanup(&mdi->mdev); list_del(&mdi->list); mutex_unlock(&media_device_lock); kfree(mdi); } /* Callers should hold media_device_lock when calling this function */ static struct media_device *__media_device_get(struct device *dev, const char *module_name, struct module *owner) { struct media_device_instance *mdi; list_for_each_entry(mdi, &media_device_list, list) { if (mdi->mdev.dev != dev) continue; kref_get(&mdi->refcount); /* get module reference for the media_device owner */ if (owner != mdi->owner && !try_module_get(mdi->owner)) dev_err(dev, "%s: module %s get owner reference error\n", __func__, module_name); else dev_dbg(dev, "%s: module %s got owner reference\n", __func__, module_name); return &mdi->mdev; } mdi = kzalloc(sizeof(*mdi), GFP_KERNEL); if (!mdi) return NULL; mdi->owner = owner; kref_init(&mdi->refcount); list_add_tail(&mdi->list, &media_device_list); dev_dbg(dev, "%s: Allocated media device for owner %s\n", __func__, module_name); return &mdi->mdev; } struct media_device *media_device_usb_allocate(struct usb_device *udev, const char *module_name, struct module *owner) { struct media_device *mdev; mutex_lock(&media_device_lock); mdev = __media_device_get(&udev->dev, module_name, owner); if (!mdev) { mutex_unlock(&media_device_lock); return ERR_PTR(-ENOMEM); } /* check if media device is already initialized */ if (!mdev->dev) __media_device_usb_init(mdev, udev, udev->product, module_name); mutex_unlock(&media_device_lock); return mdev; } EXPORT_SYMBOL_GPL(media_device_usb_allocate); void media_device_delete(struct media_device *mdev, const char *module_name, struct module *owner) { struct media_device_instance *mdi = to_media_device_instance(mdev); mutex_lock(&media_device_lock); /* put module reference for the media_device owner */ if (mdi->owner != owner) { module_put(mdi->owner); dev_dbg(mdi->mdev.dev, "%s: module %s put owner module reference\n", __func__, module_name); } mutex_unlock(&media_device_lock); kref_put(&mdi->refcount, media_device_instance_release); } EXPORT_SYMBOL_GPL(media_device_delete); |
| 1 1 1 1 1 1 1 1 1 1 1 121 281 287 322 121 2 258 320 1 316 9 9 357 123 122 121 159 415 274 273 1 274 272 275 273 1 270 154 270 222 220 154 154 159 158 273 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/char_dev.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/init.h> #include <linux/fs.h> #include <linux/kdev_t.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/kobject.h> #include <linux/kobj_map.h> #include <linux/cdev.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/tty.h> #include "internal.h" static struct kobj_map *cdev_map __ro_after_init; static DEFINE_MUTEX(chrdevs_lock); #define CHRDEV_MAJOR_HASH_SIZE 255 static struct char_device_struct { struct char_device_struct *next; unsigned int major; unsigned int baseminor; int minorct; char name[64]; struct cdev *cdev; /* will die */ } *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; /* index in the above */ static inline int major_to_index(unsigned major) { return major % CHRDEV_MAJOR_HASH_SIZE; } #ifdef CONFIG_PROC_FS void chrdev_show(struct seq_file *f, off_t offset) { struct char_device_struct *cd; mutex_lock(&chrdevs_lock); for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) { if (cd->major == offset) seq_printf(f, "%3d %s\n", cd->major, cd->name); } mutex_unlock(&chrdevs_lock); } #endif /* CONFIG_PROC_FS */ static int find_dynamic_major(void) { int i; struct char_device_struct *cd; for (i = ARRAY_SIZE(chrdevs)-1; i >= CHRDEV_MAJOR_DYN_END; i--) { if (chrdevs[i] == NULL) return i; } for (i = CHRDEV_MAJOR_DYN_EXT_START; i >= CHRDEV_MAJOR_DYN_EXT_END; i--) { for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next) if (cd->major == i) break; if (cd == NULL) return i; } return -EBUSY; } /* * Register a single major with a specified minor range. * * If major == 0 this function will dynamically allocate an unused major. * If major > 0 this function will attempt to reserve the range of minors * with given major. * */ static struct char_device_struct * __register_chrdev_region(unsigned int major, unsigned int baseminor, int minorct, const char *name) { struct char_device_struct *cd, *curr, *prev = NULL; int ret; int i; if (major >= CHRDEV_MAJOR_MAX) { pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n", name, major, CHRDEV_MAJOR_MAX-1); return ERR_PTR(-EINVAL); } if (minorct > MINORMASK + 1 - baseminor) { pr_err("CHRDEV \"%s\" minor range requested (%u-%u) is out of range of maximum range (%u-%u) for a single major\n", name, baseminor, baseminor + minorct - 1, 0, MINORMASK); return ERR_PTR(-EINVAL); } cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL); if (cd == NULL) return ERR_PTR(-ENOMEM); mutex_lock(&chrdevs_lock); if (major == 0) { ret = find_dynamic_major(); if (ret < 0) { pr_err("CHRDEV \"%s\" dynamic allocation region is full\n", name); goto out; } major = ret; } ret = -EBUSY; i = major_to_index(major); for (curr = chrdevs[i]; curr; prev = curr, curr = curr->next) { if (curr->major < major) continue; if (curr->major > major) break; if (curr->baseminor + curr->minorct <= baseminor) continue; if (curr->baseminor >= baseminor + minorct) break; goto out; } cd->major = major; cd->baseminor = baseminor; cd->minorct = minorct; strscpy(cd->name, name, sizeof(cd->name)); if (!prev) { cd->next = curr; chrdevs[i] = cd; } else { cd->next = prev->next; prev->next = cd; } mutex_unlock(&chrdevs_lock); return cd; out: mutex_unlock(&chrdevs_lock); kfree(cd); return ERR_PTR(ret); } static struct char_device_struct * __unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct) { struct char_device_struct *cd = NULL, **cp; int i = major_to_index(major); mutex_lock(&chrdevs_lock); for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) if ((*cp)->major == major && (*cp)->baseminor == baseminor && (*cp)->minorct == minorct) break; if (*cp) { cd = *cp; *cp = cd->next; } mutex_unlock(&chrdevs_lock); return cd; } /** * register_chrdev_region() - register a range of device numbers * @from: the first in the desired range of device numbers; must include * the major number. * @count: the number of consecutive device numbers required * @name: the name of the device or driver. * * Return value is zero on success, a negative error code on failure. */ int register_chrdev_region(dev_t from, unsigned count, const char *name) { struct char_device_struct *cd; dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; cd = __register_chrdev_region(MAJOR(n), MINOR(n), next - n, name); if (IS_ERR(cd)) goto fail; } return 0; fail: to = n; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } return PTR_ERR(cd); } /** * alloc_chrdev_region() - register a range of char device numbers * @dev: output parameter for first assigned number * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: the name of the associated device or driver * * Allocates a range of char device numbers. The major number will be * chosen dynamically, and returned (along with the first minor number) * in @dev. Returns zero or a negative error code. */ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, const char *name) { struct char_device_struct *cd; cd = __register_chrdev_region(0, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); *dev = MKDEV(cd->major, cd->baseminor); return 0; } /** * __register_chrdev() - create and register a cdev occupying a range of minors * @major: major device number or 0 for dynamic allocation * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: name of this range of devices * @fops: file operations associated with this devices * * If @major == 0 this functions will dynamically allocate a major and return * its number. * * If @major > 0 this function will attempt to reserve a device with the given * major number and will return zero on success. * * Returns a -ve errno on failure. * * The name of this device has nothing to do with the name of the device in * /dev. It only helps to keep track of the different owners of devices. If * your module name has only one type of devices it's ok to use e.g. the name * of the module here. */ int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops) { struct char_device_struct *cd; struct cdev *cdev; int err = -ENOMEM; cd = __register_chrdev_region(major, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); cdev = cdev_alloc(); if (!cdev) goto out2; cdev->owner = fops->owner; cdev->ops = fops; kobject_set_name(&cdev->kobj, "%s", name); err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); if (err) goto out; cd->cdev = cdev; return major ? 0 : cd->major; out: kobject_put(&cdev->kobj); out2: kfree(__unregister_chrdev_region(cd->major, baseminor, count)); return err; } /** * unregister_chrdev_region() - unregister a range of device numbers * @from: the first in the range of numbers to unregister * @count: the number of device numbers to unregister * * This function will unregister a range of @count device numbers, * starting with @from. The caller should normally be the one who * allocated those numbers in the first place... */ void unregister_chrdev_region(dev_t from, unsigned count) { dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } } /** * __unregister_chrdev - unregister and destroy a cdev * @major: major device number * @baseminor: first of the range of minor numbers * @count: the number of minor numbers this cdev is occupying * @name: name of this range of devices * * Unregister and destroy the cdev occupying the region described by * @major, @baseminor and @count. This function undoes what * __register_chrdev() did. */ void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name) { struct char_device_struct *cd; cd = __unregister_chrdev_region(major, baseminor, count); if (cd && cd->cdev) cdev_del(cd->cdev); kfree(cd); } static DEFINE_SPINLOCK(cdev_lock); static struct kobject *cdev_get(struct cdev *p) { struct module *owner = p->owner; struct kobject *kobj; if (!try_module_get(owner)) return NULL; kobj = kobject_get_unless_zero(&p->kobj); if (!kobj) module_put(owner); return kobj; } void cdev_put(struct cdev *p) { if (p) { struct module *owner = p->owner; kobject_put(&p->kobj); module_put(owner); } } /* * Called every time a character special file is opened */ static int chrdev_open(struct inode *inode, struct file *filp) { const struct file_operations *fops; struct cdev *p; struct cdev *new = NULL; int ret = 0; spin_lock(&cdev_lock); p = inode->i_cdev; if (!p) { struct kobject *kobj; int idx; spin_unlock(&cdev_lock); kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); if (!kobj) return -ENXIO; new = container_of(kobj, struct cdev, kobj); spin_lock(&cdev_lock); /* Check i_cdev again in case somebody beat us to it while we dropped the lock. */ p = inode->i_cdev; if (!p) { inode->i_cdev = p = new; list_add(&inode->i_devices, &p->list); new = NULL; } else if (!cdev_get(p)) ret = -ENXIO; } else if (!cdev_get(p)) ret = -ENXIO; spin_unlock(&cdev_lock); cdev_put(new); if (ret) return ret; ret = -ENXIO; fops = fops_get(p->ops); if (!fops) goto out_cdev_put; replace_fops(filp, fops); if (filp->f_op->open) { ret = filp->f_op->open(inode, filp); if (ret) goto out_cdev_put; } return 0; out_cdev_put: cdev_put(p); return ret; } void cd_forget(struct inode *inode) { spin_lock(&cdev_lock); list_del_init(&inode->i_devices); inode->i_cdev = NULL; inode->i_mapping = &inode->i_data; spin_unlock(&cdev_lock); } static void cdev_purge(struct cdev *cdev) { spin_lock(&cdev_lock); while (!list_empty(&cdev->list)) { struct inode *inode; inode = container_of(cdev->list.next, struct inode, i_devices); list_del_init(&inode->i_devices); inode->i_cdev = NULL; } spin_unlock(&cdev_lock); } /* * Dummy default file-operations: the only thing this does * is contain the open that then fills in the correct operations * depending on the special file... */ const struct file_operations def_chr_fops = { .open = chrdev_open, .llseek = noop_llseek, }; static struct kobject *exact_match(dev_t dev, int *part, void *data) { struct cdev *p = data; return &p->kobj; } static int exact_lock(dev_t dev, void *data) { struct cdev *p = data; return cdev_get(p) ? 0 : -1; } /** * cdev_add() - add a char device to the system * @p: the cdev structure for the device * @dev: the first device number for which this device is responsible * @count: the number of consecutive minor numbers corresponding to this * device * * cdev_add() adds the device represented by @p to the system, making it * live immediately. A negative error code is returned on failure. */ int cdev_add(struct cdev *p, dev_t dev, unsigned count) { int error; p->dev = dev; p->count = count; if (WARN_ON(dev == WHITEOUT_DEV)) { error = -EBUSY; goto err; } error = kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); if (error) goto err; kobject_get(p->kobj.parent); return 0; err: kfree_const(p->kobj.name); p->kobj.name = NULL; return error; } /** * cdev_set_parent() - set the parent kobject for a char device * @p: the cdev structure * @kobj: the kobject to take a reference to * * cdev_set_parent() sets a parent kobject which will be referenced * appropriately so the parent is not freed before the cdev. This * should be called before cdev_add. */ void cdev_set_parent(struct cdev *p, struct kobject *kobj) { WARN_ON(!kobj->state_initialized); p->kobj.parent = kobj; } /** * cdev_device_add() - add a char device and it's corresponding * struct device, linkink * @dev: the device structure * @cdev: the cdev structure * * cdev_device_add() adds the char device represented by @cdev to the system, * just as cdev_add does. It then adds @dev to the system using device_add * The dev_t for the char device will be taken from the struct device which * needs to be initialized first. This helper function correctly takes a * reference to the parent device so the parent will not get released until * all references to the cdev are released. * * This helper uses dev->devt for the device number. If it is not set * it will not add the cdev and it will be equivalent to device_add. * * This function should be used whenever the struct cdev and the * struct device are members of the same structure whose lifetime is * managed by the struct device. * * NOTE: Callers must assume that userspace was able to open the cdev and * can call cdev fops callbacks at any time, even if this function fails. */ int cdev_device_add(struct cdev *cdev, struct device *dev) { int rc = 0; if (dev->devt) { cdev_set_parent(cdev, &dev->kobj); rc = cdev_add(cdev, dev->devt, 1); if (rc) return rc; } rc = device_add(dev); if (rc && dev->devt) cdev_del(cdev); return rc; } /** * cdev_device_del() - inverse of cdev_device_add * @cdev: the cdev structure * @dev: the device structure * * cdev_device_del() is a helper function to call cdev_del and device_del. * It should be used whenever cdev_device_add is used. * * If dev->devt is not set it will not remove the cdev and will be equivalent * to device_del. * * NOTE: This guarantees that associated sysfs callbacks are not running * or runnable, however any cdevs already open will remain and their fops * will still be callable even after this function returns. */ void cdev_device_del(struct cdev *cdev, struct device *dev) { device_del(dev); if (dev->devt) cdev_del(cdev); } static void cdev_unmap(dev_t dev, unsigned count) { kobj_unmap(cdev_map, dev, count); } /** * cdev_del() - remove a cdev from the system * @p: the cdev structure to be removed * * cdev_del() removes @p from the system, possibly freeing the structure * itself. * * NOTE: This guarantees that cdev device will no longer be able to be * opened, however any cdevs already open will remain and their fops will * still be callable even after cdev_del returns. */ void cdev_del(struct cdev *p) { cdev_unmap(p->dev, p->count); kobject_put(&p->kobj); } static void cdev_default_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kobject_put(parent); } static void cdev_dynamic_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kfree(p); kobject_put(parent); } static struct kobj_type ktype_cdev_default = { .release = cdev_default_release, }; static struct kobj_type ktype_cdev_dynamic = { .release = cdev_dynamic_release, }; /** * cdev_alloc() - allocate a cdev structure * * Allocates and returns a cdev structure, or NULL on failure. */ struct cdev *cdev_alloc(void) { struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); if (p) { INIT_LIST_HEAD(&p->list); kobject_init(&p->kobj, &ktype_cdev_dynamic); } return p; } /** * cdev_init() - initialize a cdev structure * @cdev: the structure to initialize * @fops: the file_operations for this device * * Initializes @cdev, remembering @fops, making it ready to add to the * system with cdev_add(). */ void cdev_init(struct cdev *cdev, const struct file_operations *fops) { memset(cdev, 0, sizeof *cdev); INIT_LIST_HEAD(&cdev->list); kobject_init(&cdev->kobj, &ktype_cdev_default); cdev->ops = fops; } static struct kobject *base_probe(dev_t dev, int *part, void *data) { if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0) /* Make old-style 2.4 aliases work */ request_module("char-major-%d", MAJOR(dev)); return NULL; } void __init chrdev_init(void) { cdev_map = kobj_map_init(base_probe, &chrdevs_lock); } /* Let modules do char dev stuff */ EXPORT_SYMBOL(register_chrdev_region); EXPORT_SYMBOL(unregister_chrdev_region); EXPORT_SYMBOL(alloc_chrdev_region); EXPORT_SYMBOL(cdev_init); EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); EXPORT_SYMBOL(cdev_set_parent); EXPORT_SYMBOL(cdev_device_add); EXPORT_SYMBOL(cdev_device_del); EXPORT_SYMBOL(__register_chrdev); EXPORT_SYMBOL(__unregister_chrdev); |
| 16 2 16 1 16 1 19 2 11 11 11 11 13 13 7538 7538 15 2402 4 2389 2402 17 1 13 13 16 4 15 15 11 13 16 16 15 11 13 17 17 16 17 165 163 164 164 166 6892 6905 6894 6892 1 6895 982 982 984 981 984 983 1 30 31 31 31 31 53 53 52 51 51 51 226 226 226 229 1 227 214 211 11415 1 1 1 1 3 3 3 3 1 2 2 14 13 2 13 3 11 4 3 3 3 2 6 1 10 6 11 12 11 12 1 12 12 12 1 2 2 2 2 2 2 2 2 11 12 2 2 12 12 2 2 46 47 47 2916 2915 2 2868 1116 10 10 10 10 1 1 1 13 14 14 13 14 14 1 7 20 17 2 5 1 1 5 12 6 14 28 5 3 11 4 13 4 12 18 28 28 3 5 11 16 4 12 5 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 | // SPDX-License-Identifier: GPL-2.0 /* * Kernel timekeeping code and accessor functions. Based on code from * timer.c, moved in commit 8524070b7982. */ #include <linux/timekeeper_internal.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/kobject.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/nmi.h> #include <linux/sched.h> #include <linux/sched/loadavg.h> #include <linux/sched/clock.h> #include <linux/syscore_ops.h> #include <linux/clocksource.h> #include <linux/jiffies.h> #include <linux/time.h> #include <linux/timex.h> #include <linux/tick.h> #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> #include <linux/compiler.h> #include <linux/audit.h> #include <linux/random.h> #include <vdso/auxclock.h> #include "tick-internal.h" #include "ntp_internal.h" #include "timekeeping_internal.h" #define TK_CLEAR_NTP (1 << 0) #define TK_CLOCK_WAS_SET (1 << 1) #define TK_UPDATE_ALL (TK_CLEAR_NTP | TK_CLOCK_WAS_SET) enum timekeeping_adv_mode { /* Update timekeeper when a tick has passed */ TK_ADV_TICK, /* Update timekeeper on a direct frequency change */ TK_ADV_FREQ }; /* * The most important data for readout fits into a single 64 byte * cache line. */ struct tk_data { seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; struct timekeeper shadow_timekeeper; raw_spinlock_t lock; } ____cacheline_aligned; static struct tk_data timekeeper_data[TIMEKEEPERS_MAX]; /* The core timekeeper */ #define tk_core (timekeeper_data[TIMEKEEPER_CORE]) #ifdef CONFIG_POSIX_AUX_CLOCKS static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) { return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts); } static inline bool tk_is_aux(const struct timekeeper *tk) { return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST; } #else static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) { return false; } static inline bool tk_is_aux(const struct timekeeper *tk) { return false; } #endif /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; /** * struct tk_fast - NMI safe timekeeper * @seq: Sequence counter for protecting updates. The lowest bit * is the index for the tk_read_base array * @base: tk_read_base array. Access is indexed by the lowest bit of * @seq. * * See @update_fast_timekeeper() below. */ struct tk_fast { seqcount_latch_t seq; struct tk_read_base base[2]; }; /* Suspend-time cycles value for halted fast timekeeper. */ static u64 cycles_at_suspend; static u64 dummy_clock_read(struct clocksource *cs) { if (timekeeping_suspended) return cycles_at_suspend; return local_clock(); } static struct clocksource dummy_clock = { .read = dummy_clock_read, }; /* * Boot time initialization which allows local_clock() to be utilized * during early boot when clocksources are not available. local_clock() * returns nanoseconds already so no conversion is required, hence mult=1 * and shift=0. When the first proper clocksource is installed then * the fast time keepers are updated with the correct values. */ #define FAST_TK_INIT \ { \ .clock = &dummy_clock, \ .mask = CLOCKSOURCE_MASK(64), \ .mult = 1, \ .shift = 0, \ } static struct tk_fast tk_fast_mono ____cacheline_aligned = { .seq = SEQCNT_LATCH_ZERO(tk_fast_mono.seq), .base[0] = FAST_TK_INIT, .base[1] = FAST_TK_INIT, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { .seq = SEQCNT_LATCH_ZERO(tk_fast_raw.seq), .base[0] = FAST_TK_INIT, .base[1] = FAST_TK_INIT, }; #ifdef CONFIG_POSIX_AUX_CLOCKS static __init void tk_aux_setup(void); static void tk_aux_update_clocksource(void); static void tk_aux_advance(void); #else static inline void tk_aux_setup(void) { } static inline void tk_aux_update_clocksource(void) { } static inline void tk_aux_advance(void) { } #endif unsigned long timekeeper_lock_irqsave(void) { unsigned long flags; raw_spin_lock_irqsave(&tk_core.lock, flags); return flags; } void timekeeper_unlock_irqrestore(unsigned long flags) { raw_spin_unlock_irqrestore(&tk_core.lock, flags); } /* * Multigrain timestamps require tracking the latest fine-grained timestamp * that has been issued, and never returning a coarse-grained timestamp that is * earlier than that value. * * mg_floor represents the latest fine-grained time that has been handed out as * a file timestamp on the system. This is tracked as a monotonic ktime_t, and * converted to a realtime clock value on an as-needed basis. * * Maintaining mg_floor ensures the multigrain interfaces never issue a * timestamp earlier than one that has been previously issued. * * The exception to this rule is when there is a backward realtime clock jump. If * such an event occurs, a timestamp can appear to be earlier than a previous one. */ static __cacheline_aligned_in_smp atomic64_t mg_floor; static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec++; } while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) { tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift; tk->raw_sec++; } } static inline struct timespec64 tk_xtime(const struct timekeeper *tk) { struct timespec64 ts; ts.tv_sec = tk->xtime_sec; ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); return ts; } static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk) { struct timespec64 ts; ts.tv_sec = tk->xtime_sec; ts.tv_nsec = tk->coarse_nsec; return ts; } /* * Update the nanoseconds part for the coarse time keepers. They can't rely * on xtime_nsec because xtime_nsec could be adjusted by a small negative * amount when the multiplication factor of the clock is adjusted, which * could cause the coarse clocks to go slightly backwards. See * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse * clockids which only is updated when the clock has been set or we have * accumulated time. */ static inline void tk_update_coarse_nsecs(struct timekeeper *tk) { tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; } static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_update_coarse_nsecs(tk); } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec += ts->tv_sec; tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); tk_update_coarse_nsecs(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) { struct timespec64 tmp; /* * Verify consistency of: offset_real = -wall_to_monotonic * before modifying anything */ set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec, -tk->wall_to_monotonic.tv_nsec); WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp)); tk->wall_to_monotonic = wtm; set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); /* Paired with READ_ONCE() in ktime_mono_to_any() */ WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp)); WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0))); } static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { /* Paired with READ_ONCE() in ktime_mono_to_any() */ WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta)); /* * Timespec representation for VDSO update to avoid 64bit division * on every update. */ tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); } /* * tk_clock_read - atomic clocksource read() helper * * This helper is necessary to use in the read paths because, while the * seqcount ensures we don't return a bad value while structures are updated, * it doesn't protect from potential crashes. There is the possibility that * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if * the wrong clocksource is passed to the wrong read function. * This isn't necessary to use when holding the tk_core.lock or doing * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ static inline u64 tk_clock_read(const struct tk_read_base *tkr) { struct clocksource *clock = READ_ONCE(tkr->clock); return clock->read(clock); } /** * tk_setup_internals - Set up internals to use clocksource clock. * * @tk: The target timekeeper to setup. * @clock: Pointer to clocksource. * * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment * pair and interval request. * * Unless you're the timekeeping code, you should not be using this! */ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) { u64 interval; u64 tmp, ntpinterval; struct clocksource *old_clock; ++tk->cs_was_changed_seq; old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; tk->tkr_mono.mask = clock->mask; tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono); tk->tkr_raw.clock = clock; tk->tkr_raw.mask = clock->mask; tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; tmp <<= clock->shift; ntpinterval = tmp; tmp += clock->mult/2; do_div(tmp, clock->mult); if (tmp == 0) tmp = 1; interval = (u64) tmp; tk->cycle_interval = interval; /* Go back from cycles -> shifted ns */ tk->xtime_interval = interval * clock->mult; tk->xtime_remainder = ntpinterval - tk->xtime_interval; tk->raw_interval = interval * clock->mult; /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { int shift_change = clock->shift - old_clock->shift; if (shift_change < 0) { tk->tkr_mono.xtime_nsec >>= -shift_change; tk->tkr_raw.xtime_nsec >>= -shift_change; } else { tk->tkr_mono.xtime_nsec <<= shift_change; tk->tkr_raw.xtime_nsec <<= shift_change; } } tk->tkr_mono.shift = clock->shift; tk->tkr_raw.shift = clock->shift; tk->ntp_error = 0; tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; tk->ntp_tick = ntpinterval << tk->ntp_error_shift; /* * The timekeeper keeps its own mult values for the currently * active clocksource. These value will be adjusted via NTP * to counteract clock drifting. */ tk->tkr_mono.mult = clock->mult; tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; tk->skip_second_overflow = 0; } /* Timekeeper helper functions. */ static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta) { return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift); } static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { /* Calculate the delta since the last update_wall_time() */ u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; /* * This detects both negative motion and the case where the delta * overflows the multiplication with tkr->mult. */ if (unlikely(delta > tkr->clock->max_cycles)) { /* * Handle clocksource inconsistency between CPUs to prevent * time from going backwards by checking for the MSB of the * mask being set in the delta. */ if (delta & ~(mask >> 1)) return tkr->xtime_nsec >> tkr->shift; return delta_to_ns_safe(tkr, delta); } return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift; } static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) { return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr)); } /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. * @tkr: Timekeeping readout base from which we take the update * @tkf: Pointer to NMI safe timekeeper * * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. * * Employ the latch technique; see @write_seqcount_latch. * * So if a NMI hits the update of base[0] then it will use base[1] * which is still consistent. In the worst case this can result is a * slightly wrong timestamp (a few nanoseconds). See * @ktime_get_mono_fast_ns. */ static void update_fast_timekeeper(const struct tk_read_base *tkr, struct tk_fast *tkf) { struct tk_read_base *base = tkf->base; /* Force readers off to base[1] */ write_seqcount_latch_begin(&tkf->seq); /* Update base[0] */ memcpy(base, tkr, sizeof(*base)); /* Force readers back to base[0] */ write_seqcount_latch(&tkf->seq); /* Update base[1] */ memcpy(base + 1, base, sizeof(*base)); write_seqcount_latch_end(&tkf->seq); } static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) { struct tk_read_base *tkr; unsigned int seq; u64 now; do { seq = read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); now += timekeeping_get_ns(tkr); } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; } /** * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic * * This timestamp is not guaranteed to be monotonic across an update. * The timestamp is calculated by: * * now = base_mono + clock_delta * slope * * So if the update lowers the slope, readers who are forced to the * not yet updated second array are still using the old steeper slope. * * tmono * ^ * | o n * | o n * | u * | o * |o * |12345678---> reader order * * o = old slope * u = update * n = new slope * * So reader 6 will observe time going backwards versus reader 5. * * While other CPUs are likely to be able to observe that, the only way * for a CPU local observation is when an NMI hits in the middle of * the update. Timestamps taken from that NMI context might be ahead * of the following timestamps. Callers need to be aware of that and * deal with it. */ u64 notrace ktime_get_mono_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_mono); } EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); /** * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw * * Contrary to ktime_get_mono_fast_ns() this is always correct because the * conversion factor is not affected by NTP/PTP correction. */ u64 notrace ktime_get_raw_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_raw); } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); /** * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. * * To keep it NMI safe since we're accessing from tracing, we're not using a * separate timekeeper with updates to monotonic clock and boot offset * protected with seqcounts. This has the following minor side effects: * * (1) Its possible that a timestamp be taken after the boot offset is updated * but before the timekeeper is updated. If this happens, the new boot offset * is added to the old timekeeping making the clock appear to update slightly * earlier: * CPU 0 CPU 1 * timekeeping_inject_sleeptime64() * __timekeeping_inject_sleeptime(tk, delta); * timestamp(); * timekeeping_update_staged(tkd, TK_CLEAR_NTP...); * * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be * partially updated. Since the tk->offs_boot update is a rare event, this * should be a rare occurrence which postprocessing should be able to handle. * * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns() * apply as well. */ u64 notrace ktime_get_boot_fast_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot))); } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); /** * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock. * * The same limitations as described for ktime_get_boot_fast_ns() apply. The * mono time and the TAI offset are not read atomically which may yield wrong * readouts. However, an update of the TAI offset is an rare event e.g., caused * by settime or adjtimex with an offset. The user of this function has to deal * with the possibility of wrong timestamps in post processing. */ u64 notrace ktime_get_tai_fast_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai))); } EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns); /** * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. * * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. */ u64 ktime_get_real_fast_ns(void) { struct tk_fast *tkf = &tk_fast_mono; struct tk_read_base *tkr; u64 baser, delta; unsigned int seq; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); baser = ktime_to_ns(tkr->base_real); delta = timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); return baser + delta; } EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. * * It generally is unsafe to access the clocksource after timekeeping has been * suspended, so take a snapshot of the readout base of @tk and use it as the * fast timekeeper's readout base while suspended. It will return the same * number of cycles every time until timekeeping is resumed at which time the * proper readout base for the fast timekeeper will be restored automatically. */ static void halt_fast_timekeeper(const struct timekeeper *tk) { static struct tk_read_base tkr_dummy; const struct tk_read_base *tkr = &tk->tkr_mono; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); cycles_at_suspend = tk_clock_read(tkr); tkr_dummy.clock = &dummy_clock; tkr_dummy.base_real = tkr->base + tk->offs_real; update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); tkr = &tk->tkr_raw; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); tkr_dummy.clock = &dummy_clock; update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); } static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) { raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk); } /** * pvclock_gtod_register_notifier - register a pvclock timedata update listener * @nb: Pointer to the notifier block to register */ int pvclock_gtod_register_notifier(struct notifier_block *nb) { struct timekeeper *tk = &tk_core.timekeeper; int ret; guard(raw_spinlock_irqsave)(&tk_core.lock); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); update_pvclock_gtod(tk, true); return ret; } EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); /** * pvclock_gtod_unregister_notifier - unregister a pvclock * timedata update listener * @nb: Pointer to the notifier block to unregister */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { guard(raw_spinlock_irqsave)(&tk_core.lock); return raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); /* * tk_update_leap_state - helper to update the next_leap_ktime */ static inline void tk_update_leap_state(struct timekeeper *tk) { tk->next_leap_ktime = ntp_get_next_leap(tk->id); if (tk->next_leap_ktime != KTIME_MAX) /* Convert to monotonic time */ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); } /* * Leap state update for both shadow and the real timekeeper * Separate to spare a full memcpy() of the timekeeper. */ static void tk_update_leap_state_all(struct tk_data *tkd) { write_seqcount_begin(&tkd->seq); tk_update_leap_state(&tkd->shadow_timekeeper); tkd->timekeeper.next_leap_ktime = tkd->shadow_timekeeper.next_leap_ktime; write_seqcount_end(&tkd->seq); } /* * Update the ktime_t based scalar nsec members of the timekeeper */ static inline void tk_update_ktime_data(struct timekeeper *tk) { u64 seconds; u32 nsec; /* * The xtime based monotonic readout is: * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now(); * The ktime based monotonic readout is: * nsec = base_mono + now(); * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec */ seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); nsec = (u32) tk->wall_to_monotonic.tv_nsec; tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); /* * The sum of the nanoseconds portions of xtime and * wall_to_monotonic can be greater/equal one second. Take * this into account before updating tk->ktime_sec. */ nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); if (nsec >= NSEC_PER_SEC) seconds++; tk->ktime_sec = seconds; /* Update the monotonic raw base */ tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } /* * Restore the shadow timekeeper from the real timekeeper. */ static void timekeeping_restore_shadow(struct tk_data *tkd) { lockdep_assert_held(&tkd->lock); memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper)); } static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) { struct timekeeper *tk = &tkd->shadow_timekeeper; lockdep_assert_held(&tkd->lock); /* * Block out readers before running the updates below because that * updates VDSO and other time related infrastructure. Not blocking * the readers might let a reader see time going backwards when * reading from the VDSO after the VDSO update and then reading in * the kernel from the timekeeper before that got updated. */ write_seqcount_begin(&tkd->seq); if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; ntp_clear(tk->id); } tk_update_leap_state(tk); tk_update_ktime_data(tk); tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; if (tk->id == TIMEKEEPER_CORE) { update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); } else if (tk_is_aux(tk)) { vdso_time_update_aux(tk); } if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; /* * Update the real timekeeper. * * We could avoid this memcpy() by switching pointers, but that has * the downside that the reader side does not longer benefit from * the cacheline optimized data layout of the timekeeper and requires * another indirection. */ memcpy(&tkd->timekeeper, tk, sizeof(*tk)); write_seqcount_end(&tkd->seq); } /** * timekeeping_forward_now - update clock to the current time * @tk: Pointer to the timekeeper to update * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ static void timekeeping_forward_now(struct timekeeper *tk) { u64 cycle_now, delta; cycle_now = tk_clock_read(&tk->tkr_mono); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, tk->tkr_mono.clock->max_raw_delta); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; while (delta > 0) { u64 max = tk->tkr_mono.clock->max_cycles; u64 incr = delta < max ? delta : max; tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult; tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult; tk_normalize_xtime(tk); delta -= incr; } tk_update_coarse_nsecs(tk); } /** * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * * Returns the time of day in a timespec64 (WARN if suspended). */ void ktime_get_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(ktime_get_real_ts64); ktime_t ktime_get(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get); u32 ktime_get_resolution_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u32 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift; } while (read_seqcount_retry(&tk_core.seq, seq)); return nsecs; } EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); static ktime_t *offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, }; ktime_t ktime_get_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base, *offset = offsets[offs]; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t base, *offset = offsets[offs]; unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); nsecs = tk->coarse_nsec; } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); /** * ktime_mono_to_any() - convert monotonic time to any other time * @tmono: time to convert. * @offs: which offset to use */ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) { ktime_t *offset = offsets[offs]; unsigned int seq; ktime_t tconv; if (IS_ENABLED(CONFIG_64BIT)) { /* * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and * tk_update_sleep_time(). */ return ktime_add(tmono, READ_ONCE(*offset)); } do { seq = read_seqcount_begin(&tk_core.seq); tconv = ktime_add(tmono, *offset); } while (read_seqcount_retry(&tk_core.seq, seq)); return tconv; } EXPORT_SYMBOL_GPL(ktime_mono_to_any); /** * ktime_get_raw - Returns the raw monotonic time in ktime_t format */ ktime_t ktime_get_raw(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_raw.base; nsecs = timekeeping_get_ns(&tk->tkr_raw); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_raw); /** * ktime_get_ts64 - get the monotonic clock in timespec64 format * @ts: pointer to timespec variable * * The function calculates the monotonic clock from the realtime * clock and the wall_to_monotonic offset and stores the result * in normalized timespec64 format in the variable pointed to by @ts. */ void ktime_get_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 tomono; unsigned int seq; u64 nsec; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(&tk->tkr_mono); tomono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_sec += tomono.tv_sec; ts->tv_nsec = 0; timespec64_add_ns(ts, nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts64); /** * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC * * Returns the seconds portion of CLOCK_MONOTONIC with a single non * serialized read. tk->ktime_sec is of type 'unsigned long' so this * works on both 32 and 64 bit systems. On 32 bit systems the readout * covers ~136 years of uptime which should be enough to prevent * premature wrap arounds. */ time64_t ktime_get_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; WARN_ON(timekeeping_suspended); return tk->ktime_sec; } EXPORT_SYMBOL_GPL(ktime_get_seconds); /** * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME * * Returns the wall clock seconds since 1970. * * For 64bit systems the fast access to tk->xtime_sec is preserved. On * 32bit systems the access must be protected with the sequence * counter to provide "atomic" access to the 64bit tk->xtime_sec * value. */ time64_t ktime_get_real_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; time64_t seconds; unsigned int seq; if (IS_ENABLED(CONFIG_64BIT)) return tk->xtime_sec; do { seq = read_seqcount_begin(&tk_core.seq); seconds = tk->xtime_sec; } while (read_seqcount_retry(&tk_core.seq, seq)); return seconds; } EXPORT_SYMBOL_GPL(ktime_get_real_seconds); /** * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds * * The same as ktime_get_real_seconds() but without the sequence counter * protection. This function is used in restricted contexts like the x86 MCE * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half * completed modification and only to be used for such critical contexts. * * Returns: Racy snapshot of the CLOCK_REALTIME seconds value */ noinstr time64_t __ktime_get_real_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; return tk->xtime_sec; } /** * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter * @systime_snapshot: pointer to struct receiving the system time snapshot */ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base_raw; ktime_t base_real; ktime_t base_boot; u64 nsec_raw; u64 nsec_real; u64 now; WARN_ON_ONCE(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); now = tk_clock_read(&tk->tkr_mono); systime_snapshot->cs_id = tk->tkr_mono.clock->id; systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); base_boot = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_boot); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); } while (read_seqcount_retry(&tk_core.seq, seq)); systime_snapshot->cycles = now; systime_snapshot->real = ktime_add_ns(base_real, nsec_real); systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real); systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); } EXPORT_SYMBOL_GPL(ktime_get_snapshot); /* Scale base by mult/div checking for overflow */ static int scale64_check_overflow(u64 mult, u64 div, u64 *base) { u64 tmp, rem; tmp = div64_u64_rem(*base, div, &rem); if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) || ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) return -EOVERFLOW; tmp *= mult; rem = div64_u64(rem * mult, div); *base = tmp + rem; return 0; } /** * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval * @history: Snapshot representing start of history * @partial_history_cycles: Cycle offset into history (fractional part) * @total_history_cycles: Total history length in cycles * @discontinuity: True indicates clock was set on history period * @ts: Cross timestamp that should be adjusted using * partial/total ratio * * Helper function used by get_device_system_crosststamp() to correct the * crosstimestamp corresponding to the start of the current interval to the * system counter value (timestamp point) provided by the driver. The * total_history_* quantities are the total history starting at the provided * reference point and ending at the start of the current interval. The cycle * count between the driver timestamp point and the start of the current * interval is partial_history_cycles. */ static int adjust_historical_crosststamp(struct system_time_snapshot *history, u64 partial_history_cycles, u64 total_history_cycles, bool discontinuity, struct system_device_crosststamp *ts) { struct timekeeper *tk = &tk_core.timekeeper; u64 corr_raw, corr_real; bool interp_forward; int ret; if (total_history_cycles == 0 || partial_history_cycles == 0) return 0; /* Interpolate shortest distance from beginning or end of history */ interp_forward = partial_history_cycles > total_history_cycles / 2; partial_history_cycles = interp_forward ? total_history_cycles - partial_history_cycles : partial_history_cycles; /* * Scale the monotonic raw time delta by: * partial_history_cycles / total_history_cycles */ corr_raw = (u64)ktime_to_ns( ktime_sub(ts->sys_monoraw, history->raw)); ret = scale64_check_overflow(partial_history_cycles, total_history_cycles, &corr_raw); if (ret) return ret; /* * If there is a discontinuity in the history, scale monotonic raw * correction by: * mult(real)/mult(raw) yielding the realtime correction * Otherwise, calculate the realtime correction similar to monotonic * raw calculation */ if (discontinuity) { corr_real = mul_u64_u32_div (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); } else { corr_real = (u64)ktime_to_ns( ktime_sub(ts->sys_realtime, history->real)); ret = scale64_check_overflow(partial_history_cycles, total_history_cycles, &corr_real); if (ret) return ret; } /* Fixup monotonic raw and real time time values */ if (interp_forward) { ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw); ts->sys_realtime = ktime_add_ns(history->real, corr_real); } else { ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw); ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real); } return 0; } /* * timestamp_in_interval - true if ts is chronologically in [start, end] * * True if ts occurs chronologically at or after start, and before or at end. */ static bool timestamp_in_interval(u64 start, u64 end, u64 ts) { if (ts >= start && ts <= end) return true; if (start > end && (ts >= start || ts <= end)) return true; return false; } static bool convert_clock(u64 *val, u32 numerator, u32 denominator) { u64 rem, res; if (!numerator || !denominator) return false; res = div64_u64_rem(*val, denominator, &rem) * numerator; *val = res + div_u64(rem * numerator, denominator); return true; } static bool convert_base_to_cs(struct system_counterval_t *scv) { struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; struct clocksource_base *base; u32 num, den; /* The timestamp was taken from the time keeper clock source */ if (cs->id == scv->cs_id) return true; /* * Check whether cs_id matches the base clock. Prevent the compiler from * re-evaluating @base as the clocksource might change concurrently. */ base = READ_ONCE(cs->base); if (!base || base->id != scv->cs_id) return false; num = scv->use_nsecs ? cs->freq_khz : base->numerator; den = scv->use_nsecs ? USEC_PER_SEC : base->denominator; if (!convert_clock(&scv->cycles, num, den)) return false; scv->cycles += base->offset; return true; } static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id) { struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; struct clocksource_base *base; /* * Check whether base_id matches the base clock. Prevent the compiler from * re-evaluating @base as the clocksource might change concurrently. */ base = READ_ONCE(cs->base); if (!base || base->id != base_id) return false; *cycles -= base->offset; if (!convert_clock(cycles, base->denominator, base->numerator)) return false; return true; } static bool convert_ns_to_cs(u64 *delta) { struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta)) return false; *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult); return true; } /** * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp * @treal: CLOCK_REALTIME timestamp to convert * @base_id: base clocksource id * @cycles: pointer to store the converted base clock timestamp * * Converts a supplied, future realtime clock value to the corresponding base clock value. * * Return: true if the conversion is successful, false otherwise. */ bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 delta; do { seq = read_seqcount_begin(&tk_core.seq); if ((u64)treal < tk->tkr_mono.base_real) return false; delta = (u64)treal - tk->tkr_mono.base_real; if (!convert_ns_to_cs(&delta)) return false; *cycles = tk->tkr_mono.cycle_last + delta; if (!convert_cs_to_base(cycles, base_id)) return false; } while (read_seqcount_retry(&tk_core.seq, seq)); return true; } EXPORT_SYMBOL_GPL(ktime_real_to_base_clock); /** * get_device_system_crosststamp - Synchronously capture system/device timestamp * @get_time_fn: Callback to get simultaneous device time and * system counter from the device driver * @ctx: Context passed to get_time_fn() * @history_begin: Historical reference point used to interpolate system * time when counter provided by the driver is before the current interval * @xtstamp: Receives simultaneously captured system and device time * * Reads a timestamp from a device and correlates it to system time */ int get_device_system_crosststamp(int (*get_time_fn) (ktime_t *device_time, struct system_counterval_t *sys_counterval, void *ctx), void *ctx, struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { struct system_counterval_t system_counterval = {}; struct timekeeper *tk = &tk_core.timekeeper; u64 cycles, now, interval_start; unsigned int clock_was_set_seq = 0; ktime_t base_real, base_raw; u64 nsec_real, nsec_raw; u8 cs_was_changed_seq; unsigned int seq; bool do_interp; int ret; do { seq = read_seqcount_begin(&tk_core.seq); /* * Try to synchronously capture device time and a system * counter value calling back into the device driver */ ret = get_time_fn(&xtstamp->device, &system_counterval, ctx); if (ret) return ret; /* * Verify that the clocksource ID associated with the captured * system counter value is the same as for the currently * installed timekeeper clocksource */ if (system_counterval.cs_id == CSID_GENERIC || !convert_base_to_cs(&system_counterval)) return -ENODEV; cycles = system_counterval.cycles; /* * Check whether the system counter value provided by the * device driver is on the current timekeeping interval. */ now = tk_clock_read(&tk->tkr_mono); interval_start = tk->tkr_mono.cycle_last; if (!timestamp_in_interval(interval_start, now, cycles)) { clock_was_set_seq = tk->clock_was_set_seq; cs_was_changed_seq = tk->cs_was_changed_seq; cycles = interval_start; do_interp = true; } else { do_interp = false; } base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles); } while (read_seqcount_retry(&tk_core.seq, seq)); xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); /* * Interpolate if necessary, adjusting back from the start of the * current interval */ if (do_interp) { u64 partial_history_cycles, total_history_cycles; bool discontinuity; /* * Check that the counter value is not before the provided * history reference and that the history doesn't cross a * clocksource change */ if (!history_begin || !timestamp_in_interval(history_begin->cycles, cycles, system_counterval.cycles) || history_begin->cs_was_changed_seq != cs_was_changed_seq) return -EINVAL; partial_history_cycles = cycles - system_counterval.cycles; total_history_cycles = cycles - history_begin->cycles; discontinuity = history_begin->clock_was_set_seq != clock_was_set_seq; ret = adjust_historical_crosststamp(history_begin, partial_history_cycles, total_history_cycles, discontinuity, xtstamp); if (ret) return ret; } return 0; } EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** * timekeeping_clocksource_has_base - Check whether the current clocksource * is based on given a base clock * @id: base clocksource ID * * Note: The return value is a snapshot which can become invalid right * after the function returns. * * Return: true if the timekeeper clocksource has a base clock with @id, * false otherwise */ bool timekeeping_clocksource_has_base(enum clocksource_ids id) { /* * This is a snapshot, so no point in using the sequence * count. Just prevent the compiler from re-evaluating @base as the * clocksource might change concurrently. */ struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base); return base ? base->id == id : false; } EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base); /** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time * * Sets the time of day to the new time and update NTP and notify hrtimers */ int do_settimeofday64(const struct timespec64 *ts) { struct timespec64 ts_delta, xt; if (!timespec64_valid_settod(ts)) return -EINVAL; scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; timekeeping_forward_now(tks); xt = tk_xtime(tks); ts_delta = timespec64_sub(*ts, xt); if (timespec64_compare(&tks->wall_to_monotonic, &ts_delta) > 0) { timekeeping_restore_shadow(&tk_core); return -EINVAL; } tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta)); tk_set_xtime(tks, ts); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); audit_tk_injoffset(ts_delta); add_device_randomness(ts, sizeof(*ts)); return 0; } EXPORT_SYMBOL(do_settimeofday64); static inline bool timekeeper_is_core_tk(struct timekeeper *tk) { return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE; } /** * __timekeeping_inject_offset - Adds or subtracts from the current time. * @tkd: Pointer to the timekeeper to modify * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts) { struct timekeeper *tks = &tkd->shadow_timekeeper; struct timespec64 tmp; if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; timekeeping_forward_now(tks); if (timekeeper_is_core_tk(tks)) { /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tks), *ts); if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || !timespec64_valid_settod(&tmp)) { timekeeping_restore_shadow(tkd); return -EINVAL; } tk_xtime_add(tks, ts); tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); } else { struct tk_read_base *tkr_mono = &tks->tkr_mono; ktime_t now, offs; /* Get the current time */ now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono)); /* Add the relative offset change */ offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts)); /* Prevent that the resulting time becomes negative */ if (ktime_add(now, offs) < 0) { timekeeping_restore_shadow(tkd); return -EINVAL; } tks->offs_aux = offs; } timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); return 0; } static int timekeeping_inject_offset(const struct timespec64 *ts) { int ret; scoped_guard (raw_spinlock_irqsave, &tk_core.lock) ret = __timekeeping_inject_offset(&tk_core, ts); /* Signal hrtimers about time change */ if (!ret) clock_was_set(CLOCK_SET_WALL); return ret; } /* * Indicates if there is an offset between the system clock and the hardware * clock/persistent clock/rtc. */ int persistent_clock_is_local; /* * Adjust the time obtained from the CMOS to be UTC time instead of * local time. * * This is ugly, but preferable to the alternatives. Otherwise we * would either need to write a program to do it in /etc/rc (and risk * confusion if the program gets run more than once; it would also be * hard to make the program warp the clock precisely n hours) or * compile in the timezone information into the kernel. Bad, bad.... * * - TYT, 1992-01-01 * * The best thing to do is to keep the CMOS clock in universal time (UTC) * as real UNIX machines always do it. This avoids all headaches about * daylight saving times and warping kernel clocks. */ void timekeeping_warp_clock(void) { if (sys_tz.tz_minuteswest != 0) { struct timespec64 adjust; persistent_clock_is_local = 1; adjust.tv_sec = sys_tz.tz_minuteswest * 60; adjust.tv_nsec = 0; timekeeping_inject_offset(&adjust); } } /* * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic */ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) { tk->tai_offset = tai_offset; tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); } /* * change_clocksource - Swaps clocksources if a new one is available * * Accumulates current time interval and initializes new clocksource */ static int change_clocksource(void *data) { struct clocksource *new = data, *old = NULL; /* * If the clocksource is in a module, get a module reference. * Succeeds for built-in code (owner == NULL) as well. Abort if the * reference can't be acquired. */ if (!try_module_get(new->owner)) return 0; /* Abort if the device can't be enabled */ if (new->enable && new->enable(new) != 0) { module_put(new->owner); return 0; } scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; timekeeping_forward_now(tks); old = tks->tkr_mono.clock; tk_setup_internals(tks, new); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } tk_aux_update_clocksource(); if (old) { if (old->disable) old->disable(old); module_put(old->owner); } return 0; } /** * timekeeping_notify - Install a new clock source * @clock: pointer to the clock source * * This function is called from clocksource.c after a new, better clock * source has been registered. The caller holds the clocksource_mutex. */ int timekeeping_notify(struct clocksource *clock) { struct timekeeper *tk = &tk_core.timekeeper; if (tk->tkr_mono.clock == clock) return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); return tk->tkr_mono.clock == clock ? 0 : -1; } /** * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec * @ts: pointer to the timespec64 to be set * * Returns the raw monotonic time (completely un-modified by ntp) */ void ktime_get_raw_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->raw_sec; nsecs = timekeeping_get_ns(&tk->tkr_raw); } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(ktime_get_raw_ts64); /** * ktime_get_clock_ts64 - Returns time of a clock in a timespec * @id: POSIX clock ID of the clock to read * @ts: Pointer to the timespec64 to be set * * The timestamp is invalidated (@ts->sec is set to -1) if the * clock @id is not available. */ void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts) { /* Invalidate time stamp */ ts->tv_sec = -1; ts->tv_nsec = 0; switch (id) { case CLOCK_REALTIME: ktime_get_real_ts64(ts); return; case CLOCK_MONOTONIC: ktime_get_ts64(ts); return; case CLOCK_MONOTONIC_RAW: ktime_get_raw_ts64(ts); return; case CLOCK_AUX ... CLOCK_AUX_LAST: if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) ktime_get_aux_ts64(id, ts); return; default: WARN_ON_ONCE(1); } } EXPORT_SYMBOL_GPL(ktime_get_clock_ts64); /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ int timekeeping_valid_for_hres(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; int ret; do { seq = read_seqcount_begin(&tk_core.seq); ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } /** * timekeeping_max_deferment - Returns max time the clocksource can be deferred */ u64 timekeeping_max_deferment(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 ret; do { seq = read_seqcount_begin(&tk_core.seq); ret = tk->tkr_mono.clock->max_idle_ns; } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } /** * read_persistent_clock64 - Return time from the persistent clock. * @ts: Pointer to the storage for the readout value * * Weak dummy function for arches that do not yet support it. * Reads the time from the battery backed persistent clock. * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ void __weak read_persistent_clock64(struct timespec64 *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; } /** * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset * from the boot. * @wall_time: current time as returned by persistent clock * @boot_offset: offset that is defined as wall_time - boot_time * * Weak dummy function for arches that do not yet support it. * * The default function calculates offset based on the current value of * local_clock(). This way architectures that support sched_clock() but don't * support dedicated boot time clock will provide the best estimate of the * boot time. */ void __weak __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, struct timespec64 *boot_offset) { read_persistent_clock64(wall_time); *boot_offset = ns_to_timespec64(local_clock()); } static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid) { raw_spin_lock_init(&tkd->lock); seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id; tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid; } /* * Flag reflecting whether timekeeping_resume() has injected sleeptime. * * The flag starts of false and is only set when a suspend reaches * timekeeping_suspend(), timekeeping_resume() sets it to false when the * timekeeper clocksource is not stopping across suspend and has been * used to update sleep time. If the timekeeper clocksource has stopped * then the flag stays true and is used by the RTC resume code to decide * whether sleeptime must be injected and if so the flag gets false then. * * If a suspend fails before reaching timekeeping_resume() then the flag * stays false and prevents erroneous sleeptime injection. */ static bool suspend_timing_needed; /* Flag for if there is a persistent clock on this platform */ static bool persistent_clock_exists; /* * timekeeping_init - Initializes the clocksource and common timekeeping values */ void __init timekeeping_init(void) { struct timespec64 wall_time, boot_offset, wall_to_mono; struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true); tk_aux_setup(); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && timespec64_to_ns(&wall_time) > 0) { persistent_clock_exists = true; } else if (timespec64_to_ns(&wall_time) != 0) { pr_warn("Persistent clock returned invalid value"); wall_time = (struct timespec64){0}; } if (timespec64_compare(&wall_time, &boot_offset) < 0) boot_offset = (struct timespec64){0}; /* * We want set wall_to_mono, so the following is true: * wall time + wall_to_mono = boot time */ wall_to_mono = timespec64_sub(boot_offset, wall_time); guard(raw_spinlock_irqsave)(&tk_core.lock); ntp_init(); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); tk_setup_internals(tks, clock); tk_set_xtime(tks, &wall_time); tks->raw_sec = 0; tk_set_wall_to_mono(tks, wall_to_mono); timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); } /* time in seconds when suspend began for persistent clock */ static struct timespec64 timekeeping_suspend_time; /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval * @tk: Pointer to the timekeeper to be updated * @delta: Pointer to the delta value in timespec64 format * * Takes a timespec offset measuring a suspend interval and properly * adds the sleep offset to the timekeeping variables. */ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, const struct timespec64 *delta) { if (!timespec64_valid_strict(delta)) { printk_deferred(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; } tk_xtime_add(tk, delta); tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) /* * We have three kinds of time sources to use for sleep time * injection, the preference order is: * 1) non-stop clocksource * 2) persistent clock (ie: RTC accessible when irqs are off) * 3) RTC * * 1) and 2) are used by timekeeping, 3) by RTC subsystem. * If system has neither 1) nor 2), 3) will be used finally. * * * If timekeeping has injected sleeptime via either 1) or 2), * 3) becomes needless, so in this case we don't need to call * rtc_resume(), and this is what timekeeping_rtc_skipresume() * means. */ bool timekeeping_rtc_skipresume(void) { return !suspend_timing_needed; } /* * 1) can be determined whether to use or not only when doing * timekeeping_resume() which is invoked after rtc_suspend(), * so we can't skip rtc_suspend() surely if system has 1). * * But if system has 2), 2) will definitely be used, so in this * case we don't need to call rtc_suspend(), and this is what * timekeeping_rtc_skipsuspend() means. */ bool timekeeping_rtc_skipsuspend(void) { return persistent_clock_exists; } /** * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values * @delta: pointer to a timespec64 delta value * * This hook is for architectures that cannot support read_persistent_clock64 * because their RTC/persistent clock is only accessible when irqs are enabled. * and also don't have an effective nonstop clocksource. * * This function should only be called by rtc_resume(), and allows * a suspend offset to be injected into the timekeeping values. */ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) { scoped_guard(raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; suspend_timing_needed = false; timekeeping_forward_now(tks); __timekeeping_inject_sleeptime(tks, delta); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT); } #endif /** * timekeeping_resume - Resumes the generic timekeeping subsystem. */ void timekeeping_resume(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock = tks->tkr_mono.clock; struct timespec64 ts_new, ts_delta; bool inject_sleeptime = false; u64 cycle_now, nsec; unsigned long flags; read_persistent_clock64(&ts_new); clockevents_resume(); clocksource_resume(); raw_spin_lock_irqsave(&tk_core.lock, flags); /* * After system resumes, we need to calculate the suspended time and * compensate it for the OS time. There are 3 sources that could be * used: Nonstop clocksource during suspend, persistent clock and rtc * device. * * One specific platform may have 1 or 2 or all of them, and the * preference will be: * suspend-nonstop clocksource -> persistent clock -> rtc * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ cycle_now = tk_clock_read(&tks->tkr_mono); nsec = clocksource_stop_suspend_timing(clock, cycle_now); if (nsec > 0) { ts_delta = ns_to_timespec64(nsec); inject_sleeptime = true; } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); inject_sleeptime = true; } if (inject_sleeptime) { suspend_timing_needed = false; __timekeeping_inject_sleeptime(tks, &ts_delta); } /* Re-base the last cycle value */ tks->tkr_mono.cycle_last = cycle_now; tks->tkr_raw.cycle_last = cycle_now; tks->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); raw_spin_unlock_irqrestore(&tk_core.lock, flags); touch_softlockup_watchdog(); /* Resume the clockevent device(s) and hrtimers */ tick_resume(); /* Notify timerfd as resume is equivalent to clock_was_set() */ timerfd_resume(); } int timekeeping_suspend(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; struct timespec64 delta, delta_delta; static struct timespec64 old_delta; struct clocksource *curr_clock; unsigned long flags; u64 cycle_now; read_persistent_clock64(&timekeeping_suspend_time); /* * On some systems the persistent_clock can not be detected at * timekeeping_init by its return value, so if we see a valid * value returned, update the persistent_clock_exists flag. */ if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) persistent_clock_exists = true; suspend_timing_needed = true; raw_spin_lock_irqsave(&tk_core.lock, flags); timekeeping_forward_now(tks); timekeeping_suspended = 1; /* * Since we've called forward_now, cycle_last stores the value * just read from the current clocksource. Save this to potentially * use in suspend timing. */ curr_clock = tks->tkr_mono.clock; cycle_now = tks->tkr_mono.cycle_last; clocksource_start_suspend_timing(curr_clock, cycle_now); if (persistent_clock_exists) { /* * To avoid drift caused by repeated suspend/resumes, * which each can add ~1 second drift error, * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ delta = timespec64_sub(tk_xtime(tks), timekeeping_suspend_time); delta_delta = timespec64_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* * if delta_delta is too large, assume time correction * has occurred and set old_delta to the current delta. */ old_delta = delta; } else { /* Otherwise try to adjust old_system to compensate */ timekeeping_suspend_time = timespec64_add(timekeeping_suspend_time, delta_delta); } } timekeeping_update_from_shadow(&tk_core, 0); halt_fast_timekeeper(tks); raw_spin_unlock_irqrestore(&tk_core.lock, flags); tick_suspend(); clocksource_suspend(); clockevents_suspend(); return 0; } /* sysfs resume/suspend bits for timekeeping */ static struct syscore_ops timekeeping_syscore_ops = { .resume = timekeeping_resume, .suspend = timekeeping_suspend, }; static int __init timekeeping_init_ops(void) { register_syscore_ops(&timekeeping_syscore_ops); return 0; } device_initcall(timekeeping_init_ops); /* * Apply a multiplier adjustment to the timekeeper */ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, s64 offset, s32 mult_adj) { s64 interval = tk->cycle_interval; if (mult_adj == 0) { return; } else if (mult_adj == -1) { interval = -interval; offset = -offset; } else if (mult_adj != 1) { interval *= mult_adj; offset *= mult_adj; } /* * So the following can be confusing. * * To keep things simple, lets assume mult_adj == 1 for now. * * When mult_adj != 1, remember that the interval and offset values * have been appropriately scaled so the math is the same. * * The basic idea here is that we're increasing the multiplier * by one, this causes the xtime_interval to be incremented by * one cycle_interval. This is because: * xtime_interval = cycle_interval * mult * So if mult is being incremented by one: * xtime_interval = cycle_interval * (mult + 1) * Its the same as: * xtime_interval = (cycle_interval * mult) + cycle_interval * Which can be shortened to: * xtime_interval += cycle_interval * * So offset stores the non-accumulated cycles. Thus the current * time (in shifted nanoseconds) is: * now = (offset * adj) + xtime_nsec * Now, even though we're adjusting the clock frequency, we have * to keep time consistent. In other words, we can't jump back * in time, and we also want to avoid jumping forward in time. * * So given the same offset value, we need the time to be the same * both before and after the freq adjustment. * now = (offset * adj_1) + xtime_nsec_1 * now = (offset * adj_2) + xtime_nsec_2 * So: * (offset * adj_1) + xtime_nsec_1 = * (offset * adj_2) + xtime_nsec_2 * And we know: * adj_2 = adj_1 + 1 * So: * (offset * adj_1) + xtime_nsec_1 = * (offset * (adj_1+1)) + xtime_nsec_2 * (offset * adj_1) + xtime_nsec_1 = * (offset * adj_1) + offset + xtime_nsec_2 * Canceling the sides: * xtime_nsec_1 = offset + xtime_nsec_2 * Which gives us: * xtime_nsec_2 = xtime_nsec_1 - offset * Which simplifies to: * xtime_nsec -= offset */ if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { /* NTP adjustment caused clocksource mult overflow */ WARN_ON_ONCE(1); return; } tk->tkr_mono.mult += mult_adj; tk->xtime_interval += interval; tk->tkr_mono.xtime_nsec -= offset; } /* * Adjust the timekeeper's multiplier to the correct frequency * and also to reduce the accumulated error value. */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { u64 ntp_tl = ntp_tick_length(tk->id); u32 mult; /* * Determine the multiplier from the current NTP tick length. * Avoid expensive division when the tick length doesn't change. */ if (likely(tk->ntp_tick == ntp_tl)) { mult = tk->tkr_mono.mult - tk->ntp_err_mult; } else { tk->ntp_tick = ntp_tl; mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) - tk->xtime_remainder, tk->cycle_interval); } /* * If the clock is behind the NTP time, increase the multiplier by 1 * to catch up with it. If it's ahead and there was a remainder in the * tick division, the clock will slow down. Otherwise it will stay * ahead until the tick length changes to a non-divisible value. */ tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0; mult += tk->ntp_err_mult; timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult); if (unlikely(tk->tkr_mono.clock->maxadj && (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) > tk->tkr_mono.clock->maxadj))) { printk_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult, (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj); } /* * It may be possible that when we entered this function, xtime_nsec * was very small. Further, if we're slightly speeding the clocksource * in the code above, its possible the required corrective factor to * xtime_nsec could cause it to underflow. * * Now, since we have already accumulated the second and the NTP * subsystem has been notified via second_overflow(), we need to skip * the next update. */ if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec--; tk->skip_second_overflow = 1; } } /* * accumulate_nsecs_to_secs - Accumulates nsecs into secs * * Helper function that accumulates the nsecs greater than a second * from the xtime_nsec field to the xtime_secs field. * It also calls into the NTP code to handle leapsecond processing. */ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift; unsigned int clock_set = 0; while (tk->tkr_mono.xtime_nsec >= nsecps) { int leap; tk->tkr_mono.xtime_nsec -= nsecps; tk->xtime_sec++; /* * Skip NTP update if this second was accumulated before, * i.e. xtime_nsec underflowed in timekeeping_adjust() */ if (unlikely(tk->skip_second_overflow)) { tk->skip_second_overflow = 0; continue; } /* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->id, tk->xtime_sec); if (unlikely(leap)) { struct timespec64 ts; tk->xtime_sec += leap; ts.tv_sec = leap; ts.tv_nsec = 0; tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts)); __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); clock_set = TK_CLOCK_WAS_SET; } } return clock_set; } /* * logarithmic_accumulation - shifted accumulation of cycles * * This functions accumulates a shifted interval of cycles into * a shifted interval nanoseconds. Allows for O(log) accumulation * loop. * * Returns the unconsumed cycles. */ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, u32 shift, unsigned int *clock_set) { u64 interval = tk->cycle_interval << shift; u64 snsec_per_sec; /* If the offset is smaller than a shifted interval, do nothing */ if (offset < interval) return offset; /* Accumulate one shifted interval */ offset -= interval; tk->tkr_mono.cycle_last += interval; tk->tkr_raw.cycle_last += interval; tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift; *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { tk->tkr_raw.xtime_nsec -= snsec_per_sec; tk->raw_sec++; } /* Accumulate error between NTP and clock interval */ tk->ntp_error += tk->ntp_tick << shift; tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << (tk->ntp_error_shift + shift); return offset; } /* * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode) { struct timekeeper *tk = &tkd->shadow_timekeeper; struct timekeeper *real_tk = &tkd->timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; u64 offset, orig_offset; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return false; offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask, tk->tkr_mono.clock->max_raw_delta); orig_offset = offset; /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, * we calculate the largest doubling multiple of cycle_intervals * that is smaller than the offset. We then accumulate that * chunk in one go, and then try to consume the next smaller * doubled multiple. */ shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { offset = logarithmic_accumulation(tk, offset, shift, &clock_set); if (offset < tk->cycle_interval<<shift) shift--; } /* Adjust the multiplier to correct NTP error */ timekeeping_adjust(tk, offset); /* * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC */ clock_set |= accumulate_nsecs_to_secs(tk); /* * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls * making small negative adjustments to the base xtime_nsec * value, only update the coarse clocks if we accumulated time */ if (orig_offset != offset) tk_update_coarse_nsecs(tk); timekeeping_update_from_shadow(tkd, clock_set); return !!clock_set; } static bool timekeeping_advance(enum timekeeping_adv_mode mode) { guard(raw_spinlock_irqsave)(&tk_core.lock); return __timekeeping_advance(&tk_core, mode); } /** * update_wall_time - Uses the current clocksource to increment the wall time * * It also updates the enabled auxiliary clock timekeepers */ void update_wall_time(void) { if (timekeeping_advance(TK_ADV_TICK)) clock_was_set_delayed(); tk_aux_advance(); } /** * getboottime64 - Return the real time of system boot. * @ts: pointer to the timespec64 to be set * * Returns the wall-time of boot in a timespec64. * * This is based on the wall_to_monotonic offset and the total suspend * time. Calls to settimeofday will affect the value returned (which * basically means that however wrong your real time clock is at boot time, * you get the right time here). */ void getboottime64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); *ts = ktime_to_timespec64(t); } EXPORT_SYMBOL_GPL(getboottime64); void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); *ts = tk_xtime_coarse(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); /** * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor * @ts: timespec64 to be filled * * Fetch the global mg_floor value, convert it to realtime and compare it * to the current coarse-grained time. Fill @ts with whichever is * latest. Note that this is a filesystem-specific interface and should be * avoided outside of that context. */ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; u64 floor = atomic64_read(&mg_floor); ktime_t f_real, offset, coarse; unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); *ts = tk_xtime_coarse(tk); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); coarse = timespec64_to_ktime(*ts); f_real = ktime_add(floor, offset); if (ktime_after(f_real, coarse)) *ts = ktime_to_timespec64(f_real); } /** * ktime_get_real_ts64_mg - attempt to update floor value and return result * @ts: pointer to the timespec to be set * * Get a monotonic fine-grained time value and attempt to swap it into * mg_floor. If that succeeds then accept the new floor value. If it fails * then another task raced in during the interim time and updated the * floor. Since any update to the floor must be later than the previous * floor, either outcome is acceptable. * * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(), * and determining that the resulting coarse-grained timestamp did not effect * a change in ctime. Any more recent floor value would effect a change to * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure. * * @ts will be filled with the latest floor value, regardless of the outcome of * the cmpxchg. Note that this is a filesystem specific interface and should be * avoided outside of that context. */ void ktime_get_real_ts64_mg(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t old = atomic64_read(&mg_floor); ktime_t offset, mono; unsigned int seq; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; mono = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); mono = ktime_add_ns(mono, nsecs); /* * Attempt to update the floor with the new time value. As any * update must be later then the existing floor, and would effect * a change to ctime from the perspective of the current task, * accept the resulting floor value regardless of the outcome of * the swap. */ if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) { ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); timekeeping_inc_mg_floor_swaps(); } else { /* * Another task changed mg_floor since "old" was fetched. * "old" has been updated with the latest value of "mg_floor". * That value is newer than the previous floor value, which * is enough to effect a change to ctime. Accept it. */ *ts = ktime_to_timespec64(ktime_add(old, offset)); } } void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now, mono; unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); now = tk_xtime_coarse(tk); mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); } EXPORT_SYMBOL(ktime_get_coarse_ts64); /* * Must hold jiffies_lock */ void do_timer(unsigned long ticks) { jiffies_64 += ticks; calc_global_load(); } /** * ktime_get_update_offsets_now - hrtimer helper * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * * Returns current monotonic time and updates the offsets if the * sequence number in @cwsseq and timekeeper.clock_was_set_seq are * different. * * Called from hrtimer_interrupt() or retrigger_next_event() */ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); base = ktime_add_ns(base, nsecs); if (*cwsseq != tk->clock_was_set_seq) { *cwsseq = tk->clock_was_set_seq; *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; } /* Handle leapsecond insertion adjustments */ if (unlikely(base >= tk->next_leap_ktime)) *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0)); } while (read_seqcount_retry(&tk_core.seq, seq)); return base; } /* * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) return -EINVAL; if (!(txc->modes & ADJ_OFFSET_READONLY) && !capable(CAP_SYS_TIME)) return -EPERM; } else { /* In order to modify anything, you gotta be super-user! */ if (txc->modes && !capable(CAP_SYS_TIME)) return -EPERM; /* * if the quartz is off by more than 10% then * something is VERY wrong! */ if (txc->modes & ADJ_TICK && (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) return -EINVAL; } if (txc->modes & ADJ_SETOFFSET) { /* In order to inject time, you gotta be super-user! */ if (!capable(CAP_SYS_TIME)) return -EPERM; /* * Validate if a timespec/timeval used to inject a time * offset is valid. Offsets can be positive or negative, so * we don't check tv_sec. The value of the timeval/timespec * is the sum of its fields,but *NOTE*: * The field tv_usec/tv_nsec must always be non-negative and * we can't have more nanoseconds/microseconds than a second. */ if (txc->time.tv_usec < 0) return -EINVAL; if (txc->modes & ADJ_NANO) { if (txc->time.tv_usec >= NSEC_PER_SEC) return -EINVAL; } else { if (txc->time.tv_usec >= USEC_PER_SEC) return -EINVAL; } } /* * Check for potential multiplication overflows that can * only happen on 64-bit systems: */ if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { if (LLONG_MIN / PPM_SCALE > txc->freq) return -EINVAL; if (LLONG_MAX / PPM_SCALE < txc->freq) return -EINVAL; } if (aux_clock) { /* Auxiliary clocks are similar to TAI and do not have leap seconds */ if (txc->status & (STA_INS | STA_DEL)) return -EINVAL; /* No TAI offset setting */ if (txc->modes & ADJ_TAI) return -EINVAL; /* No PPS support either */ if (txc->status & (STA_PPSFREQ | STA_PPSTIME)) return -EINVAL; } return 0; } /** * random_get_entropy_fallback - Returns the raw clock source value, * used by random.c for platforms with no valid random_get_entropy(). */ unsigned long random_get_entropy_fallback(void) { struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; struct clocksource *clock = READ_ONCE(tkr->clock); if (unlikely(timekeeping_suspended || !clock)) return 0; return clock->read(clock); } EXPORT_SYMBOL_GPL(random_get_entropy_fallback); struct adjtimex_result { struct audit_ntp_data ad; struct timespec64 delta; bool clock_set; }; static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc, struct adjtimex_result *result) { struct timekeeper *tks = &tkd->shadow_timekeeper; bool aux_clock = !timekeeper_is_core_tk(tks); struct timespec64 ts; s32 orig_tai, tai; int ret; /* Validate the data before disabling interrupts */ ret = timekeeping_validate_timex(txc, aux_clock); if (ret) return ret; add_device_randomness(txc, sizeof(*txc)); if (!aux_clock) ktime_get_real_ts64(&ts); else tk_get_aux_ts64(tkd->timekeeper.id, &ts); add_device_randomness(&ts, sizeof(ts)); guard(raw_spinlock_irqsave)(&tkd->lock); if (!tks->clock_valid) return -ENODEV; if (txc->modes & ADJ_SETOFFSET) { result->delta.tv_sec = txc->time.tv_sec; result->delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) result->delta.tv_nsec *= 1000; ret = __timekeeping_inject_offset(tkd, &result->delta); if (ret) return ret; result->clock_set = true; } orig_tai = tai = tks->tai_offset; ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad); if (tai != orig_tai) { __timekeeping_set_tai_offset(tks, tai); timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET); result->clock_set = true; } else { tk_update_leap_state_all(&tk_core); } /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ); return ret; } /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function * @txc: Pointer to kernel_timex structure containing NTP parameters */ int do_adjtimex(struct __kernel_timex *txc) { struct adjtimex_result result = { }; int ret; ret = __do_adjtimex(&tk_core, txc, &result); if (ret < 0) return ret; if (txc->modes & ADJ_SETOFFSET) audit_tk_injoffset(result.delta); audit_ntp_log(&result.ad); if (result.clock_set) clock_was_set(CLOCK_SET_WALL); ntp_notify_cmos_timer(result.delta.tv_sec != 0); return ret; } /* * Invoked from NTP with the time keeper lock held, so lockless access is * fine. */ long ktime_get_ntp_seconds(unsigned int id) { return timekeeper_data[id].timekeeper.xtime_sec; } #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function * @phase_ts: Pointer to timespec64 structure representing phase timestamp * @raw_ts: Pointer to timespec64 structure representing raw timestamp */ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { guard(raw_spinlock_irqsave)(&tk_core.lock); __hardpps(phase_ts, raw_ts); } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ #ifdef CONFIG_POSIX_AUX_CLOCKS #include "posix-timers.h" /* * Bitmap for the activated auxiliary timekeepers to allow lockless quick * checks in the hot paths without touching extra cache lines. If set, then * the state of the corresponding timekeeper has to be re-checked under * timekeeper::lock. */ static unsigned long aux_timekeepers; static inline unsigned int clockid_to_tkid(unsigned int id) { return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX; } static inline struct tk_data *aux_get_tk_data(clockid_t id) { if (!clockid_aux_valid(id)) return NULL; return &timekeeper_data[clockid_to_tkid(id)]; } /* Invoked from timekeeping after a clocksource change */ static void tk_aux_update_clocksource(void) { unsigned long active = READ_ONCE(aux_timekeepers); unsigned int id; for_each_set_bit(id, &active, BITS_PER_LONG) { struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; struct timekeeper *tks = &tkd->shadow_timekeeper; guard(raw_spinlock_irqsave)(&tkd->lock); if (!tks->clock_valid) continue; timekeeping_forward_now(tks); tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock); timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); } } static void tk_aux_advance(void) { unsigned long active = READ_ONCE(aux_timekeepers); unsigned int id; /* Lockless quick check to avoid extra cache lines */ for_each_set_bit(id, &active, BITS_PER_LONG) { struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; guard(raw_spinlock)(&aux_tkd->lock); if (aux_tkd->shadow_timekeeper.clock_valid) __timekeeping_advance(aux_tkd, TK_ADV_TICK); } } /** * ktime_get_aux - Get time for a AUX clock * @id: ID of the clock to read (CLOCK_AUX...) * @kt: Pointer to ktime_t to store the time stamp * * Returns: True if the timestamp is valid, false otherwise */ bool ktime_get_aux(clockid_t id, ktime_t *kt) { struct tk_data *aux_tkd = aux_get_tk_data(id); struct timekeeper *aux_tk; unsigned int seq; ktime_t base; u64 nsecs; WARN_ON(timekeeping_suspended); if (!aux_tkd) return false; aux_tk = &aux_tkd->timekeeper; do { seq = read_seqcount_begin(&aux_tkd->seq); if (!aux_tk->clock_valid) return false; base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux); nsecs = timekeeping_get_ns(&aux_tk->tkr_mono); } while (read_seqcount_retry(&aux_tkd->seq, seq)); *kt = ktime_add_ns(base, nsecs); return true; } EXPORT_SYMBOL_GPL(ktime_get_aux); /** * ktime_get_aux_ts64 - Get time for a AUX clock * @id: ID of the clock to read (CLOCK_AUX...) * @ts: Pointer to timespec64 to store the time stamp * * Returns: True if the timestamp is valid, false otherwise */ bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts) { ktime_t now; if (!ktime_get_aux(id, &now)) return false; *ts = ktime_to_timespec64(now); return true; } EXPORT_SYMBOL_GPL(ktime_get_aux_ts64); static int aux_get_res(clockid_t id, struct timespec64 *tp) { if (!clockid_aux_valid(id)) return -ENODEV; tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC; tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC; return 0; } static int aux_get_timespec(clockid_t id, struct timespec64 *tp) { return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV; } static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew) { struct tk_data *aux_tkd = aux_get_tk_data(id); struct timekeeper *aux_tks; ktime_t tnow, nsecs; if (!timespec64_valid_settod(tnew)) return -EINVAL; if (!aux_tkd) return -ENODEV; aux_tks = &aux_tkd->shadow_timekeeper; guard(raw_spinlock_irq)(&aux_tkd->lock); if (!aux_tks->clock_valid) return -ENODEV; /* Forward the timekeeper base time */ timekeeping_forward_now(aux_tks); /* * Get the updated base time. tkr_mono.base has not been * updated yet, so do that first. That makes the update * in timekeeping_update_from_shadow() redundant, but * that's harmless. After that @tnow can be calculated * by using tkr_mono::cycle_last, which has been set * by timekeeping_forward_now(). */ tk_update_ktime_data(aux_tks); nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last); tnow = ktime_add(aux_tks->tkr_mono.base, nsecs); /* * Calculate the new AUX offset as delta to @tnow ("monotonic"). * That avoids all the tk::xtime back and forth conversions as * xtime ("realtime") is not applicable for auxiliary clocks and * kept in sync with "monotonic". */ aux_tks->offs_aux = ktime_sub(timespec64_to_ktime(*tnew), tnow); timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); return 0; } static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc) { struct tk_data *aux_tkd = aux_get_tk_data(id); struct adjtimex_result result = { }; if (!aux_tkd) return -ENODEV; /* * @result is ignored for now as there are neither hrtimers nor a * RTC related to auxiliary clocks for now. */ return __do_adjtimex(aux_tkd, txc, &result); } const struct k_clock clock_aux = { .clock_getres = aux_get_res, .clock_get_timespec = aux_get_timespec, .clock_set = aux_clock_set, .clock_adj = aux_clock_adj, }; static void aux_clock_enable(clockid_t id) { struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw; struct tk_data *aux_tkd = aux_get_tk_data(id); struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper; /* Prevent the core timekeeper from changing. */ guard(raw_spinlock_irq)(&tk_core.lock); /* * Setup the auxiliary clock assuming that the raw core timekeeper * clock frequency conversion is close enough. Userspace has to * adjust for the deviation via clock_adjtime(2). */ guard(raw_spinlock_nested)(&aux_tkd->lock); /* Remove leftovers of a previous registration */ memset(aux_tks, 0, sizeof(*aux_tks)); /* Restore the timekeeper id */ aux_tks->id = aux_tkd->timekeeper.id; /* Setup the timekeeper based on the current system clocksource */ tk_setup_internals(aux_tks, tkr_raw->clock); /* Mark it valid and set it live */ aux_tks->clock_valid = true; timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); } static void aux_clock_disable(clockid_t id) { struct tk_data *aux_tkd = aux_get_tk_data(id); guard(raw_spinlock_irq)(&aux_tkd->lock); aux_tkd->shadow_timekeeper.clock_valid = false; timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); } static DEFINE_MUTEX(aux_clock_mutex); static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { /* Lazy atoi() as name is "0..7" */ int id = kobj->name[0] & 0x7; bool enable; if (!capable(CAP_SYS_TIME)) return -EPERM; if (kstrtobool(buf, &enable) < 0) return -EINVAL; guard(mutex)(&aux_clock_mutex); if (enable == test_bit(id, &aux_timekeepers)) return count; if (enable) { aux_clock_enable(CLOCK_AUX + id); set_bit(id, &aux_timekeepers); } else { aux_clock_disable(CLOCK_AUX + id); clear_bit(id, &aux_timekeepers); } return count; } static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { unsigned long active = READ_ONCE(aux_timekeepers); /* Lazy atoi() as name is "0..7" */ int id = kobj->name[0] & 0x7; return sysfs_emit(buf, "%d\n", test_bit(id, &active)); } static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable); static struct attribute *aux_clock_enable_attrs[] = { &aux_clock_enable_attr.attr, NULL }; static const struct attribute_group aux_clock_enable_attr_group = { .attrs = aux_clock_enable_attrs, }; static int __init tk_aux_sysfs_init(void) { struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj); if (!tko) return -ENOMEM; auxo = kobject_create_and_add("aux_clocks", tko); if (!auxo) { kobject_put(tko); return -ENOMEM; } for (int i = 0; i <= MAX_AUX_CLOCKS; i++) { char id[2] = { [0] = '0' + i, }; struct kobject *clk = kobject_create_and_add(id, auxo); if (!clk) return -ENOMEM; int ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); if (ret) return ret; } return 0; } late_initcall(tk_aux_sysfs_init); static __init void tk_aux_setup(void) { for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) tkd_basic_setup(&timekeeper_data[i], i, false); } #endif /* CONFIG_POSIX_AUX_CLOCKS */ |
| 6 6 3 12 6 9 10 10 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * linux/include/linux/jbd2.h * * Written by Stephen C. Tweedie <sct@redhat.com> * * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved * * Definitions for transaction data structures for the buffer cache * filesystem journaling support. */ #ifndef _LINUX_JBD2_H #define _LINUX_JBD2_H /* Allow this file to be included directly into e2fsprogs */ #ifndef __KERNEL__ #include "jfs_compat.h" #define JBD2_DEBUG #else #include <linux/types.h> #include <linux/buffer_head.h> #include <linux/journal-head.h> #include <linux/stddef.h> #include <linux/mutex.h> #include <linux/timer.h> #include <linux/slab.h> #include <linux/bit_spinlock.h> #include <linux/blkdev.h> #include <linux/crc32c.h> #endif #define journal_oom_retry 1 /* * Define JBD2_PARANIOD_IOFAIL to cause a kernel BUG() if ext4 finds * certain classes of error which can occur due to failed IOs. Under * normal use we want ext4 to continue after such errors, because * hardware _can_ fail, but for debugging purposes when running tests on * known-good hardware we may want to trap these errors. */ #undef JBD2_PARANOID_IOFAIL /* * The default maximum commit age, in seconds. */ #define JBD2_DEFAULT_MAX_COMMIT_AGE 5 #ifdef CONFIG_JBD2_DEBUG /* * Define JBD2_EXPENSIVE_CHECKING to enable more expensive internal * consistency checks. By default we don't do this unless * CONFIG_JBD2_DEBUG is on. */ #define JBD2_EXPENSIVE_CHECKING void __jbd2_debug(int level, const char *file, const char *func, unsigned int line, const char *fmt, ...); #define jbd2_debug(n, fmt, a...) \ __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a) #else #define jbd2_debug(n, fmt, a...) no_printk(fmt, ##a) #endif extern void *jbd2_alloc(size_t size, gfp_t flags); extern void jbd2_free(void *ptr, size_t size); #define JBD2_MIN_JOURNAL_BLOCKS 1024 #define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256 #ifdef __KERNEL__ /** * typedef handle_t - The handle_t type represents a single atomic update being performed by some process. * * All filesystem modifications made by the process go * through this handle. Recursive operations (such as quota operations) * are gathered into a single update. * * The buffer credits field is used to account for journaled buffers * being modified by the running process. To ensure that there is * enough log space for all outstanding operations, we need to limit the * number of outstanding buffers possible at any time. When the * operation completes, any buffer credits not used are credited back to * the transaction, so that at all times we know how many buffers the * outstanding updates on a transaction might possibly touch. * * This is an opaque datatype. **/ typedef struct jbd2_journal_handle handle_t; /* Atomic operation type */ /** * typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem. * * journal_t is linked to from the fs superblock structure. * * We use the journal_t to keep track of all outstanding transaction * activity on the filesystem, and to manage the state of the log * writing process. * * This is an opaque datatype. **/ typedef struct journal_s journal_t; /* Journal control structure */ #endif /* * Internal structures used by the logging mechanism: */ #define JBD2_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */ /* * On-disk structures */ /* * Descriptor block types: */ #define JBD2_DESCRIPTOR_BLOCK 1 #define JBD2_COMMIT_BLOCK 2 #define JBD2_SUPERBLOCK_V1 3 #define JBD2_SUPERBLOCK_V2 4 #define JBD2_REVOKE_BLOCK 5 /* * Standard header for all descriptor blocks: */ typedef struct journal_header_s { __be32 h_magic; __be32 h_blocktype; __be32 h_sequence; } journal_header_t; /* * Checksum types. */ #define JBD2_CRC32_CHKSUM 1 #define JBD2_MD5_CHKSUM 2 #define JBD2_SHA1_CHKSUM 3 #define JBD2_CRC32C_CHKSUM 4 #define JBD2_CRC32_CHKSUM_SIZE 4 #define JBD2_CHECKSUM_BYTES (32 / sizeof(u32)) /* * Commit block header for storing transactional checksums: * * NOTE: If FEATURE_COMPAT_CHECKSUM (checksum v1) is set, the h_chksum* * fields are used to store a checksum of the descriptor and data blocks. * * If FEATURE_INCOMPAT_CSUM_V2 (checksum v2) is set, then the h_chksum * field is used to store crc32c(uuid+commit_block). Each journal metadata * block gets its own checksum, and data block checksums are stored in * journal_block_tag (in the descriptor). The other h_chksum* fields are * not used. * * If FEATURE_INCOMPAT_CSUM_V3 is set, the descriptor block uses * journal_block_tag3_t to store a full 32-bit checksum. Everything else * is the same as v2. * * Checksum v1, v2, and v3 are mutually exclusive features. */ struct commit_header { __be32 h_magic; __be32 h_blocktype; __be32 h_sequence; unsigned char h_chksum_type; unsigned char h_chksum_size; unsigned char h_padding[2]; __be32 h_chksum[JBD2_CHECKSUM_BYTES]; __be64 h_commit_sec; __be32 h_commit_nsec; }; /* * The block tag: used to describe a single buffer in the journal. * t_blocknr_high is only used if INCOMPAT_64BIT is set, so this * raw struct shouldn't be used for pointer math or sizeof() - use * journal_tag_bytes(journal) instead to compute this. */ typedef struct journal_block_tag3_s { __be32 t_blocknr; /* The on-disk block number */ __be32 t_flags; /* See below */ __be32 t_blocknr_high; /* most-significant high 32bits. */ __be32 t_checksum; /* crc32c(uuid+seq+block) */ } journal_block_tag3_t; typedef struct journal_block_tag_s { __be32 t_blocknr; /* The on-disk block number */ __be16 t_checksum; /* truncated crc32c(uuid+seq+block) */ __be16 t_flags; /* See below */ __be32 t_blocknr_high; /* most-significant high 32bits. */ } journal_block_tag_t; /* Tail of descriptor or revoke block, for checksumming */ struct jbd2_journal_block_tail { __be32 t_checksum; /* crc32c(uuid+descr_block) */ }; /* * The revoke descriptor: used on disk to describe a series of blocks to * be revoked from the log */ typedef struct jbd2_journal_revoke_header_s { journal_header_t r_header; __be32 r_count; /* Count of bytes used in the block */ } jbd2_journal_revoke_header_t; /* Definitions for the journal tag flags word: */ #define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */ #define JBD2_FLAG_SAME_UUID 2 /* block has same uuid as previous */ #define JBD2_FLAG_DELETED 4 /* block deleted by this transaction */ #define JBD2_FLAG_LAST_TAG 8 /* last tag in this descriptor block */ /* * The journal superblock. All fields are in big-endian byte order. */ typedef struct journal_superblock_s { /* 0x0000 */ journal_header_t s_header; /* 0x000C */ /* Static information describing the journal */ __be32 s_blocksize; /* journal device blocksize */ __be32 s_maxlen; /* total blocks in journal file */ __be32 s_first; /* first block of log information */ /* 0x0018 */ /* Dynamic information describing the current state of the log */ __be32 s_sequence; /* first commit ID expected in log */ __be32 s_start; /* blocknr of start of log */ /* 0x0020 */ /* Error value, as set by jbd2_journal_abort(). */ __be32 s_errno; /* 0x0024 */ /* Remaining fields are only valid in a version-2 superblock */ __be32 s_feature_compat; /* compatible feature set */ __be32 s_feature_incompat; /* incompatible feature set */ __be32 s_feature_ro_compat; /* readonly-compatible feature set */ /* 0x0030 */ __u8 s_uuid[16]; /* 128-bit uuid for journal */ /* 0x0040 */ __be32 s_nr_users; /* Nr of filesystems sharing log */ __be32 s_dynsuper; /* Blocknr of dynamic superblock copy*/ /* 0x0048 */ __be32 s_max_transaction; /* Limit of journal blocks per trans.*/ __be32 s_max_trans_data; /* Limit of data blocks per trans. */ /* 0x0050 */ __u8 s_checksum_type; /* checksum type */ __u8 s_padding2[3]; /* 0x0054 */ __be32 s_num_fc_blks; /* Number of fast commit blocks */ __be32 s_head; /* blocknr of head of log, only uptodate * while the filesystem is clean */ /* 0x005C */ __u32 s_padding[40]; __be32 s_checksum; /* crc32c(superblock) */ /* 0x0100 */ __u8 s_users[16*48]; /* ids of all fs'es sharing the log */ /* 0x0400 */ } journal_superblock_t; #define JBD2_FEATURE_COMPAT_CHECKSUM 0x00000001 #define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 #define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 #define JBD2_FEATURE_INCOMPAT_CSUM_V2 0x00000008 #define JBD2_FEATURE_INCOMPAT_CSUM_V3 0x00000010 #define JBD2_FEATURE_INCOMPAT_FAST_COMMIT 0x00000020 /* See "journal feature predicate functions" below */ /* Features known to this kernel version: */ #define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM #define JBD2_KNOWN_ROCOMPAT_FEATURES 0 #define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ JBD2_FEATURE_INCOMPAT_64BIT | \ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \ JBD2_FEATURE_INCOMPAT_CSUM_V2 | \ JBD2_FEATURE_INCOMPAT_CSUM_V3 | \ JBD2_FEATURE_INCOMPAT_FAST_COMMIT) #ifdef __KERNEL__ #include <linux/fs.h> #include <linux/sched.h> enum jbd_state_bits { BH_JBD /* Has an attached ext3 journal_head */ = BH_PrivateStart, BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ BH_Freed, /* Has been freed (truncated) */ BH_Revoked, /* Has been revoked from the log */ BH_RevokeValid, /* Revoked flag is valid */ BH_JBDDirty, /* Is dirty but journaled */ BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ BH_Shadow, /* IO on shadow buffer is running */ BH_Verified, /* Metadata block has been verified ok */ BH_JBDPrivateStart, /* First bit available for private use by FS */ }; BUFFER_FNS(JBD, jbd) BUFFER_FNS(JWrite, jwrite) BUFFER_FNS(JBDDirty, jbddirty) TAS_BUFFER_FNS(JBDDirty, jbddirty) BUFFER_FNS(Revoked, revoked) TAS_BUFFER_FNS(Revoked, revoked) BUFFER_FNS(RevokeValid, revokevalid) TAS_BUFFER_FNS(RevokeValid, revokevalid) BUFFER_FNS(Freed, freed) BUFFER_FNS(Shadow, shadow) BUFFER_FNS(Verified, verified) static inline struct buffer_head *jh2bh(struct journal_head *jh) { return jh->b_bh; } static inline struct journal_head *bh2jh(struct buffer_head *bh) { return bh->b_private; } static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) { bit_spin_lock(BH_JournalHead, &bh->b_state); } static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) { bit_spin_unlock(BH_JournalHead, &bh->b_state); } #define J_ASSERT(assert) BUG_ON(!(assert)) #define J_ASSERT_BH(bh, expr) J_ASSERT(expr) #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) #if defined(JBD2_PARANOID_IOFAIL) #define J_EXPECT(expr, why...) J_ASSERT(expr) #define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr) #define J_EXPECT_JH(jh, expr, why...) J_ASSERT_JH(jh, expr) #else #define __journal_expect(expr, why...) \ ({ \ int val = (expr); \ if (!val) { \ printk(KERN_ERR \ "JBD2 unexpected failure: %s: %s;\n", \ __func__, #expr); \ printk(KERN_ERR why "\n"); \ } \ val; \ }) #define J_EXPECT(expr, why...) __journal_expect(expr, ## why) #define J_EXPECT_BH(bh, expr, why...) __journal_expect(expr, ## why) #define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why) #endif /* Flags in jbd_inode->i_flags */ #define __JI_COMMIT_RUNNING 0 #define __JI_WRITE_DATA 1 #define __JI_WAIT_DATA 2 /* * Commit of the inode data in progress. We use this flag to protect us from * concurrent deletion of inode. We cannot use reference to inode for this * since we cannot afford doing last iput() on behalf of kjournald */ #define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING) /* Write allocated dirty buffers in this inode before commit */ #define JI_WRITE_DATA (1 << __JI_WRITE_DATA) /* Wait for outstanding data writes for this inode before commit */ #define JI_WAIT_DATA (1 << __JI_WAIT_DATA) /** * struct jbd2_inode - The jbd_inode type is the structure linking inodes in * ordered mode present in a transaction so that we can sync them during commit. */ struct jbd2_inode { /** * @i_transaction: * * Which transaction does this inode belong to? Either the running * transaction or the committing one. [j_list_lock] */ transaction_t *i_transaction; /** * @i_next_transaction: * * Pointer to the running transaction modifying inode's data in case * there is already a committing transaction touching it. [j_list_lock] */ transaction_t *i_next_transaction; /** * @i_list: List of inodes in the i_transaction [j_list_lock] */ struct list_head i_list; /** * @i_vfs_inode: * * VFS inode this inode belongs to [constant for lifetime of structure] */ struct inode *i_vfs_inode; /** * @i_flags: Flags of inode [j_list_lock] */ unsigned long i_flags; /** * @i_dirty_start: * * Offset in bytes where the dirty range for this inode starts. * [j_list_lock] */ loff_t i_dirty_start; /** * @i_dirty_end: * * Inclusive offset in bytes where the dirty range for this inode * ends. [j_list_lock] */ loff_t i_dirty_end; }; struct jbd2_revoke_table_s; /** * struct jbd2_journal_handle - The jbd2_journal_handle type is the concrete * type associated with handle_t. * @h_transaction: Which compound transaction is this update a part of? * @h_journal: Which journal handle belongs to - used iff h_reserved set. * @h_rsv_handle: Handle reserved for finishing the logical operation. * @h_total_credits: Number of remaining buffers we are allowed to add to * journal. These are dirty buffers and revoke descriptor blocks. * @h_revoke_credits: Number of remaining revoke records available for handle * @h_ref: Reference count on this handle. * @h_err: Field for caller's use to track errors through large fs operations. * @h_sync: Flag for sync-on-close. * @h_reserved: Flag for handle for reserved credits. * @h_aborted: Flag indicating fatal error on handle. * @h_type: For handle statistics. * @h_line_no: For handle statistics. * @h_start_jiffies: Handle Start time. * @h_requested_credits: Holds @h_total_credits after handle is started. * @h_revoke_credits_requested: Holds @h_revoke_credits after handle is started. * @saved_alloc_context: Saved context while transaction is open. **/ /* Docbook can't yet cope with the bit fields, but will leave the documentation * in so it can be fixed later. */ struct jbd2_journal_handle { union { transaction_t *h_transaction; /* Which journal handle belongs to - used iff h_reserved set */ journal_t *h_journal; }; handle_t *h_rsv_handle; int h_total_credits; int h_revoke_credits; int h_revoke_credits_requested; int h_ref; int h_err; /* Flags [no locking] */ unsigned int h_sync: 1; unsigned int h_reserved: 1; unsigned int h_aborted: 1; unsigned int h_type: 8; unsigned int h_line_no: 16; unsigned long h_start_jiffies; unsigned int h_requested_credits; unsigned int saved_alloc_context; }; /* * Some stats for checkpoint phase */ struct transaction_chp_stats_s { unsigned long cs_chp_time; __u32 cs_forced_to_close; __u32 cs_written; __u32 cs_dropped; }; /* The transaction_t type is the guts of the journaling mechanism. It * tracks a compound transaction through its various states: * * RUNNING: accepting new updates * LOCKED: Updates still running but we don't accept new ones * RUNDOWN: Updates are tidying up but have finished requesting * new buffers to modify (state not used for now) * FLUSH: All updates complete, but we are still writing to disk * COMMIT: All data on disk, writing commit record * FINISHED: We still have to keep the transaction for checkpointing. * * The transaction keeps track of all of the buffers modified by a * running transaction, and all of the buffers committed but not yet * flushed to home for finished transactions. * (Locking Documentation improved by LockDoc) */ /* * Lock ranking: * * j_list_lock * ->jbd_lock_bh_journal_head() (This is "innermost") * * j_state_lock * ->b_state_lock * * b_state_lock * ->j_list_lock * * j_state_lock * ->j_list_lock (journal_unmap_buffer) * */ struct transaction_s { /* Pointer to the journal for this transaction. [no locking] */ journal_t *t_journal; /* Sequence number for this transaction [no locking] */ tid_t t_tid; /* * Transaction's current state * [no locking - only kjournald2 alters this] * [j_list_lock] guards transition of a transaction into T_FINISHED * state and subsequent call of __jbd2_journal_drop_transaction() * FIXME: needs barriers * KLUDGE: [use j_state_lock] */ enum { T_RUNNING, T_LOCKED, T_SWITCH, T_FLUSH, T_COMMIT, T_COMMIT_DFLUSH, T_COMMIT_JFLUSH, T_COMMIT_CALLBACK, T_FINISHED } t_state; /* * Where in the log does this transaction's commit start? [no locking] */ unsigned long t_log_start; /* * Number of buffers on the t_buffers list [j_list_lock, no locks * needed for jbd2 thread] */ int t_nr_buffers; /* * Doubly-linked circular list of all buffers reserved but not yet * modified by this transaction [j_list_lock, no locks needed fo * jbd2 thread] */ struct journal_head *t_reserved_list; /* * Doubly-linked circular list of all metadata buffers owned by this * transaction [j_list_lock, no locks needed for jbd2 thread] */ struct journal_head *t_buffers; /* * Doubly-linked circular list of all forget buffers (superseded * buffers which we can un-checkpoint once this transaction commits) * [j_list_lock] */ struct journal_head *t_forget; /* * Doubly-linked circular list of all buffers still to be flushed before * this transaction can be checkpointed. [j_list_lock] */ struct journal_head *t_checkpoint_list; /* * Doubly-linked circular list of metadata buffers being * shadowed by log IO. The IO buffers on the iobuf list and * the shadow buffers on this list match each other one for * one at all times. [j_list_lock, no locks needed for jbd2 * thread] */ struct journal_head *t_shadow_list; /* * List of inodes associated with the transaction; e.g., ext4 uses * this to track inodes in data=ordered and data=journal mode that * need special handling on transaction commit; also used by ocfs2. * [j_list_lock] */ struct list_head t_inode_list; /* * Longest time some handle had to wait for running transaction */ unsigned long t_max_wait; /* * When transaction started */ unsigned long t_start; /* * When commit was requested [j_state_lock] */ unsigned long t_requested; /* * Checkpointing stats [j_list_lock] */ struct transaction_chp_stats_s t_chp_stats; /* * Number of outstanding updates running on this transaction * [none] */ atomic_t t_updates; /* * Number of blocks reserved for this transaction in the journal. * This is including all credits reserved when starting transaction * handles as well as all journal descriptor blocks needed for this * transaction. [none] */ atomic_t t_outstanding_credits; /* * Number of revoke records for this transaction added by already * stopped handles. [none] */ atomic_t t_outstanding_revokes; /* * How many handles used this transaction? [none] */ atomic_t t_handle_count; /* * Forward and backward links for the circular list of all transactions * awaiting checkpoint. [j_list_lock] */ transaction_t *t_cpnext, *t_cpprev; /* * When will the transaction expire (become due for commit), in jiffies? * [no locking] */ unsigned long t_expires; /* * When this transaction started, in nanoseconds [no locking] */ ktime_t t_start_time; /* * This transaction is being forced and some process is * waiting for it to finish. */ unsigned int t_synchronous_commit:1; /* Disk flush needs to be sent to fs partition [no locking] */ int t_need_data_flush; }; struct transaction_run_stats_s { unsigned long rs_wait; unsigned long rs_request_delay; unsigned long rs_running; unsigned long rs_locked; unsigned long rs_flushing; unsigned long rs_logging; __u32 rs_handle_count; __u32 rs_blocks; __u32 rs_blocks_logged; }; struct transaction_stats_s { unsigned long ts_tid; unsigned long ts_requested; struct transaction_run_stats_s run; }; static inline unsigned long jbd2_time_diff(unsigned long start, unsigned long end) { if (end >= start) return end - start; return end + (MAX_JIFFY_OFFSET - start); } #define JBD2_NR_BATCH 64 enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; #define JBD2_FC_REPLAY_STOP 0 #define JBD2_FC_REPLAY_CONTINUE 1 /** * struct journal_s - The journal_s type is the concrete type associated with * journal_t. */ struct journal_s { /** * @j_flags: General journaling state flags [j_state_lock, * no lock for quick racy checks] */ unsigned long j_flags; /** * @j_errno: * * Is there an outstanding uncleared error on the journal (from a prior * abort)? [j_state_lock] */ int j_errno; /** * @j_abort_mutex: Lock the whole aborting procedure. */ struct mutex j_abort_mutex; /** * @j_sb_buffer: The first part of the superblock buffer. */ struct buffer_head *j_sb_buffer; /** * @j_superblock: The second part of the superblock buffer. */ journal_superblock_t *j_superblock; /** * @j_state_lock: Protect the various scalars in the journal. */ rwlock_t j_state_lock; /** * @j_barrier_count: * * Number of processes waiting to create a barrier lock [j_state_lock, * no lock for quick racy checks] */ int j_barrier_count; /** * @j_barrier: The barrier lock itself. */ struct mutex j_barrier; /** * @j_running_transaction: * * Transactions: The current running transaction... * [j_state_lock, no lock for quick racy checks] [caller holding * open handle] */ transaction_t *j_running_transaction; /** * @j_committing_transaction: * * the transaction we are pushing to disk * [j_state_lock] [caller holding open handle] */ transaction_t *j_committing_transaction; /** * @j_checkpoint_transactions: * * ... and a linked circular list of all transactions waiting for * checkpointing. [j_list_lock] */ transaction_t *j_checkpoint_transactions; /** * @j_wait_transaction_locked: * * Wait queue for waiting for a locked transaction to start committing, * or for a barrier lock to be released. */ wait_queue_head_t j_wait_transaction_locked; /** * @j_wait_done_commit: Wait queue for waiting for commit to complete. */ wait_queue_head_t j_wait_done_commit; /** * @j_wait_commit: Wait queue to trigger commit. */ wait_queue_head_t j_wait_commit; /** * @j_wait_updates: Wait queue to wait for updates to complete. */ wait_queue_head_t j_wait_updates; /** * @j_wait_reserved: * * Wait queue to wait for reserved buffer credits to drop. */ wait_queue_head_t j_wait_reserved; /** * @j_fc_wait: * * Wait queue to wait for completion of async fast commits. */ wait_queue_head_t j_fc_wait; /** * @j_checkpoint_mutex: * * Semaphore for locking against concurrent checkpoints. */ struct mutex j_checkpoint_mutex; /** * @j_chkpt_bhs: * * List of buffer heads used by the checkpoint routine. This * was moved from jbd2_log_do_checkpoint() to reduce stack * usage. Access to this array is controlled by the * @j_checkpoint_mutex. [j_checkpoint_mutex] */ struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH]; /** * @j_shrinker: * * Journal head shrinker, reclaim buffer's journal head which * has been written back. */ struct shrinker *j_shrinker; /** * @j_checkpoint_jh_count: * * Number of journal buffers on the checkpoint list. [j_list_lock] */ struct percpu_counter j_checkpoint_jh_count; /** * @j_shrink_transaction: * * Record next transaction will shrink on the checkpoint list. * [j_list_lock] */ transaction_t *j_shrink_transaction; /** * @j_head: * * Journal head: identifies the first unused block in the journal. * [j_state_lock] */ unsigned long j_head; /** * @j_tail: * * Journal tail: identifies the oldest still-used block in the journal. * [j_state_lock] */ unsigned long j_tail; /** * @j_free: * * Journal free: how many free blocks are there in the journal? * [j_state_lock] */ unsigned long j_free; /** * @j_first: * * The block number of the first usable block in the journal * [j_state_lock]. */ unsigned long j_first; /** * @j_last: * * The block number one beyond the last usable block in the journal * [j_state_lock]. */ unsigned long j_last; /** * @j_fc_first: * * The block number of the first fast commit block in the journal * [j_state_lock]. */ unsigned long j_fc_first; /** * @j_fc_off: * * Number of fast commit blocks currently allocated. Accessed only * during fast commit. Currently only process can do fast commit, so * this field is not protected by any lock. */ unsigned long j_fc_off; /** * @j_fc_last: * * The block number one beyond the last fast commit block in the journal * [j_state_lock]. */ unsigned long j_fc_last; /** * @j_dev: Device where we store the journal. */ struct block_device *j_dev; /** * @j_blocksize: Block size for the location where we store the journal. */ int j_blocksize; /** * @j_blk_offset: * * Starting block offset into the device where we store the journal. */ unsigned long long j_blk_offset; /** * @j_devname: Journal device name. */ char j_devname[BDEVNAME_SIZE+24]; /** * @j_fs_dev: * * Device which holds the client fs. For internal journal this will be * equal to j_dev. */ struct block_device *j_fs_dev; /** * @j_fs_dev_wb_err: * * Records the errseq of the client fs's backing block device. */ errseq_t j_fs_dev_wb_err; /** * @j_total_len: Total maximum capacity of the journal region on disk. */ unsigned int j_total_len; /** * @j_reserved_credits: * * Number of buffers reserved from the running transaction. */ atomic_t j_reserved_credits; /** * @j_list_lock: Protects the buffer lists and internal buffer state. */ spinlock_t j_list_lock; /** * @j_inode: * * Optional inode where we store the journal. If present, all * journal block numbers are mapped into this inode via bmap(). */ struct inode *j_inode; /** * @j_tail_sequence: * * Sequence number of the oldest transaction in the log [j_state_lock] */ tid_t j_tail_sequence; /** * @j_transaction_sequence: * * Sequence number of the next transaction to grant [j_state_lock] */ tid_t j_transaction_sequence; /** * @j_commit_sequence: * * Sequence number of the most recently committed transaction * [j_state_lock, no lock for quick racy checks] */ tid_t j_commit_sequence; /** * @j_commit_request: * * Sequence number of the most recent transaction wanting commit * [j_state_lock, no lock for quick racy checks] */ tid_t j_commit_request; /** * @j_uuid: * * Journal uuid: identifies the object (filesystem, LVM volume etc) * backed by this journal. This will eventually be replaced by an array * of uuids, allowing us to index multiple devices within a single * journal and to perform atomic updates across them. */ __u8 j_uuid[16]; /** * @j_task: Pointer to the current commit thread for this journal. */ struct task_struct *j_task; /** * @j_max_transaction_buffers: * * Maximum number of metadata buffers to allow in a single compound * commit transaction. */ int j_max_transaction_buffers; /** * @j_revoke_records_per_block: * * Number of revoke records that fit in one descriptor block. */ int j_revoke_records_per_block; /** * @j_transaction_overhead_buffers: * * Number of blocks each transaction needs for its own bookkeeping */ int j_transaction_overhead_buffers; /** * @j_commit_interval: * * What is the maximum transaction lifetime before we begin a commit? */ unsigned long j_commit_interval; /** * @j_commit_timer: The timer used to wakeup the commit thread. */ struct timer_list j_commit_timer; /** * @j_revoke_lock: Protect the revoke table. */ spinlock_t j_revoke_lock; /** * @j_revoke: * * The revoke table - maintains the list of revoked blocks in the * current transaction. */ struct jbd2_revoke_table_s *j_revoke; /** * @j_revoke_table: Alternate revoke tables for j_revoke. */ struct jbd2_revoke_table_s *j_revoke_table[2]; /** * @j_wbuf: Array of bhs for jbd2_journal_commit_transaction. */ struct buffer_head **j_wbuf; /** * @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only * during a fast commit. Currently only process can do fast commit, so * this field is not protected by any lock. */ struct buffer_head **j_fc_wbuf; /** * @j_wbufsize: * * Size of @j_wbuf array. */ int j_wbufsize; /** * @j_fc_wbufsize: * * Size of @j_fc_wbuf array. */ int j_fc_wbufsize; /** * @j_last_sync_writer: * * The pid of the last person to run a synchronous operation * through the journal. */ pid_t j_last_sync_writer; /** * @j_average_commit_time: * * The average amount of time in nanoseconds it takes to commit a * transaction to disk. [j_state_lock] */ u64 j_average_commit_time; /** * @j_min_batch_time: * * Minimum time that we should wait for additional filesystem operations * to get batched into a synchronous handle in microseconds. */ u32 j_min_batch_time; /** * @j_max_batch_time: * * Maximum time that we should wait for additional filesystem operations * to get batched into a synchronous handle in microseconds. */ u32 j_max_batch_time; /** * @j_commit_callback: * * This function is called when a transaction is closed. */ void (*j_commit_callback)(journal_t *, transaction_t *); /** * @j_submit_inode_data_buffers: * * This function is called for all inodes associated with the * committing transaction marked with JI_WRITE_DATA flag * before we start to write out the transaction to the journal. */ int (*j_submit_inode_data_buffers) (struct jbd2_inode *); /** * @j_finish_inode_data_buffers: * * This function is called for all inodes associated with the * committing transaction marked with JI_WAIT_DATA flag * after we have written the transaction to the journal * but before we write out the commit block. */ int (*j_finish_inode_data_buffers) (struct jbd2_inode *); /* * Journal statistics */ /** * @j_history_lock: Protect the transactions statistics history. */ spinlock_t j_history_lock; /** * @j_proc_entry: procfs entry for the jbd statistics directory. */ struct proc_dir_entry *j_proc_entry; /** * @j_stats: Overall statistics. */ struct transaction_stats_s j_stats; /** * @j_failed_commit: Failed journal commit ID. */ unsigned int j_failed_commit; /** * @j_private: * * An opaque pointer to fs-private information. ext3 puts its * superblock pointer here. */ void *j_private; /** * @j_csum_seed: * * Precomputed journal UUID checksum for seeding other checksums. */ __u32 j_csum_seed; #ifdef CONFIG_DEBUG_LOCK_ALLOC /** * @j_trans_commit_map: * * Lockdep entity to track transaction commit dependencies. Handles * hold this "lock" for read, when we wait for commit, we acquire the * "lock" for writing. This matches the properties of jbd2 journalling * where the running transaction has to wait for all handles to be * dropped to commit that transaction and also acquiring a handle may * require transaction commit to finish. */ struct lockdep_map j_trans_commit_map; #endif /** * @j_fc_cleanup_callback: * * Clean-up after fast commit or full commit. JBD2 calls this function * after every commit operation. */ void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid); /** * @j_fc_replay_callback: * * File-system specific function that performs replay of a fast * commit. JBD2 calls this function for each fast commit block found in * the journal. This function should return JBD2_FC_REPLAY_CONTINUE * to indicate that the block was processed correctly and more fast * commit replay should continue. Return value of JBD2_FC_REPLAY_STOP * indicates the end of replay (no more blocks remaining). A negative * return value indicates error. */ int (*j_fc_replay_callback)(struct journal_s *journal, struct buffer_head *bh, enum passtype pass, int off, tid_t expected_commit_id); /** * @j_bmap: * * Bmap function that should be used instead of the generic * VFS bmap function. */ int (*j_bmap)(struct journal_s *journal, sector_t *block); }; #define jbd2_might_wait_for_commit(j) \ do { \ rwsem_acquire(&j->j_trans_commit_map, 0, 0, _THIS_IP_); \ rwsem_release(&j->j_trans_commit_map, _THIS_IP_); \ } while (0) /* * We can support any known requested features iff the * superblock is not in version 1. Otherwise we fail to support any * extended sb features. */ static inline bool jbd2_format_support_feature(journal_t *j) { return j->j_superblock->s_header.h_blocktype != cpu_to_be32(JBD2_SUPERBLOCK_V1); } /* journal feature predicate functions */ #define JBD2_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_compat & \ cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname)) != 0); \ } \ static inline void jbd2_set_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_compat |= \ cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \ } \ static inline void jbd2_clear_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_compat &= \ ~cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \ } #define JBD2_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_ro_compat & \ cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname)) != 0); \ } \ static inline void jbd2_set_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_ro_compat |= \ cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \ } \ static inline void jbd2_clear_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_ro_compat &= \ ~cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \ } #define JBD2_FEATURE_INCOMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_incompat & \ cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname)) != 0); \ } \ static inline void jbd2_set_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_incompat |= \ cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \ } \ static inline void jbd2_clear_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_incompat &= \ ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \ } JBD2_FEATURE_COMPAT_FUNCS(checksum, CHECKSUM) JBD2_FEATURE_INCOMPAT_FUNCS(revoke, REVOKE) JBD2_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) JBD2_FEATURE_INCOMPAT_FUNCS(async_commit, ASYNC_COMMIT) JBD2_FEATURE_INCOMPAT_FUNCS(csum2, CSUM_V2) JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM_V3) JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) /* Journal high priority write IO operation flags */ #define JBD2_JOURNAL_REQ_FLAGS (REQ_META | REQ_SYNC | REQ_IDLE) /* * Journal flag definitions */ #define JBD2_UNMOUNT 0x001 /* Journal thread is being destroyed */ #define JBD2_ABORT 0x002 /* Journaling has been aborted for errors. */ #define JBD2_ACK_ERR 0x004 /* The errno in the sb has been acked */ #define JBD2_FLUSHED 0x008 /* The journal superblock has been flushed */ #define JBD2_LOADED 0x010 /* The journal superblock has been loaded */ #define JBD2_BARRIER 0x020 /* Use IDE barriers */ #define JBD2_CYCLE_RECORD 0x080 /* Journal cycled record log on * clean and empty filesystem * logging area */ #define JBD2_FAST_COMMIT_ONGOING 0x100 /* Fast commit is ongoing */ #define JBD2_FULL_COMMIT_ONGOING 0x200 /* Full commit is ongoing */ #define JBD2_JOURNAL_FLUSH_DISCARD 0x0001 #define JBD2_JOURNAL_FLUSH_ZEROOUT 0x0002 #define JBD2_JOURNAL_FLUSH_VALID (JBD2_JOURNAL_FLUSH_DISCARD | \ JBD2_JOURNAL_FLUSH_ZEROOUT) /* * Function declarations for the journaling transaction and buffer * management */ /* Filing buffers */ extern bool __jbd2_journal_refile_buffer(struct journal_head *); extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *); extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh) { list_add_tail(&bh->b_assoc_buffers, head); } static inline void jbd2_unfile_log_bh(struct buffer_head *bh) { list_del_init(&bh->b_assoc_buffers); } /* Log buffer allocation */ struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int); void jbd2_descriptor_block_csum_set(journal_t *, struct buffer_head *); int jbd2_journal_next_log_block(journal_t *, unsigned long long *); int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, unsigned long *block); int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); /* Commit management */ extern void jbd2_journal_commit_transaction(journal_t *); /* Checkpoint list management */ enum jbd2_shrink_type {JBD2_SHRINK_DESTROY, JBD2_SHRINK_BUSY_STOP, JBD2_SHRINK_BUSY_SKIP}; void __jbd2_journal_clean_checkpoint_list(journal_t *journal, enum jbd2_shrink_type type); unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); int __jbd2_journal_remove_checkpoint(struct journal_head *); int jbd2_journal_try_remove_checkpoint(struct journal_head *jh); void jbd2_journal_destroy_checkpoint(journal_t *journal); void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); /* * Triggers */ struct jbd2_buffer_trigger_type { /* * Fired a the moment data to write to the journal are known to be * stable - so either at the moment b_frozen_data is created or just * before a buffer is written to the journal. mapped_data is a mapped * buffer that is the frozen data for commit. */ void (*t_frozen)(struct jbd2_buffer_trigger_type *type, struct buffer_head *bh, void *mapped_data, size_t size); /* * Fired during journal abort for dirty buffers that will not be * committed. */ void (*t_abort)(struct jbd2_buffer_trigger_type *type, struct buffer_head *bh); }; extern void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, struct jbd2_buffer_trigger_type *triggers); extern void jbd2_buffer_abort_trigger(struct journal_head *jh, struct jbd2_buffer_trigger_type *triggers); /* Buffer IO */ extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, struct buffer_head **bh_out, sector_t blocknr); /* Transaction cache support */ extern void jbd2_journal_destroy_transaction_cache(void); extern int __init jbd2_journal_init_transaction_cache(void); extern void jbd2_journal_free_transaction(transaction_t *); /* * Journal locking. * * We need to lock the journal during transaction state changes so that nobody * ever tries to take a handle on the running transaction while we are in the * middle of moving it to the commit phase. j_state_lock does this. * * Note that the locking is completely interrupt unsafe. We never touch * journal structures from interrupts. */ static inline handle_t *journal_current_handle(void) { return current->journal_info; } /* The journaling code user interface: * * Create and destroy handles * Register buffer modifications against the current transaction. */ extern handle_t *jbd2_journal_start(journal_t *, int nblocks); extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks, int revoke_records, gfp_t gfp_mask, unsigned int type, unsigned int line_no); extern int jbd2_journal_restart(handle_t *, int nblocks); extern int jbd2__journal_restart(handle_t *, int nblocks, int revoke_records, gfp_t gfp_mask); extern int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, unsigned int line_no); extern void jbd2_journal_free_reserved(handle_t *handle); extern int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records); extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *); void jbd2_journal_set_triggers(struct buffer_head *, struct jbd2_buffer_trigger_type *type); extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); extern int jbd2_journal_forget (handle_t *, struct buffer_head *); int jbd2_journal_invalidate_folio(journal_t *, struct folio *, size_t offset, size_t length); bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio); extern int jbd2_journal_stop(handle_t *); extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); extern void jbd2_journal_lock_updates (journal_t *); extern void jbd2_journal_unlock_updates (journal_t *); void jbd2_journal_wait_updates(journal_t *); extern journal_t * jbd2_journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, unsigned long long start, int len, int bsize); extern journal_t * jbd2_journal_init_inode (struct inode *); extern int jbd2_journal_update_format (journal_t *); extern int jbd2_journal_check_used_features (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_check_available_features (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_set_features (journal_t *, unsigned long, unsigned long, unsigned long); extern void jbd2_journal_clear_features (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_load (journal_t *journal); extern int jbd2_journal_destroy (journal_t *); extern int jbd2_journal_recover (journal_t *journal); extern int jbd2_journal_wipe (journal_t *, int); extern int jbd2_journal_skip_recovery (journal_t *); extern void jbd2_journal_update_sb_errno(journal_t *); extern int jbd2_journal_update_sb_log_tail (journal_t *, tid_t, unsigned long, blk_opf_t); extern void jbd2_journal_abort (journal_t *, int); extern int jbd2_journal_errno (journal_t *); extern void jbd2_journal_ack_err (journal_t *); extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); extern int jbd2_journal_force_commit_nested(journal_t *); extern int jbd2_journal_inode_ranged_write(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); extern int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); extern int jbd2_journal_finish_inode_data_buffers( struct jbd2_inode *jinode); extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, struct jbd2_inode *inode, loff_t new_size); extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode); /* * journal_head management */ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh); struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh); void jbd2_journal_put_journal_head(struct journal_head *jh); /* * handle management */ extern struct kmem_cache *jbd2_handle_cache; /* * This specialized allocator has to be a macro for its allocations to be * accounted separately (to have a separate alloc_tag). The typecast is * intentional to enforce typesafety. */ #define jbd2_alloc_handle(_gfp_flags) \ ((handle_t *)kmem_cache_zalloc(jbd2_handle_cache, _gfp_flags)) static inline void jbd2_free_handle(handle_t *handle) { kmem_cache_free(jbd2_handle_cache, handle); } /* * jbd2_inode management (optional, for those file systems that want to use * dynamically allocated jbd2_inode structures) */ extern struct kmem_cache *jbd2_inode_cache; /* * This specialized allocator has to be a macro for its allocations to be * accounted separately (to have a separate alloc_tag). The typecast is * intentional to enforce typesafety. */ #define jbd2_alloc_inode(_gfp_flags) \ ((struct jbd2_inode *)kmem_cache_alloc(jbd2_inode_cache, _gfp_flags)) static inline void jbd2_free_inode(struct jbd2_inode *jinode) { kmem_cache_free(jbd2_inode_cache, jinode); } /* Primary revoke support */ #define JOURNAL_REVOKE_DEFAULT_HASH 256 extern int jbd2_journal_init_revoke(journal_t *, int); extern void jbd2_journal_destroy_revoke_record_cache(void); extern void jbd2_journal_destroy_revoke_table_cache(void); extern int __init jbd2_journal_init_revoke_record_cache(void); extern int __init jbd2_journal_init_revoke_table_cache(void); struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size); void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table); extern void jbd2_journal_destroy_revoke(journal_t *); extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); extern void jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); extern void jbd2_journal_write_revoke_records(transaction_t *transaction, struct list_head *log_bufs); /* Recovery revoke support */ extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); extern int jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t); extern void jbd2_journal_clear_revoke(journal_t *); extern void jbd2_journal_switch_revoke_table(journal_t *journal); extern void jbd2_clear_buffer_revoked_flags(journal_t *journal); /* * The log thread user interface: * * Request space in the current transaction, and force transaction commit * transitions on demand. */ int jbd2_log_start_commit(journal_t *journal, tid_t tid); int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_log_wait_commit(journal_t *journal, tid_t tid); int jbd2_transaction_committed(journal_t *journal, tid_t tid); int jbd2_complete_transaction(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); void __jbd2_log_wait_for_space(journal_t *journal); extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); extern int jbd2_cleanup_journal_tail(journal_t *); /* Fast commit related APIs */ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); int jbd2_fc_end_commit(journal_t *journal); int jbd2_fc_end_commit_fallback(journal_t *journal); int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out); int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); void jbd2_fc_release_bufs(journal_t *journal); /* * is_journal_abort * * Simple test wrapper function to test the JBD2_ABORT state flag. This * bit, when set, indicates that we have had a fatal error somewhere, * either inside the journaling layer or indicated to us by the client * (eg. ext3), and that we and should not commit any further * transactions. */ static inline int is_journal_aborted(journal_t *journal) { return journal->j_flags & JBD2_ABORT; } static inline int is_handle_aborted(handle_t *handle) { if (handle->h_aborted || !handle->h_transaction) return 1; return is_journal_aborted(handle->h_transaction->t_journal); } static inline void jbd2_journal_abort_handle(handle_t *handle) { handle->h_aborted = 1; } static inline void jbd2_init_fs_dev_write_error(journal_t *journal) { struct address_space *mapping = journal->j_fs_dev->bd_mapping; /* * Save the original wb_err value of client fs's bdev mapping which * could be used to detect the client fs's metadata async write error. */ errseq_check_and_advance(&mapping->wb_err, &journal->j_fs_dev_wb_err); } static inline int jbd2_check_fs_dev_write_error(journal_t *journal) { struct address_space *mapping = journal->j_fs_dev->bd_mapping; return errseq_check(&mapping->wb_err, READ_ONCE(journal->j_fs_dev_wb_err)); } #endif /* __KERNEL__ */ /* Comparison functions for transaction IDs: perform comparisons using * modulo arithmetic so that they work over sequence number wraps. */ static inline int tid_gt(tid_t x, tid_t y) { int difference = (x - y); return (difference > 0); } static inline int tid_geq(tid_t x, tid_t y) { int difference = (x - y); return (difference >= 0); } extern int jbd2_journal_blocks_per_folio(struct inode *inode); extern size_t journal_tag_bytes(journal_t *journal); static inline int jbd2_journal_has_csum_v2or3(journal_t *journal) { return jbd2_has_feature_csum2(journal) || jbd2_has_feature_csum3(journal); } static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb) { int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks); return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS; } /* * Return number of free blocks in the log. Must be called under j_state_lock. */ static inline unsigned long jbd2_log_space_left(journal_t *journal) { /* Allow for rounding errors */ long free = journal->j_free - 32; if (journal->j_committing_transaction) { free -= atomic_read(&journal-> j_committing_transaction->t_outstanding_credits); } return max_t(long, free, 0); } /* * Definitions which augment the buffer_head layer */ /* journaling buffer types */ #define BJ_None 0 /* Not journaled */ #define BJ_Metadata 1 /* Normal journaled metadata */ #define BJ_Forget 2 /* Buffer superseded by this transaction */ #define BJ_Shadow 3 /* Buffer contents being shadowed to the log */ #define BJ_Reserved 4 /* Buffer is reserved for access by journal */ #define BJ_Types 5 static inline u32 jbd2_chksum(u32 crc, const void *address, unsigned int length) { return crc32c(crc, address, length); } /* Return most recent uncommitted transaction */ static inline tid_t jbd2_get_latest_transaction(journal_t *journal) { tid_t tid; read_lock(&journal->j_state_lock); tid = journal->j_commit_request; if (journal->j_running_transaction) tid = journal->j_running_transaction->t_tid; read_unlock(&journal->j_state_lock); return tid; } static inline int jbd2_handle_buffer_credits(handle_t *handle) { journal_t *journal; if (!handle->h_reserved) journal = handle->h_transaction->t_journal; else journal = handle->h_journal; return handle->h_total_credits - DIV_ROUND_UP(handle->h_revoke_credits_requested, journal->j_revoke_records_per_block); } #ifdef __KERNEL__ #define buffer_trace_init(bh) do {} while (0) #define print_buffer_fields(bh) do {} while (0) #define print_buffer_trace(bh) do {} while (0) #define BUFFER_TRACE(bh, info) do {} while (0) #define BUFFER_TRACE2(bh, bh2, info) do {} while (0) #define JBUFFER_TRACE(jh, info) do {} while (0) #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* _LINUX_JBD2_H */ |
| 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 | // SPDX-License-Identifier: GPL-2.0-or-later // // core.c -- Voltage/Current Regulator framework. // // Copyright 2007, 2008 Wolfson Microelectronics PLC. // Copyright 2008 SlimLogic Ltd. // // Author: Liam Girdwood <lrg@slimlogic.co.uk> #include <linux/kernel.h> #include <linux/init.h> #include <linux/debugfs.h> #include <linux/device.h> #include <linux/slab.h> #include <linux/async.h> #include <linux/err.h> #include <linux/mutex.h> #include <linux/suspend.h> #include <linux/delay.h> #include <linux/gpio/consumer.h> #include <linux/of.h> #include <linux/reboot.h> #include <linux/regmap.h> #include <linux/regulator/of_regulator.h> #include <linux/regulator/consumer.h> #include <linux/regulator/coupler.h> #include <linux/regulator/driver.h> #include <linux/regulator/machine.h> #include <linux/module.h> #define CREATE_TRACE_POINTS #include <trace/events/regulator.h> #include "dummy.h" #include "internal.h" #include "regnl.h" static DEFINE_WW_CLASS(regulator_ww_class); static DEFINE_MUTEX(regulator_nesting_mutex); static DEFINE_MUTEX(regulator_list_mutex); static LIST_HEAD(regulator_map_list); static LIST_HEAD(regulator_ena_gpio_list); static LIST_HEAD(regulator_supply_alias_list); static LIST_HEAD(regulator_coupler_list); static bool has_full_constraints; static struct dentry *debugfs_root; /* * struct regulator_map * * Used to provide symbolic supply names to devices. */ struct regulator_map { struct list_head list; const char *dev_name; /* The dev_name() for the consumer */ const char *supply; struct regulator_dev *regulator; }; /* * struct regulator_enable_gpio * * Management for shared enable GPIO pin */ struct regulator_enable_gpio { struct list_head list; struct gpio_desc *gpiod; u32 enable_count; /* a number of enabled shared GPIO */ u32 request_count; /* a number of requested shared GPIO */ }; /* * struct regulator_supply_alias * * Used to map lookups for a supply onto an alternative device. */ struct regulator_supply_alias { struct list_head list; struct device *src_dev; const char *src_supply; struct device *alias_dev; const char *alias_supply; }; static int _regulator_is_enabled(struct regulator_dev *rdev); static int _regulator_disable(struct regulator *regulator); static int _regulator_get_error_flags(struct regulator_dev *rdev, unsigned int *flags); static int _regulator_get_current_limit(struct regulator_dev *rdev); static unsigned int _regulator_get_mode(struct regulator_dev *rdev); static int _notifier_call_chain(struct regulator_dev *rdev, unsigned long event, void *data); static int _regulator_do_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV); static int regulator_balance_voltage(struct regulator_dev *rdev, suspend_state_t state); static struct regulator *create_regulator(struct regulator_dev *rdev, struct device *dev, const char *supply_name); static void destroy_regulator(struct regulator *regulator); static void _regulator_put(struct regulator *regulator); const char *rdev_get_name(struct regulator_dev *rdev) { if (rdev->constraints && rdev->constraints->name) return rdev->constraints->name; else if (rdev->desc->name) return rdev->desc->name; else return ""; } EXPORT_SYMBOL_GPL(rdev_get_name); static bool have_full_constraints(void) { return has_full_constraints || of_have_populated_dt(); } static bool regulator_ops_is_valid(struct regulator_dev *rdev, int ops) { if (!rdev->constraints) { rdev_err(rdev, "no constraints\n"); return false; } if (rdev->constraints->valid_ops_mask & ops) return true; return false; } /** * regulator_lock_nested - lock a single regulator * @rdev: regulator source * @ww_ctx: w/w mutex acquire context * * This function can be called many times by one task on * a single regulator and its mutex will be locked only * once. If a task, which is calling this function is other * than the one, which initially locked the mutex, it will * wait on mutex. * * Return: 0 on success or a negative error number on failure. */ static inline int regulator_lock_nested(struct regulator_dev *rdev, struct ww_acquire_ctx *ww_ctx) { bool lock = false; int ret = 0; mutex_lock(®ulator_nesting_mutex); if (!ww_mutex_trylock(&rdev->mutex, ww_ctx)) { if (rdev->mutex_owner == current) rdev->ref_cnt++; else lock = true; if (lock) { mutex_unlock(®ulator_nesting_mutex); ret = ww_mutex_lock(&rdev->mutex, ww_ctx); mutex_lock(®ulator_nesting_mutex); } } else { lock = true; } if (lock && ret != -EDEADLK) { rdev->ref_cnt++; rdev->mutex_owner = current; } mutex_unlock(®ulator_nesting_mutex); return ret; } /** * regulator_lock - lock a single regulator * @rdev: regulator source * * This function can be called many times by one task on * a single regulator and its mutex will be locked only * once. If a task, which is calling this function is other * than the one, which initially locked the mutex, it will * wait on mutex. */ static void regulator_lock(struct regulator_dev *rdev) { regulator_lock_nested(rdev, NULL); } /** * regulator_unlock - unlock a single regulator * @rdev: regulator_source * * This function unlocks the mutex when the * reference counter reaches 0. */ static void regulator_unlock(struct regulator_dev *rdev) { mutex_lock(®ulator_nesting_mutex); if (--rdev->ref_cnt == 0) { rdev->mutex_owner = NULL; ww_mutex_unlock(&rdev->mutex); } WARN_ON_ONCE(rdev->ref_cnt < 0); mutex_unlock(®ulator_nesting_mutex); } /** * regulator_lock_two - lock two regulators * @rdev1: first regulator * @rdev2: second regulator * @ww_ctx: w/w mutex acquire context * * Locks both rdevs using the regulator_ww_class. */ static void regulator_lock_two(struct regulator_dev *rdev1, struct regulator_dev *rdev2, struct ww_acquire_ctx *ww_ctx) { struct regulator_dev *held, *contended; int ret; ww_acquire_init(ww_ctx, ®ulator_ww_class); /* Try to just grab both of them */ ret = regulator_lock_nested(rdev1, ww_ctx); WARN_ON(ret); ret = regulator_lock_nested(rdev2, ww_ctx); if (ret != -EDEADLOCK) { WARN_ON(ret); goto exit; } held = rdev1; contended = rdev2; while (true) { regulator_unlock(held); ww_mutex_lock_slow(&contended->mutex, ww_ctx); contended->ref_cnt++; contended->mutex_owner = current; swap(held, contended); ret = regulator_lock_nested(contended, ww_ctx); if (ret != -EDEADLOCK) { WARN_ON(ret); break; } } exit: ww_acquire_done(ww_ctx); } /** * regulator_unlock_two - unlock two regulators * @rdev1: first regulator * @rdev2: second regulator * @ww_ctx: w/w mutex acquire context * * The inverse of regulator_lock_two(). */ static void regulator_unlock_two(struct regulator_dev *rdev1, struct regulator_dev *rdev2, struct ww_acquire_ctx *ww_ctx) { regulator_unlock(rdev2); regulator_unlock(rdev1); ww_acquire_fini(ww_ctx); } static bool regulator_supply_is_couple(struct regulator_dev *rdev) { struct regulator_dev *c_rdev; int i; for (i = 1; i < rdev->coupling_desc.n_coupled; i++) { c_rdev = rdev->coupling_desc.coupled_rdevs[i]; if (rdev->supply->rdev == c_rdev) return true; } return false; } static void regulator_unlock_recursive(struct regulator_dev *rdev, unsigned int n_coupled) { struct regulator_dev *c_rdev, *supply_rdev; int i, supply_n_coupled; for (i = n_coupled; i > 0; i--) { c_rdev = rdev->coupling_desc.coupled_rdevs[i - 1]; if (!c_rdev) continue; if (c_rdev->supply && !regulator_supply_is_couple(c_rdev)) { supply_rdev = c_rdev->supply->rdev; supply_n_coupled = supply_rdev->coupling_desc.n_coupled; regulator_unlock_recursive(supply_rdev, supply_n_coupled); } regulator_unlock(c_rdev); } } static int regulator_lock_recursive(struct regulator_dev *rdev, struct regulator_dev **new_contended_rdev, struct regulator_dev **old_contended_rdev, struct ww_acquire_ctx *ww_ctx) { struct regulator_dev *c_rdev; int i, err; for (i = 0; i < rdev->coupling_desc.n_coupled; i++) { c_rdev = rdev->coupling_desc.coupled_rdevs[i]; if (!c_rdev) continue; if (c_rdev != *old_contended_rdev) { err = regulator_lock_nested(c_rdev, ww_ctx); if (err) { if (err == -EDEADLK) { *new_contended_rdev = c_rdev; goto err_unlock; } /* shouldn't happen */ WARN_ON_ONCE(err != -EALREADY); } } else { *old_contended_rdev = NULL; } if (c_rdev->supply && !regulator_supply_is_couple(c_rdev)) { err = regulator_lock_recursive(c_rdev->supply->rdev, new_contended_rdev, old_contended_rdev, ww_ctx); if (err) { regulator_unlock(c_rdev); goto err_unlock; } } } return 0; err_unlock: regulator_unlock_recursive(rdev, i); return err; } /** * regulator_unlock_dependent - unlock regulator's suppliers and coupled * regulators * @rdev: regulator source * @ww_ctx: w/w mutex acquire context * * Unlock all regulators related with rdev by coupling or supplying. */ static void regulator_unlock_dependent(struct regulator_dev *rdev, struct ww_acquire_ctx *ww_ctx) { regulator_unlock_recursive(rdev, rdev->coupling_desc.n_coupled); ww_acquire_fini(ww_ctx); } /** * regulator_lock_dependent - lock regulator's suppliers and coupled regulators * @rdev: regulator source * @ww_ctx: w/w mutex acquire context * * This function as a wrapper on regulator_lock_recursive(), which locks * all regulators related with rdev by coupling or supplying. */ static void regulator_lock_dependent(struct regulator_dev *rdev, struct ww_acquire_ctx *ww_ctx) { struct regulator_dev *new_contended_rdev = NULL; struct regulator_dev *old_contended_rdev = NULL; int err; mutex_lock(®ulator_list_mutex); ww_acquire_init(ww_ctx, ®ulator_ww_class); do { if (new_contended_rdev) { ww_mutex_lock_slow(&new_contended_rdev->mutex, ww_ctx); old_contended_rdev = new_contended_rdev; old_contended_rdev->ref_cnt++; old_contended_rdev->mutex_owner = current; } err = regulator_lock_recursive(rdev, &new_contended_rdev, &old_contended_rdev, ww_ctx); if (old_contended_rdev) regulator_unlock(old_contended_rdev); } while (err == -EDEADLK); ww_acquire_done(ww_ctx); mutex_unlock(®ulator_list_mutex); } /* Platform voltage constraint check */ int regulator_check_voltage(struct regulator_dev *rdev, int *min_uV, int *max_uV) { BUG_ON(*min_uV > *max_uV); if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_VOLTAGE)) { rdev_err(rdev, "voltage operation not allowed\n"); return -EPERM; } if (*max_uV > rdev->constraints->max_uV) *max_uV = rdev->constraints->max_uV; if (*min_uV < rdev->constraints->min_uV) *min_uV = rdev->constraints->min_uV; if (*min_uV > *max_uV) { rdev_err(rdev, "unsupportable voltage range: %d-%duV\n", *min_uV, *max_uV); return -EINVAL; } return 0; } /* return 0 if the state is valid */ static int regulator_check_states(suspend_state_t state) { return (state > PM_SUSPEND_MAX || state == PM_SUSPEND_TO_IDLE); } /* Make sure we select a voltage that suits the needs of all * regulator consumers */ int regulator_check_consumers(struct regulator_dev *rdev, int *min_uV, int *max_uV, suspend_state_t state) { struct regulator *regulator; struct regulator_voltage *voltage; list_for_each_entry(regulator, &rdev->consumer_list, list) { voltage = ®ulator->voltage[state]; /* * Assume consumers that didn't say anything are OK * with anything in the constraint range. */ if (!voltage->min_uV && !voltage->max_uV) continue; if (*max_uV > voltage->max_uV) *max_uV = voltage->max_uV; if (*min_uV < voltage->min_uV) *min_uV = voltage->min_uV; } if (*min_uV > *max_uV) { rdev_err(rdev, "Restricting voltage, %u-%uuV\n", *min_uV, *max_uV); return -EINVAL; } return 0; } /* current constraint check */ static int regulator_check_current_limit(struct regulator_dev *rdev, int *min_uA, int *max_uA) { BUG_ON(*min_uA > *max_uA); if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_CURRENT)) { rdev_err(rdev, "current operation not allowed\n"); return -EPERM; } if (*max_uA > rdev->constraints->max_uA && rdev->constraints->max_uA) *max_uA = rdev->constraints->max_uA; if (*min_uA < rdev->constraints->min_uA) *min_uA = rdev->constraints->min_uA; if (*min_uA > *max_uA) { rdev_err(rdev, "unsupportable current range: %d-%duA\n", *min_uA, *max_uA); return -EINVAL; } return 0; } /* operating mode constraint check */ static int regulator_mode_constrain(struct regulator_dev *rdev, unsigned int *mode) { switch (*mode) { case REGULATOR_MODE_FAST: case REGULATOR_MODE_NORMAL: case REGULATOR_MODE_IDLE: case REGULATOR_MODE_STANDBY: break; default: rdev_err(rdev, "invalid mode %x specified\n", *mode); return -EINVAL; } if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_MODE)) { rdev_err(rdev, "mode operation not allowed\n"); return -EPERM; } /* The modes are bitmasks, the most power hungry modes having * the lowest values. If the requested mode isn't supported * try higher modes. */ while (*mode) { if (rdev->constraints->valid_modes_mask & *mode) return 0; *mode /= 2; } return -EINVAL; } static inline struct regulator_state * regulator_get_suspend_state(struct regulator_dev *rdev, suspend_state_t state) { if (rdev->constraints == NULL) return NULL; switch (state) { case PM_SUSPEND_STANDBY: return &rdev->constraints->state_standby; case PM_SUSPEND_MEM: return &rdev->constraints->state_mem; case PM_SUSPEND_MAX: return &rdev->constraints->state_disk; default: return NULL; } } static const struct regulator_state * regulator_get_suspend_state_check(struct regulator_dev *rdev, suspend_state_t state) { const struct regulator_state *rstate; rstate = regulator_get_suspend_state(rdev, state); if (rstate == NULL) return NULL; /* If we have no suspend mode configuration don't set anything; * only warn if the driver implements set_suspend_voltage or * set_suspend_mode callback. */ if (rstate->enabled != ENABLE_IN_SUSPEND && rstate->enabled != DISABLE_IN_SUSPEND) { if (rdev->desc->ops->set_suspend_voltage || rdev->desc->ops->set_suspend_mode) rdev_warn(rdev, "No configuration\n"); return NULL; } return rstate; } static ssize_t microvolts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); int uV; regulator_lock(rdev); uV = regulator_get_voltage_rdev(rdev); regulator_unlock(rdev); if (uV < 0) return uV; return sprintf(buf, "%d\n", uV); } static DEVICE_ATTR_RO(microvolts); static ssize_t microamps_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return sprintf(buf, "%d\n", _regulator_get_current_limit(rdev)); } static DEVICE_ATTR_RO(microamps); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return sprintf(buf, "%s\n", rdev_get_name(rdev)); } static DEVICE_ATTR_RO(name); static const char *regulator_opmode_to_str(int mode) { switch (mode) { case REGULATOR_MODE_FAST: return "fast"; case REGULATOR_MODE_NORMAL: return "normal"; case REGULATOR_MODE_IDLE: return "idle"; case REGULATOR_MODE_STANDBY: return "standby"; } return "unknown"; } static ssize_t regulator_print_opmode(char *buf, int mode) { return sprintf(buf, "%s\n", regulator_opmode_to_str(mode)); } static ssize_t opmode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return regulator_print_opmode(buf, _regulator_get_mode(rdev)); } static DEVICE_ATTR_RO(opmode); static ssize_t regulator_print_state(char *buf, int state) { if (state > 0) return sprintf(buf, "enabled\n"); else if (state == 0) return sprintf(buf, "disabled\n"); else return sprintf(buf, "unknown\n"); } static ssize_t state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); ssize_t ret; regulator_lock(rdev); ret = regulator_print_state(buf, _regulator_is_enabled(rdev)); regulator_unlock(rdev); return ret; } static DEVICE_ATTR_RO(state); static ssize_t status_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); int status; char *label; status = rdev->desc->ops->get_status(rdev); if (status < 0) return status; switch (status) { case REGULATOR_STATUS_OFF: label = "off"; break; case REGULATOR_STATUS_ON: label = "on"; break; case REGULATOR_STATUS_ERROR: label = "error"; break; case REGULATOR_STATUS_FAST: label = "fast"; break; case REGULATOR_STATUS_NORMAL: label = "normal"; break; case REGULATOR_STATUS_IDLE: label = "idle"; break; case REGULATOR_STATUS_STANDBY: label = "standby"; break; case REGULATOR_STATUS_BYPASS: label = "bypass"; break; case REGULATOR_STATUS_UNDEFINED: label = "undefined"; break; default: return -ERANGE; } return sprintf(buf, "%s\n", label); } static DEVICE_ATTR_RO(status); static ssize_t min_microamps_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); if (!rdev->constraints) return sprintf(buf, "constraint not defined\n"); return sprintf(buf, "%d\n", rdev->constraints->min_uA); } static DEVICE_ATTR_RO(min_microamps); static ssize_t max_microamps_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); if (!rdev->constraints) return sprintf(buf, "constraint not defined\n"); return sprintf(buf, "%d\n", rdev->constraints->max_uA); } static DEVICE_ATTR_RO(max_microamps); static ssize_t min_microvolts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); if (!rdev->constraints) return sprintf(buf, "constraint not defined\n"); return sprintf(buf, "%d\n", rdev->constraints->min_uV); } static DEVICE_ATTR_RO(min_microvolts); static ssize_t max_microvolts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); if (!rdev->constraints) return sprintf(buf, "constraint not defined\n"); return sprintf(buf, "%d\n", rdev->constraints->max_uV); } static DEVICE_ATTR_RO(max_microvolts); static ssize_t requested_microamps_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); struct regulator *regulator; int uA = 0; regulator_lock(rdev); list_for_each_entry(regulator, &rdev->consumer_list, list) { if (regulator->enable_count) uA += regulator->uA_load; } regulator_unlock(rdev); return sprintf(buf, "%d\n", uA); } static DEVICE_ATTR_RO(requested_microamps); static ssize_t num_users_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return sprintf(buf, "%d\n", rdev->use_count); } static DEVICE_ATTR_RO(num_users); static ssize_t type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); switch (rdev->desc->type) { case REGULATOR_VOLTAGE: return sprintf(buf, "voltage\n"); case REGULATOR_CURRENT: return sprintf(buf, "current\n"); } return sprintf(buf, "unknown\n"); } static DEVICE_ATTR_RO(type); static ssize_t suspend_mem_microvolts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return sprintf(buf, "%d\n", rdev->constraints->state_mem.uV); } static DEVICE_ATTR_RO(suspend_mem_microvolts); static ssize_t suspend_disk_microvolts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return sprintf(buf, "%d\n", rdev->constraints->state_disk.uV); } static DEVICE_ATTR_RO(suspend_disk_microvolts); static ssize_t suspend_standby_microvolts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return sprintf(buf, "%d\n", rdev->constraints->state_standby.uV); } static DEVICE_ATTR_RO(suspend_standby_microvolts); static ssize_t suspend_mem_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return regulator_print_opmode(buf, rdev->constraints->state_mem.mode); } static DEVICE_ATTR_RO(suspend_mem_mode); static ssize_t suspend_disk_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return regulator_print_opmode(buf, rdev->constraints->state_disk.mode); } static DEVICE_ATTR_RO(suspend_disk_mode); static ssize_t suspend_standby_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return regulator_print_opmode(buf, rdev->constraints->state_standby.mode); } static DEVICE_ATTR_RO(suspend_standby_mode); static ssize_t suspend_mem_state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return regulator_print_state(buf, rdev->constraints->state_mem.enabled); } static DEVICE_ATTR_RO(suspend_mem_state); static ssize_t suspend_disk_state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return regulator_print_state(buf, rdev->constraints->state_disk.enabled); } static DEVICE_ATTR_RO(suspend_disk_state); static ssize_t suspend_standby_state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return regulator_print_state(buf, rdev->constraints->state_standby.enabled); } static DEVICE_ATTR_RO(suspend_standby_state); static ssize_t bypass_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); const char *report; bool bypass; int ret; ret = rdev->desc->ops->get_bypass(rdev, &bypass); if (ret != 0) report = "unknown"; else if (bypass) report = "enabled"; else report = "disabled"; return sprintf(buf, "%s\n", report); } static DEVICE_ATTR_RO(bypass); static ssize_t power_budget_milliwatt_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return sprintf(buf, "%d\n", rdev->constraints->pw_budget_mW); } static DEVICE_ATTR_RO(power_budget_milliwatt); static ssize_t power_requested_milliwatt_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return sprintf(buf, "%d\n", rdev->pw_requested_mW); } static DEVICE_ATTR_RO(power_requested_milliwatt); #define REGULATOR_ERROR_ATTR(name, bit) \ static ssize_t name##_show(struct device *dev, struct device_attribute *attr, \ char *buf) \ { \ int ret; \ unsigned int flags; \ struct regulator_dev *rdev = dev_get_drvdata(dev); \ ret = _regulator_get_error_flags(rdev, &flags); \ if (ret) \ return ret; \ return sysfs_emit(buf, "%d\n", !!(flags & (bit))); \ } \ static DEVICE_ATTR_RO(name) REGULATOR_ERROR_ATTR(under_voltage, REGULATOR_ERROR_UNDER_VOLTAGE); REGULATOR_ERROR_ATTR(over_current, REGULATOR_ERROR_OVER_CURRENT); REGULATOR_ERROR_ATTR(regulation_out, REGULATOR_ERROR_REGULATION_OUT); REGULATOR_ERROR_ATTR(fail, REGULATOR_ERROR_FAIL); REGULATOR_ERROR_ATTR(over_temp, REGULATOR_ERROR_OVER_TEMP); REGULATOR_ERROR_ATTR(under_voltage_warn, REGULATOR_ERROR_UNDER_VOLTAGE_WARN); REGULATOR_ERROR_ATTR(over_current_warn, REGULATOR_ERROR_OVER_CURRENT_WARN); REGULATOR_ERROR_ATTR(over_voltage_warn, REGULATOR_ERROR_OVER_VOLTAGE_WARN); REGULATOR_ERROR_ATTR(over_temp_warn, REGULATOR_ERROR_OVER_TEMP_WARN); /* Calculate the new optimum regulator operating mode based on the new total * consumer load. All locks held by caller */ static int drms_uA_update(struct regulator_dev *rdev) { struct regulator *sibling; int current_uA = 0, output_uV, input_uV, err; unsigned int mode; /* * first check to see if we can set modes at all, otherwise just * tell the consumer everything is OK. */ if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_DRMS)) { rdev_dbg(rdev, "DRMS operation not allowed\n"); return 0; } if (!rdev->desc->ops->get_optimum_mode && !rdev->desc->ops->set_load) return 0; if (!rdev->desc->ops->set_mode && !rdev->desc->ops->set_load) return -EINVAL; /* calc total requested load */ list_for_each_entry(sibling, &rdev->consumer_list, list) { if (sibling->enable_count) current_uA += sibling->uA_load; } current_uA += rdev->constraints->system_load; if (rdev->desc->ops->set_load) { /* set the optimum mode for our new total regulator load */ err = rdev->desc->ops->set_load(rdev, current_uA); if (err < 0) rdev_err(rdev, "failed to set load %d: %pe\n", current_uA, ERR_PTR(err)); } else { /* * Unfortunately in some cases the constraints->valid_ops has * REGULATOR_CHANGE_DRMS but there are no valid modes listed. * That's not really legit but we won't consider it a fatal * error here. We'll treat it as if REGULATOR_CHANGE_DRMS * wasn't set. */ if (!rdev->constraints->valid_modes_mask) { rdev_dbg(rdev, "Can change modes; but no valid mode\n"); return 0; } /* get output voltage */ output_uV = regulator_get_voltage_rdev(rdev); /* * Don't return an error; if regulator driver cares about * output_uV then it's up to the driver to validate. */ if (output_uV <= 0) rdev_dbg(rdev, "invalid output voltage found\n"); /* get input voltage */ input_uV = 0; if (rdev->supply) input_uV = regulator_get_voltage_rdev(rdev->supply->rdev); if (input_uV <= 0) input_uV = rdev->constraints->input_uV; /* * Don't return an error; if regulator driver cares about * input_uV then it's up to the driver to validate. */ if (input_uV <= 0) rdev_dbg(rdev, "invalid input voltage found\n"); /* now get the optimum mode for our new total regulator load */ mode = rdev->desc->ops->get_optimum_mode(rdev, input_uV, output_uV, current_uA); /* check the new mode is allowed */ err = regulator_mode_constrain(rdev, &mode); if (err < 0) { rdev_err(rdev, "failed to get optimum mode @ %d uA %d -> %d uV: %pe\n", current_uA, input_uV, output_uV, ERR_PTR(err)); return err; } err = rdev->desc->ops->set_mode(rdev, mode); if (err < 0) rdev_err(rdev, "failed to set optimum mode %x: %pe\n", mode, ERR_PTR(err)); } return err; } static int __suspend_set_state(struct regulator_dev *rdev, const struct regulator_state *rstate) { int ret = 0; if (rstate->enabled == ENABLE_IN_SUSPEND && rdev->desc->ops->set_suspend_enable) ret = rdev->desc->ops->set_suspend_enable(rdev); else if (rstate->enabled == DISABLE_IN_SUSPEND && rdev->desc->ops->set_suspend_disable) ret = rdev->desc->ops->set_suspend_disable(rdev); else /* OK if set_suspend_enable or set_suspend_disable is NULL */ ret = 0; if (ret < 0) { rdev_err(rdev, "failed to enabled/disable: %pe\n", ERR_PTR(ret)); return ret; } if (rdev->desc->ops->set_suspend_voltage && rstate->uV > 0) { ret = rdev->desc->ops->set_suspend_voltage(rdev, rstate->uV); if (ret < 0) { rdev_err(rdev, "failed to set voltage: %pe\n", ERR_PTR(ret)); return ret; } } if (rdev->desc->ops->set_suspend_mode && rstate->mode > 0) { ret = rdev->desc->ops->set_suspend_mode(rdev, rstate->mode); if (ret < 0) { rdev_err(rdev, "failed to set mode: %pe\n", ERR_PTR(ret)); return ret; } } return ret; } static int suspend_set_initial_state(struct regulator_dev *rdev) { const struct regulator_state *rstate; rstate = regulator_get_suspend_state_check(rdev, rdev->constraints->initial_state); if (!rstate) return 0; return __suspend_set_state(rdev, rstate); } #if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) static void print_constraints_debug(struct regulator_dev *rdev) { struct regulation_constraints *constraints = rdev->constraints; char buf[160] = ""; size_t len = sizeof(buf) - 1; int count = 0; int ret; if (constraints->min_uV && constraints->max_uV) { if (constraints->min_uV == constraints->max_uV) count += scnprintf(buf + count, len - count, "%d mV ", constraints->min_uV / 1000); else count += scnprintf(buf + count, len - count, "%d <--> %d mV ", constraints->min_uV / 1000, constraints->max_uV / 1000); } if (!constraints->min_uV || constraints->min_uV != constraints->max_uV) { ret = regulator_get_voltage_rdev(rdev); if (ret > 0) count += scnprintf(buf + count, len - count, "at %d mV ", ret / 1000); } if (constraints->uV_offset) count += scnprintf(buf + count, len - count, "%dmV offset ", constraints->uV_offset / 1000); if (constraints->min_uA && constraints->max_uA) { if (constraints->min_uA == constraints->max_uA) count += scnprintf(buf + count, len - count, "%d mA ", constraints->min_uA / 1000); else count += scnprintf(buf + count, len - count, "%d <--> %d mA ", constraints->min_uA / 1000, constraints->max_uA / 1000); } if (!constraints->min_uA || constraints->min_uA != constraints->max_uA) { ret = _regulator_get_current_limit(rdev); if (ret > 0) count += scnprintf(buf + count, len - count, "at %d mA ", ret / 1000); } if (constraints->valid_modes_mask & REGULATOR_MODE_FAST) count += scnprintf(buf + count, len - count, "fast "); if (constraints->valid_modes_mask & REGULATOR_MODE_NORMAL) count += scnprintf(buf + count, len - count, "normal "); if (constraints->valid_modes_mask & REGULATOR_MODE_IDLE) count += scnprintf(buf + count, len - count, "idle "); if (constraints->valid_modes_mask & REGULATOR_MODE_STANDBY) count += scnprintf(buf + count, len - count, "standby "); if (constraints->pw_budget_mW) count += scnprintf(buf + count, len - count, "%d mW budget", constraints->pw_budget_mW); if (!count) count = scnprintf(buf, len, "no parameters"); else --count; count += scnprintf(buf + count, len - count, ", %s", _regulator_is_enabled(rdev) ? "enabled" : "disabled"); rdev_dbg(rdev, "%s\n", buf); } #else /* !DEBUG && !CONFIG_DYNAMIC_DEBUG */ static inline void print_constraints_debug(struct regulator_dev *rdev) {} #endif /* !DEBUG && !CONFIG_DYNAMIC_DEBUG */ static void print_constraints(struct regulator_dev *rdev) { struct regulation_constraints *constraints = rdev->constraints; print_constraints_debug(rdev); if ((constraints->min_uV != constraints->max_uV) && !regulator_ops_is_valid(rdev, REGULATOR_CHANGE_VOLTAGE)) rdev_warn(rdev, "Voltage range but no REGULATOR_CHANGE_VOLTAGE\n"); } static int machine_constraints_voltage(struct regulator_dev *rdev, struct regulation_constraints *constraints) { const struct regulator_ops *ops = rdev->desc->ops; int ret; /* do we need to apply the constraint voltage */ if (rdev->constraints->apply_uV && rdev->constraints->min_uV && rdev->constraints->max_uV) { int target_min, target_max; int current_uV = regulator_get_voltage_rdev(rdev); if (current_uV == -ENOTRECOVERABLE) { /* This regulator can't be read and must be initialized */ rdev_info(rdev, "Setting %d-%duV\n", rdev->constraints->min_uV, rdev->constraints->max_uV); _regulator_do_set_voltage(rdev, rdev->constraints->min_uV, rdev->constraints->max_uV); current_uV = regulator_get_voltage_rdev(rdev); } if (current_uV < 0) { if (current_uV != -EPROBE_DEFER) rdev_err(rdev, "failed to get the current voltage: %pe\n", ERR_PTR(current_uV)); return current_uV; } /* * If we're below the minimum voltage move up to the * minimum voltage, if we're above the maximum voltage * then move down to the maximum. */ target_min = current_uV; target_max = current_uV; if (current_uV < rdev->constraints->min_uV) { target_min = rdev->constraints->min_uV; target_max = rdev->constraints->min_uV; } if (current_uV > rdev->constraints->max_uV) { target_min = rdev->constraints->max_uV; target_max = rdev->constraints->max_uV; } if (target_min != current_uV || target_max != current_uV) { rdev_info(rdev, "Bringing %duV into %d-%duV\n", current_uV, target_min, target_max); ret = _regulator_do_set_voltage( rdev, target_min, target_max); if (ret < 0) { rdev_err(rdev, "failed to apply %d-%duV constraint: %pe\n", target_min, target_max, ERR_PTR(ret)); return ret; } } } /* constrain machine-level voltage specs to fit * the actual range supported by this regulator. */ if (ops->list_voltage && rdev->desc->n_voltages) { int count = rdev->desc->n_voltages; int i; int min_uV = INT_MAX; int max_uV = INT_MIN; int cmin = constraints->min_uV; int cmax = constraints->max_uV; /* it's safe to autoconfigure fixed-voltage supplies * and the constraints are used by list_voltage. */ if (count == 1 && !cmin) { cmin = 1; cmax = INT_MAX; constraints->min_uV = cmin; constraints->max_uV = cmax; } /* voltage constraints are optional */ if ((cmin == 0) && (cmax == 0)) return 0; /* else require explicit machine-level constraints */ if (cmin <= 0 || cmax <= 0 || cmax < cmin) { rdev_err(rdev, "invalid voltage constraints\n"); return -EINVAL; } /* no need to loop voltages if range is continuous */ if (rdev->desc->continuous_voltage_range) return 0; /* initial: [cmin..cmax] valid, [min_uV..max_uV] not */ for (i = 0; i < count; i++) { int value; value = ops->list_voltage(rdev, i); if (value <= 0) continue; /* maybe adjust [min_uV..max_uV] */ if (value >= cmin && value < min_uV) min_uV = value; if (value <= cmax && value > max_uV) max_uV = value; } /* final: [min_uV..max_uV] valid iff constraints valid */ if (max_uV < min_uV) { rdev_err(rdev, "unsupportable voltage constraints %u-%uuV\n", min_uV, max_uV); return -EINVAL; } /* use regulator's subset of machine constraints */ if (constraints->min_uV < min_uV) { rdev_dbg(rdev, "override min_uV, %d -> %d\n", constraints->min_uV, min_uV); constraints->min_uV = min_uV; } if (constraints->max_uV > max_uV) { rdev_dbg(rdev, "override max_uV, %d -> %d\n", constraints->max_uV, max_uV); constraints->max_uV = max_uV; } } return 0; } static int machine_constraints_current(struct regulator_dev *rdev, struct regulation_constraints *constraints) { const struct regulator_ops *ops = rdev->desc->ops; int ret; if (!constraints->min_uA && !constraints->max_uA) return 0; if (constraints->min_uA > constraints->max_uA) { rdev_err(rdev, "Invalid current constraints\n"); return -EINVAL; } if (!ops->set_current_limit || !ops->get_current_limit) { rdev_warn(rdev, "Operation of current configuration missing\n"); return 0; } /* Set regulator current in constraints range */ ret = ops->set_current_limit(rdev, constraints->min_uA, constraints->max_uA); if (ret < 0) { rdev_err(rdev, "Failed to set current constraint, %d\n", ret); return ret; } return 0; } static int _regulator_do_enable(struct regulator_dev *rdev); static int notif_set_limit(struct regulator_dev *rdev, int (*set)(struct regulator_dev *, int, int, bool), int limit, int severity) { bool enable; if (limit == REGULATOR_NOTIF_LIMIT_DISABLE) { enable = false; limit = 0; } else { enable = true; } if (limit == REGULATOR_NOTIF_LIMIT_ENABLE) limit = 0; return set(rdev, limit, severity, enable); } static int handle_notify_limits(struct regulator_dev *rdev, int (*set)(struct regulator_dev *, int, int, bool), struct notification_limit *limits) { int ret = 0; if (!set) return -EOPNOTSUPP; if (limits->prot) ret = notif_set_limit(rdev, set, limits->prot, REGULATOR_SEVERITY_PROT); if (ret) return ret; if (limits->err) ret = notif_set_limit(rdev, set, limits->err, REGULATOR_SEVERITY_ERR); if (ret) return ret; if (limits->warn) ret = notif_set_limit(rdev, set, limits->warn, REGULATOR_SEVERITY_WARN); return ret; } /** * set_machine_constraints - sets regulator constraints * @rdev: regulator source * * Allows platform initialisation code to define and constrain * regulator circuits e.g. valid voltage/current ranges, etc. NOTE: * Constraints *must* be set by platform code in order for some * regulator operations to proceed i.e. set_voltage, set_current_limit, * set_mode. * * Return: 0 on success or a negative error number on failure. */ static int set_machine_constraints(struct regulator_dev *rdev) { int ret = 0; const struct regulator_ops *ops = rdev->desc->ops; ret = machine_constraints_voltage(rdev, rdev->constraints); if (ret != 0) return ret; ret = machine_constraints_current(rdev, rdev->constraints); if (ret != 0) return ret; if (rdev->constraints->ilim_uA && ops->set_input_current_limit) { ret = ops->set_input_current_limit(rdev, rdev->constraints->ilim_uA); if (ret < 0) { rdev_err(rdev, "failed to set input limit: %pe\n", ERR_PTR(ret)); return ret; } } /* do we need to setup our suspend state */ if (rdev->constraints->initial_state) { ret = suspend_set_initial_state(rdev); if (ret < 0) { rdev_err(rdev, "failed to set suspend state: %pe\n", ERR_PTR(ret)); return ret; } } if (rdev->constraints->initial_mode) { if (!ops->set_mode) { rdev_err(rdev, "no set_mode operation\n"); return -EINVAL; } ret = ops->set_mode(rdev, rdev->constraints->initial_mode); if (ret < 0) { rdev_err(rdev, "failed to set initial mode: %pe\n", ERR_PTR(ret)); return ret; } } else if (rdev->constraints->system_load) { /* * We'll only apply the initial system load if an * initial mode wasn't specified. */ drms_uA_update(rdev); } if ((rdev->constraints->ramp_delay || rdev->constraints->ramp_disable) && ops->set_ramp_delay) { ret = ops->set_ramp_delay(rdev, rdev->constraints->ramp_delay); if (ret < 0) { rdev_err(rdev, "failed to set ramp_delay: %pe\n", ERR_PTR(ret)); return ret; } } if (rdev->constraints->pull_down && ops->set_pull_down) { ret = ops->set_pull_down(rdev); if (ret < 0) { rdev_err(rdev, "failed to set pull down: %pe\n", ERR_PTR(ret)); return ret; } } if (rdev->constraints->soft_start && ops->set_soft_start) { ret = ops->set_soft_start(rdev); if (ret < 0) { rdev_err(rdev, "failed to set soft start: %pe\n", ERR_PTR(ret)); return ret; } } /* * Existing logic does not warn if over_current_protection is given as * a constraint but driver does not support that. I think we should * warn about this type of issues as it is possible someone changes * PMIC on board to another type - and the another PMIC's driver does * not support setting protection. Board composer may happily believe * the DT limits are respected - especially if the new PMIC HW also * supports protection but the driver does not. I won't change the logic * without hearing more experienced opinion on this though. * * If warning is seen as a good idea then we can merge handling the * over-curret protection and detection and get rid of this special * handling. */ if (rdev->constraints->over_current_protection && ops->set_over_current_protection) { int lim = rdev->constraints->over_curr_limits.prot; ret = ops->set_over_current_protection(rdev, lim, REGULATOR_SEVERITY_PROT, true); if (ret < 0) { rdev_err(rdev, "failed to set over current protection: %pe\n", ERR_PTR(ret)); return ret; } } if (rdev->constraints->over_current_detection) ret = handle_notify_limits(rdev, ops->set_over_current_protection, &rdev->constraints->over_curr_limits); if (ret) { if (ret != -EOPNOTSUPP) { rdev_err(rdev, "failed to set over current limits: %pe\n", ERR_PTR(ret)); return ret; } rdev_warn(rdev, "IC does not support requested over-current limits\n"); } if (rdev->constraints->over_voltage_detection) ret = handle_notify_limits(rdev, ops->set_over_voltage_protection, &rdev->constraints->over_voltage_limits); if (ret) { if (ret != -EOPNOTSUPP) { rdev_err(rdev, "failed to set over voltage limits %pe\n", ERR_PTR(ret)); return ret; } rdev_warn(rdev, "IC does not support requested over voltage limits\n"); } if (rdev->constraints->under_voltage_detection) ret = handle_notify_limits(rdev, ops->set_under_voltage_protection, &rdev->constraints->under_voltage_limits); if (ret) { if (ret != -EOPNOTSUPP) { rdev_err(rdev, "failed to set under voltage limits %pe\n", ERR_PTR(ret)); return ret; } rdev_warn(rdev, "IC does not support requested under voltage limits\n"); } if (rdev->constraints->over_temp_detection) ret = handle_notify_limits(rdev, ops->set_thermal_protection, &rdev->constraints->temp_limits); if (ret) { if (ret != -EOPNOTSUPP) { rdev_err(rdev, "failed to set temperature limits %pe\n", ERR_PTR(ret)); return ret; } rdev_warn(rdev, "IC does not support requested temperature limits\n"); } if (rdev->constraints->active_discharge && ops->set_active_discharge) { bool ad_state = (rdev->constraints->active_discharge == REGULATOR_ACTIVE_DISCHARGE_ENABLE) ? true : false; ret = ops->set_active_discharge(rdev, ad_state); if (ret < 0) { rdev_err(rdev, "failed to set active discharge: %pe\n", ERR_PTR(ret)); return ret; } } /* * If there is no mechanism for controlling the regulator then * flag it as always_on so we don't end up duplicating checks * for this so much. Note that we could control the state of * a supply to control the output on a regulator that has no * direct control. */ if (!rdev->ena_pin && !ops->enable) { if (rdev->supply_name && !rdev->supply) return -EPROBE_DEFER; if (rdev->supply) rdev->constraints->always_on = rdev->supply->rdev->constraints->always_on; else rdev->constraints->always_on = true; } /* If the constraints say the regulator should be on at this point * and we have control then make sure it is enabled. */ if (rdev->constraints->always_on || rdev->constraints->boot_on) { /* If we want to enable this regulator, make sure that we know * the supplying regulator. */ if (rdev->supply_name && !rdev->supply) return -EPROBE_DEFER; /* If supplying regulator has already been enabled, * it's not intended to have use_count increment * when rdev is only boot-on. */ if (rdev->supply && (rdev->constraints->always_on || !regulator_is_enabled(rdev->supply))) { ret = regulator_enable(rdev->supply); if (ret < 0) { _regulator_put(rdev->supply); rdev->supply = NULL; return ret; } } ret = _regulator_do_enable(rdev); if (ret < 0 && ret != -EINVAL) { rdev_err(rdev, "failed to enable: %pe\n", ERR_PTR(ret)); return ret; } if (rdev->constraints->always_on) rdev->use_count++; } else if (rdev->desc->off_on_delay) { rdev->last_off = ktime_get(); } if (!rdev->constraints->pw_budget_mW) rdev->constraints->pw_budget_mW = INT_MAX; print_constraints(rdev); return 0; } /** * set_supply - set regulator supply regulator * @rdev: regulator (locked) * @supply_rdev: supply regulator (locked)) * * Called by platform initialisation code to set the supply regulator for this * regulator. This ensures that a regulators supply will also be enabled by the * core if it's child is enabled. * * Return: 0 on success or a negative error number on failure. */ static int set_supply(struct regulator_dev *rdev, struct regulator_dev *supply_rdev) { int err; rdev_dbg(rdev, "supplied by %s\n", rdev_get_name(supply_rdev)); if (!try_module_get(supply_rdev->owner)) return -ENODEV; rdev->supply = create_regulator(supply_rdev, &rdev->dev, "SUPPLY"); if (rdev->supply == NULL) { module_put(supply_rdev->owner); err = -ENOMEM; return err; } supply_rdev->open_count++; return 0; } /** * set_consumer_device_supply - Bind a regulator to a symbolic supply * @rdev: regulator source * @consumer_dev_name: dev_name() string for device supply applies to * @supply: symbolic name for supply * * Allows platform initialisation code to map physical regulator * sources to symbolic names for supplies for use by devices. Devices * should use these symbolic names to request regulators, avoiding the * need to provide board-specific regulator names as platform data. * * Return: 0 on success or a negative error number on failure. */ static int set_consumer_device_supply(struct regulator_dev *rdev, const char *consumer_dev_name, const char *supply) { struct regulator_map *node, *new_node; int has_dev; if (supply == NULL) return -EINVAL; if (consumer_dev_name != NULL) has_dev = 1; else has_dev = 0; new_node = kzalloc(sizeof(struct regulator_map), GFP_KERNEL); if (new_node == NULL) return -ENOMEM; new_node->regulator = rdev; new_node->supply = supply; if (has_dev) { new_node->dev_name = kstrdup(consumer_dev_name, GFP_KERNEL); if (new_node->dev_name == NULL) { kfree(new_node); return -ENOMEM; } } mutex_lock(®ulator_list_mutex); list_for_each_entry(node, ®ulator_map_list, list) { if (node->dev_name && consumer_dev_name) { if (strcmp(node->dev_name, consumer_dev_name) != 0) continue; } else if (node->dev_name || consumer_dev_name) { continue; } if (strcmp(node->supply, supply) != 0) continue; pr_debug("%s: %s/%s is '%s' supply; fail %s/%s\n", consumer_dev_name, dev_name(&node->regulator->dev), node->regulator->desc->name, supply, dev_name(&rdev->dev), rdev_get_name(rdev)); goto fail; } list_add(&new_node->list, ®ulator_map_list); mutex_unlock(®ulator_list_mutex); return 0; fail: mutex_unlock(®ulator_list_mutex); kfree(new_node->dev_name); kfree(new_node); return -EBUSY; } static void unset_regulator_supplies(struct regulator_dev *rdev) { struct regulator_map *node, *n; list_for_each_entry_safe(node, n, ®ulator_map_list, list) { if (rdev == node->regulator) { list_del(&node->list); kfree(node->dev_name); kfree(node); } } } #ifdef CONFIG_DEBUG_FS static ssize_t constraint_flags_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { const struct regulator *regulator = file->private_data; const struct regulation_constraints *c = regulator->rdev->constraints; char *buf; ssize_t ret; if (!c) return 0; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; ret = snprintf(buf, PAGE_SIZE, "always_on: %u\n" "boot_on: %u\n" "apply_uV: %u\n" "ramp_disable: %u\n" "soft_start: %u\n" "pull_down: %u\n" "over_current_protection: %u\n", c->always_on, c->boot_on, c->apply_uV, c->ramp_disable, c->soft_start, c->pull_down, c->over_current_protection); ret = simple_read_from_buffer(user_buf, count, ppos, buf, ret); kfree(buf); return ret; } #endif static const struct file_operations constraint_flags_fops = { #ifdef CONFIG_DEBUG_FS .open = simple_open, .read = constraint_flags_read_file, .llseek = default_llseek, #endif }; #define REG_STR_SIZE 64 static void link_and_create_debugfs(struct regulator *regulator, struct regulator_dev *rdev, struct device *dev) { int err = 0; if (dev) { regulator->dev = dev; /* Add a link to the device sysfs entry */ err = sysfs_create_link_nowarn(&rdev->dev.kobj, &dev->kobj, regulator->supply_name); if (err) { rdev_dbg(rdev, "could not add device link %s: %pe\n", dev->kobj.name, ERR_PTR(err)); /* non-fatal */ } } if (err != -EEXIST) { regulator->debugfs = debugfs_create_dir(regulator->supply_name, rdev->debugfs); if (IS_ERR(regulator->debugfs)) { rdev_dbg(rdev, "Failed to create debugfs directory\n"); regulator->debugfs = NULL; } } if (regulator->debugfs) { debugfs_create_u32("uA_load", 0444, regulator->debugfs, ®ulator->uA_load); debugfs_create_u32("min_uV", 0444, regulator->debugfs, ®ulator->voltage[PM_SUSPEND_ON].min_uV); debugfs_create_u32("max_uV", 0444, regulator->debugfs, ®ulator->voltage[PM_SUSPEND_ON].max_uV); debugfs_create_file("constraint_flags", 0444, regulator->debugfs, regulator, &constraint_flags_fops); } } static struct regulator *create_regulator(struct regulator_dev *rdev, struct device *dev, const char *supply_name) { struct regulator *regulator; lockdep_assert_held_once(&rdev->mutex.base); if (dev) { char buf[REG_STR_SIZE]; int size; size = snprintf(buf, REG_STR_SIZE, "%s-%s", dev->kobj.name, supply_name); if (size >= REG_STR_SIZE) return NULL; supply_name = kstrdup(buf, GFP_KERNEL); if (supply_name == NULL) return NULL; } else { supply_name = kstrdup_const(supply_name, GFP_KERNEL); if (supply_name == NULL) return NULL; } regulator = kzalloc(sizeof(*regulator), GFP_KERNEL); if (regulator == NULL) { kfree_const(supply_name); return NULL; } regulator->rdev = rdev; regulator->supply_name = supply_name; list_add(®ulator->list, &rdev->consumer_list); /* * Check now if the regulator is an always on regulator - if * it is then we don't need to do nearly so much work for * enable/disable calls. */ if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_STATUS) && _regulator_is_enabled(rdev)) regulator->always_on = true; return regulator; } static int _regulator_get_enable_time(struct regulator_dev *rdev) { if (rdev->constraints && rdev->constraints->enable_time) return rdev->constraints->enable_time; if (rdev->desc->ops->enable_time) return rdev->desc->ops->enable_time(rdev); return rdev->desc->enable_time; } static struct regulator_supply_alias *regulator_find_supply_alias( struct device *dev, const char *supply) { struct regulator_supply_alias *map; list_for_each_entry(map, ®ulator_supply_alias_list, list) if (map->src_dev == dev && strcmp(map->src_supply, supply) == 0) return map; return NULL; } static void regulator_supply_alias(struct device **dev, const char **supply) { struct regulator_supply_alias *map; map = regulator_find_supply_alias(*dev, *supply); if (map) { dev_dbg(*dev, "Mapping supply %s to %s,%s\n", *supply, map->alias_supply, dev_name(map->alias_dev)); *dev = map->alias_dev; *supply = map->alias_supply; } } static int regulator_match(struct device *dev, const void *data) { struct regulator_dev *r = dev_to_rdev(dev); return strcmp(rdev_get_name(r), data) == 0; } static struct regulator_dev *regulator_lookup_by_name(const char *name) { struct device *dev; dev = class_find_device(®ulator_class, NULL, name, regulator_match); return dev ? dev_to_rdev(dev) : NULL; } static struct regulator_dev *regulator_dt_lookup(struct device *dev, const char *supply) { struct regulator_dev *r = NULL; if (dev_of_node(dev)) { r = of_regulator_dev_lookup(dev, dev_of_node(dev), supply); if (PTR_ERR(r) == -ENODEV) r = NULL; } return r; } /** * regulator_dev_lookup - lookup a regulator device. * @dev: device for regulator "consumer". * @supply: Supply name or regulator ID. * * Return: pointer to &struct regulator_dev or ERR_PTR() encoded negative error number. * * If successful, returns a struct regulator_dev that corresponds to the name * @supply and with the embedded struct device refcount incremented by one. * The refcount must be dropped by calling put_device(). * On failure one of the following ERR_PTR() encoded values is returned: * -%ENODEV if lookup fails permanently, -%EPROBE_DEFER if lookup could succeed * in the future. */ static struct regulator_dev *regulator_dev_lookup(struct device *dev, const char *supply) { struct regulator_dev *r = NULL; struct regulator_map *map; const char *devname = NULL; regulator_supply_alias(&dev, &supply); /* first do a dt based lookup */ r = regulator_dt_lookup(dev, supply); if (r) return r; /* if not found, try doing it non-dt way */ if (dev) devname = dev_name(dev); mutex_lock(®ulator_list_mutex); list_for_each_entry(map, ®ulator_map_list, list) { /* If the mapping has a device set up it must match */ if (map->dev_name && (!devname || strcmp(map->dev_name, devname))) continue; if (strcmp(map->supply, supply) == 0 && get_device(&map->regulator->dev)) { r = map->regulator; break; } } mutex_unlock(®ulator_list_mutex); if (r) return r; r = regulator_lookup_by_name(supply); if (r) return r; return ERR_PTR(-ENODEV); } static int regulator_resolve_supply(struct regulator_dev *rdev) { struct regulator_dev *r; struct device *dev = rdev->dev.parent; struct ww_acquire_ctx ww_ctx; int ret = 0; /* No supply to resolve? */ if (!rdev->supply_name) return 0; /* Supply already resolved? (fast-path without locking contention) */ if (rdev->supply) return 0; /* first do a dt based lookup on the node described in the virtual * device. */ r = regulator_dt_lookup(&rdev->dev, rdev->supply_name); /* If regulator not found use usual search path in the parent * device. */ if (!r) r = regulator_dev_lookup(dev, rdev->supply_name); if (IS_ERR(r)) { ret = PTR_ERR(r); /* Did the lookup explicitly defer for us? */ if (ret == -EPROBE_DEFER) goto out; if (have_full_constraints()) { r = dummy_regulator_rdev; if (!r) { ret = -EPROBE_DEFER; goto out; } get_device(&r->dev); } else { dev_err(dev, "Failed to resolve %s-supply for %s\n", rdev->supply_name, rdev->desc->name); ret = -EPROBE_DEFER; goto out; } } if (r == rdev) { dev_err(dev, "Supply for %s (%s) resolved to itself\n", rdev->desc->name, rdev->supply_name); if (!have_full_constraints()) { ret = -EINVAL; goto out; } r = dummy_regulator_rdev; if (!r) { ret = -EPROBE_DEFER; goto out; } get_device(&r->dev); } /* * If the supply's parent device is not the same as the * regulator's parent device, then ensure the parent device * is bound before we resolve the supply, in case the parent * device get probe deferred and unregisters the supply. */ if (r->dev.parent && r->dev.parent != rdev->dev.parent) { if (!device_is_bound(r->dev.parent)) { put_device(&r->dev); ret = -EPROBE_DEFER; goto out; } } /* Recursively resolve the supply of the supply */ ret = regulator_resolve_supply(r); if (ret < 0) { put_device(&r->dev); goto out; } /* * Recheck rdev->supply with rdev->mutex lock held to avoid a race * between rdev->supply null check and setting rdev->supply in * set_supply() from concurrent tasks. */ regulator_lock_two(rdev, r, &ww_ctx); /* Supply just resolved by a concurrent task? */ if (rdev->supply) { regulator_unlock_two(rdev, r, &ww_ctx); put_device(&r->dev); goto out; } ret = set_supply(rdev, r); if (ret < 0) { regulator_unlock_two(rdev, r, &ww_ctx); put_device(&r->dev); goto out; } regulator_unlock_two(rdev, r, &ww_ctx); /* rdev->supply was created in set_supply() */ link_and_create_debugfs(rdev->supply, r, &rdev->dev); /* * In set_machine_constraints() we may have turned this regulator on * but we couldn't propagate to the supply if it hadn't been resolved * yet. Do it now. */ if (rdev->use_count) { ret = regulator_enable(rdev->supply); if (ret < 0) { _regulator_put(rdev->supply); rdev->supply = NULL; goto out; } } out: return ret; } /* common pre-checks for regulator requests */ int _regulator_get_common_check(struct device *dev, const char *id, enum regulator_get_type get_type) { if (get_type >= MAX_GET_TYPE) { dev_err(dev, "invalid type %d in %s\n", get_type, __func__); return -EINVAL; } if (id == NULL) { dev_err(dev, "regulator request with no identifier\n"); return -EINVAL; } return 0; } /** * _regulator_get_common - Common code for regulator requests * @rdev: regulator device pointer as returned by *regulator_dev_lookup() * Its reference count is expected to have been incremented. * @dev: device used for dev_printk messages * @id: Supply name or regulator ID * @get_type: enum regulator_get_type value corresponding to type of request * * Returns: pointer to struct regulator corresponding to @rdev, or ERR_PTR() * encoded error. * * This function should be chained with *regulator_dev_lookup() functions. */ struct regulator *_regulator_get_common(struct regulator_dev *rdev, struct device *dev, const char *id, enum regulator_get_type get_type) { struct regulator *regulator; struct device_link *link; int ret; if (IS_ERR(rdev)) { ret = PTR_ERR(rdev); /* * If regulator_dev_lookup() fails with error other * than -ENODEV our job here is done, we simply return it. */ if (ret != -ENODEV) return ERR_PTR(ret); if (!have_full_constraints()) { dev_warn(dev, "incomplete constraints, dummy supplies not allowed (id=%s)\n", id); return ERR_PTR(-ENODEV); } switch (get_type) { case NORMAL_GET: /* * Assume that a regulator is physically present and * enabled, even if it isn't hooked up, and just * provide a dummy. */ rdev = dummy_regulator_rdev; if (!rdev) return ERR_PTR(-EPROBE_DEFER); dev_warn(dev, "supply %s not found, using dummy regulator\n", id); get_device(&rdev->dev); break; case EXCLUSIVE_GET: dev_warn(dev, "dummy supplies not allowed for exclusive requests (id=%s)\n", id); fallthrough; default: return ERR_PTR(-ENODEV); } } if (rdev->exclusive) { regulator = ERR_PTR(-EPERM); put_device(&rdev->dev); return regulator; } if (get_type == EXCLUSIVE_GET && rdev->open_count) { regulator = ERR_PTR(-EBUSY); put_device(&rdev->dev); return regulator; } mutex_lock(®ulator_list_mutex); ret = (rdev->coupling_desc.n_resolved != rdev->coupling_desc.n_coupled); mutex_unlock(®ulator_list_mutex); if (ret != 0) { regulator = ERR_PTR(-EPROBE_DEFER); put_device(&rdev->dev); return regulator; } ret = regulator_resolve_supply(rdev); if (ret < 0) { regulator = ERR_PTR(ret); put_device(&rdev->dev); return regulator; } if (!try_module_get(rdev->owner)) { regulator = ERR_PTR(-EPROBE_DEFER); put_device(&rdev->dev); return regulator; } regulator_lock(rdev); regulator = create_regulator(rdev, dev, id); regulator_unlock(rdev); if (regulator == NULL) { regulator = ERR_PTR(-ENOMEM); module_put(rdev->owner); put_device(&rdev->dev); return regulator; } link_and_create_debugfs(regulator, rdev, dev); rdev->open_count++; if (get_type == EXCLUSIVE_GET) { rdev->exclusive = 1; ret = _regulator_is_enabled(rdev); if (ret > 0) { rdev->use_count = 1; regulator->enable_count = 1; /* Propagate the regulator state to its supply */ if (rdev->supply) { ret = regulator_enable(rdev->supply); if (ret < 0) { destroy_regulator(regulator); module_put(rdev->owner); put_device(&rdev->dev); return ERR_PTR(ret); } } } else { rdev->use_count = 0; regulator->enable_count = 0; } } link = device_link_add(dev, &rdev->dev, DL_FLAG_STATELESS); if (!IS_ERR_OR_NULL(link)) regulator->device_link = true; return regulator; } /* Internal regulator request function */ struct regulator *_regulator_get(struct device *dev, const char *id, enum regulator_get_type get_type) { struct regulator_dev *rdev; int ret; ret = _regulator_get_common_check(dev, id, get_type); if (ret) return ERR_PTR(ret); rdev = regulator_dev_lookup(dev, id); return _regulator_get_common(rdev, dev, id, get_type); } /** * regulator_get - lookup and obtain a reference to a regulator. * @dev: device for regulator "consumer" * @id: Supply name or regulator ID. * * Use of supply names configured via set_consumer_device_supply() is * strongly encouraged. It is recommended that the supply name used * should match the name used for the supply and/or the relevant * device pins in the datasheet. * * Return: Pointer to a &struct regulator corresponding to the regulator * producer, or an ERR_PTR() encoded negative error number. */ struct regulator *regulator_get(struct device *dev, const char *id) { return _regulator_get(dev, id, NORMAL_GET); } EXPORT_SYMBOL_GPL(regulator_get); /** * regulator_get_exclusive - obtain exclusive access to a regulator. * @dev: device for regulator "consumer" * @id: Supply name or regulator ID. * * Other consumers will be unable to obtain this regulator while this * reference is held and the use count for the regulator will be * initialised to reflect the current state of the regulator. * * This is intended for use by consumers which cannot tolerate shared * use of the regulator such as those which need to force the * regulator off for correct operation of the hardware they are * controlling. * * Use of supply names configured via set_consumer_device_supply() is * strongly encouraged. It is recommended that the supply name used * should match the name used for the supply and/or the relevant * device pins in the datasheet. * * Return: Pointer to a &struct regulator corresponding to the regulator * producer, or an ERR_PTR() encoded negative error number. */ struct regulator *regulator_get_exclusive(struct device *dev, const char *id) { return _regulator_get(dev, id, EXCLUSIVE_GET); } EXPORT_SYMBOL_GPL(regulator_get_exclusive); /** * regulator_get_optional - obtain optional access to a regulator. * @dev: device for regulator "consumer" * @id: Supply name or regulator ID. * * This is intended for use by consumers for devices which can have * some supplies unconnected in normal use, such as some MMC devices. * It can allow the regulator core to provide stub supplies for other * supplies requested using normal regulator_get() calls without * disrupting the operation of drivers that can handle absent * supplies. * * Use of supply names configured via set_consumer_device_supply() is * strongly encouraged. It is recommended that the supply name used * should match the name used for the supply and/or the relevant * device pins in the datasheet. * * Return: Pointer to a &struct regulator corresponding to the regulator * producer, or an ERR_PTR() encoded negative error number. */ struct regulator *regulator_get_optional(struct device *dev, const char *id) { return _regulator_get(dev, id, OPTIONAL_GET); } EXPORT_SYMBOL_GPL(regulator_get_optional); static void destroy_regulator(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; debugfs_remove_recursive(regulator->debugfs); if (regulator->dev) { if (regulator->device_link) device_link_remove(regulator->dev, &rdev->dev); /* remove any sysfs entries */ sysfs_remove_link(&rdev->dev.kobj, regulator->supply_name); } regulator_lock(rdev); list_del(®ulator->list); rdev->open_count--; rdev->exclusive = 0; regulator_unlock(rdev); kfree_const(regulator->supply_name); kfree(regulator); } /* regulator_list_mutex lock held by regulator_put() */ static void _regulator_put(struct regulator *regulator) { struct regulator_dev *rdev; if (IS_ERR_OR_NULL(regulator)) return; lockdep_assert_held_once(®ulator_list_mutex); /* Docs say you must disable before calling regulator_put() */ WARN_ON(regulator->enable_count); rdev = regulator->rdev; destroy_regulator(regulator); module_put(rdev->owner); put_device(&rdev->dev); } /** * regulator_put - "free" the regulator source * @regulator: regulator source * * Note: drivers must ensure that all regulator_enable calls made on this * regulator source are balanced by regulator_disable calls prior to calling * this function. */ void regulator_put(struct regulator *regulator) { mutex_lock(®ulator_list_mutex); _regulator_put(regulator); mutex_unlock(®ulator_list_mutex); } EXPORT_SYMBOL_GPL(regulator_put); /** * regulator_register_supply_alias - Provide device alias for supply lookup * * @dev: device that will be given as the regulator "consumer" * @id: Supply name or regulator ID * @alias_dev: device that should be used to lookup the supply * @alias_id: Supply name or regulator ID that should be used to lookup the * supply * * All lookups for id on dev will instead be conducted for alias_id on * alias_dev. * * Return: 0 on success or a negative error number on failure. */ int regulator_register_supply_alias(struct device *dev, const char *id, struct device *alias_dev, const char *alias_id) { struct regulator_supply_alias *map; map = regulator_find_supply_alias(dev, id); if (map) return -EEXIST; map = kzalloc(sizeof(struct regulator_supply_alias), GFP_KERNEL); if (!map) return -ENOMEM; map->src_dev = dev; map->src_supply = id; map->alias_dev = alias_dev; map->alias_supply = alias_id; list_add(&map->list, ®ulator_supply_alias_list); pr_info("Adding alias for supply %s,%s -> %s,%s\n", id, dev_name(dev), alias_id, dev_name(alias_dev)); return 0; } EXPORT_SYMBOL_GPL(regulator_register_supply_alias); /** * regulator_unregister_supply_alias - Remove device alias * * @dev: device that will be given as the regulator "consumer" * @id: Supply name or regulator ID * * Remove a lookup alias if one exists for id on dev. */ void regulator_unregister_supply_alias(struct device *dev, const char *id) { struct regulator_supply_alias *map; map = regulator_find_supply_alias(dev, id); if (map) { list_del(&map->list); kfree(map); } } EXPORT_SYMBOL_GPL(regulator_unregister_supply_alias); /** * regulator_bulk_register_supply_alias - register multiple aliases * * @dev: device that will be given as the regulator "consumer" * @id: List of supply names or regulator IDs * @alias_dev: device that should be used to lookup the supply * @alias_id: List of supply names or regulator IDs that should be used to * lookup the supply * @num_id: Number of aliases to register * * This helper function allows drivers to register several supply * aliases in one operation. If any of the aliases cannot be * registered any aliases that were registered will be removed * before returning to the caller. * * Return: 0 on success or a negative error number on failure. */ int regulator_bulk_register_supply_alias(struct device *dev, const char *const *id, struct device *alias_dev, const char *const *alias_id, int num_id) { int i; int ret; for (i = 0; i < num_id; ++i) { ret = regulator_register_supply_alias(dev, id[i], alias_dev, alias_id[i]); if (ret < 0) goto err; } return 0; err: dev_err(dev, "Failed to create supply alias %s,%s -> %s,%s\n", id[i], dev_name(dev), alias_id[i], dev_name(alias_dev)); while (--i >= 0) regulator_unregister_supply_alias(dev, id[i]); return ret; } EXPORT_SYMBOL_GPL(regulator_bulk_register_supply_alias); /** * regulator_bulk_unregister_supply_alias - unregister multiple aliases * * @dev: device that will be given as the regulator "consumer" * @id: List of supply names or regulator IDs * @num_id: Number of aliases to unregister * * This helper function allows drivers to unregister several supply * aliases in one operation. */ void regulator_bulk_unregister_supply_alias(struct device *dev, const char *const *id, int num_id) { int i; for (i = 0; i < num_id; ++i) regulator_unregister_supply_alias(dev, id[i]); } EXPORT_SYMBOL_GPL(regulator_bulk_unregister_supply_alias); /* Manage enable GPIO list. Same GPIO pin can be shared among regulators */ static int regulator_ena_gpio_request(struct regulator_dev *rdev, const struct regulator_config *config) { struct regulator_enable_gpio *pin, *new_pin; struct gpio_desc *gpiod; gpiod = config->ena_gpiod; new_pin = kzalloc(sizeof(*new_pin), GFP_KERNEL); mutex_lock(®ulator_list_mutex); list_for_each_entry(pin, ®ulator_ena_gpio_list, list) { if (gpiod_is_equal(pin->gpiod, gpiod)) { rdev_dbg(rdev, "GPIO is already used\n"); goto update_ena_gpio_to_rdev; } } if (new_pin == NULL) { mutex_unlock(®ulator_list_mutex); return -ENOMEM; } pin = new_pin; new_pin = NULL; pin->gpiod = gpiod; list_add(&pin->list, ®ulator_ena_gpio_list); update_ena_gpio_to_rdev: pin->request_count++; rdev->ena_pin = pin; mutex_unlock(®ulator_list_mutex); kfree(new_pin); return 0; } static void regulator_ena_gpio_free(struct regulator_dev *rdev) { struct regulator_enable_gpio *pin, *n; if (!rdev->ena_pin) return; /* Free the GPIO only in case of no use */ list_for_each_entry_safe(pin, n, ®ulator_ena_gpio_list, list) { if (pin != rdev->ena_pin) continue; if (--pin->request_count) break; gpiod_put(pin->gpiod); list_del(&pin->list); kfree(pin); break; } rdev->ena_pin = NULL; } /** * regulator_ena_gpio_ctrl - balance enable_count of each GPIO and actual GPIO pin control * @rdev: regulator_dev structure * @enable: enable GPIO at initial use? * * GPIO is enabled in case of initial use. (enable_count is 0) * GPIO is disabled when it is not shared any more. (enable_count <= 1) * * Return: 0 on success or a negative error number on failure. */ static int regulator_ena_gpio_ctrl(struct regulator_dev *rdev, bool enable) { struct regulator_enable_gpio *pin = rdev->ena_pin; if (!pin) return -EINVAL; if (enable) { /* Enable GPIO at initial use */ if (pin->enable_count == 0) gpiod_set_value_cansleep(pin->gpiod, 1); pin->enable_count++; } else { if (pin->enable_count > 1) { pin->enable_count--; return 0; } /* Disable GPIO if not used */ if (pin->enable_count <= 1) { gpiod_set_value_cansleep(pin->gpiod, 0); pin->enable_count = 0; } } return 0; } /** * _regulator_check_status_enabled - check if regulator status can be * interpreted as "regulator is enabled" * @rdev: the regulator device to check * * Return: * * 1 - if status shows regulator is in enabled state * * 0 - if not enabled state * * Error Value - as received from ops->get_status() */ static inline int _regulator_check_status_enabled(struct regulator_dev *rdev) { int ret = rdev->desc->ops->get_status(rdev); if (ret < 0) { rdev_info(rdev, "get_status returned error: %d\n", ret); return ret; } switch (ret) { case REGULATOR_STATUS_OFF: case REGULATOR_STATUS_ERROR: case REGULATOR_STATUS_UNDEFINED: return 0; default: return 1; } } static int _regulator_do_enable(struct regulator_dev *rdev) { int ret, delay; /* Query before enabling in case configuration dependent. */ ret = _regulator_get_enable_time(rdev); if (ret >= 0) { delay = ret; } else { rdev_warn(rdev, "enable_time() failed: %pe\n", ERR_PTR(ret)); delay = 0; } trace_regulator_enable(rdev_get_name(rdev)); if (rdev->desc->off_on_delay) { /* if needed, keep a distance of off_on_delay from last time * this regulator was disabled. */ ktime_t end = ktime_add_us(rdev->last_off, rdev->desc->off_on_delay); s64 remaining = ktime_us_delta(end, ktime_get_boottime()); if (remaining > 0) fsleep(remaining); } if (rdev->ena_pin) { if (!rdev->ena_gpio_state) { ret = regulator_ena_gpio_ctrl(rdev, true); if (ret < 0) return ret; rdev->ena_gpio_state = 1; } } else if (rdev->desc->ops->enable) { ret = rdev->desc->ops->enable(rdev); if (ret < 0) return ret; } else { return -EINVAL; } /* Allow the regulator to ramp; it would be useful to extend * this for bulk operations so that the regulators can ramp * together. */ trace_regulator_enable_delay(rdev_get_name(rdev)); /* If poll_enabled_time is set, poll upto the delay calculated * above, delaying poll_enabled_time uS to check if the regulator * actually got enabled. * If the regulator isn't enabled after our delay helper has expired, * return -ETIMEDOUT. */ if (rdev->desc->poll_enabled_time) { int time_remaining = delay; while (time_remaining > 0) { fsleep(rdev->desc->poll_enabled_time); if (rdev->desc->ops->get_status) { ret = _regulator_check_status_enabled(rdev); if (ret < 0) return ret; else if (ret) break; } else if (rdev->desc->ops->is_enabled(rdev)) break; time_remaining -= rdev->desc->poll_enabled_time; } if (time_remaining <= 0) { rdev_err(rdev, "Enabled check timed out\n"); return -ETIMEDOUT; } } else { fsleep(delay); } trace_regulator_enable_complete(rdev_get_name(rdev)); return 0; } /** * _regulator_handle_consumer_enable - handle that a consumer enabled * @regulator: regulator source * * Some things on a regulator consumer (like the contribution towards total * load on the regulator) only have an effect when the consumer wants the * regulator enabled. Explained in example with two consumers of the same * regulator: * consumer A: set_load(100); => total load = 0 * consumer A: regulator_enable(); => total load = 100 * consumer B: set_load(1000); => total load = 100 * consumer B: regulator_enable(); => total load = 1100 * consumer A: regulator_disable(); => total_load = 1000 * * This function (together with _regulator_handle_consumer_disable) is * responsible for keeping track of the refcount for a given regulator consumer * and applying / unapplying these things. * * Return: 0 on success or negative error number on failure. */ static int _regulator_handle_consumer_enable(struct regulator *regulator) { int ret; struct regulator_dev *rdev = regulator->rdev; lockdep_assert_held_once(&rdev->mutex.base); regulator->enable_count++; if (regulator->uA_load && regulator->enable_count == 1) { ret = drms_uA_update(rdev); if (ret) regulator->enable_count--; return ret; } return 0; } /** * _regulator_handle_consumer_disable - handle that a consumer disabled * @regulator: regulator source * * The opposite of _regulator_handle_consumer_enable(). * * Return: 0 on success or a negative error number on failure. */ static int _regulator_handle_consumer_disable(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; lockdep_assert_held_once(&rdev->mutex.base); if (!regulator->enable_count) { rdev_err(rdev, "Underflow of regulator enable count\n"); return -EINVAL; } regulator->enable_count--; if (regulator->uA_load && regulator->enable_count == 0) return drms_uA_update(rdev); return 0; } /* locks held by regulator_enable() */ static int _regulator_enable(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; int ret; lockdep_assert_held_once(&rdev->mutex.base); if (rdev->use_count == 0 && rdev->supply) { ret = _regulator_enable(rdev->supply); if (ret < 0) return ret; } /* balance only if there are regulators coupled */ if (rdev->coupling_desc.n_coupled > 1) { ret = regulator_balance_voltage(rdev, PM_SUSPEND_ON); if (ret < 0) goto err_disable_supply; } ret = _regulator_handle_consumer_enable(regulator); if (ret < 0) goto err_disable_supply; if (rdev->use_count == 0) { /* * The regulator may already be enabled if it's not switchable * or was left on */ ret = _regulator_is_enabled(rdev); if (ret == -EINVAL || ret == 0) { if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_STATUS)) { ret = -EPERM; goto err_consumer_disable; } ret = _regulator_do_enable(rdev); if (ret < 0) goto err_consumer_disable; _notifier_call_chain(rdev, REGULATOR_EVENT_ENABLE, NULL); } else if (ret < 0) { rdev_err(rdev, "is_enabled() failed: %pe\n", ERR_PTR(ret)); goto err_consumer_disable; } /* Fallthrough on positive return values - already enabled */ } if (regulator->enable_count == 1) rdev->use_count++; return 0; err_consumer_disable: _regulator_handle_consumer_disable(regulator); err_disable_supply: if (rdev->use_count == 0 && rdev->supply) _regulator_disable(rdev->supply); return ret; } /** * regulator_enable - enable regulator output * @regulator: regulator source * * Request that the regulator be enabled with the regulator output at * the predefined voltage or current value. Calls to regulator_enable() * must be balanced with calls to regulator_disable(). * * NOTE: the output value can be set by other drivers, boot loader or may be * hardwired in the regulator. * * Return: 0 on success or a negative error number on failure. */ int regulator_enable(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; struct ww_acquire_ctx ww_ctx; int ret; regulator_lock_dependent(rdev, &ww_ctx); ret = _regulator_enable(regulator); regulator_unlock_dependent(rdev, &ww_ctx); return ret; } EXPORT_SYMBOL_GPL(regulator_enable); static int _regulator_do_disable(struct regulator_dev *rdev) { int ret; trace_regulator_disable(rdev_get_name(rdev)); if (rdev->ena_pin) { if (rdev->ena_gpio_state) { ret = regulator_ena_gpio_ctrl(rdev, false); if (ret < 0) return ret; rdev->ena_gpio_state = 0; } } else if (rdev->desc->ops->disable) { ret = rdev->desc->ops->disable(rdev); if (ret != 0) return ret; } if (rdev->desc->off_on_delay) rdev->last_off = ktime_get_boottime(); trace_regulator_disable_complete(rdev_get_name(rdev)); return 0; } /* locks held by regulator_disable() */ static int _regulator_disable(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; int ret = 0; lockdep_assert_held_once(&rdev->mutex.base); if (WARN(regulator->enable_count == 0, "unbalanced disables for %s\n", rdev_get_name(rdev))) return -EIO; if (regulator->enable_count == 1) { /* disabling last enable_count from this regulator */ /* are we the last user and permitted to disable ? */ if (rdev->use_count == 1 && (rdev->constraints && !rdev->constraints->always_on)) { /* we are last user */ if (regulator_ops_is_valid(rdev, REGULATOR_CHANGE_STATUS)) { ret = _notifier_call_chain(rdev, REGULATOR_EVENT_PRE_DISABLE, NULL); if (ret & NOTIFY_STOP_MASK) return -EINVAL; ret = _regulator_do_disable(rdev); if (ret < 0) { rdev_err(rdev, "failed to disable: %pe\n", ERR_PTR(ret)); _notifier_call_chain(rdev, REGULATOR_EVENT_ABORT_DISABLE, NULL); return ret; } _notifier_call_chain(rdev, REGULATOR_EVENT_DISABLE, NULL); } rdev->use_count = 0; } else if (rdev->use_count > 1) { rdev->use_count--; } } if (ret == 0) ret = _regulator_handle_consumer_disable(regulator); if (ret == 0 && rdev->coupling_desc.n_coupled > 1) ret = regulator_balance_voltage(rdev, PM_SUSPEND_ON); if (ret == 0 && rdev->use_count == 0 && rdev->supply) ret = _regulator_disable(rdev->supply); return ret; } /** * regulator_disable - disable regulator output * @regulator: regulator source * * Disable the regulator output voltage or current. Calls to * regulator_enable() must be balanced with calls to * regulator_disable(). * * NOTE: this will only disable the regulator output if no other consumer * devices have it enabled, the regulator device supports disabling and * machine constraints permit this operation. * * Return: 0 on success or a negative error number on failure. */ int regulator_disable(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; struct ww_acquire_ctx ww_ctx; int ret; regulator_lock_dependent(rdev, &ww_ctx); ret = _regulator_disable(regulator); regulator_unlock_dependent(rdev, &ww_ctx); return ret; } EXPORT_SYMBOL_GPL(regulator_disable); /* locks held by regulator_force_disable() */ static int _regulator_force_disable(struct regulator_dev *rdev) { int ret = 0; lockdep_assert_held_once(&rdev->mutex.base); ret = _notifier_call_chain(rdev, REGULATOR_EVENT_FORCE_DISABLE | REGULATOR_EVENT_PRE_DISABLE, NULL); if (ret & NOTIFY_STOP_MASK) return -EINVAL; ret = _regulator_do_disable(rdev); if (ret < 0) { rdev_err(rdev, "failed to force disable: %pe\n", ERR_PTR(ret)); _notifier_call_chain(rdev, REGULATOR_EVENT_FORCE_DISABLE | REGULATOR_EVENT_ABORT_DISABLE, NULL); return ret; } _notifier_call_chain(rdev, REGULATOR_EVENT_FORCE_DISABLE | REGULATOR_EVENT_DISABLE, NULL); return 0; } /** * regulator_force_disable - force disable regulator output * @regulator: regulator source * * Forcibly disable the regulator output voltage or current. * NOTE: this *will* disable the regulator output even if other consumer * devices have it enabled. This should be used for situations when device * damage will likely occur if the regulator is not disabled (e.g. over temp). * * Return: 0 on success or a negative error number on failure. */ int regulator_force_disable(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; struct ww_acquire_ctx ww_ctx; int ret; regulator_lock_dependent(rdev, &ww_ctx); ret = _regulator_force_disable(regulator->rdev); if (rdev->coupling_desc.n_coupled > 1) regulator_balance_voltage(rdev, PM_SUSPEND_ON); if (regulator->uA_load) { regulator->uA_load = 0; ret = drms_uA_update(rdev); } if (rdev->use_count != 0 && rdev->supply) _regulator_disable(rdev->supply); regulator_unlock_dependent(rdev, &ww_ctx); return ret; } EXPORT_SYMBOL_GPL(regulator_force_disable); static void regulator_disable_work(struct work_struct *work) { struct regulator_dev *rdev = container_of(work, struct regulator_dev, disable_work.work); struct ww_acquire_ctx ww_ctx; int count, i, ret; struct regulator *regulator; int total_count = 0; regulator_lock_dependent(rdev, &ww_ctx); /* * Workqueue functions queue the new work instance while the previous * work instance is being processed. Cancel the queued work instance * as the work instance under processing does the job of the queued * work instance. */ cancel_delayed_work(&rdev->disable_work); list_for_each_entry(regulator, &rdev->consumer_list, list) { count = regulator->deferred_disables; if (!count) continue; total_count += count; regulator->deferred_disables = 0; for (i = 0; i < count; i++) { ret = _regulator_disable(regulator); if (ret != 0) rdev_err(rdev, "Deferred disable failed: %pe\n", ERR_PTR(ret)); } } WARN_ON(!total_count); if (rdev->coupling_desc.n_coupled > 1) regulator_balance_voltage(rdev, PM_SUSPEND_ON); regulator_unlock_dependent(rdev, &ww_ctx); } /** * regulator_disable_deferred - disable regulator output with delay * @regulator: regulator source * @ms: milliseconds until the regulator is disabled * * Execute regulator_disable() on the regulator after a delay. This * is intended for use with devices that require some time to quiesce. * * NOTE: this will only disable the regulator output if no other consumer * devices have it enabled, the regulator device supports disabling and * machine constraints permit this operation. * * Return: 0 on success or a negative error number on failure. */ int regulator_disable_deferred(struct regulator *regulator, int ms) { struct regulator_dev *rdev = regulator->rdev; if (!ms) return regulator_disable(regulator); regulator_lock(rdev); regulator->deferred_disables++; mod_delayed_work(system_power_efficient_wq, &rdev->disable_work, msecs_to_jiffies(ms)); regulator_unlock(rdev); return 0; } EXPORT_SYMBOL_GPL(regulator_disable_deferred); static int _regulator_is_enabled(struct regulator_dev *rdev) { /* A GPIO control always takes precedence */ if (rdev->ena_pin) return rdev->ena_gpio_state; /* If we don't know then assume that the regulator is always on */ if (!rdev->desc->ops->is_enabled) return 1; return rdev->desc->ops->is_enabled(rdev); } static int _regulator_list_voltage(struct regulator_dev *rdev, unsigned selector, int lock) { const struct regulator_ops *ops = rdev->desc->ops; int ret; if (rdev->desc->fixed_uV && rdev->desc->n_voltages == 1 && !selector) return rdev->desc->fixed_uV; if (ops->list_voltage) { if (selector >= rdev->desc->n_voltages) return -EINVAL; if (selector < rdev->desc->linear_min_sel) return 0; if (lock) regulator_lock(rdev); ret = ops->list_voltage(rdev, selector); if (lock) regulator_unlock(rdev); } else if (rdev->is_switch && rdev->supply) { ret = _regulator_list_voltage(rdev->supply->rdev, selector, lock); } else { return -EINVAL; } if (ret > 0) { if (ret < rdev->constraints->min_uV) ret = 0; else if (ret > rdev->constraints->max_uV) ret = 0; } return ret; } /** * regulator_is_enabled - is the regulator output enabled * @regulator: regulator source * * Note that the device backing this regulator handle can have multiple * users, so it might be enabled even if regulator_enable() was never * called for this particular source. * * Return: Positive if the regulator driver backing the source/client * has requested that the device be enabled, zero if it hasn't, * else a negative error number. */ int regulator_is_enabled(struct regulator *regulator) { int ret; if (regulator->always_on) return 1; regulator_lock(regulator->rdev); ret = _regulator_is_enabled(regulator->rdev); regulator_unlock(regulator->rdev); return ret; } EXPORT_SYMBOL_GPL(regulator_is_enabled); /** * regulator_count_voltages - count regulator_list_voltage() selectors * @regulator: regulator source * * Return: Number of selectors for @regulator, or negative error number. * * Selectors are numbered starting at zero, and typically correspond to * bitfields in hardware registers. */ int regulator_count_voltages(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; if (rdev->desc->n_voltages) return rdev->desc->n_voltages; if (!rdev->is_switch || !rdev->supply) return -EINVAL; return regulator_count_voltages(rdev->supply); } EXPORT_SYMBOL_GPL(regulator_count_voltages); /** * regulator_list_voltage - enumerate supported voltages * @regulator: regulator source * @selector: identify voltage to list * Context: can sleep * * Return: Voltage for @selector that can be passed to regulator_set_voltage(), * 0 if @selector can't be used on this system, or a negative error * number on failure. */ int regulator_list_voltage(struct regulator *regulator, unsigned selector) { return _regulator_list_voltage(regulator->rdev, selector, 1); } EXPORT_SYMBOL_GPL(regulator_list_voltage); /** * regulator_get_regmap - get the regulator's register map * @regulator: regulator source * * Return: Pointer to the &struct regmap for @regulator, or ERR_PTR() * encoded -%EOPNOTSUPP if @regulator doesn't use regmap. */ struct regmap *regulator_get_regmap(struct regulator *regulator) { struct regmap *map = regulator->rdev->regmap; return map ? map : ERR_PTR(-EOPNOTSUPP); } EXPORT_SYMBOL_GPL(regulator_get_regmap); /** * regulator_get_hardware_vsel_register - get the HW voltage selector register * @regulator: regulator source * @vsel_reg: voltage selector register, output parameter * @vsel_mask: mask for voltage selector bitfield, output parameter * * Returns the hardware register offset and bitmask used for setting the * regulator voltage. This might be useful when configuring voltage-scaling * hardware or firmware that can make I2C requests behind the kernel's back, * for example. * * Return: 0 on success, or -%EOPNOTSUPP if the regulator does not support * voltage selectors. * * On success, the output parameters @vsel_reg and @vsel_mask are filled in * and 0 is returned, otherwise a negative error number is returned. */ int regulator_get_hardware_vsel_register(struct regulator *regulator, unsigned *vsel_reg, unsigned *vsel_mask) { struct regulator_dev *rdev = regulator->rdev; const struct regulator_ops *ops = rdev->desc->ops; if (ops->set_voltage_sel != regulator_set_voltage_sel_regmap) return -EOPNOTSUPP; *vsel_reg = rdev->desc->vsel_reg; *vsel_mask = rdev->desc->vsel_mask; return 0; } EXPORT_SYMBOL_GPL(regulator_get_hardware_vsel_register); /** * regulator_list_hardware_vsel - get the HW-specific register value for a selector * @regulator: regulator source * @selector: identify voltage to list * * Converts the selector to a hardware-specific voltage selector that can be * directly written to the regulator registers. The address of the voltage * register can be determined by calling @regulator_get_hardware_vsel_register. * * Return: 0 on success, -%EINVAL if the selector is outside the supported * range, or -%EOPNOTSUPP if the regulator does not support voltage * selectors. */ int regulator_list_hardware_vsel(struct regulator *regulator, unsigned selector) { struct regulator_dev *rdev = regulator->rdev; const struct regulator_ops *ops = rdev->desc->ops; if (selector >= rdev->desc->n_voltages) return -EINVAL; if (selector < rdev->desc->linear_min_sel) return 0; if (ops->set_voltage_sel != regulator_set_voltage_sel_regmap) return -EOPNOTSUPP; return selector; } EXPORT_SYMBOL_GPL(regulator_list_hardware_vsel); /** * regulator_hardware_enable - access the HW for enable/disable regulator * @regulator: regulator source * @enable: true for enable, false for disable * * Request that the regulator be enabled/disabled with the regulator output at * the predefined voltage or current value. * * Return: 0 on success or a negative error number on failure. */ int regulator_hardware_enable(struct regulator *regulator, bool enable) { struct regulator_dev *rdev = regulator->rdev; const struct regulator_ops *ops = rdev->desc->ops; int ret = -EOPNOTSUPP; if (!rdev->exclusive || !ops || !ops->enable || !ops->disable) return ret; if (enable) ret = ops->enable(rdev); else ret = ops->disable(rdev); return ret; } EXPORT_SYMBOL_GPL(regulator_hardware_enable); /** * regulator_get_linear_step - return the voltage step size between VSEL values * @regulator: regulator source * * Return: The voltage step size between VSEL values for linear regulators, * or 0 if the regulator isn't a linear regulator. */ unsigned int regulator_get_linear_step(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; return rdev->desc->uV_step; } EXPORT_SYMBOL_GPL(regulator_get_linear_step); /** * regulator_is_supported_voltage - check if a voltage range can be supported * * @regulator: Regulator to check. * @min_uV: Minimum required voltage in uV. * @max_uV: Maximum required voltage in uV. * * Return: 1 if the voltage range is supported, 0 if not, or a negative error * number if @regulator's voltage can't be changed and voltage readback * failed. */ int regulator_is_supported_voltage(struct regulator *regulator, int min_uV, int max_uV) { struct regulator_dev *rdev = regulator->rdev; int i, voltages, ret; /* If we can't change voltage check the current voltage */ if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_VOLTAGE)) { ret = regulator_get_voltage(regulator); if (ret >= 0) return min_uV <= ret && ret <= max_uV; else return ret; } /* Any voltage within constrains range is fine? */ if (rdev->desc->continuous_voltage_range) return min_uV >= rdev->constraints->min_uV && max_uV <= rdev->constraints->max_uV; ret = regulator_count_voltages(regulator); if (ret < 0) return 0; voltages = ret; for (i = 0; i < voltages; i++) { ret = regulator_list_voltage(regulator, i); if (ret >= min_uV && ret <= max_uV) return 1; } return 0; } EXPORT_SYMBOL_GPL(regulator_is_supported_voltage); static int regulator_map_voltage(struct regulator_dev *rdev, int min_uV, int max_uV) { const struct regulator_desc *desc = rdev->desc; if (desc->ops->map_voltage) return desc->ops->map_voltage(rdev, min_uV, max_uV); if (desc->ops->list_voltage == regulator_list_voltage_linear) return regulator_map_voltage_linear(rdev, min_uV, max_uV); if (desc->ops->list_voltage == regulator_list_voltage_linear_range) return regulator_map_voltage_linear_range(rdev, min_uV, max_uV); if (desc->ops->list_voltage == regulator_list_voltage_pickable_linear_range) return regulator_map_voltage_pickable_linear_range(rdev, min_uV, max_uV); return regulator_map_voltage_iterate(rdev, min_uV, max_uV); } static int _regulator_call_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV, unsigned *selector) { struct pre_voltage_change_data data; int ret; data.old_uV = regulator_get_voltage_rdev(rdev); data.min_uV = min_uV; data.max_uV = max_uV; ret = _notifier_call_chain(rdev, REGULATOR_EVENT_PRE_VOLTAGE_CHANGE, &data); if (ret & NOTIFY_STOP_MASK) return -EINVAL; ret = rdev->desc->ops->set_voltage(rdev, min_uV, max_uV, selector); if (ret >= 0) return ret; _notifier_call_chain(rdev, REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE, (void *)data.old_uV); return ret; } static int _regulator_call_set_voltage_sel(struct regulator_dev *rdev, int uV, unsigned selector) { struct pre_voltage_change_data data; int ret; data.old_uV = regulator_get_voltage_rdev(rdev); data.min_uV = uV; data.max_uV = uV; ret = _notifier_call_chain(rdev, REGULATOR_EVENT_PRE_VOLTAGE_CHANGE, &data); if (ret & NOTIFY_STOP_MASK) return -EINVAL; ret = rdev->desc->ops->set_voltage_sel(rdev, selector); if (ret >= 0) return ret; _notifier_call_chain(rdev, REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE, (void *)data.old_uV); return ret; } static int _regulator_set_voltage_sel_step(struct regulator_dev *rdev, int uV, int new_selector) { const struct regulator_ops *ops = rdev->desc->ops; int diff, old_sel, curr_sel, ret; /* Stepping is only needed if the regulator is enabled. */ if (!_regulator_is_enabled(rdev)) goto final_set; if (!ops->get_voltage_sel) return -EINVAL; old_sel = ops->get_voltage_sel(rdev); if (old_sel < 0) return old_sel; diff = new_selector - old_sel; if (diff == 0) return 0; /* No change needed. */ if (diff > 0) { /* Stepping up. */ for (curr_sel = old_sel + rdev->desc->vsel_step; curr_sel < new_selector; curr_sel += rdev->desc->vsel_step) { /* * Call the callback directly instead of using * _regulator_call_set_voltage_sel() as we don't * want to notify anyone yet. Same in the branch * below. */ ret = ops->set_voltage_sel(rdev, curr_sel); if (ret) goto try_revert; } } else { /* Stepping down. */ for (curr_sel = old_sel - rdev->desc->vsel_step; curr_sel > new_selector; curr_sel -= rdev->desc->vsel_step) { ret = ops->set_voltage_sel(rdev, curr_sel); if (ret) goto try_revert; } } final_set: /* The final selector will trigger the notifiers. */ return _regulator_call_set_voltage_sel(rdev, uV, new_selector); try_revert: /* * At least try to return to the previous voltage if setting a new * one failed. */ (void)ops->set_voltage_sel(rdev, old_sel); return ret; } static int _regulator_set_voltage_time(struct regulator_dev *rdev, int old_uV, int new_uV) { unsigned int ramp_delay = 0; if (rdev->constraints->ramp_delay) ramp_delay = rdev->constraints->ramp_delay; else if (rdev->desc->ramp_delay) ramp_delay = rdev->desc->ramp_delay; else if (rdev->constraints->settling_time) return rdev->constraints->settling_time; else if (rdev->constraints->settling_time_up && (new_uV > old_uV)) return rdev->constraints->settling_time_up; else if (rdev->constraints->settling_time_down && (new_uV < old_uV)) return rdev->constraints->settling_time_down; if (ramp_delay == 0) return 0; return DIV_ROUND_UP(abs(new_uV - old_uV), ramp_delay); } static int _regulator_do_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV) { int ret; int delay = 0; int best_val = 0; unsigned int selector; int old_selector = -1; const struct regulator_ops *ops = rdev->desc->ops; int old_uV = regulator_get_voltage_rdev(rdev); trace_regulator_set_voltage(rdev_get_name(rdev), min_uV, max_uV); min_uV += rdev->constraints->uV_offset; max_uV += rdev->constraints->uV_offset; /* * If we can't obtain the old selector there is not enough * info to call set_voltage_time_sel(). */ if (_regulator_is_enabled(rdev) && ops->set_voltage_time_sel && ops->get_voltage_sel) { old_selector = ops->get_voltage_sel(rdev); if (old_selector < 0) return old_selector; } if (ops->set_voltage) { ret = _regulator_call_set_voltage(rdev, min_uV, max_uV, &selector); if (ret >= 0) { if (ops->list_voltage) best_val = ops->list_voltage(rdev, selector); else best_val = regulator_get_voltage_rdev(rdev); } } else if (ops->set_voltage_sel) { ret = regulator_map_voltage(rdev, min_uV, max_uV); if (ret >= 0) { best_val = ops->list_voltage(rdev, ret); if (min_uV <= best_val && max_uV >= best_val) { selector = ret; if (old_selector == selector) ret = 0; else if (rdev->desc->vsel_step) ret = _regulator_set_voltage_sel_step( rdev, best_val, selector); else ret = _regulator_call_set_voltage_sel( rdev, best_val, selector); } else { ret = -EINVAL; } } } else { ret = -EINVAL; } if (ret) goto out; if (ops->set_voltage_time_sel) { /* * Call set_voltage_time_sel if successfully obtained * old_selector */ if (old_selector >= 0 && old_selector != selector) delay = ops->set_voltage_time_sel(rdev, old_selector, selector); } else { if (old_uV != best_val) { if (ops->set_voltage_time) delay = ops->set_voltage_time(rdev, old_uV, best_val); else delay = _regulator_set_voltage_time(rdev, old_uV, best_val); } } if (delay < 0) { rdev_warn(rdev, "failed to get delay: %pe\n", ERR_PTR(delay)); delay = 0; } /* Insert any necessary delays */ fsleep(delay); if (best_val >= 0) { unsigned long data = best_val; _notifier_call_chain(rdev, REGULATOR_EVENT_VOLTAGE_CHANGE, (void *)data); } out: trace_regulator_set_voltage_complete(rdev_get_name(rdev), best_val); return ret; } static int _regulator_do_set_suspend_voltage(struct regulator_dev *rdev, int min_uV, int max_uV, suspend_state_t state) { struct regulator_state *rstate; int uV, sel; rstate = regulator_get_suspend_state(rdev, state); if (rstate == NULL) return -EINVAL; if (min_uV < rstate->min_uV) min_uV = rstate->min_uV; if (max_uV > rstate->max_uV) max_uV = rstate->max_uV; sel = regulator_map_voltage(rdev, min_uV, max_uV); if (sel < 0) return sel; uV = rdev->desc->ops->list_voltage(rdev, sel); if (uV >= min_uV && uV <= max_uV) rstate->uV = uV; return 0; } static int regulator_get_voltage_delta(struct regulator_dev *rdev, int uV) { int current_uV = regulator_get_voltage_rdev(rdev); if (current_uV < 0) return current_uV; return abs(current_uV - uV); } static int regulator_set_voltage_unlocked(struct regulator *regulator, int min_uV, int max_uV, suspend_state_t state) { struct regulator_dev *rdev = regulator->rdev; struct regulator_voltage *voltage = ®ulator->voltage[state]; int ret = 0; int current_uV, delta, new_delta; int old_min_uV, old_max_uV; /* If we're setting the same range as last time the change * should be a noop (some cpufreq implementations use the same * voltage for multiple frequencies, for example). */ if (voltage->min_uV == min_uV && voltage->max_uV == max_uV) goto out; /* If we're trying to set a range that overlaps the current voltage, * return successfully even though the regulator does not support * changing the voltage. */ if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_VOLTAGE)) { current_uV = regulator_get_voltage_rdev(rdev); if (min_uV <= current_uV && current_uV <= max_uV) { voltage->min_uV = min_uV; voltage->max_uV = max_uV; goto out; } } /* sanity check */ if (!rdev->desc->ops->set_voltage && !rdev->desc->ops->set_voltage_sel) { ret = -EINVAL; goto out; } /* constraints check */ ret = regulator_check_voltage(rdev, &min_uV, &max_uV); if (ret < 0) goto out; /* restore original values in case of error */ old_min_uV = voltage->min_uV; old_max_uV = voltage->max_uV; voltage->min_uV = min_uV; voltage->max_uV = max_uV; /* for not coupled regulators this will just set the voltage */ ret = regulator_balance_voltage(rdev, state); if (ret < 0) { voltage->min_uV = old_min_uV; voltage->max_uV = old_max_uV; } if (rdev->constraints->max_uV_step > 0) { /* For regulators with a maximum voltage step, reaching the desired * voltage might take a few retries. */ ret = regulator_get_voltage_delta(rdev, min_uV); if (ret < 0) goto out; delta = ret; while (delta > 0) { ret = regulator_balance_voltage(rdev, state); if (ret < 0) goto out; ret = regulator_get_voltage_delta(rdev, min_uV); if (ret < 0) goto out; new_delta = ret; /* check that voltage is converging quickly enough */ if (delta - new_delta < rdev->constraints->max_uV_step) { ret = -EWOULDBLOCK; goto out; } delta = new_delta; } } out: return ret; } int regulator_set_voltage_rdev(struct regulator_dev *rdev, int min_uV, int max_uV, suspend_state_t state) { int best_supply_uV = 0; int supply_change_uV = 0; int ret; if (rdev->supply && regulator_ops_is_valid(rdev->supply->rdev, REGULATOR_CHANGE_VOLTAGE) && (rdev->desc->min_dropout_uV || !(rdev->desc->ops->get_voltage || rdev->desc->ops->get_voltage_sel))) { int current_supply_uV; int selector; selector = regulator_map_voltage(rdev, min_uV, max_uV); if (selector < 0) { ret = selector; goto out; } best_supply_uV = _regulator_list_voltage(rdev, selector, 0); if (best_supply_uV < 0) { ret = best_supply_uV; goto out; } best_supply_uV += rdev->desc->min_dropout_uV; current_supply_uV = regulator_get_voltage_rdev(rdev->supply->rdev); if (current_supply_uV < 0) { ret = current_supply_uV; goto out; } supply_change_uV = best_supply_uV - current_supply_uV; } if (supply_change_uV > 0) { ret = regulator_set_voltage_unlocked(rdev->supply, best_supply_uV, INT_MAX, state); if (ret) { dev_err(&rdev->dev, "Failed to increase supply voltage: %pe\n", ERR_PTR(ret)); goto out; } } if (state == PM_SUSPEND_ON) ret = _regulator_do_set_voltage(rdev, min_uV, max_uV); else ret = _regulator_do_set_suspend_voltage(rdev, min_uV, max_uV, state); if (ret < 0) goto out; if (supply_change_uV < 0) { ret = regulator_set_voltage_unlocked(rdev->supply, best_supply_uV, INT_MAX, state); if (ret) dev_warn(&rdev->dev, "Failed to decrease supply voltage: %pe\n", ERR_PTR(ret)); /* No need to fail here */ ret = 0; } out: return ret; } EXPORT_SYMBOL_GPL(regulator_set_voltage_rdev); static int regulator_limit_voltage_step(struct regulator_dev *rdev, int *current_uV, int *min_uV) { struct regulation_constraints *constraints = rdev->constraints; /* Limit voltage change only if necessary */ if (!constraints->max_uV_step || !_regulator_is_enabled(rdev)) return 1; if (*current_uV < 0) { *current_uV = regulator_get_voltage_rdev(rdev); if (*current_uV < 0) return *current_uV; } if (abs(*current_uV - *min_uV) <= constraints->max_uV_step) return 1; /* Clamp target voltage within the given step */ if (*current_uV < *min_uV) *min_uV = min(*current_uV + constraints->max_uV_step, *min_uV); else *min_uV = max(*current_uV - constraints->max_uV_step, *min_uV); return 0; } static int regulator_get_optimal_voltage(struct regulator_dev *rdev, int *current_uV, int *min_uV, int *max_uV, suspend_state_t state, int n_coupled) { struct coupling_desc *c_desc = &rdev->coupling_desc; struct regulator_dev **c_rdevs = c_desc->coupled_rdevs; struct regulation_constraints *constraints = rdev->constraints; int desired_min_uV = 0, desired_max_uV = INT_MAX; int max_current_uV = 0, min_current_uV = INT_MAX; int highest_min_uV = 0, target_uV, possible_uV; int i, ret, max_spread; bool done; *current_uV = -1; /* * If there are no coupled regulators, simply set the voltage * demanded by consumers. */ if (n_coupled == 1) { /* * If consumers don't provide any demands, set voltage * to min_uV */ desired_min_uV = constraints->min_uV; desired_max_uV = constraints->max_uV; ret = regulator_check_consumers(rdev, &desired_min_uV, &desired_max_uV, state); if (ret < 0) return ret; done = true; goto finish; } /* Find highest min desired voltage */ for (i = 0; i < n_coupled; i++) { int tmp_min = 0; int tmp_max = INT_MAX; lockdep_assert_held_once(&c_rdevs[i]->mutex.base); ret = regulator_check_consumers(c_rdevs[i], &tmp_min, &tmp_max, state); if (ret < 0) return ret; ret = regulator_check_voltage(c_rdevs[i], &tmp_min, &tmp_max); if (ret < 0) return ret; highest_min_uV = max(highest_min_uV, tmp_min); if (i == 0) { desired_min_uV = tmp_min; desired_max_uV = tmp_max; } } max_spread = constraints->max_spread[0]; /* * Let target_uV be equal to the desired one if possible. * If not, set it to minimum voltage, allowed by other coupled * regulators. */ target_uV = max(desired_min_uV, highest_min_uV - max_spread); /* * Find min and max voltages, which currently aren't violating * max_spread. */ for (i = 1; i < n_coupled; i++) { int tmp_act; if (!_regulator_is_enabled(c_rdevs[i])) continue; tmp_act = regulator_get_voltage_rdev(c_rdevs[i]); if (tmp_act < 0) return tmp_act; min_current_uV = min(tmp_act, min_current_uV); max_current_uV = max(tmp_act, max_current_uV); } /* There aren't any other regulators enabled */ if (max_current_uV == 0) { possible_uV = target_uV; } else { /* * Correct target voltage, so as it currently isn't * violating max_spread */ possible_uV = max(target_uV, max_current_uV - max_spread); possible_uV = min(possible_uV, min_current_uV + max_spread); } if (possible_uV > desired_max_uV) return -EINVAL; done = (possible_uV == target_uV); desired_min_uV = possible_uV; finish: /* Apply max_uV_step constraint if necessary */ if (state == PM_SUSPEND_ON) { ret = regulator_limit_voltage_step(rdev, current_uV, &desired_min_uV); if (ret < 0) return ret; if (ret == 0) done = false; } /* Set current_uV if wasn't done earlier in the code and if necessary */ if (n_coupled > 1 && *current_uV == -1) { if (_regulator_is_enabled(rdev)) { ret = regulator_get_voltage_rdev(rdev); if (ret < 0) return ret; *current_uV = ret; } else { *current_uV = desired_min_uV; } } *min_uV = desired_min_uV; *max_uV = desired_max_uV; return done; } int regulator_do_balance_voltage(struct regulator_dev *rdev, suspend_state_t state, bool skip_coupled) { struct regulator_dev **c_rdevs; struct regulator_dev *best_rdev; struct coupling_desc *c_desc = &rdev->coupling_desc; int i, ret, n_coupled, best_min_uV, best_max_uV, best_c_rdev; unsigned int delta, best_delta; unsigned long c_rdev_done = 0; bool best_c_rdev_done; c_rdevs = c_desc->coupled_rdevs; n_coupled = skip_coupled ? 1 : c_desc->n_coupled; /* * Find the best possible voltage change on each loop. Leave the loop * if there isn't any possible change. */ do { best_c_rdev_done = false; best_delta = 0; best_min_uV = 0; best_max_uV = 0; best_c_rdev = 0; best_rdev = NULL; /* * Find highest difference between optimal voltage * and current voltage. */ for (i = 0; i < n_coupled; i++) { /* * optimal_uV is the best voltage that can be set for * i-th regulator at the moment without violating * max_spread constraint in order to balance * the coupled voltages. */ int optimal_uV = 0, optimal_max_uV = 0, current_uV = 0; if (test_bit(i, &c_rdev_done)) continue; ret = regulator_get_optimal_voltage(c_rdevs[i], ¤t_uV, &optimal_uV, &optimal_max_uV, state, n_coupled); if (ret < 0) goto out; delta = abs(optimal_uV - current_uV); if (delta && best_delta <= delta) { best_c_rdev_done = ret; best_delta = delta; best_rdev = c_rdevs[i]; best_min_uV = optimal_uV; best_max_uV = optimal_max_uV; best_c_rdev = i; } } /* Nothing to change, return successfully */ if (!best_rdev) { ret = 0; goto out; } ret = regulator_set_voltage_rdev(best_rdev, best_min_uV, best_max_uV, state); if (ret < 0) goto out; if (best_c_rdev_done) set_bit(best_c_rdev, &c_rdev_done); } while (n_coupled > 1); out: return ret; } static int regulator_balance_voltage(struct regulator_dev *rdev, suspend_state_t state) { struct coupling_desc *c_desc = &rdev->coupling_desc; struct regulator_coupler *coupler = c_desc->coupler; bool skip_coupled = false; /* * If system is in a state other than PM_SUSPEND_ON, don't check * other coupled regulators. */ if (state != PM_SUSPEND_ON) skip_coupled = true; if (c_desc->n_resolved < c_desc->n_coupled) { rdev_err(rdev, "Not all coupled regulators registered\n"); return -EPERM; } /* Invoke custom balancer for customized couplers */ if (coupler && coupler->balance_voltage) return coupler->balance_voltage(coupler, rdev, state); return regulator_do_balance_voltage(rdev, state, skip_coupled); } /** * regulator_set_voltage - set regulator output voltage * @regulator: regulator source * @min_uV: Minimum required voltage in uV * @max_uV: Maximum acceptable voltage in uV * * Sets a voltage regulator to the desired output voltage. This can be set * during any regulator state. IOW, regulator can be disabled or enabled. * * If the regulator is enabled then the voltage will change to the new value * immediately otherwise if the regulator is disabled the regulator will * output at the new voltage when enabled. * * NOTE: If the regulator is shared between several devices then the lowest * request voltage that meets the system constraints will be used. * Regulator system constraints must be set for this regulator before * calling this function otherwise this call will fail. * * Return: 0 on success or a negative error number on failure. */ int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV) { struct ww_acquire_ctx ww_ctx; int ret; regulator_lock_dependent(regulator->rdev, &ww_ctx); ret = regulator_set_voltage_unlocked(regulator, min_uV, max_uV, PM_SUSPEND_ON); regulator_unlock_dependent(regulator->rdev, &ww_ctx); return ret; } EXPORT_SYMBOL_GPL(regulator_set_voltage); static inline int regulator_suspend_toggle(struct regulator_dev *rdev, suspend_state_t state, bool en) { struct regulator_state *rstate; rstate = regulator_get_suspend_state(rdev, state); if (rstate == NULL) return -EINVAL; if (!rstate->changeable) return -EPERM; rstate->enabled = (en) ? ENABLE_IN_SUSPEND : DISABLE_IN_SUSPEND; return 0; } int regulator_suspend_enable(struct regulator_dev *rdev, suspend_state_t state) { return regulator_suspend_toggle(rdev, state, true); } EXPORT_SYMBOL_GPL(regulator_suspend_enable); int regulator_suspend_disable(struct regulator_dev *rdev, suspend_state_t state) { struct regulator *regulator; struct regulator_voltage *voltage; /* * if any consumer wants this regulator device keeping on in * suspend states, don't set it as disabled. */ list_for_each_entry(regulator, &rdev->consumer_list, list) { voltage = ®ulator->voltage[state]; if (voltage->min_uV || voltage->max_uV) return 0; } return regulator_suspend_toggle(rdev, state, false); } EXPORT_SYMBOL_GPL(regulator_suspend_disable); static int _regulator_set_suspend_voltage(struct regulator *regulator, int min_uV, int max_uV, suspend_state_t state) { struct regulator_dev *rdev = regulator->rdev; struct regulator_state *rstate; rstate = regulator_get_suspend_state(rdev, state); if (rstate == NULL) return -EINVAL; if (rstate->min_uV == rstate->max_uV) { rdev_err(rdev, "The suspend voltage can't be changed!\n"); return -EPERM; } return regulator_set_voltage_unlocked(regulator, min_uV, max_uV, state); } int regulator_set_suspend_voltage(struct regulator *regulator, int min_uV, int max_uV, suspend_state_t state) { struct ww_acquire_ctx ww_ctx; int ret; /* PM_SUSPEND_ON is handled by regulator_set_voltage() */ if (regulator_check_states(state) || state == PM_SUSPEND_ON) return -EINVAL; regulator_lock_dependent(regulator->rdev, &ww_ctx); ret = _regulator_set_suspend_voltage(regulator, min_uV, max_uV, state); regulator_unlock_dependent(regulator->rdev, &ww_ctx); return ret; } EXPORT_SYMBOL_GPL(regulator_set_suspend_voltage); /** * regulator_set_voltage_time - get raise/fall time * @regulator: regulator source * @old_uV: starting voltage in microvolts * @new_uV: target voltage in microvolts * * Provided with the starting and ending voltage, this function attempts to * calculate the time in microseconds required to rise or fall to this new * voltage. * * Return: ramp time in microseconds, or a negative error number if calculation failed. */ int regulator_set_voltage_time(struct regulator *regulator, int old_uV, int new_uV) { struct regulator_dev *rdev = regulator->rdev; const struct regulator_ops *ops = rdev->desc->ops; int old_sel = -1; int new_sel = -1; int voltage; int i; if (ops->set_voltage_time) return ops->set_voltage_time(rdev, old_uV, new_uV); else if (!ops->set_voltage_time_sel) return _regulator_set_voltage_time(rdev, old_uV, new_uV); /* Currently requires operations to do this */ if (!ops->list_voltage || !rdev->desc->n_voltages) return -EINVAL; for (i = 0; i < rdev->desc->n_voltages; i++) { /* We only look for exact voltage matches here */ if (i < rdev->desc->linear_min_sel) continue; if (old_sel >= 0 && new_sel >= 0) break; voltage = regulator_list_voltage(regulator, i); if (voltage < 0) return -EINVAL; if (voltage == 0) continue; if (voltage == old_uV) old_sel = i; if (voltage == new_uV) new_sel = i; } if (old_sel < 0 || new_sel < 0) return -EINVAL; return ops->set_voltage_time_sel(rdev, old_sel, new_sel); } EXPORT_SYMBOL_GPL(regulator_set_voltage_time); /** * regulator_set_voltage_time_sel - get raise/fall time * @rdev: regulator source device * @old_selector: selector for starting voltage * @new_selector: selector for target voltage * * Provided with the starting and target voltage selectors, this function * returns time in microseconds required to rise or fall to this new voltage * * Drivers providing ramp_delay in regulation_constraints can use this as their * set_voltage_time_sel() operation. * * Return: ramp time in microseconds, or a negative error number if calculation failed. */ int regulator_set_voltage_time_sel(struct regulator_dev *rdev, unsigned int old_selector, unsigned int new_selector) { int old_volt, new_volt; /* sanity check */ if (!rdev->desc->ops->list_voltage) return -EINVAL; old_volt = rdev->desc->ops->list_voltage(rdev, old_selector); new_volt = rdev->desc->ops->list_voltage(rdev, new_selector); if (rdev->desc->ops->set_voltage_time) return rdev->desc->ops->set_voltage_time(rdev, old_volt, new_volt); else return _regulator_set_voltage_time(rdev, old_volt, new_volt); } EXPORT_SYMBOL_GPL(regulator_set_voltage_time_sel); int regulator_sync_voltage_rdev(struct regulator_dev *rdev) { int ret; regulator_lock(rdev); if (!rdev->desc->ops->set_voltage && !rdev->desc->ops->set_voltage_sel) { ret = -EINVAL; goto out; } /* balance only, if regulator is coupled */ if (rdev->coupling_desc.n_coupled > 1) ret = regulator_balance_voltage(rdev, PM_SUSPEND_ON); else ret = -EOPNOTSUPP; out: regulator_unlock(rdev); return ret; } /** * regulator_sync_voltage - re-apply last regulator output voltage * @regulator: regulator source * * Re-apply the last configured voltage. This is intended to be used * where some external control source the consumer is cooperating with * has caused the configured voltage to change. * * Return: 0 on success or a negative error number on failure. */ int regulator_sync_voltage(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; struct regulator_voltage *voltage = ®ulator->voltage[PM_SUSPEND_ON]; int ret, min_uV, max_uV; if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_VOLTAGE)) return 0; regulator_lock(rdev); if (!rdev->desc->ops->set_voltage && !rdev->desc->ops->set_voltage_sel) { ret = -EINVAL; goto out; } /* This is only going to work if we've had a voltage configured. */ if (!voltage->min_uV && !voltage->max_uV) { ret = -EINVAL; goto out; } min_uV = voltage->min_uV; max_uV = voltage->max_uV; /* This should be a paranoia check... */ ret = regulator_check_voltage(rdev, &min_uV, &max_uV); if (ret < 0) goto out; ret = regulator_check_consumers(rdev, &min_uV, &max_uV, 0); if (ret < 0) goto out; /* balance only, if regulator is coupled */ if (rdev->coupling_desc.n_coupled > 1) ret = regulator_balance_voltage(rdev, PM_SUSPEND_ON); else ret = _regulator_do_set_voltage(rdev, min_uV, max_uV); out: regulator_unlock(rdev); return ret; } EXPORT_SYMBOL_GPL(regulator_sync_voltage); int regulator_get_voltage_rdev(struct regulator_dev *rdev) { int sel, ret; bool bypassed; if (rdev->desc->ops->get_bypass) { ret = rdev->desc->ops->get_bypass(rdev, &bypassed); if (ret < 0) return ret; if (bypassed) { /* if bypassed the regulator must have a supply */ if (!rdev->supply) { rdev_err(rdev, "bypassed regulator has no supply!\n"); return -EPROBE_DEFER; } return regulator_get_voltage_rdev(rdev->supply->rdev); } } if (rdev->desc->ops->get_voltage_sel) { sel = rdev->desc->ops->get_voltage_sel(rdev); if (sel < 0) return sel; ret = rdev->desc->ops->list_voltage(rdev, sel); } else if (rdev->desc->ops->get_voltage) { ret = rdev->desc->ops->get_voltage(rdev); } else if (rdev->desc->ops->list_voltage) { ret = rdev->desc->ops->list_voltage(rdev, 0); } else if (rdev->desc->fixed_uV && (rdev->desc->n_voltages == 1)) { ret = rdev->desc->fixed_uV; } else if (rdev->supply) { ret = regulator_get_voltage_rdev(rdev->supply->rdev); } else if (rdev->supply_name) { return -EPROBE_DEFER; } else { return -EINVAL; } if (ret < 0) return ret; return ret - rdev->constraints->uV_offset; } EXPORT_SYMBOL_GPL(regulator_get_voltage_rdev); /** * regulator_get_voltage - get regulator output voltage * @regulator: regulator source * * Return: Current regulator voltage in uV, or a negative error number on failure. * * NOTE: If the regulator is disabled it will return the voltage value. This * function should not be used to determine regulator state. */ int regulator_get_voltage(struct regulator *regulator) { struct ww_acquire_ctx ww_ctx; int ret; regulator_lock_dependent(regulator->rdev, &ww_ctx); ret = regulator_get_voltage_rdev(regulator->rdev); regulator_unlock_dependent(regulator->rdev, &ww_ctx); return ret; } EXPORT_SYMBOL_GPL(regulator_get_voltage); /** * regulator_set_current_limit - set regulator output current limit * @regulator: regulator source * @min_uA: Minimum supported current in uA * @max_uA: Maximum supported current in uA * * Sets current sink to the desired output current. This can be set during * any regulator state. IOW, regulator can be disabled or enabled. * * If the regulator is enabled then the current will change to the new value * immediately otherwise if the regulator is disabled the regulator will * output at the new current when enabled. * * NOTE: Regulator system constraints must be set for this regulator before * calling this function otherwise this call will fail. * * Return: 0 on success or a negative error number on failure. */ int regulator_set_current_limit(struct regulator *regulator, int min_uA, int max_uA) { struct regulator_dev *rdev = regulator->rdev; int ret; regulator_lock(rdev); /* sanity check */ if (!rdev->desc->ops->set_current_limit) { ret = -EINVAL; goto out; } /* constraints check */ ret = regulator_check_current_limit(rdev, &min_uA, &max_uA); if (ret < 0) goto out; ret = rdev->desc->ops->set_current_limit(rdev, min_uA, max_uA); out: regulator_unlock(rdev); return ret; } EXPORT_SYMBOL_GPL(regulator_set_current_limit); static int _regulator_get_current_limit_unlocked(struct regulator_dev *rdev) { /* sanity check */ if (!rdev->desc->ops->get_current_limit) return -EINVAL; return rdev->desc->ops->get_current_limit(rdev); } static int _regulator_get_current_limit(struct regulator_dev *rdev) { int ret; regulator_lock(rdev); ret = _regulator_get_current_limit_unlocked(rdev); regulator_unlock(rdev); return ret; } /** * regulator_get_current_limit - get regulator output current * @regulator: regulator source * * Return: Current supplied by the specified current sink in uA, * or a negative error number on failure. * * NOTE: If the regulator is disabled it will return the current value. This * function should not be used to determine regulator state. */ int regulator_get_current_limit(struct regulator *regulator) { return _regulator_get_current_limit(regulator->rdev); } EXPORT_SYMBOL_GPL(regulator_get_current_limit); /** * regulator_get_unclaimed_power_budget - get regulator unclaimed power budget * @regulator: regulator source * * Return: Unclaimed power budget of the regulator in mW. */ int regulator_get_unclaimed_power_budget(struct regulator *regulator) { return regulator->rdev->constraints->pw_budget_mW - regulator->rdev->pw_requested_mW; } EXPORT_SYMBOL_GPL(regulator_get_unclaimed_power_budget); /** * regulator_request_power_budget - request power budget on a regulator * @regulator: regulator source * @pw_req: Power requested * * Return: 0 on success or a negative error number on failure. */ int regulator_request_power_budget(struct regulator *regulator, unsigned int pw_req) { struct regulator_dev *rdev = regulator->rdev; int ret = 0, pw_tot_req; regulator_lock(rdev); if (rdev->supply) { ret = regulator_request_power_budget(rdev->supply, pw_req); if (ret < 0) goto out; } pw_tot_req = rdev->pw_requested_mW + pw_req; if (pw_tot_req > rdev->constraints->pw_budget_mW) { rdev_warn(rdev, "power requested %d mW out of budget %d mW", pw_req, rdev->constraints->pw_budget_mW - rdev->pw_requested_mW); regulator_notifier_call_chain(rdev, REGULATOR_EVENT_OVER_CURRENT_WARN, NULL); ret = -ERANGE; goto out; } rdev->pw_requested_mW = pw_tot_req; out: regulator_unlock(rdev); return ret; } EXPORT_SYMBOL_GPL(regulator_request_power_budget); /** * regulator_free_power_budget - free power budget on a regulator * @regulator: regulator source * @pw: Power to be released. * * Return: Power budget of the regulator in mW. */ void regulator_free_power_budget(struct regulator *regulator, unsigned int pw) { struct regulator_dev *rdev = regulator->rdev; int pw_tot_req; regulator_lock(rdev); if (rdev->supply) regulator_free_power_budget(rdev->supply, pw); pw_tot_req = rdev->pw_requested_mW - pw; if (pw_tot_req >= 0) rdev->pw_requested_mW = pw_tot_req; else rdev_warn(rdev, "too much power freed %d mW (already requested %d mW)", pw, rdev->pw_requested_mW); regulator_unlock(rdev); } EXPORT_SYMBOL_GPL(regulator_free_power_budget); /** * regulator_set_mode - set regulator operating mode * @regulator: regulator source * @mode: operating mode - one of the REGULATOR_MODE constants * * Set regulator operating mode to increase regulator efficiency or improve * regulation performance. * * NOTE: Regulator system constraints must be set for this regulator before * calling this function otherwise this call will fail. * * Return: 0 on success or a negative error number on failure. */ int regulator_set_mode(struct regulator *regulator, unsigned int mode) { struct regulator_dev *rdev = regulator->rdev; int ret; int regulator_curr_mode; regulator_lock(rdev); /* sanity check */ if (!rdev->desc->ops->set_mode) { ret = -EINVAL; goto out; } /* return if the same mode is requested */ if (rdev->desc->ops->get_mode) { regulator_curr_mode = rdev->desc->ops->get_mode(rdev); if (regulator_curr_mode == mode) { ret = 0; goto out; } } /* constraints check */ ret = regulator_mode_constrain(rdev, &mode); if (ret < 0) goto out; ret = rdev->desc->ops->set_mode(rdev, mode); out: regulator_unlock(rdev); return ret; } EXPORT_SYMBOL_GPL(regulator_set_mode); static unsigned int _regulator_get_mode_unlocked(struct regulator_dev *rdev) { /* sanity check */ if (!rdev->desc->ops->get_mode) return -EINVAL; return rdev->desc->ops->get_mode(rdev); } static unsigned int _regulator_get_mode(struct regulator_dev *rdev) { int ret; regulator_lock(rdev); ret = _regulator_get_mode_unlocked(rdev); regulator_unlock(rdev); return ret; } /** * regulator_get_mode - get regulator operating mode * @regulator: regulator source * * Get the current regulator operating mode. * * Return: Current operating mode as %REGULATOR_MODE_* values, * or a negative error number on failure. */ unsigned int regulator_get_mode(struct regulator *regulator) { return _regulator_get_mode(regulator->rdev); } EXPORT_SYMBOL_GPL(regulator_get_mode); static int rdev_get_cached_err_flags(struct regulator_dev *rdev) { int ret = 0; if (rdev->use_cached_err) { spin_lock(&rdev->err_lock); ret = rdev->cached_err; spin_unlock(&rdev->err_lock); } return ret; } static int _regulator_get_error_flags(struct regulator_dev *rdev, unsigned int *flags) { int cached_flags, ret = 0; regulator_lock(rdev); cached_flags = rdev_get_cached_err_flags(rdev); if (rdev->desc->ops->get_error_flags) ret = rdev->desc->ops->get_error_flags(rdev, flags); else if (!rdev->use_cached_err) ret = -EINVAL; *flags |= cached_flags; regulator_unlock(rdev); return ret; } /** * regulator_get_error_flags - get regulator error information * @regulator: regulator source * @flags: pointer to store error flags * * Get the current regulator error information. * * Return: 0 on success or a negative error number on failure. */ int regulator_get_error_flags(struct regulator *regulator, unsigned int *flags) { return _regulator_get_error_flags(regulator->rdev, flags); } EXPORT_SYMBOL_GPL(regulator_get_error_flags); /** * regulator_set_load - set regulator load * @regulator: regulator source * @uA_load: load current * * Notifies the regulator core of a new device load. This is then used by * DRMS (if enabled by constraints) to set the most efficient regulator * operating mode for the new regulator loading. * * Consumer devices notify their supply regulator of the maximum power * they will require (can be taken from device datasheet in the power * consumption tables) when they change operational status and hence power * state. Examples of operational state changes that can affect power * consumption are :- * * o Device is opened / closed. * o Device I/O is about to begin or has just finished. * o Device is idling in between work. * * This information is also exported via sysfs to userspace. * * DRMS will sum the total requested load on the regulator and change * to the most efficient operating mode if platform constraints allow. * * NOTE: when a regulator consumer requests to have a regulator * disabled then any load that consumer requested no longer counts * toward the total requested load. If the regulator is re-enabled * then the previously requested load will start counting again. * * If a regulator is an always-on regulator then an individual consumer's * load will still be removed if that consumer is fully disabled. * * Return: 0 on success or a negative error number on failure. */ int regulator_set_load(struct regulator *regulator, int uA_load) { struct regulator_dev *rdev = regulator->rdev; int old_uA_load; int ret = 0; regulator_lock(rdev); old_uA_load = regulator->uA_load; regulator->uA_load = uA_load; if (regulator->enable_count && old_uA_load != uA_load) { ret = drms_uA_update(rdev); if (ret < 0) regulator->uA_load = old_uA_load; } regulator_unlock(rdev); return ret; } EXPORT_SYMBOL_GPL(regulator_set_load); /** * regulator_allow_bypass - allow the regulator to go into bypass mode * * @regulator: Regulator to configure * @enable: enable or disable bypass mode * * Allow the regulator to go into bypass mode if all other consumers * for the regulator also enable bypass mode and the machine * constraints allow this. Bypass mode means that the regulator is * simply passing the input directly to the output with no regulation. * * Return: 0 on success or if changing bypass is not possible, or * a negative error number on failure. */ int regulator_allow_bypass(struct regulator *regulator, bool enable) { struct regulator_dev *rdev = regulator->rdev; const char *name = rdev_get_name(rdev); int ret = 0; if (!rdev->desc->ops->set_bypass) return 0; if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_BYPASS)) return 0; regulator_lock(rdev); if (enable && !regulator->bypass) { rdev->bypass_count++; if (rdev->bypass_count == rdev->open_count) { trace_regulator_bypass_enable(name); ret = rdev->desc->ops->set_bypass(rdev, enable); if (ret != 0) rdev->bypass_count--; else trace_regulator_bypass_enable_complete(name); } } else if (!enable && regulator->bypass) { rdev->bypass_count--; if (rdev->bypass_count != rdev->open_count) { trace_regulator_bypass_disable(name); ret = rdev->desc->ops->set_bypass(rdev, enable); if (ret != 0) rdev->bypass_count++; else trace_regulator_bypass_disable_complete(name); } } if (ret == 0) regulator->bypass = enable; regulator_unlock(rdev); return ret; } EXPORT_SYMBOL_GPL(regulator_allow_bypass); /** * regulator_register_notifier - register regulator event notifier * @regulator: regulator source * @nb: notifier block * * Register notifier block to receive regulator events. * * Return: 0 on success or a negative error number on failure. */ int regulator_register_notifier(struct regulator *regulator, struct notifier_block *nb) { return blocking_notifier_chain_register(®ulator->rdev->notifier, nb); } EXPORT_SYMBOL_GPL(regulator_register_notifier); /** * regulator_unregister_notifier - unregister regulator event notifier * @regulator: regulator source * @nb: notifier block * * Unregister regulator event notifier block. * * Return: 0 on success or a negative error number on failure. */ int regulator_unregister_notifier(struct regulator *regulator, struct notifier_block *nb) { return blocking_notifier_chain_unregister(®ulator->rdev->notifier, nb); } EXPORT_SYMBOL_GPL(regulator_unregister_notifier); /* notify regulator consumers and downstream regulator consumers. * Note mutex must be held by caller. */ static int _notifier_call_chain(struct regulator_dev *rdev, unsigned long event, void *data) { /* call rdev chain first */ int ret = blocking_notifier_call_chain(&rdev->notifier, event, data); if (IS_REACHABLE(CONFIG_REGULATOR_NETLINK_EVENTS)) { struct device *parent = rdev->dev.parent; const char *rname = rdev_get_name(rdev); char name[32]; /* Avoid duplicate debugfs directory names */ if (parent && rname == rdev->desc->name) { snprintf(name, sizeof(name), "%s-%s", dev_name(parent), rname); rname = name; } reg_generate_netlink_event(rname, event); } return ret; } int _regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers, enum regulator_get_type get_type) { int i; int ret; for (i = 0; i < num_consumers; i++) consumers[i].consumer = NULL; for (i = 0; i < num_consumers; i++) { consumers[i].consumer = _regulator_get(dev, consumers[i].supply, get_type); if (IS_ERR(consumers[i].consumer)) { ret = dev_err_probe(dev, PTR_ERR(consumers[i].consumer), "Failed to get supply '%s'\n", consumers[i].supply); consumers[i].consumer = NULL; goto err; } if (consumers[i].init_load_uA > 0) { ret = regulator_set_load(consumers[i].consumer, consumers[i].init_load_uA); if (ret) { i++; goto err; } } } return 0; err: while (--i >= 0) regulator_put(consumers[i].consumer); return ret; } /** * regulator_bulk_get - get multiple regulator consumers * * @dev: Device to supply * @num_consumers: Number of consumers to register * @consumers: Configuration of consumers; clients are stored here. * * This helper function allows drivers to get several regulator * consumers in one operation. If any of the regulators cannot be * acquired then any regulators that were allocated will be freed * before returning to the caller. * * Return: 0 on success or a negative error number on failure. */ int regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers) { return _regulator_bulk_get(dev, num_consumers, consumers, NORMAL_GET); } EXPORT_SYMBOL_GPL(regulator_bulk_get); static void regulator_bulk_enable_async(void *data, async_cookie_t cookie) { struct regulator_bulk_data *bulk = data; bulk->ret = regulator_enable(bulk->consumer); } /** * regulator_bulk_enable - enable multiple regulator consumers * * @num_consumers: Number of consumers * @consumers: Consumer data; clients are stored here. * * This convenience API allows consumers to enable multiple regulator * clients in a single API call. If any consumers cannot be enabled * then any others that were enabled will be disabled again prior to * return. * * Return: 0 on success or a negative error number on failure. */ int regulator_bulk_enable(int num_consumers, struct regulator_bulk_data *consumers) { ASYNC_DOMAIN_EXCLUSIVE(async_domain); int i; int ret = 0; for (i = 0; i < num_consumers; i++) { async_schedule_domain(regulator_bulk_enable_async, &consumers[i], &async_domain); } async_synchronize_full_domain(&async_domain); /* If any consumer failed we need to unwind any that succeeded */ for (i = 0; i < num_consumers; i++) { if (consumers[i].ret != 0) { ret = consumers[i].ret; goto err; } } return 0; err: for (i = 0; i < num_consumers; i++) { if (consumers[i].ret < 0) pr_err("Failed to enable %s: %pe\n", consumers[i].supply, ERR_PTR(consumers[i].ret)); else regulator_disable(consumers[i].consumer); } return ret; } EXPORT_SYMBOL_GPL(regulator_bulk_enable); /** * regulator_bulk_disable - disable multiple regulator consumers * * @num_consumers: Number of consumers * @consumers: Consumer data; clients are stored here. * * This convenience API allows consumers to disable multiple regulator * clients in a single API call. If any consumers cannot be disabled * then any others that were disabled will be enabled again prior to * return. * * Return: 0 on success or a negative error number on failure. */ int regulator_bulk_disable(int num_consumers, struct regulator_bulk_data *consumers) { int i; int ret, r; for (i = num_consumers - 1; i >= 0; --i) { ret = regulator_disable(consumers[i].consumer); if (ret != 0) goto err; } return 0; err: pr_err("Failed to disable %s: %pe\n", consumers[i].supply, ERR_PTR(ret)); for (++i; i < num_consumers; ++i) { r = regulator_enable(consumers[i].consumer); if (r != 0) pr_err("Failed to re-enable %s: %pe\n", consumers[i].supply, ERR_PTR(r)); } return ret; } EXPORT_SYMBOL_GPL(regulator_bulk_disable); /** * regulator_bulk_force_disable - force disable multiple regulator consumers * * @num_consumers: Number of consumers * @consumers: Consumer data; clients are stored here. * * This convenience API allows consumers to forcibly disable multiple regulator * clients in a single API call. * NOTE: This should be used for situations when device damage will * likely occur if the regulators are not disabled (e.g. over temp). * Although regulator_force_disable function call for some consumers can * return error numbers, the function is called for all consumers. * * Return: 0 on success or a negative error number on failure. */ int regulator_bulk_force_disable(int num_consumers, struct regulator_bulk_data *consumers) { int i; int ret = 0; for (i = 0; i < num_consumers; i++) { consumers[i].ret = regulator_force_disable(consumers[i].consumer); /* Store first error for reporting */ if (consumers[i].ret && !ret) ret = consumers[i].ret; } return ret; } EXPORT_SYMBOL_GPL(regulator_bulk_force_disable); /** * regulator_bulk_free - free multiple regulator consumers * * @num_consumers: Number of consumers * @consumers: Consumer data; clients are stored here. * * This convenience API allows consumers to free multiple regulator * clients in a single API call. */ void regulator_bulk_free(int num_consumers, struct regulator_bulk_data *consumers) { int i; for (i = 0; i < num_consumers; i++) { regulator_put(consumers[i].consumer); consumers[i].consumer = NULL; } } EXPORT_SYMBOL_GPL(regulator_bulk_free); /** * regulator_handle_critical - Handle events for system-critical regulators. * @rdev: The regulator device. * @event: The event being handled. * * This function handles critical events such as under-voltage, over-current, * and unknown errors for regulators deemed system-critical. On detecting such * events, it triggers a hardware protection shutdown with a defined timeout. */ static void regulator_handle_critical(struct regulator_dev *rdev, unsigned long event) { const char *reason = NULL; if (!rdev->constraints->system_critical) return; switch (event) { case REGULATOR_EVENT_UNDER_VOLTAGE: reason = "System critical regulator: voltage drop detected"; break; case REGULATOR_EVENT_OVER_CURRENT: reason = "System critical regulator: over-current detected"; break; case REGULATOR_EVENT_FAIL: reason = "System critical regulator: unknown error"; } if (!reason) return; hw_protection_trigger(reason, rdev->constraints->uv_less_critical_window_ms); } /** * regulator_notifier_call_chain - call regulator event notifier * @rdev: regulator source * @event: notifier block * @data: callback-specific data. * * Called by regulator drivers to notify clients a regulator event has * occurred. * * Return: %NOTIFY_DONE. */ int regulator_notifier_call_chain(struct regulator_dev *rdev, unsigned long event, void *data) { regulator_handle_critical(rdev, event); _notifier_call_chain(rdev, event, data); return NOTIFY_DONE; } EXPORT_SYMBOL_GPL(regulator_notifier_call_chain); /** * regulator_mode_to_status - convert a regulator mode into a status * * @mode: Mode to convert * * Convert a regulator mode into a status. * * Return: %REGULATOR_STATUS_* value corresponding to given mode. */ int regulator_mode_to_status(unsigned int mode) { switch (mode) { case REGULATOR_MODE_FAST: return REGULATOR_STATUS_FAST; case REGULATOR_MODE_NORMAL: return REGULATOR_STATUS_NORMAL; case REGULATOR_MODE_IDLE: return REGULATOR_STATUS_IDLE; case REGULATOR_MODE_STANDBY: return REGULATOR_STATUS_STANDBY; default: return REGULATOR_STATUS_UNDEFINED; } } EXPORT_SYMBOL_GPL(regulator_mode_to_status); static struct attribute *regulator_dev_attrs[] = { &dev_attr_name.attr, &dev_attr_num_users.attr, &dev_attr_type.attr, &dev_attr_microvolts.attr, &dev_attr_microamps.attr, &dev_attr_opmode.attr, &dev_attr_state.attr, &dev_attr_status.attr, &dev_attr_bypass.attr, &dev_attr_requested_microamps.attr, &dev_attr_min_microvolts.attr, &dev_attr_max_microvolts.attr, &dev_attr_min_microamps.attr, &dev_attr_max_microamps.attr, &dev_attr_under_voltage.attr, &dev_attr_over_current.attr, &dev_attr_regulation_out.attr, &dev_attr_fail.attr, &dev_attr_over_temp.attr, &dev_attr_under_voltage_warn.attr, &dev_attr_over_current_warn.attr, &dev_attr_over_voltage_warn.attr, &dev_attr_over_temp_warn.attr, &dev_attr_suspend_standby_state.attr, &dev_attr_suspend_mem_state.attr, &dev_attr_suspend_disk_state.attr, &dev_attr_suspend_standby_microvolts.attr, &dev_attr_suspend_mem_microvolts.attr, &dev_attr_suspend_disk_microvolts.attr, &dev_attr_suspend_standby_mode.attr, &dev_attr_suspend_mem_mode.attr, &dev_attr_suspend_disk_mode.attr, &dev_attr_power_budget_milliwatt.attr, &dev_attr_power_requested_milliwatt.attr, NULL }; /* * To avoid cluttering sysfs (and memory) with useless state, only * create attributes that can be meaningfully displayed. */ static umode_t regulator_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx) { struct device *dev = kobj_to_dev(kobj); struct regulator_dev *rdev = dev_to_rdev(dev); const struct regulator_ops *ops = rdev->desc->ops; umode_t mode = attr->mode; /* these three are always present */ if (attr == &dev_attr_name.attr || attr == &dev_attr_num_users.attr || attr == &dev_attr_type.attr) return mode; /* some attributes need specific methods to be displayed */ if (attr == &dev_attr_microvolts.attr) { if ((ops->get_voltage && ops->get_voltage(rdev) >= 0) || (ops->get_voltage_sel && ops->get_voltage_sel(rdev) >= 0) || (ops->list_voltage && ops->list_voltage(rdev, 0) >= 0) || (rdev->desc->fixed_uV && rdev->desc->n_voltages == 1)) return mode; return 0; } if (attr == &dev_attr_microamps.attr) return ops->get_current_limit ? mode : 0; if (attr == &dev_attr_opmode.attr) return ops->get_mode ? mode : 0; if (attr == &dev_attr_state.attr) return (rdev->ena_pin || ops->is_enabled) ? mode : 0; if (attr == &dev_attr_status.attr) return ops->get_status ? mode : 0; if (attr == &dev_attr_bypass.attr) return ops->get_bypass ? mode : 0; if (attr == &dev_attr_under_voltage.attr || attr == &dev_attr_over_current.attr || attr == &dev_attr_regulation_out.attr || attr == &dev_attr_fail.attr || attr == &dev_attr_over_temp.attr || attr == &dev_attr_under_voltage_warn.attr || attr == &dev_attr_over_current_warn.attr || attr == &dev_attr_over_voltage_warn.attr || attr == &dev_attr_over_temp_warn.attr) return ops->get_error_flags ? mode : 0; /* constraints need specific supporting methods */ if (attr == &dev_attr_min_microvolts.attr || attr == &dev_attr_max_microvolts.attr) return (ops->set_voltage || ops->set_voltage_sel) ? mode : 0; if (attr == &dev_attr_min_microamps.attr || attr == &dev_attr_max_microamps.attr) return ops->set_current_limit ? mode : 0; if (attr == &dev_attr_suspend_standby_state.attr || attr == &dev_attr_suspend_mem_state.attr || attr == &dev_attr_suspend_disk_state.attr) return mode; if (attr == &dev_attr_suspend_standby_microvolts.attr || attr == &dev_attr_suspend_mem_microvolts.attr || attr == &dev_attr_suspend_disk_microvolts.attr) return ops->set_suspend_voltage ? mode : 0; if (attr == &dev_attr_suspend_standby_mode.attr || attr == &dev_attr_suspend_mem_mode.attr || attr == &dev_attr_suspend_disk_mode.attr) return ops->set_suspend_mode ? mode : 0; if (attr == &dev_attr_power_budget_milliwatt.attr || attr == &dev_attr_power_requested_milliwatt.attr) return rdev->constraints->pw_budget_mW != INT_MAX ? mode : 0; return mode; } static const struct attribute_group regulator_dev_group = { .attrs = regulator_dev_attrs, .is_visible = regulator_attr_is_visible, }; static const struct attribute_group *regulator_dev_groups[] = { ®ulator_dev_group, NULL }; static void regulator_dev_release(struct device *dev) { struct regulator_dev *rdev = dev_get_drvdata(dev); debugfs_remove_recursive(rdev->debugfs); kfree(rdev->constraints); of_node_put(rdev->dev.of_node); kfree(rdev); } static void rdev_init_debugfs(struct regulator_dev *rdev) { struct device *parent = rdev->dev.parent; const char *rname = rdev_get_name(rdev); char name[NAME_MAX]; /* Avoid duplicate debugfs directory names */ if (parent && rname == rdev->desc->name) { snprintf(name, sizeof(name), "%s-%s", dev_name(parent), rname); rname = name; } rdev->debugfs = debugfs_create_dir(rname, debugfs_root); if (IS_ERR(rdev->debugfs)) rdev_dbg(rdev, "Failed to create debugfs directory\n"); debugfs_create_u32("use_count", 0444, rdev->debugfs, &rdev->use_count); debugfs_create_u32("open_count", 0444, rdev->debugfs, &rdev->open_count); debugfs_create_u32("bypass_count", 0444, rdev->debugfs, &rdev->bypass_count); } static int regulator_register_resolve_supply(struct device *dev, void *data) { struct regulator_dev *rdev = dev_to_rdev(dev); if (regulator_resolve_supply(rdev)) rdev_dbg(rdev, "unable to resolve supply\n"); return 0; } int regulator_coupler_register(struct regulator_coupler *coupler) { mutex_lock(®ulator_list_mutex); list_add_tail(&coupler->list, ®ulator_coupler_list); mutex_unlock(®ulator_list_mutex); return 0; } static struct regulator_coupler * regulator_find_coupler(struct regulator_dev *rdev) { struct regulator_coupler *coupler; int err; /* * Note that regulators are appended to the list and the generic * coupler is registered first, hence it will be attached at last * if nobody cared. */ list_for_each_entry_reverse(coupler, ®ulator_coupler_list, list) { err = coupler->attach_regulator(coupler, rdev); if (!err) { if (!coupler->balance_voltage && rdev->coupling_desc.n_coupled > 2) goto err_unsupported; return coupler; } if (err < 0) return ERR_PTR(err); if (err == 1) continue; break; } return ERR_PTR(-EINVAL); err_unsupported: if (coupler->detach_regulator) coupler->detach_regulator(coupler, rdev); rdev_err(rdev, "Voltage balancing for multiple regulator couples is unimplemented\n"); return ERR_PTR(-EPERM); } static void regulator_resolve_coupling(struct regulator_dev *rdev) { struct regulator_coupler *coupler = rdev->coupling_desc.coupler; struct coupling_desc *c_desc = &rdev->coupling_desc; int n_coupled = c_desc->n_coupled; struct regulator_dev *c_rdev; int i; for (i = 1; i < n_coupled; i++) { /* already resolved */ if (c_desc->coupled_rdevs[i]) continue; c_rdev = of_parse_coupled_regulator(rdev, i - 1); if (!c_rdev) continue; if (c_rdev->coupling_desc.coupler != coupler) { rdev_err(rdev, "coupler mismatch with %s\n", rdev_get_name(c_rdev)); return; } c_desc->coupled_rdevs[i] = c_rdev; c_desc->n_resolved++; regulator_resolve_coupling(c_rdev); } } static void regulator_remove_coupling(struct regulator_dev *rdev) { struct regulator_coupler *coupler = rdev->coupling_desc.coupler; struct coupling_desc *__c_desc, *c_desc = &rdev->coupling_desc; struct regulator_dev *__c_rdev, *c_rdev; unsigned int __n_coupled, n_coupled; int i, k; int err; n_coupled = c_desc->n_coupled; for (i = 1; i < n_coupled; i++) { c_rdev = c_desc->coupled_rdevs[i]; if (!c_rdev) continue; regulator_lock(c_rdev); __c_desc = &c_rdev->coupling_desc; __n_coupled = __c_desc->n_coupled; for (k = 1; k < __n_coupled; k++) { __c_rdev = __c_desc->coupled_rdevs[k]; if (__c_rdev == rdev) { __c_desc->coupled_rdevs[k] = NULL; __c_desc->n_resolved--; break; } } regulator_unlock(c_rdev); c_desc->coupled_rdevs[i] = NULL; c_desc->n_resolved--; } if (coupler && coupler->detach_regulator) { err = coupler->detach_regulator(coupler, rdev); if (err) rdev_err(rdev, "failed to detach from coupler: %pe\n", ERR_PTR(err)); } rdev->coupling_desc.n_coupled = 0; kfree(rdev->coupling_desc.coupled_rdevs); rdev->coupling_desc.coupled_rdevs = NULL; } static int regulator_init_coupling(struct regulator_dev *rdev) { struct regulator_dev **coupled; int err, n_phandles; if (!IS_ENABLED(CONFIG_OF)) n_phandles = 0; else n_phandles = of_get_n_coupled(rdev); coupled = kcalloc(n_phandles + 1, sizeof(*coupled), GFP_KERNEL); if (!coupled) return -ENOMEM; rdev->coupling_desc.coupled_rdevs = coupled; /* * Every regulator should always have coupling descriptor filled with * at least pointer to itself. */ rdev->coupling_desc.coupled_rdevs[0] = rdev; rdev->coupling_desc.n_coupled = n_phandles + 1; rdev->coupling_desc.n_resolved++; /* regulator isn't coupled */ if (n_phandles == 0) return 0; if (!of_check_coupling_data(rdev)) return -EPERM; mutex_lock(®ulator_list_mutex); rdev->coupling_desc.coupler = regulator_find_coupler(rdev); mutex_unlock(®ulator_list_mutex); if (IS_ERR(rdev->coupling_desc.coupler)) { err = PTR_ERR(rdev->coupling_desc.coupler); rdev_err(rdev, "failed to get coupler: %pe\n", ERR_PTR(err)); return err; } return 0; } static int generic_coupler_attach(struct regulator_coupler *coupler, struct regulator_dev *rdev) { if (rdev->coupling_desc.n_coupled > 2) { rdev_err(rdev, "Voltage balancing for multiple regulator couples is unimplemented\n"); return -EPERM; } if (!rdev->constraints->always_on) { rdev_err(rdev, "Coupling of a non always-on regulator is unimplemented\n"); return -ENOTSUPP; } return 0; } static struct regulator_coupler generic_regulator_coupler = { .attach_regulator = generic_coupler_attach, }; /** * regulator_register - register regulator * @dev: the device that drive the regulator * @regulator_desc: regulator to register * @cfg: runtime configuration for regulator * * Called by regulator drivers to register a regulator. * * Return: Pointer to a valid &struct regulator_dev on success or * an ERR_PTR() encoded negative error number on failure. */ struct regulator_dev * regulator_register(struct device *dev, const struct regulator_desc *regulator_desc, const struct regulator_config *cfg) { const struct regulator_init_data *init_data; struct regulator_config *config = NULL; static atomic_t regulator_no = ATOMIC_INIT(-1); struct regulator_dev *rdev; bool dangling_cfg_gpiod = false; bool dangling_of_gpiod = false; int ret, i; bool resolved_early = false; if (cfg == NULL) return ERR_PTR(-EINVAL); if (cfg->ena_gpiod) dangling_cfg_gpiod = true; if (regulator_desc == NULL) { ret = -EINVAL; goto rinse; } WARN_ON(!dev || !cfg->dev); if (regulator_desc->name == NULL || regulator_desc->ops == NULL) { ret = -EINVAL; goto rinse; } if (regulator_desc->type != REGULATOR_VOLTAGE && regulator_desc->type != REGULATOR_CURRENT) { ret = -EINVAL; goto rinse; } /* Only one of each should be implemented */ WARN_ON(regulator_desc->ops->get_voltage && regulator_desc->ops->get_voltage_sel); WARN_ON(regulator_desc->ops->set_voltage && regulator_desc->ops->set_voltage_sel); /* If we're using selectors we must implement list_voltage. */ if (regulator_desc->ops->get_voltage_sel && !regulator_desc->ops->list_voltage) { ret = -EINVAL; goto rinse; } if (regulator_desc->ops->set_voltage_sel && !regulator_desc->ops->list_voltage) { ret = -EINVAL; goto rinse; } rdev = kzalloc(sizeof(struct regulator_dev), GFP_KERNEL); if (rdev == NULL) { ret = -ENOMEM; goto rinse; } device_initialize(&rdev->dev); dev_set_drvdata(&rdev->dev, rdev); rdev->dev.class = ®ulator_class; spin_lock_init(&rdev->err_lock); /* * Duplicate the config so the driver could override it after * parsing init data. */ config = kmemdup(cfg, sizeof(*cfg), GFP_KERNEL); if (config == NULL) { ret = -ENOMEM; goto clean; } /* * DT may override the config->init_data provided if the platform * needs to do so. If so, config->init_data is completely ignored. */ init_data = regulator_of_get_init_data(dev, regulator_desc, config, &rdev->dev.of_node); /* * Sometimes not all resources are probed already so we need to take * that into account. This happens most the time if the ena_gpiod comes * from a gpio extender or something else. */ if (PTR_ERR(init_data) == -EPROBE_DEFER) { ret = -EPROBE_DEFER; goto clean; } /* * We need to keep track of any GPIO descriptor coming from the * device tree until we have handled it over to the core. If the * config that was passed in to this function DOES NOT contain * a descriptor, and the config after this call DOES contain * a descriptor, we definitely got one from parsing the device * tree. */ if (!cfg->ena_gpiod && config->ena_gpiod) dangling_of_gpiod = true; if (!init_data) { init_data = config->init_data; rdev->dev.of_node = of_node_get(config->of_node); } ww_mutex_init(&rdev->mutex, ®ulator_ww_class); rdev->reg_data = config->driver_data; rdev->owner = regulator_desc->owner; rdev->desc = regulator_desc; if (config->regmap) rdev->regmap = config->regmap; else if (dev_get_regmap(dev, NULL)) rdev->regmap = dev_get_regmap(dev, NULL); else if (dev->parent) rdev->regmap = dev_get_regmap(dev->parent, NULL); INIT_LIST_HEAD(&rdev->consumer_list); INIT_LIST_HEAD(&rdev->list); BLOCKING_INIT_NOTIFIER_HEAD(&rdev->notifier); INIT_DELAYED_WORK(&rdev->disable_work, regulator_disable_work); if (init_data && init_data->supply_regulator) rdev->supply_name = init_data->supply_regulator; else if (regulator_desc->supply_name) rdev->supply_name = regulator_desc->supply_name; /* register with sysfs */ rdev->dev.parent = config->dev; dev_set_name(&rdev->dev, "regulator.%lu", (unsigned long) atomic_inc_return(®ulator_no)); /* set regulator constraints */ if (init_data) rdev->constraints = kmemdup(&init_data->constraints, sizeof(*rdev->constraints), GFP_KERNEL); else rdev->constraints = kzalloc(sizeof(*rdev->constraints), GFP_KERNEL); if (!rdev->constraints) { ret = -ENOMEM; goto wash; } if (regulator_desc->init_cb) { ret = regulator_desc->init_cb(rdev, config); if (ret < 0) goto wash; } if ((rdev->supply_name && !rdev->supply) && (rdev->constraints->always_on || rdev->constraints->boot_on)) { ret = regulator_resolve_supply(rdev); if (ret) rdev_dbg(rdev, "unable to resolve supply early: %pe\n", ERR_PTR(ret)); resolved_early = true; } if (config->ena_gpiod) { ret = regulator_ena_gpio_request(rdev, config); if (ret != 0) { rdev_err(rdev, "Failed to request enable GPIO: %pe\n", ERR_PTR(ret)); goto wash; } /* The regulator core took over the GPIO descriptor */ dangling_cfg_gpiod = false; dangling_of_gpiod = false; } ret = set_machine_constraints(rdev); if (ret == -EPROBE_DEFER && !resolved_early) { /* Regulator might be in bypass mode and so needs its supply * to set the constraints */ /* FIXME: this currently triggers a chicken-and-egg problem * when creating -SUPPLY symlink in sysfs to a regulator * that is just being created */ rdev_dbg(rdev, "will resolve supply early: %s\n", rdev->supply_name); ret = regulator_resolve_supply(rdev); if (!ret) ret = set_machine_constraints(rdev); else rdev_dbg(rdev, "unable to resolve supply early: %pe\n", ERR_PTR(ret)); } if (ret < 0) goto wash; ret = regulator_init_coupling(rdev); if (ret < 0) goto wash; /* add consumers devices */ if (init_data) { for (i = 0; i < init_data->num_consumer_supplies; i++) { ret = set_consumer_device_supply(rdev, init_data->consumer_supplies[i].dev_name, init_data->consumer_supplies[i].supply); if (ret < 0) { dev_err(dev, "Failed to set supply %s\n", init_data->consumer_supplies[i].supply); goto unset_supplies; } } } if (!rdev->desc->ops->get_voltage && !rdev->desc->ops->list_voltage && !rdev->desc->fixed_uV) rdev->is_switch = true; ret = device_add(&rdev->dev); if (ret != 0) goto unset_supplies; rdev_init_debugfs(rdev); /* try to resolve regulators coupling since a new one was registered */ mutex_lock(®ulator_list_mutex); regulator_resolve_coupling(rdev); mutex_unlock(®ulator_list_mutex); /* try to resolve regulators supply since a new one was registered */ class_for_each_device(®ulator_class, NULL, NULL, regulator_register_resolve_supply); kfree(config); return rdev; unset_supplies: mutex_lock(®ulator_list_mutex); unset_regulator_supplies(rdev); regulator_remove_coupling(rdev); mutex_unlock(®ulator_list_mutex); wash: regulator_put(rdev->supply); kfree(rdev->coupling_desc.coupled_rdevs); mutex_lock(®ulator_list_mutex); regulator_ena_gpio_free(rdev); mutex_unlock(®ulator_list_mutex); clean: if (dangling_of_gpiod) gpiod_put(config->ena_gpiod); kfree(config); put_device(&rdev->dev); rinse: if (dangling_cfg_gpiod) gpiod_put(cfg->ena_gpiod); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(regulator_register); /** * regulator_unregister - unregister regulator * @rdev: regulator to unregister * * Called by regulator drivers to unregister a regulator. */ void regulator_unregister(struct regulator_dev *rdev) { if (rdev == NULL) return; if (rdev->supply) { while (rdev->use_count--) regulator_disable(rdev->supply); regulator_put(rdev->supply); } flush_work(&rdev->disable_work.work); mutex_lock(®ulator_list_mutex); WARN_ON(rdev->open_count); regulator_remove_coupling(rdev); unset_regulator_supplies(rdev); list_del(&rdev->list); regulator_ena_gpio_free(rdev); device_unregister(&rdev->dev); mutex_unlock(®ulator_list_mutex); } EXPORT_SYMBOL_GPL(regulator_unregister); #ifdef CONFIG_SUSPEND /** * regulator_suspend - prepare regulators for system wide suspend * @dev: ``&struct device`` pointer that is passed to _regulator_suspend() * * Configure each regulator with it's suspend operating parameters for state. * * Return: 0 on success or a negative error number on failure. */ static int regulator_suspend(struct device *dev) { struct regulator_dev *rdev = dev_to_rdev(dev); suspend_state_t state = pm_suspend_target_state; int ret; const struct regulator_state *rstate; rstate = regulator_get_suspend_state_check(rdev, state); if (!rstate) return 0; regulator_lock(rdev); ret = __suspend_set_state(rdev, rstate); regulator_unlock(rdev); return ret; } static int regulator_resume(struct device *dev) { suspend_state_t state = pm_suspend_target_state; struct regulator_dev *rdev = dev_to_rdev(dev); struct regulator_state *rstate; int ret = 0; rstate = regulator_get_suspend_state(rdev, state); if (rstate == NULL) return 0; /* Avoid grabbing the lock if we don't need to */ if (!rdev->desc->ops->resume) return 0; regulator_lock(rdev); if (rstate->enabled == ENABLE_IN_SUSPEND || rstate->enabled == DISABLE_IN_SUSPEND) ret = rdev->desc->ops->resume(rdev); regulator_unlock(rdev); return ret; } #else /* !CONFIG_SUSPEND */ #define regulator_suspend NULL #define regulator_resume NULL #endif /* !CONFIG_SUSPEND */ #ifdef CONFIG_PM static const struct dev_pm_ops __maybe_unused regulator_pm_ops = { .suspend = regulator_suspend, .resume = regulator_resume, }; #endif const struct class regulator_class = { .name = "regulator", .dev_release = regulator_dev_release, .dev_groups = regulator_dev_groups, #ifdef CONFIG_PM .pm = ®ulator_pm_ops, #endif }; /** * regulator_has_full_constraints - the system has fully specified constraints * * Calling this function will cause the regulator API to disable all * regulators which have a zero use count and don't have an always_on * constraint in a late_initcall. * * The intention is that this will become the default behaviour in a * future kernel release so users are encouraged to use this facility * now. */ void regulator_has_full_constraints(void) { has_full_constraints = 1; } EXPORT_SYMBOL_GPL(regulator_has_full_constraints); /** * rdev_get_drvdata - get rdev regulator driver data * @rdev: regulator * * Get rdev regulator driver private data. This call can be used in the * regulator driver context. * * Return: Pointer to regulator driver private data. */ void *rdev_get_drvdata(struct regulator_dev *rdev) { return rdev->reg_data; } EXPORT_SYMBOL_GPL(rdev_get_drvdata); /** * regulator_get_drvdata - get regulator driver data * @regulator: regulator * * Get regulator driver private data. This call can be used in the consumer * driver context when non API regulator specific functions need to be called. * * Return: Pointer to regulator driver private data. */ void *regulator_get_drvdata(struct regulator *regulator) { return regulator->rdev->reg_data; } EXPORT_SYMBOL_GPL(regulator_get_drvdata); /** * regulator_set_drvdata - set regulator driver data * @regulator: regulator * @data: data */ void regulator_set_drvdata(struct regulator *regulator, void *data) { regulator->rdev->reg_data = data; } EXPORT_SYMBOL_GPL(regulator_set_drvdata); /** * rdev_get_id - get regulator ID * @rdev: regulator * * Return: Regulator ID for @rdev. */ int rdev_get_id(struct regulator_dev *rdev) { return rdev->desc->id; } EXPORT_SYMBOL_GPL(rdev_get_id); struct device *rdev_get_dev(struct regulator_dev *rdev) { return &rdev->dev; } EXPORT_SYMBOL_GPL(rdev_get_dev); struct regmap *rdev_get_regmap(struct regulator_dev *rdev) { return rdev->regmap; } EXPORT_SYMBOL_GPL(rdev_get_regmap); void *regulator_get_init_drvdata(struct regulator_init_data *reg_init_data) { return reg_init_data->driver_data; } EXPORT_SYMBOL_GPL(regulator_get_init_drvdata); #ifdef CONFIG_DEBUG_FS static int supply_map_show(struct seq_file *sf, void *data) { struct regulator_map *map; list_for_each_entry(map, ®ulator_map_list, list) { seq_printf(sf, "%s -> %s.%s\n", rdev_get_name(map->regulator), map->dev_name, map->supply); } return 0; } DEFINE_SHOW_ATTRIBUTE(supply_map); struct summary_data { struct seq_file *s; struct regulator_dev *parent; int level; }; static void regulator_summary_show_subtree(struct seq_file *s, struct regulator_dev *rdev, int level); static int regulator_summary_show_children(struct device *dev, void *data) { struct regulator_dev *rdev = dev_to_rdev(dev); struct summary_data *summary_data = data; if (rdev->supply && rdev->supply->rdev == summary_data->parent) regulator_summary_show_subtree(summary_data->s, rdev, summary_data->level + 1); return 0; } static void regulator_summary_show_subtree(struct seq_file *s, struct regulator_dev *rdev, int level) { struct regulation_constraints *c; struct regulator *consumer; struct summary_data summary_data; unsigned int opmode; if (!rdev) return; opmode = _regulator_get_mode_unlocked(rdev); seq_printf(s, "%*s%-*s %3d %4d %6d %7s ", level * 3 + 1, "", 30 - level * 3, rdev_get_name(rdev), rdev->use_count, rdev->open_count, rdev->bypass_count, regulator_opmode_to_str(opmode)); seq_printf(s, "%5dmV ", regulator_get_voltage_rdev(rdev) / 1000); seq_printf(s, "%5dmA ", _regulator_get_current_limit_unlocked(rdev) / 1000); c = rdev->constraints; if (c) { switch (rdev->desc->type) { case REGULATOR_VOLTAGE: seq_printf(s, "%5dmV %5dmV ", c->min_uV / 1000, c->max_uV / 1000); break; case REGULATOR_CURRENT: seq_printf(s, "%5dmA %5dmA ", c->min_uA / 1000, c->max_uA / 1000); break; } } seq_puts(s, "\n"); list_for_each_entry(consumer, &rdev->consumer_list, list) { if (consumer->dev && consumer->dev->class == ®ulator_class) continue; seq_printf(s, "%*s%-*s ", (level + 1) * 3 + 1, "", 30 - (level + 1) * 3, consumer->supply_name ? consumer->supply_name : consumer->dev ? dev_name(consumer->dev) : "deviceless"); switch (rdev->desc->type) { case REGULATOR_VOLTAGE: seq_printf(s, "%3d %33dmA%c%5dmV %5dmV", consumer->enable_count, consumer->uA_load / 1000, consumer->uA_load && !consumer->enable_count ? '*' : ' ', consumer->voltage[PM_SUSPEND_ON].min_uV / 1000, consumer->voltage[PM_SUSPEND_ON].max_uV / 1000); break; case REGULATOR_CURRENT: break; } seq_puts(s, "\n"); } summary_data.s = s; summary_data.level = level; summary_data.parent = rdev; class_for_each_device(®ulator_class, NULL, &summary_data, regulator_summary_show_children); } struct summary_lock_data { struct ww_acquire_ctx *ww_ctx; struct regulator_dev **new_contended_rdev; struct regulator_dev **old_contended_rdev; }; static int regulator_summary_lock_one(struct device *dev, void *data) { struct regulator_dev *rdev = dev_to_rdev(dev); struct summary_lock_data *lock_data = data; int ret = 0; if (rdev != *lock_data->old_contended_rdev) { ret = regulator_lock_nested(rdev, lock_data->ww_ctx); if (ret == -EDEADLK) *lock_data->new_contended_rdev = rdev; else WARN_ON_ONCE(ret); } else { *lock_data->old_contended_rdev = NULL; } return ret; } static int regulator_summary_unlock_one(struct device *dev, void *data) { struct regulator_dev *rdev = dev_to_rdev(dev); struct summary_lock_data *lock_data = data; if (lock_data) { if (rdev == *lock_data->new_contended_rdev) return -EDEADLK; } regulator_unlock(rdev); return 0; } static int regulator_summary_lock_all(struct ww_acquire_ctx *ww_ctx, struct regulator_dev **new_contended_rdev, struct regulator_dev **old_contended_rdev) { struct summary_lock_data lock_data; int ret; lock_data.ww_ctx = ww_ctx; lock_data.new_contended_rdev = new_contended_rdev; lock_data.old_contended_rdev = old_contended_rdev; ret = class_for_each_device(®ulator_class, NULL, &lock_data, regulator_summary_lock_one); if (ret) class_for_each_device(®ulator_class, NULL, &lock_data, regulator_summary_unlock_one); return ret; } static void regulator_summary_lock(struct ww_acquire_ctx *ww_ctx) { struct regulator_dev *new_contended_rdev = NULL; struct regulator_dev *old_contended_rdev = NULL; int err; mutex_lock(®ulator_list_mutex); ww_acquire_init(ww_ctx, ®ulator_ww_class); do { if (new_contended_rdev) { ww_mutex_lock_slow(&new_contended_rdev->mutex, ww_ctx); old_contended_rdev = new_contended_rdev; old_contended_rdev->ref_cnt++; old_contended_rdev->mutex_owner = current; } err = regulator_summary_lock_all(ww_ctx, &new_contended_rdev, &old_contended_rdev); if (old_contended_rdev) regulator_unlock(old_contended_rdev); } while (err == -EDEADLK); ww_acquire_done(ww_ctx); } static void regulator_summary_unlock(struct ww_acquire_ctx *ww_ctx) { class_for_each_device(®ulator_class, NULL, NULL, regulator_summary_unlock_one); ww_acquire_fini(ww_ctx); mutex_unlock(®ulator_list_mutex); } static int regulator_summary_show_roots(struct device *dev, void *data) { struct regulator_dev *rdev = dev_to_rdev(dev); struct seq_file *s = data; if (!rdev->supply) regulator_summary_show_subtree(s, rdev, 0); return 0; } static int regulator_summary_show(struct seq_file *s, void *data) { struct ww_acquire_ctx ww_ctx; seq_puts(s, " regulator use open bypass opmode voltage current min max\n"); seq_puts(s, "---------------------------------------------------------------------------------------\n"); regulator_summary_lock(&ww_ctx); class_for_each_device(®ulator_class, NULL, s, regulator_summary_show_roots); regulator_summary_unlock(&ww_ctx); return 0; } DEFINE_SHOW_ATTRIBUTE(regulator_summary); #endif /* CONFIG_DEBUG_FS */ static int __init regulator_init(void) { int ret; ret = class_register(®ulator_class); debugfs_root = debugfs_create_dir("regulator", NULL); if (IS_ERR(debugfs_root)) pr_debug("regulator: Failed to create debugfs directory\n"); #ifdef CONFIG_DEBUG_FS debugfs_create_file("supply_map", 0444, debugfs_root, NULL, &supply_map_fops); debugfs_create_file("regulator_summary", 0444, debugfs_root, NULL, ®ulator_summary_fops); #endif regulator_dummy_init(); regulator_coupler_register(&generic_regulator_coupler); return ret; } /* init early to allow our consumers to complete system booting */ core_initcall(regulator_init); static int regulator_late_cleanup(struct device *dev, void *data) { struct regulator_dev *rdev = dev_to_rdev(dev); struct regulation_constraints *c = rdev->constraints; int ret; if (c && c->always_on) return 0; if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_STATUS)) return 0; regulator_lock(rdev); if (rdev->use_count) goto unlock; /* If reading the status failed, assume that it's off. */ if (_regulator_is_enabled(rdev) <= 0) goto unlock; if (have_full_constraints()) { /* We log since this may kill the system if it goes * wrong. */ rdev_info(rdev, "disabling\n"); ret = _regulator_do_disable(rdev); if (ret != 0) rdev_err(rdev, "couldn't disable: %pe\n", ERR_PTR(ret)); } else { /* The intention is that in future we will * assume that full constraints are provided * so warn even if we aren't going to do * anything here. */ rdev_warn(rdev, "incomplete constraints, leaving on\n"); } unlock: regulator_unlock(rdev); return 0; } static bool regulator_ignore_unused; static int __init regulator_ignore_unused_setup(char *__unused) { regulator_ignore_unused = true; return 1; } __setup("regulator_ignore_unused", regulator_ignore_unused_setup); static void regulator_init_complete_work_function(struct work_struct *work) { /* * Regulators may had failed to resolve their input supplies * when were registered, either because the input supply was * not registered yet or because its parent device was not * bound yet. So attempt to resolve the input supplies for * pending regulators before trying to disable unused ones. */ class_for_each_device(®ulator_class, NULL, NULL, regulator_register_resolve_supply); /* * For debugging purposes, it may be useful to prevent unused * regulators from being disabled. */ if (regulator_ignore_unused) { pr_warn("regulator: Not disabling unused regulators\n"); return; } /* If we have a full configuration then disable any regulators * we have permission to change the status for and which are * not in use or always_on. This is effectively the default * for DT and ACPI as they have full constraints. */ class_for_each_device(®ulator_class, NULL, NULL, regulator_late_cleanup); } static DECLARE_DELAYED_WORK(regulator_init_complete_work, regulator_init_complete_work_function); static int __init regulator_init_complete(void) { /* * Since DT doesn't provide an idiomatic mechanism for * enabling full constraints and since it's much more natural * with DT to provide them just assume that a DT enabled * system has full constraints. */ if (of_have_populated_dt()) has_full_constraints = true; /* * We punt completion for an arbitrary amount of time since * systems like distros will load many drivers from userspace * so consumers might not always be ready yet, this is * particularly an issue with laptops where this might bounce * the display off then on. Ideally we'd get a notification * from userspace when this happens but we don't so just wait * a bit and hope we waited long enough. It'd be better if * we'd only do this on systems that need it, and a kernel * command line option might be useful. */ schedule_delayed_work(®ulator_init_complete_work, msecs_to_jiffies(30000)); return 0; } late_initcall_sync(regulator_init_complete); |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 | /* * Copyright (c) 2016 Laurent Pinchart <laurent.pinchart@ideasonboard.com> * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #ifndef __DRM_FOURCC_H__ #define __DRM_FOURCC_H__ #include <linux/math.h> #include <linux/types.h> #include <uapi/drm/drm_fourcc.h> /** * DRM_FORMAT_MAX_PLANES - maximum number of planes a DRM format can have */ #define DRM_FORMAT_MAX_PLANES 4u /* * DRM formats are little endian. Define host endian variants for the * most common formats here, to reduce the #ifdefs needed in drivers. * * Note that the DRM_FORMAT_BIG_ENDIAN flag should only be used in * case the format can't be specified otherwise, so we don't end up * with two values describing the same format. */ #ifdef __BIG_ENDIAN # define DRM_FORMAT_HOST_XRGB1555 (DRM_FORMAT_XRGB1555 | \ DRM_FORMAT_BIG_ENDIAN) # define DRM_FORMAT_HOST_RGB565 (DRM_FORMAT_RGB565 | \ DRM_FORMAT_BIG_ENDIAN) # define DRM_FORMAT_HOST_XRGB8888 DRM_FORMAT_BGRX8888 # define DRM_FORMAT_HOST_ARGB8888 DRM_FORMAT_BGRA8888 #else # define DRM_FORMAT_HOST_XRGB1555 DRM_FORMAT_XRGB1555 # define DRM_FORMAT_HOST_RGB565 DRM_FORMAT_RGB565 # define DRM_FORMAT_HOST_XRGB8888 DRM_FORMAT_XRGB8888 # define DRM_FORMAT_HOST_ARGB8888 DRM_FORMAT_ARGB8888 #endif struct drm_device; /** * struct drm_format_info - information about a DRM format */ struct drm_format_info { /** @format: 4CC format identifier (DRM_FORMAT_*) */ u32 format; /** * @depth: * * Color depth (number of bits per pixel excluding padding bits), * valid for a subset of RGB formats only. This is a legacy field, do * not use in new code and set to 0 for new formats. */ u8 depth; /** @num_planes: Number of color planes (1 to 3) */ u8 num_planes; union { /** * @cpp: * * Number of bytes per pixel (per plane), this is aliased with * @char_per_block. It is deprecated in favour of using the * triplet @char_per_block, @block_w, @block_h for better * describing the pixel format. */ u8 cpp[DRM_FORMAT_MAX_PLANES]; /** * @char_per_block: * * Number of bytes per block (per plane), where blocks are * defined as a rectangle of pixels which are stored next to * each other in a byte aligned memory region. Together with * @block_w and @block_h this is used to properly describe tiles * in tiled formats or to describe groups of pixels in packed * formats for which the memory needed for a single pixel is not * byte aligned. * * @cpp has been kept for historical reasons because there are * a lot of places in drivers where it's used. In drm core for * generic code paths the preferred way is to use * @char_per_block, drm_format_info_block_width() and * drm_format_info_block_height() which allows handling both * block and non-block formats in the same way. * * For formats that are intended to be used only with non-linear * modifiers both @cpp and @char_per_block must be 0 in the * generic format table. Drivers could supply accurate * information from their drm_mode_config.get_format_info hook * if they want the core to be validating the pitch. */ u8 char_per_block[DRM_FORMAT_MAX_PLANES]; }; /** * @block_w: * * Block width in pixels, this is intended to be accessed through * drm_format_info_block_width() */ u8 block_w[DRM_FORMAT_MAX_PLANES]; /** * @block_h: * * Block height in pixels, this is intended to be accessed through * drm_format_info_block_height() */ u8 block_h[DRM_FORMAT_MAX_PLANES]; /** @hsub: Horizontal chroma subsampling factor */ u8 hsub; /** @vsub: Vertical chroma subsampling factor */ u8 vsub; /** @has_alpha: Does the format embeds an alpha component? */ bool has_alpha; /** @is_yuv: Is it a YUV format? */ bool is_yuv; /** @is_color_indexed: Is it a color-indexed format? */ bool is_color_indexed; }; /** * drm_format_info_is_yuv_packed - check that the format info matches a YUV * format with data laid in a single plane * @info: format info * * Returns: * A boolean indicating whether the format info matches a packed YUV format. */ static inline bool drm_format_info_is_yuv_packed(const struct drm_format_info *info) { return info->is_yuv && info->num_planes == 1; } /** * drm_format_info_is_yuv_semiplanar - check that the format info matches a YUV * format with data laid in two planes (luminance and chrominance) * @info: format info * * Returns: * A boolean indicating whether the format info matches a semiplanar YUV format. */ static inline bool drm_format_info_is_yuv_semiplanar(const struct drm_format_info *info) { return info->is_yuv && info->num_planes == 2; } /** * drm_format_info_is_yuv_planar - check that the format info matches a YUV * format with data laid in three planes (one for each YUV component) * @info: format info * * Returns: * A boolean indicating whether the format info matches a planar YUV format. */ static inline bool drm_format_info_is_yuv_planar(const struct drm_format_info *info) { return info->is_yuv && info->num_planes == 3; } /** * drm_format_info_is_yuv_sampling_410 - check that the format info matches a * YUV format with 4:1:0 sub-sampling * @info: format info * * Returns: * A boolean indicating whether the format info matches a YUV format with 4:1:0 * sub-sampling. */ static inline bool drm_format_info_is_yuv_sampling_410(const struct drm_format_info *info) { return info->is_yuv && info->hsub == 4 && info->vsub == 4; } /** * drm_format_info_is_yuv_sampling_411 - check that the format info matches a * YUV format with 4:1:1 sub-sampling * @info: format info * * Returns: * A boolean indicating whether the format info matches a YUV format with 4:1:1 * sub-sampling. */ static inline bool drm_format_info_is_yuv_sampling_411(const struct drm_format_info *info) { return info->is_yuv && info->hsub == 4 && info->vsub == 1; } /** * drm_format_info_is_yuv_sampling_420 - check that the format info matches a * YUV format with 4:2:0 sub-sampling * @info: format info * * Returns: * A boolean indicating whether the format info matches a YUV format with 4:2:0 * sub-sampling. */ static inline bool drm_format_info_is_yuv_sampling_420(const struct drm_format_info *info) { return info->is_yuv && info->hsub == 2 && info->vsub == 2; } /** * drm_format_info_is_yuv_sampling_422 - check that the format info matches a * YUV format with 4:2:2 sub-sampling * @info: format info * * Returns: * A boolean indicating whether the format info matches a YUV format with 4:2:2 * sub-sampling. */ static inline bool drm_format_info_is_yuv_sampling_422(const struct drm_format_info *info) { return info->is_yuv && info->hsub == 2 && info->vsub == 1; } /** * drm_format_info_is_yuv_sampling_444 - check that the format info matches a * YUV format with 4:4:4 sub-sampling * @info: format info * * Returns: * A boolean indicating whether the format info matches a YUV format with 4:4:4 * sub-sampling. */ static inline bool drm_format_info_is_yuv_sampling_444(const struct drm_format_info *info) { return info->is_yuv && info->hsub == 1 && info->vsub == 1; } /** * drm_format_info_plane_width - width of the plane given the first plane * @info: pixel format info * @width: width of the first plane * @plane: plane index * * Returns: * The width of @plane, given that the width of the first plane is @width. */ static inline int drm_format_info_plane_width(const struct drm_format_info *info, int width, int plane) { if (!info || plane >= info->num_planes) return 0; if (plane == 0) return width; return DIV_ROUND_UP(width, info->hsub); } /** * drm_format_info_plane_height - height of the plane given the first plane * @info: pixel format info * @height: height of the first plane * @plane: plane index * * Returns: * The height of @plane, given that the height of the first plane is @height. */ static inline int drm_format_info_plane_height(const struct drm_format_info *info, int height, int plane) { if (!info || plane >= info->num_planes) return 0; if (plane == 0) return height; return DIV_ROUND_UP(height, info->vsub); } const struct drm_format_info *__drm_format_info(u32 format); const struct drm_format_info *drm_format_info(u32 format); const struct drm_format_info * drm_get_format_info(struct drm_device *dev, u32 pixel_format, u64 modifier); uint32_t drm_mode_legacy_fb_format(uint32_t bpp, uint32_t depth); uint32_t drm_driver_legacy_fb_format(struct drm_device *dev, uint32_t bpp, uint32_t depth); uint32_t drm_driver_color_mode_format(struct drm_device *dev, unsigned int color_mode); unsigned int drm_format_info_block_width(const struct drm_format_info *info, int plane); unsigned int drm_format_info_block_height(const struct drm_format_info *info, int plane); unsigned int drm_format_info_bpp(const struct drm_format_info *info, int plane); uint64_t drm_format_info_min_pitch(const struct drm_format_info *info, int plane, unsigned int buffer_width); #endif /* __DRM_FOURCC_H__ */ |
| 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_U64_STATS_SYNC_H #define _LINUX_U64_STATS_SYNC_H /* * Protect against 64-bit values tearing on 32-bit architectures. This is * typically used for statistics read/update in different subsystems. * * Key points : * * - Use a seqcount on 32-bit * - The whole thing is a no-op on 64-bit architectures. * * Usage constraints: * * 1) Write side must ensure mutual exclusion, or one seqcount update could * be lost, thus blocking readers forever. * * 2) Write side must disable preemption, or a seqcount reader can preempt the * writer and also spin forever. * * 3) Write side must use the _irqsave() variant if other writers, or a reader, * can be invoked from an IRQ context. On 64bit systems this variant does not * disable interrupts. * * 4) If reader fetches several counters, there is no guarantee the whole values * are consistent w.r.t. each other (remember point #2: seqcounts are not * used for 64bit architectures). * * 5) Readers are allowed to sleep or be preempted/interrupted: they perform * pure reads. * * Usage : * * Stats producer (writer) should use following template granted it already got * an exclusive access to counters (a lock is already taken, or per cpu * data is used [in a non preemptable context]) * * spin_lock_bh(...) or other synchronization to get exclusive access * ... * u64_stats_update_begin(&stats->syncp); * u64_stats_add(&stats->bytes64, len); // non atomic operation * u64_stats_inc(&stats->packets64); // non atomic operation * u64_stats_update_end(&stats->syncp); * * While a consumer (reader) should use following template to get consistent * snapshot for each variable (but no guarantee on several ones) * * u64 tbytes, tpackets; * unsigned int start; * * do { * start = u64_stats_fetch_begin(&stats->syncp); * tbytes = u64_stats_read(&stats->bytes64); // non atomic operation * tpackets = u64_stats_read(&stats->packets64); // non atomic operation * } while (u64_stats_fetch_retry(&stats->syncp, start)); * * * Example of use in drivers/net/loopback.c, using per_cpu containers, * in BH disabled context. */ #include <linux/seqlock.h> struct u64_stats_sync { #if BITS_PER_LONG == 32 seqcount_t seq; #endif }; #if BITS_PER_LONG == 64 #include <asm/local64.h> typedef struct { local64_t v; } u64_stats_t ; static inline u64 u64_stats_read(const u64_stats_t *p) { return local64_read(&p->v); } static inline void u64_stats_set(u64_stats_t *p, u64 val) { local64_set(&p->v, val); } static inline void u64_stats_add(u64_stats_t *p, unsigned long val) { local64_add(val, &p->v); } static inline void u64_stats_inc(u64_stats_t *p) { local64_inc(&p->v); } static inline void u64_stats_init(struct u64_stats_sync *syncp) { } static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { } static inline void __u64_stats_update_end(struct u64_stats_sync *syncp) { } static inline unsigned long __u64_stats_irqsave(void) { return 0; } static inline void __u64_stats_irqrestore(unsigned long flags) { } static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { return 0; } static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { return false; } #else /* 64 bit */ typedef struct { u64 v; } u64_stats_t; static inline u64 u64_stats_read(const u64_stats_t *p) { return p->v; } static inline void u64_stats_set(u64_stats_t *p, u64 val) { p->v = val; } static inline void u64_stats_add(u64_stats_t *p, unsigned long val) { p->v += val; } static inline void u64_stats_inc(u64_stats_t *p) { p->v++; } #define u64_stats_init(syncp) \ do { \ struct u64_stats_sync *__s = (syncp); \ seqcount_init(&__s->seq); \ } while (0) static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { preempt_disable_nested(); write_seqcount_begin(&syncp->seq); } static inline void __u64_stats_update_end(struct u64_stats_sync *syncp) { write_seqcount_end(&syncp->seq); preempt_enable_nested(); } static inline unsigned long __u64_stats_irqsave(void) { unsigned long flags; local_irq_save(flags); return flags; } static inline void __u64_stats_irqrestore(unsigned long flags) { local_irq_restore(flags); } static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { return read_seqcount_begin(&syncp->seq); } static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { return read_seqcount_retry(&syncp->seq, start); } #endif /* !64 bit */ static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) { __u64_stats_update_begin(syncp); } static inline void u64_stats_update_end(struct u64_stats_sync *syncp) { __u64_stats_update_end(syncp); } static inline unsigned long u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) { unsigned long flags = __u64_stats_irqsave(); __u64_stats_update_begin(syncp); return flags; } static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, unsigned long flags) { __u64_stats_update_end(syncp); __u64_stats_irqrestore(flags); } static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { return __u64_stats_fetch_begin(syncp); } static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { return __u64_stats_fetch_retry(syncp, start); } #endif /* _LINUX_U64_STATS_SYNC_H */ |
| 3 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 | // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-20 Intel Corporation. */ #include <linux/file.h> #include <linux/freezer.h> #include <linux/highmem.h> #include <linux/kthread.h> #include <linux/miscdevice.h> #include <linux/node.h> #include <linux/pagemap.h> #include <linux/ratelimit.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/sysfs.h> #include <linux/vmalloc.h> #include <asm/msr.h> #include <asm/sgx.h> #include "driver.h" #include "encl.h" #include "encls.h" struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; static int sgx_nr_epc_sections; static struct task_struct *ksgxd_tsk; static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); static DEFINE_XARRAY(sgx_epc_address_space); /* * These variables are part of the state of the reclaimer, and must be accessed * with sgx_reclaimer_lock acquired. */ static LIST_HEAD(sgx_active_page_list); static DEFINE_SPINLOCK(sgx_reclaimer_lock); static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0); /* Nodes with one or more EPC sections. */ static nodemask_t sgx_numa_mask; /* * Array with one list_head for each possible NUMA node. Each * list contains all the sgx_epc_section's which are on that * node. */ static struct sgx_numa_node *sgx_numa_nodes; static LIST_HEAD(sgx_dirty_page_list); /* * Reset post-kexec EPC pages to the uninitialized state. The pages are removed * from the input list, and made available for the page allocator. SECS pages * prepending their children in the input list are left intact. * * Return 0 when sanitization was successful or kthread was stopped, and the * number of unsanitized pages otherwise. */ static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list) { unsigned long left_dirty = 0; struct sgx_epc_page *page; LIST_HEAD(dirty); int ret; /* dirty_page_list is thread-local, no need for a lock: */ while (!list_empty(dirty_page_list)) { if (kthread_should_stop()) return 0; page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); /* * Checking page->poison without holding the node->lock * is racy, but losing the race (i.e. poison is set just * after the check) just means __eremove() will be uselessly * called for a page that sgx_free_epc_page() will put onto * the node->sgx_poison_page_list later. */ if (page->poison) { struct sgx_epc_section *section = &sgx_epc_sections[page->section]; struct sgx_numa_node *node = section->node; spin_lock(&node->lock); list_move(&page->list, &node->sgx_poison_page_list); spin_unlock(&node->lock); continue; } ret = __eremove(sgx_get_epc_virt_addr(page)); if (!ret) { /* * page is now sanitized. Make it available via the SGX * page allocator: */ list_del(&page->list); sgx_free_epc_page(page); } else { /* The page is not yet clean - move to the dirty list. */ list_move_tail(&page->list, &dirty); left_dirty++; } cond_resched(); } list_splice(&dirty, dirty_page_list); return left_dirty; } static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) { struct sgx_encl_page *page = epc_page->owner; struct sgx_encl *encl = page->encl; struct sgx_encl_mm *encl_mm; bool ret = true; int idx; idx = srcu_read_lock(&encl->srcu); list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { if (!mmget_not_zero(encl_mm->mm)) continue; mmap_read_lock(encl_mm->mm); ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page); mmap_read_unlock(encl_mm->mm); mmput_async(encl_mm->mm); if (!ret) break; } srcu_read_unlock(&encl->srcu, idx); if (!ret) return false; return true; } static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) { struct sgx_encl_page *page = epc_page->owner; unsigned long addr = page->desc & PAGE_MASK; struct sgx_encl *encl = page->encl; int ret; sgx_zap_enclave_ptes(encl, addr); mutex_lock(&encl->lock); ret = __eblock(sgx_get_epc_virt_addr(epc_page)); if (encls_failed(ret)) ENCLS_WARN(ret, "EBLOCK"); mutex_unlock(&encl->lock); } static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, struct sgx_backing *backing) { struct sgx_pageinfo pginfo; int ret; pginfo.addr = 0; pginfo.secs = 0; pginfo.contents = (unsigned long)kmap_local_page(backing->contents); pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) + backing->pcmd_offset; ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); set_page_dirty(backing->pcmd); set_page_dirty(backing->contents); kunmap_local((void *)(unsigned long)(pginfo.metadata - backing->pcmd_offset)); kunmap_local((void *)(unsigned long)pginfo.contents); return ret; } void sgx_ipi_cb(void *info) { } /* * Swap page to the regular memory transformed to the blocked state by using * EBLOCK, which means that it can no longer be referenced (no new TLB entries). * * The first trial just tries to write the page assuming that some other thread * has reset the count for threads inside the enclave by using ETRACK, and * previous thread count has been zeroed out. The second trial calls ETRACK * before EWB. If that fails we kick all the HW threads out, and then do EWB, * which should be guaranteed the succeed. */ static void sgx_encl_ewb(struct sgx_epc_page *epc_page, struct sgx_backing *backing) { struct sgx_encl_page *encl_page = epc_page->owner; struct sgx_encl *encl = encl_page->encl; struct sgx_va_page *va_page; unsigned int va_offset; void *va_slot; int ret; encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED; va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, list); va_offset = sgx_alloc_va_slot(va_page); va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset; if (sgx_va_page_full(va_page)) list_move_tail(&va_page->list, &encl->va_pages); ret = __sgx_encl_ewb(epc_page, va_slot, backing); if (ret == SGX_NOT_TRACKED) { ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page)); if (ret) { if (encls_failed(ret)) ENCLS_WARN(ret, "ETRACK"); } ret = __sgx_encl_ewb(epc_page, va_slot, backing); if (ret == SGX_NOT_TRACKED) { /* * Slow path, send IPIs to kick cpus out of the * enclave. Note, it's imperative that the cpu * mask is generated *after* ETRACK, else we'll * miss cpus that entered the enclave between * generating the mask and incrementing epoch. */ on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); ret = __sgx_encl_ewb(epc_page, va_slot, backing); } } if (ret) { if (encls_failed(ret)) ENCLS_WARN(ret, "EWB"); sgx_free_va_slot(va_page, va_offset); } else { encl_page->desc |= va_offset; encl_page->va_page = va_page; } } static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, struct sgx_backing *backing) { struct sgx_encl_page *encl_page = epc_page->owner; struct sgx_encl *encl = encl_page->encl; struct sgx_backing secs_backing; int ret; mutex_lock(&encl->lock); sgx_encl_ewb(epc_page, backing); encl_page->epc_page = NULL; encl->secs_child_cnt--; sgx_encl_put_backing(backing); if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size), &secs_backing); if (ret) goto out; sgx_encl_ewb(encl->secs.epc_page, &secs_backing); sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; sgx_encl_put_backing(&secs_backing); } out: mutex_unlock(&encl->lock); } /* * Take a fixed number of pages from the head of the active page pool and * reclaim them to the enclave's private shmem files. Skip the pages, which have * been accessed since the last scan. Move those pages to the tail of active * page pool so that the pages get scanned in LRU like fashion. * * Batch process a chunk of pages (at the moment 16) in order to degrade amount * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI * + EWB) but not sufficiently. Reclaiming one page at a time would also be * problematic as it would increase the lock contention too much, which would * halt forward progress. */ static void sgx_reclaim_pages(void) { struct sgx_epc_page *chunk[SGX_NR_TO_SCAN]; struct sgx_backing backing[SGX_NR_TO_SCAN]; struct sgx_encl_page *encl_page; struct sgx_epc_page *epc_page; pgoff_t page_index; int cnt = 0; int ret; int i; spin_lock(&sgx_reclaimer_lock); for (i = 0; i < SGX_NR_TO_SCAN; i++) { if (list_empty(&sgx_active_page_list)) break; epc_page = list_first_entry(&sgx_active_page_list, struct sgx_epc_page, list); list_del_init(&epc_page->list); encl_page = epc_page->owner; if (kref_get_unless_zero(&encl_page->encl->refcount) != 0) chunk[cnt++] = epc_page; else /* The owner is freeing the page. No need to add the * page back to the list of reclaimable pages. */ epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; } spin_unlock(&sgx_reclaimer_lock); for (i = 0; i < cnt; i++) { epc_page = chunk[i]; encl_page = epc_page->owner; if (!sgx_reclaimer_age(epc_page)) goto skip; page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); mutex_lock(&encl_page->encl->lock); ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]); if (ret) { mutex_unlock(&encl_page->encl->lock); goto skip; } encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; mutex_unlock(&encl_page->encl->lock); continue; skip: spin_lock(&sgx_reclaimer_lock); list_add_tail(&epc_page->list, &sgx_active_page_list); spin_unlock(&sgx_reclaimer_lock); kref_put(&encl_page->encl->refcount, sgx_encl_release); chunk[i] = NULL; } for (i = 0; i < cnt; i++) { epc_page = chunk[i]; if (epc_page) sgx_reclaimer_block(epc_page); } for (i = 0; i < cnt; i++) { epc_page = chunk[i]; if (!epc_page) continue; encl_page = epc_page->owner; sgx_reclaimer_write(epc_page, &backing[i]); kref_put(&encl_page->encl->refcount, sgx_encl_release); epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; sgx_free_epc_page(epc_page); } } static bool sgx_should_reclaim(unsigned long watermark) { return atomic_long_read(&sgx_nr_free_pages) < watermark && !list_empty(&sgx_active_page_list); } /* * sgx_reclaim_direct() should be called (without enclave's mutex held) * in locations where SGX memory resources might be low and might be * needed in order to make forward progress. */ void sgx_reclaim_direct(void) { if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) sgx_reclaim_pages(); } static int ksgxd(void *p) { set_freezable(); /* * Sanitize pages in order to recover from kexec(). The 2nd pass is * required for SECS pages, whose child pages blocked EREMOVE. */ __sgx_sanitize_pages(&sgx_dirty_page_list); WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list)); while (!kthread_should_stop()) { if (try_to_freeze()) continue; wait_event_freezable(ksgxd_waitq, kthread_should_stop() || sgx_should_reclaim(SGX_NR_HIGH_PAGES)); if (sgx_should_reclaim(SGX_NR_HIGH_PAGES)) sgx_reclaim_pages(); cond_resched(); } return 0; } static bool __init sgx_page_reclaimer_init(void) { struct task_struct *tsk; tsk = kthread_run(ksgxd, NULL, "ksgxd"); if (IS_ERR(tsk)) return false; ksgxd_tsk = tsk; return true; } bool current_is_ksgxd(void) { return current == ksgxd_tsk; } static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) { struct sgx_numa_node *node = &sgx_numa_nodes[nid]; struct sgx_epc_page *page = NULL; spin_lock(&node->lock); if (list_empty(&node->free_page_list)) { spin_unlock(&node->lock); return NULL; } page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); page->flags = 0; spin_unlock(&node->lock); atomic_long_dec(&sgx_nr_free_pages); return page; } /** * __sgx_alloc_epc_page() - Allocate an EPC page * * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start * from the NUMA node, where the caller is executing. * * Return: * - an EPC page: A borrowed EPC pages were available. * - NULL: Out of EPC pages. */ struct sgx_epc_page *__sgx_alloc_epc_page(void) { struct sgx_epc_page *page; int nid_of_current = numa_node_id(); int nid_start, nid; /* * Try local node first. If it doesn't have an EPC section, * fall back to the non-local NUMA nodes. */ if (node_isset(nid_of_current, sgx_numa_mask)) nid_start = nid_of_current; else nid_start = next_node_in(nid_of_current, sgx_numa_mask); nid = nid_start; do { page = __sgx_alloc_epc_page_from_node(nid); if (page) return page; nid = next_node_in(nid, sgx_numa_mask); } while (nid != nid_start); return ERR_PTR(-ENOMEM); } /** * sgx_mark_page_reclaimable() - Mark a page as reclaimable * @page: EPC page * * Mark a page as reclaimable and add it to the active page list. Pages * are automatically removed from the active list when freed. */ void sgx_mark_page_reclaimable(struct sgx_epc_page *page) { spin_lock(&sgx_reclaimer_lock); page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED; list_add_tail(&page->list, &sgx_active_page_list); spin_unlock(&sgx_reclaimer_lock); } /** * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list * @page: EPC page * * Clear the reclaimable flag and remove the page from the active page list. * * Return: * 0 on success, * -EBUSY if the page is in the process of being reclaimed */ int sgx_unmark_page_reclaimable(struct sgx_epc_page *page) { spin_lock(&sgx_reclaimer_lock); if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) { /* The page is being reclaimed. */ if (list_empty(&page->list)) { spin_unlock(&sgx_reclaimer_lock); return -EBUSY; } list_del(&page->list); page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; } spin_unlock(&sgx_reclaimer_lock); return 0; } /** * sgx_alloc_epc_page() - Allocate an EPC page * @owner: the owner of the EPC page * @reclaim: reclaim pages if necessary * * Iterate through EPC sections and borrow a free EPC page to the caller. When a * page is no longer needed it must be released with sgx_free_epc_page(). If * @reclaim is set to true, directly reclaim pages when we are out of pages. No * mm's can be locked when @reclaim is set to true. * * Finally, wake up ksgxd when the number of pages goes below the watermark * before returning back to the caller. * * Return: * an EPC page, * -errno on error */ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) { struct sgx_epc_page *page; for ( ; ; ) { page = __sgx_alloc_epc_page(); if (!IS_ERR(page)) { page->owner = owner; break; } if (list_empty(&sgx_active_page_list)) return ERR_PTR(-ENOMEM); if (!reclaim) { page = ERR_PTR(-EBUSY); break; } if (signal_pending(current)) { page = ERR_PTR(-ERESTARTSYS); break; } sgx_reclaim_pages(); cond_resched(); } if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) wake_up(&ksgxd_waitq); return page; } /** * sgx_free_epc_page() - Free an EPC page * @page: an EPC page * * Put the EPC page back to the list of free pages. It's the caller's * responsibility to make sure that the page is in uninitialized state. In other * words, do EREMOVE, EWB or whatever operation is necessary before calling * this function. */ void sgx_free_epc_page(struct sgx_epc_page *page) { struct sgx_epc_section *section = &sgx_epc_sections[page->section]; struct sgx_numa_node *node = section->node; spin_lock(&node->lock); page->owner = NULL; if (page->poison) list_add(&page->list, &node->sgx_poison_page_list); else list_add_tail(&page->list, &node->free_page_list); page->flags = SGX_EPC_PAGE_IS_FREE; spin_unlock(&node->lock); atomic_long_inc(&sgx_nr_free_pages); } static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, unsigned long index, struct sgx_epc_section *section) { unsigned long nr_pages = size >> PAGE_SHIFT; unsigned long i; section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB); if (!section->virt_addr) return false; section->pages = vmalloc_array(nr_pages, sizeof(struct sgx_epc_page)); if (!section->pages) { memunmap(section->virt_addr); return false; } section->phys_addr = phys_addr; xa_store_range(&sgx_epc_address_space, section->phys_addr, phys_addr + size - 1, section, GFP_KERNEL); for (i = 0; i < nr_pages; i++) { section->pages[i].section = index; section->pages[i].flags = 0; section->pages[i].owner = NULL; section->pages[i].poison = 0; list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } return true; } bool arch_is_platform_page(u64 paddr) { return !!xa_load(&sgx_epc_address_space, paddr); } EXPORT_SYMBOL_GPL(arch_is_platform_page); static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr) { struct sgx_epc_section *section; section = xa_load(&sgx_epc_address_space, paddr); if (!section) return NULL; return §ion->pages[PFN_DOWN(paddr - section->phys_addr)]; } /* * Called in process context to handle a hardware reported * error in an SGX EPC page. * If the MF_ACTION_REQUIRED bit is set in flags, then the * context is the task that consumed the poison data. Otherwise * this is called from a kernel thread unrelated to the page. */ int arch_memory_failure(unsigned long pfn, int flags) { struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT); struct sgx_epc_section *section; struct sgx_numa_node *node; /* * mm/memory-failure.c calls this routine for all errors * where there isn't a "struct page" for the address. But that * includes other address ranges besides SGX. */ if (!page) return -ENXIO; /* * If poison was consumed synchronously. Send a SIGBUS to * the task. Hardware has already exited the SGX enclave and * will not allow re-entry to an enclave that has a memory * error. The signal may help the task understand why the * enclave is broken. */ if (flags & MF_ACTION_REQUIRED) force_sig(SIGBUS); section = &sgx_epc_sections[page->section]; node = section->node; spin_lock(&node->lock); /* Already poisoned? Nothing more to do */ if (page->poison) goto out; page->poison = 1; /* * If the page is on a free list, move it to the per-node * poison page list. */ if (page->flags & SGX_EPC_PAGE_IS_FREE) { list_move(&page->list, &node->sgx_poison_page_list); goto out; } sgx_unmark_page_reclaimable(page); /* * TBD: Add additional plumbing to enable pre-emptive * action for asynchronous poison notification. Until * then just hope that the poison: * a) is not accessed - sgx_free_epc_page() will deal with it * when the user gives it back * b) results in a recoverable machine check rather than * a fatal one */ out: spin_unlock(&node->lock); return 0; } /* * A section metric is concatenated in a way that @low bits 12-31 define the * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the * metric. */ static inline u64 __init sgx_calc_section_metric(u64 low, u64 high) { return (low & GENMASK_ULL(31, 12)) + ((high & GENMASK_ULL(19, 0)) << 32); } #ifdef CONFIG_NUMA static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size); } static DEVICE_ATTR_RO(sgx_total_bytes); static umode_t arch_node_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx) { /* Make all x86/ attributes invisible when SGX is not initialized: */ if (nodes_empty(sgx_numa_mask)) return 0; return attr->mode; } static struct attribute *arch_node_dev_attrs[] = { &dev_attr_sgx_total_bytes.attr, NULL, }; const struct attribute_group arch_node_dev_group = { .name = "x86", .attrs = arch_node_dev_attrs, .is_visible = arch_node_attr_is_visible, }; static void __init arch_update_sysfs_visibility(int nid) { struct node *node = node_devices[nid]; int ret; ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group); if (ret) pr_err("sysfs update failed (%d), files may be invisible", ret); } #else /* !CONFIG_NUMA */ static void __init arch_update_sysfs_visibility(int nid) {} #endif static bool __init sgx_page_cache_init(void) { u32 eax, ebx, ecx, edx, type; u64 pa, size; int nid; int i; sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL); if (!sgx_numa_nodes) return false; for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); type = eax & SGX_CPUID_EPC_MASK; if (type == SGX_CPUID_EPC_INVALID) break; if (type != SGX_CPUID_EPC_SECTION) { pr_err_once("Unknown EPC section type: %u\n", type); break; } pa = sgx_calc_section_metric(eax, ebx); size = sgx_calc_section_metric(ecx, edx); pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1); if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) { pr_err("No free memory for an EPC section\n"); break; } nid = numa_map_to_online_node(phys_to_target_node(pa)); if (nid == NUMA_NO_NODE) { /* The physical address is already printed above. */ pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n"); nid = 0; } if (!node_isset(nid, sgx_numa_mask)) { spin_lock_init(&sgx_numa_nodes[nid].lock); INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list); node_set(nid, sgx_numa_mask); sgx_numa_nodes[nid].size = 0; /* Make SGX-specific node sysfs files visible: */ arch_update_sysfs_visibility(nid); } sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; sgx_numa_nodes[nid].size += size; sgx_nr_epc_sections++; } if (!sgx_nr_epc_sections) { pr_err("There are zero EPC sections.\n"); return false; } for_each_online_node(nid) { if (!node_isset(nid, sgx_numa_mask) && node_state(nid, N_MEMORY) && node_state(nid, N_CPU)) pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n", nid); } return true; } /* * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller. * Bare-metal driver requires to update them to hash of enclave's signer * before EINIT. KVM needs to update them to guest's virtual MSR values * before doing EINIT from guest. */ void sgx_update_lepubkeyhash(u64 *lepubkeyhash) { int i; WARN_ON_ONCE(preemptible()); for (i = 0; i < 4; i++) wrmsrq(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); } const struct file_operations sgx_provision_fops = { .owner = THIS_MODULE, }; static struct miscdevice sgx_dev_provision = { .minor = MISC_DYNAMIC_MINOR, .name = "sgx_provision", .nodename = "sgx_provision", .fops = &sgx_provision_fops, }; /** * sgx_set_attribute() - Update allowed attributes given file descriptor * @allowed_attributes: Pointer to allowed enclave attributes * @attribute_fd: File descriptor for specific attribute * * Append enclave attribute indicated by file descriptor to allowed * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by * /dev/sgx_provision is supported. * * Return: * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes * -EINVAL: Invalid, or not supported file descriptor */ int sgx_set_attribute(unsigned long *allowed_attributes, unsigned int attribute_fd) { CLASS(fd, f)(attribute_fd); if (fd_empty(f)) return -EINVAL; if (fd_file(f)->f_op != &sgx_provision_fops) return -EINVAL; *allowed_attributes |= SGX_ATTR_PROVISIONKEY; return 0; } EXPORT_SYMBOL_GPL(sgx_set_attribute); static int __init sgx_init(void) { int ret; int i; if (!cpu_feature_enabled(X86_FEATURE_SGX)) return -ENODEV; if (!sgx_page_cache_init()) return -ENOMEM; if (!sgx_page_reclaimer_init()) { ret = -ENOMEM; goto err_page_cache; } ret = misc_register(&sgx_dev_provision); if (ret) goto err_kthread; /* * Always try to initialize the native *and* KVM drivers. * The KVM driver is less picky than the native one and * can function if the native one is not supported on the * current system or fails to initialize. * * Error out only if both fail to initialize. */ ret = sgx_drv_init(); if (sgx_vepc_init() && ret) goto err_provision; return 0; err_provision: misc_deregister(&sgx_dev_provision); err_kthread: kthread_stop(ksgxd_tsk); err_page_cache: for (i = 0; i < sgx_nr_epc_sections; i++) { vfree(sgx_epc_sections[i].pages); memunmap(sgx_epc_sections[i].virt_addr); } return ret; } device_initcall(sgx_init); |
| 8 8 8 7 4 4 4 4 8 1 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | // SPDX-License-Identifier: GPL-2.0 /* * Parts of this file are * Copyright (C) 2022-2023 Intel Corporation */ #include <linux/ieee80211.h> #include <linux/export.h> #include <net/cfg80211.h> #include "nl80211.h" #include "core.h" #include "rdev-ops.h" static int ___cfg80211_stop_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id, bool notify) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_wiphy(wdev->wiphy); if (!rdev->ops->stop_ap) return -EOPNOTSUPP; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; if (!wdev->links[link_id].ap.beacon_interval) return -ENOENT; err = rdev_stop_ap(rdev, dev, link_id); if (!err) { wdev->conn_owner_nlportid = 0; wdev->links[link_id].ap.beacon_interval = 0; memset(&wdev->links[link_id].ap.chandef, 0, sizeof(wdev->links[link_id].ap.chandef)); wdev->u.ap.ssid_len = 0; rdev_set_qos_map(rdev, dev, NULL); if (notify) nl80211_send_ap_stopped(wdev, link_id); /* Should we apply the grace period during beaconing interface * shutdown also? */ cfg80211_sched_dfs_chan_update(rdev); } schedule_work(&cfg80211_disconnect_work); return err; } int cfg80211_stop_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, int link_id, bool notify) { unsigned int link; int ret = 0; if (link_id >= 0) return ___cfg80211_stop_ap(rdev, dev, link_id, notify); for_each_valid_link(dev->ieee80211_ptr, link) { int ret1 = ___cfg80211_stop_ap(rdev, dev, link, notify); if (ret1) ret = ret1; /* try the next one also if one errored */ } return ret; } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | #ifndef _LINUX_SCHED_ISOLATION_H #define _LINUX_SCHED_ISOLATION_H #include <linux/cpumask.h> #include <linux/cpuset.h> #include <linux/init.h> #include <linux/tick.h> enum hk_type { HK_TYPE_DOMAIN, HK_TYPE_MANAGED_IRQ, HK_TYPE_KERNEL_NOISE, HK_TYPE_MAX, /* * The following housekeeping types are only set by the nohz_full * boot commandline option. So they can share the same value. */ HK_TYPE_TICK = HK_TYPE_KERNEL_NOISE, HK_TYPE_TIMER = HK_TYPE_KERNEL_NOISE, HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE, HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE, HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE, HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE }; #ifdef CONFIG_CPU_ISOLATION DECLARE_STATIC_KEY_FALSE(housekeeping_overridden); extern int housekeeping_any_cpu(enum hk_type type); extern const struct cpumask *housekeeping_cpumask(enum hk_type type); extern bool housekeeping_enabled(enum hk_type type); extern void housekeeping_affine(struct task_struct *t, enum hk_type type); extern bool housekeeping_test_cpu(int cpu, enum hk_type type); extern void __init housekeeping_init(void); #else static inline int housekeeping_any_cpu(enum hk_type type) { return smp_processor_id(); } static inline const struct cpumask *housekeeping_cpumask(enum hk_type type) { return cpu_possible_mask; } static inline bool housekeeping_enabled(enum hk_type type) { return false; } static inline void housekeeping_affine(struct task_struct *t, enum hk_type type) { } static inline bool housekeeping_test_cpu(int cpu, enum hk_type type) { return true; } static inline void housekeeping_init(void) { } #endif /* CONFIG_CPU_ISOLATION */ static inline bool housekeeping_cpu(int cpu, enum hk_type type) { #ifdef CONFIG_CPU_ISOLATION if (static_branch_unlikely(&housekeeping_overridden)) return housekeeping_test_cpu(cpu, type); #endif return true; } static inline bool cpu_is_isolated(int cpu) { return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) || !housekeeping_test_cpu(cpu, HK_TYPE_TICK) || cpuset_cpu_is_isolated(cpu); } #endif /* _LINUX_SCHED_ISOLATION_H */ |
| 82 80 80 81 166 167 7 161 38 7 123 3 159 82 82 130 128 1 131 36 36 87 87 6 87 1 87 87 88 2 87 85 87 87 86 86 222 224 220 4 115 6 7 117 7 4 4 3 3 1 1 1 134 19 127 13 19 19 2 2 4 4 10 11 10 2 11 220 100 16 175 34 31 2 32 6 17 13 259 260 259 257 260 258 259 257 33 1 1 1 30 228 13 260 33 230 229 33 228 227 1 228 227 18 7 11 19 20 20 20 12 47 53 19 38 2 54 53 46 9 54 54 42 12 52 3 3 3 3 3 3 3 3 18 2 1 1456 1453 17 104 104 104 2 29 30 30 2 30 1 29 30 30 7 2 2 2 2 7 4 7 7 4 7 4 716 718 552 37 1 380 721 349 195 60 60 79 153 22 231 22 256 107 107 105 2 23 84 108 105 3 278 27 137 261 2 24 176 179 178 20 177 20 19 1 178 191 189 187 137 3 3 7 134 11 1 10 134 12 189 26 4 3 4 3 3 4 3 3 4 159 162 164 45 2 162 51 51 51 30 49 10 20 46 50 37 37 2 25 16 33 1 1 1 1 9 2 1 8 2 2 1 3 1 3 1 21 51 51 37 35 3 40 37 3 37 3 1 48 47 21 46 1 10 46 1 3 1 9 20 1 20 24 23 23 2 2 1 20 20 19 15 20 20 20 20 19 20 22 28 7 21 1 29 26 30 19 1 17 58 58 58 57 30 30 23 1 25 28 22 7 30 30 6 5 1 1 5 9 1 8 8 1 6 6 3 1 2 1 1 1 8 4 4 57 4 54 17 1 1 12 26 12 1 31 14 17 24 54 20 22 21 21 57 1 58 4 42 42 1 55 1 55 55 58 1 53 29 2 28 3 1 54 24 54 54 645 6 6 1 616 57 762 756 3 2 26 641 29 32 16 1 18 18 50 625 753 757 166 648 645 642 18 641 493 643 648 761 2 2 2 2 1 2 2 29 29 1 1 33 33 29 29 22 1 1 26 2 2 31 2 1 1 1 1 19 19 4 4 1 3 98 98 2 96 96 1 96 20 89 89 9 82 7 7 8 1 1 5 150 149 107 2 7 6 3 1 1 2 1 3 3 7 5 2 12 1 1 9 1 7 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/filemap.c * * Copyright (C) 1994-1999 Linus Torvalds */ /* * This file handles the generic file mmap semantics used by * most "normal" filesystems (but you don't /have/ to use this: * the NFS filesystem used to do this differently, for example) */ #include <linux/export.h> #include <linux/compiler.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> #include <linux/capability.h> #include <linux/kernel_stat.h> #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/syscalls.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/error-injection.h> #include <linux/hash.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/security.h> #include <linux/cpuset.h> #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/shmem_fs.h> #include <linux/rmap.h> #include <linux/delayacct.h> #include <linux/psi.h> #include <linux/ramfs.h> #include <linux/page_idle.h> #include <linux/migrate.h> #include <linux/pipe_fs_i.h> #include <linux/splice.h> #include <linux/rcupdate_wait.h> #include <linux/sched/mm.h> #include <linux/sysctl.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/filemap.h> /* * FIXME: remove all knowledge of the buffer layer from the core VM */ #include <linux/buffer_head.h> /* for try_to_free_buffers */ #include <asm/mman.h> #include "swap.h" /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. * * Shared mappings now work. 15.8.1995 Bruno. * * finished 'unifying' the page and buffer cache and SMP-threaded the * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> * * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> */ /* * Lock ordering: * * ->i_mmap_rwsem (truncate_pagecache) * ->private_lock (__free_pte->block_dirty_folio) * ->swap_lock (exclusive_swap_page, others) * ->i_pages lock * * ->i_rwsem * ->invalidate_lock (acquired by fs in truncate path) * ->i_mmap_rwsem (truncate->unmap_mapping_range) * * ->mmap_lock * ->i_mmap_rwsem * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_lock * ->invalidate_lock (filemap_fault) * ->lock_page (filemap_fault, access_process_vm) * * ->i_rwsem (generic_perform_write) * ->mmap_lock (fault_in_readable->do_page_fault) * * bdi->wb.list_lock * sb_lock (fs/fs-writeback.c) * ->i_pages lock (__sync_single_inode) * * ->i_mmap_rwsem * ->anon_vma.lock (vma_merge) * * ->anon_vma.lock * ->page_table_lock or pte_lock (anon_vma_prepare and various) * * ->page_table_lock or pte_lock * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->i_pages lock (try_to_unmap_one) * ->lruvec->lru_lock (follow_page_mask->mark_page_accessed) * ->lruvec->lru_lock (check_pte_range->folio_isolate_lru) * ->private_lock (folio_remove_rmap_pte->set_page_dirty) * ->i_pages lock (folio_remove_rmap_pte->set_page_dirty) * bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty) * ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->block_dirty_folio) */ static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { XA_STATE(xas, &mapping->i_pages, folio->index); long nr = 1; mapping_set_update(&xas, mapping); xas_set_order(&xas, folio->index, folio_order(folio)); nr = folio_nr_pages(folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); xas_store(&xas, shadow); xas_init_marks(&xas); folio->mapping = NULL; /* Leave folio->index set: truncation lookup relies upon it */ mapping->nrpages -= nr; } static void filemap_unaccount_folio(struct address_space *mapping, struct folio *folio) { long nr; VM_BUG_ON_FOLIO(folio_mapped(folio), folio); if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) { pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", current->comm, folio_pfn(folio)); dump_page(&folio->page, "still mapped when deleted"); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); if (mapping_exiting(mapping) && !folio_test_large(folio)) { int mapcount = folio_mapcount(folio); if (folio_ref_count(folio) >= mapcount + 2) { /* * All vmas have already been torn down, so it's * a good bet that actually the page is unmapped * and we'd rather not leak it: if we're wrong, * another bad page check should catch it later. */ atomic_set(&folio->_mapcount, -1); folio_ref_sub(folio, mapcount); } } } /* hugetlb folios do not participate in page cache accounting. */ if (folio_test_hugetlb(folio)) return; nr = folio_nr_pages(folio); __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); if (folio_test_swapbacked(folio)) { __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); if (folio_test_pmd_mappable(folio)) __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else if (folio_test_pmd_mappable(folio)) { __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } /* * At this point folio must be either written or cleaned by * truncate. Dirty folio here signals a bug and loss of * unwritten data - on ordinary filesystems. * * But it's harmless on in-memory filesystems like tmpfs; and can * occur when a driver which did get_user_pages() sets page dirty * before putting it, while the inode is being finally evicted. * * Below fixes dirty accounting after removing the folio entirely * but leaves the dirty flag set: it has no effect for truncated * folio and anyway will be cleared before returning folio to * buddy allocator. */ if (WARN_ON_ONCE(folio_test_dirty(folio) && mapping_can_writeback(mapping))) folio_account_cleaned(folio, inode_to_wb(mapping->host)); } /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the i_pages lock. */ void __filemap_remove_folio(struct folio *folio, void *shadow) { struct address_space *mapping = folio->mapping; trace_mm_filemap_delete_from_page_cache(folio); filemap_unaccount_folio(mapping, folio); page_cache_delete(mapping, folio, shadow); } void filemap_free_folio(struct address_space *mapping, struct folio *folio) { void (*free_folio)(struct folio *); free_folio = mapping->a_ops->free_folio; if (free_folio) free_folio(folio); folio_put_refs(folio, folio_nr_pages(folio)); } /** * filemap_remove_folio - Remove folio from page cache. * @folio: The folio. * * This must be called only on folios that are locked and have been * verified to be in the page cache. It will never put the folio into * the free list because the caller has a reference on the page. */ void filemap_remove_folio(struct folio *folio) { struct address_space *mapping = folio->mapping; BUG_ON(!folio_test_locked(folio)); spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); } /* * page_cache_delete_batch - delete several folios from page cache * @mapping: the mapping to which folios belong * @fbatch: batch of folios to delete * * The function walks over mapping->i_pages and removes folios passed in * @fbatch from the mapping. The function expects @fbatch to be sorted * by page index and is optimised for it to be dense. * It tolerates holes in @fbatch (mapping entries at those indices are not * modified). * * The function expects the i_pages lock to be held. */ static void page_cache_delete_batch(struct address_space *mapping, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index); long total_pages = 0; int i = 0; struct folio *folio; mapping_set_update(&xas, mapping); xas_for_each(&xas, folio, ULONG_MAX) { if (i >= folio_batch_count(fbatch)) break; /* A swap/dax/shadow entry got inserted? Skip it. */ if (xa_is_value(folio)) continue; /* * A page got inserted in our range? Skip it. We have our * pages locked so they are protected from being removed. * If we see a page whose index is higher than ours, it * means our page has been removed, which shouldn't be * possible because we're holding the PageLock. */ if (folio != fbatch->folios[i]) { VM_BUG_ON_FOLIO(folio->index > fbatch->folios[i]->index, folio); continue; } WARN_ON_ONCE(!folio_test_locked(folio)); folio->mapping = NULL; /* Leave folio->index set: truncation lookup relies on it */ i++; xas_store(&xas, NULL); total_pages += folio_nr_pages(folio); } mapping->nrpages -= total_pages; } void delete_from_page_cache_batch(struct address_space *mapping, struct folio_batch *fbatch) { int i; if (!folio_batch_count(fbatch)) return; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; trace_mm_filemap_delete_from_page_cache(folio); filemap_unaccount_folio(mapping, folio); } page_cache_delete_batch(mapping, fbatch); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); for (i = 0; i < folio_batch_count(fbatch); i++) filemap_free_folio(mapping, fbatch->folios[i]); } int filemap_check_errors(struct address_space *mapping) { int ret = 0; /* Check for outstanding write errors */ if (test_bit(AS_ENOSPC, &mapping->flags) && test_and_clear_bit(AS_ENOSPC, &mapping->flags)) ret = -ENOSPC; if (test_bit(AS_EIO, &mapping->flags) && test_and_clear_bit(AS_EIO, &mapping->flags)) ret = -EIO; return ret; } EXPORT_SYMBOL(filemap_check_errors); static int filemap_check_and_keep_errors(struct address_space *mapping) { /* Check for outstanding write errors */ if (test_bit(AS_EIO, &mapping->flags)) return -EIO; if (test_bit(AS_ENOSPC, &mapping->flags)) return -ENOSPC; return 0; } /** * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @wbc: the writeback_control controlling the writeout * * Call writepages on the mapping using the provided wbc to control the * writeout. * * Return: %0 on success, negative error code otherwise. */ int filemap_fdatawrite_wbc(struct address_space *mapping, struct writeback_control *wbc) { int ret; if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; wbc_attach_fdatawrite_inode(wbc, mapping->host); ret = do_writepages(mapping, wbc); wbc_detach_inode(wbc); return ret; } EXPORT_SYMBOL(filemap_fdatawrite_wbc); /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts * @end: offset in bytes where the range ends (inclusive) * @sync_mode: enable synchronous operation * * Start writeback against all of a mapping's dirty pages that lie * within the byte offsets <start, end> inclusive. * * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as * opposed to a regular memory cleansing writeback. The difference between * these two operations is that if a dirty page/buffer is encountered, it must * be waited upon, and not just skipped over. * * Return: %0 on success, negative error code otherwise. */ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode) { struct writeback_control wbc = { .sync_mode = sync_mode, .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; return filemap_fdatawrite_wbc(mapping, &wbc); } static inline int __filemap_fdatawrite(struct address_space *mapping, int sync_mode) { return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); } int filemap_fdatawrite(struct address_space *mapping) { return __filemap_fdatawrite(mapping, WB_SYNC_ALL); } EXPORT_SYMBOL(filemap_fdatawrite); int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } EXPORT_SYMBOL(filemap_fdatawrite_range); /** * filemap_fdatawrite_range_kick - start writeback on a range * @mapping: target address_space * @start: index to start writeback on * @end: last (inclusive) index for writeback * * This is a non-integrity writeback helper, to start writing back folios * for the indicated range. * * Return: %0 on success, negative error code otherwise. */ int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE); } EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); /** * filemap_flush - mostly a non-blocking flush * @mapping: target address_space * * This is a mostly non-blocking flush. Not suitable for data-integrity * purposes - I/O may not be started against all dirty pages. * * Return: %0 on success, negative error code otherwise. */ int filemap_flush(struct address_space *mapping) { return __filemap_fdatawrite(mapping, WB_SYNC_NONE); } EXPORT_SYMBOL(filemap_flush); /** * filemap_range_has_page - check if a page exists in range. * @mapping: address space within which to check * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Find at least one page in the range supplied, usually used to check if * direct writing in this range will trigger a writeback. * * Return: %true if at least one page exists in the specified range, * %false otherwise. */ bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { struct folio *folio; XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; if (end_byte < start_byte) return false; rcu_read_lock(); for (;;) { folio = xas_find(&xas, max); if (xas_retry(&xas, folio)) continue; /* Shadow entries don't count */ if (xa_is_value(folio)) continue; /* * We don't need to try to pin this page; we're about to * release the RCU lock anyway. It is enough to know that * there was a page here recently. */ break; } rcu_read_unlock(); return folio != NULL; } EXPORT_SYMBOL(filemap_range_has_page); static void __filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { pgoff_t index = start_byte >> PAGE_SHIFT; pgoff_t end = end_byte >> PAGE_SHIFT; struct folio_batch fbatch; unsigned nr_folios; folio_batch_init(&fbatch); while (index <= end) { unsigned i; nr_folios = filemap_get_folios_tag(mapping, &index, end, PAGECACHE_TAG_WRITEBACK, &fbatch); if (!nr_folios) break; for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; folio_wait_writeback(folio); } folio_batch_release(&fbatch); cond_resched(); } } /** * filemap_fdatawait_range - wait for writeback to complete * @mapping: address space structure to wait for * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Walk the list of under-writeback pages of the given address space * in the given range and wait for all of them. Check error status of * the address space and return it. * * Since the error status of the address space is cleared by this function, * callers are responsible for checking the return value and handling and/or * reporting the error. * * Return: error status of the address space. */ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { __filemap_fdatawait_range(mapping, start_byte, end_byte); return filemap_check_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_range); /** * filemap_fdatawait_range_keep_errors - wait for writeback to complete * @mapping: address space structure to wait for * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Walk the list of under-writeback pages of the given address space in the * given range and wait for all of them. Unlike filemap_fdatawait_range(), * this function does not clear error status of the address space. * * Use this function if callers don't handle errors themselves. Expected * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), * fsfreeze(8) */ int filemap_fdatawait_range_keep_errors(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { __filemap_fdatawait_range(mapping, start_byte, end_byte); return filemap_check_and_keep_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors); /** * file_fdatawait_range - wait for writeback to complete * @file: file pointing to address space structure to wait for * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Walk the list of under-writeback pages of the address space that file * refers to, in the given range and wait for all of them. Check error * status of the address space vs. the file->f_wb_err cursor and return it. * * Since the error status of the file is advanced by this function, * callers are responsible for checking the return value and handling and/or * reporting the error. * * Return: error status of the address space vs. the file->f_wb_err cursor. */ int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) { struct address_space *mapping = file->f_mapping; __filemap_fdatawait_range(mapping, start_byte, end_byte); return file_check_and_advance_wb_err(file); } EXPORT_SYMBOL(file_fdatawait_range); /** * filemap_fdatawait_keep_errors - wait for writeback without clearing errors * @mapping: address space structure to wait for * * Walk the list of under-writeback pages of the given address space * and wait for all of them. Unlike filemap_fdatawait(), this function * does not clear error status of the address space. * * Use this function if callers don't handle errors themselves. Expected * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), * fsfreeze(8) * * Return: error status of the address space. */ int filemap_fdatawait_keep_errors(struct address_space *mapping) { __filemap_fdatawait_range(mapping, 0, LLONG_MAX); return filemap_check_and_keep_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_keep_errors); /* Returns true if writeback might be needed or already in progress. */ static bool mapping_needs_writeback(struct address_space *mapping) { return mapping->nrpages; } bool filemap_range_has_writeback(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; struct folio *folio; if (end_byte < start_byte) return false; rcu_read_lock(); xas_for_each(&xas, folio, max) { if (xas_retry(&xas, folio)) continue; if (xa_is_value(folio)) continue; if (folio_test_dirty(folio) || folio_test_locked(folio) || folio_test_writeback(folio)) break; } rcu_read_unlock(); return folio != NULL; } EXPORT_SYMBOL_GPL(filemap_range_has_writeback); /** * filemap_write_and_wait_range - write out & wait on a file range * @mapping: the address_space for the pages * @lstart: offset in bytes where the range starts * @lend: offset in bytes where the range ends (inclusive) * * Write out and wait upon file offsets lstart->lend, inclusive. * * Note that @lend is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). * * Return: error status of the address space. */ int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { int err = 0, err2; if (lend < lstart) return 0; if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* * Even if the above returned error, the pages may be * written partially (e.g. -ENOSPC), so we wait for it. * But the -EIO is special case, it may indicate the worst * thing (e.g. bug) happened, so we avoid waiting for it. */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); } err2 = filemap_check_errors(mapping); if (!err) err = err2; return err; } EXPORT_SYMBOL(filemap_write_and_wait_range); void __filemap_set_wb_err(struct address_space *mapping, int err) { errseq_t eseq = errseq_set(&mapping->wb_err, err); trace_filemap_set_wb_err(mapping, eseq); } EXPORT_SYMBOL(__filemap_set_wb_err); /** * file_check_and_advance_wb_err - report wb error (if any) that was previously * and advance wb_err to current one * @file: struct file on which the error is being reported * * When userland calls fsync (or something like nfsd does the equivalent), we * want to report any writeback errors that occurred since the last fsync (or * since the file was opened if there haven't been any). * * Grab the wb_err from the mapping. If it matches what we have in the file, * then just quickly return 0. The file is all caught up. * * If it doesn't match, then take the mapping value, set the "seen" flag in * it and try to swap it into place. If it works, or another task beat us * to it with the new value, then update the f_wb_err and return the error * portion. The error at this point must be reported via proper channels * (a'la fsync, or NFS COMMIT operation, etc.). * * While we handle mapping->wb_err with atomic operations, the f_wb_err * value is protected by the f_lock since we must ensure that it reflects * the latest value swapped in for this file descriptor. * * Return: %0 on success, negative error code otherwise. */ int file_check_and_advance_wb_err(struct file *file) { int err = 0; errseq_t old = READ_ONCE(file->f_wb_err); struct address_space *mapping = file->f_mapping; /* Locklessly handle the common case where nothing has changed */ if (errseq_check(&mapping->wb_err, old)) { /* Something changed, must use slow path */ spin_lock(&file->f_lock); old = file->f_wb_err; err = errseq_check_and_advance(&mapping->wb_err, &file->f_wb_err); trace_file_check_and_advance_wb_err(file, old); spin_unlock(&file->f_lock); } /* * We're mostly using this function as a drop in replacement for * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect * that the legacy code would have had on these flags. */ clear_bit(AS_EIO, &mapping->flags); clear_bit(AS_ENOSPC, &mapping->flags); return err; } EXPORT_SYMBOL(file_check_and_advance_wb_err); /** * file_write_and_wait_range - write out & wait on a file range * @file: file pointing to address_space with pages * @lstart: offset in bytes where the range starts * @lend: offset in bytes where the range ends (inclusive) * * Write out and wait upon file offsets lstart->lend, inclusive. * * Note that @lend is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). * * After writing out and waiting on the data, we check and advance the * f_wb_err cursor to the latest value, and return any errors detected there. * * Return: %0 on success, negative error code otherwise. */ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) { int err = 0, err2; struct address_space *mapping = file->f_mapping; if (lend < lstart) return 0; if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); } err2 = file_check_and_advance_wb_err(file); if (!err) err = err2; return err; } EXPORT_SYMBOL(file_write_and_wait_range); /** * replace_page_cache_folio - replace a pagecache folio with a new one * @old: folio to be replaced * @new: folio to replace with * * This function replaces a folio in the pagecache with a new one. On * success it acquires the pagecache reference for the new folio and * drops it for the old folio. Both the old and new folios must be * locked. This function does not add the new folio to the LRU, the * caller must do that. * * The remove + add is atomic. This function cannot fail. */ void replace_page_cache_folio(struct folio *old, struct folio *new) { struct address_space *mapping = old->mapping; void (*free_folio)(struct folio *) = mapping->a_ops->free_folio; pgoff_t offset = old->index; XA_STATE(xas, &mapping->i_pages, offset); VM_BUG_ON_FOLIO(!folio_test_locked(old), old); VM_BUG_ON_FOLIO(!folio_test_locked(new), new); VM_BUG_ON_FOLIO(new->mapping, new); folio_get(new); new->mapping = mapping; new->index = offset; mem_cgroup_replace_folio(old, new); xas_lock_irq(&xas); xas_store(&xas, new); old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ if (!folio_test_hugetlb(old)) __lruvec_stat_sub_folio(old, NR_FILE_PAGES); if (!folio_test_hugetlb(new)) __lruvec_stat_add_folio(new, NR_FILE_PAGES); if (folio_test_swapbacked(old)) __lruvec_stat_sub_folio(old, NR_SHMEM); if (folio_test_swapbacked(new)) __lruvec_stat_add_folio(new, NR_SHMEM); xas_unlock_irq(&xas); if (free_folio) free_folio(old); folio_put(old); } EXPORT_SYMBOL_GPL(replace_page_cache_folio); noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); bool huge; long nr; unsigned int forder = folio_order(folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping), folio); mapping_set_update(&xas, mapping); VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); huge = folio_test_hugetlb(folio); nr = folio_nr_pages(folio); gfp &= GFP_RECLAIM_MASK; folio_ref_add(folio, nr); folio->mapping = mapping; folio->index = xas.xa_index; for (;;) { int order = -1; void *entry, *old = NULL; xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { old = entry; if (!xa_is_value(entry)) { xas_set_err(&xas, -EEXIST); goto unlock; } /* * If a larger entry exists, * it will be the first and only entry iterated. */ if (order == -1) order = xas_get_order(&xas); } if (old) { if (order > 0 && order > forder) { unsigned int split_order = max(forder, xas_try_split_min_order(order)); /* How to handle large swap entries? */ BUG_ON(shmem_mapping(mapping)); while (order > forder) { xas_set_order(&xas, index, split_order); xas_try_split(&xas, old, order); if (xas_error(&xas)) goto unlock; order = split_order; split_order = max(xas_try_split_min_order( split_order), forder); } xas_reset(&xas); } if (shadowp) *shadowp = old; } xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; mapping->nrpages += nr; /* hugetlb pages do not participate in page cache accounting */ if (!huge) { __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); if (folio_test_pmd_mappable(folio)) __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); } unlock: xas_unlock_irq(&xas); if (!xas_nomem(&xas, gfp)) break; } if (xas_error(&xas)) goto error; trace_mm_filemap_add_to_page_cache(folio); return 0; error: folio->mapping = NULL; /* Leave folio->index set: truncation relies upon it */ folio_put_refs(folio, nr); return xas_error(&xas); } ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO); int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp) { void *shadow = NULL; int ret; ret = mem_cgroup_charge(folio, NULL, gfp); if (ret) return ret; __folio_set_locked(folio); ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow); if (unlikely(ret)) { mem_cgroup_uncharge(folio); __folio_clear_locked(folio); } else { /* * The folio might have been evicted from cache only * recently, in which case it should be activated like * any other repeatedly accessed folio. * The exception is folios getting rewritten; evicting other * data from the working set, only to cache data that will * get overwritten with something else, is a waste of memory. */ WARN_ON_ONCE(folio_test_active(folio)); if (!(gfp & __GFP_WRITE) && shadow) workingset_refault(folio, shadow); folio_add_lru(folio); } return ret; } EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) { int n; struct folio *folio; if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); folio = __folio_alloc_node_noprof(gfp, order, n); } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie)); return folio; } return folio_alloc_noprof(gfp, order); } EXPORT_SYMBOL(filemap_alloc_folio_noprof); #endif /* * filemap_invalidate_lock_two - lock invalidate_lock for two mappings * * Lock exclusively invalidate_lock of any passed mapping that is not NULL. * * @mapping1: the first mapping to lock * @mapping2: the second mapping to lock */ void filemap_invalidate_lock_two(struct address_space *mapping1, struct address_space *mapping2) { if (mapping1 > mapping2) swap(mapping1, mapping2); if (mapping1) down_write(&mapping1->invalidate_lock); if (mapping2 && mapping1 != mapping2) down_write_nested(&mapping2->invalidate_lock, 1); } EXPORT_SYMBOL(filemap_invalidate_lock_two); /* * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings * * Unlock exclusive invalidate_lock of any passed mapping that is not NULL. * * @mapping1: the first mapping to unlock * @mapping2: the second mapping to unlock */ void filemap_invalidate_unlock_two(struct address_space *mapping1, struct address_space *mapping2) { if (mapping1) up_write(&mapping1->invalidate_lock); if (mapping2 && mapping1 != mapping2) up_write(&mapping2->invalidate_lock); } EXPORT_SYMBOL(filemap_invalidate_unlock_two); /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of * waitqueues where the bucket discipline is to maintain all * waiters on the same queue and wake all when any of the pages * become available, and for the woken contexts to check to be * sure the appropriate page became available, this saves space * at a cost of "thundering herd" phenomena during rare hash * collisions. */ #define PAGE_WAIT_TABLE_BITS 8 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS) static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned; static wait_queue_head_t *folio_waitqueue(struct folio *folio) { return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)]; } /* How many times do we accept lock stealing from under a waiter? */ static int sysctl_page_lock_unfairness = 5; static const struct ctl_table filemap_sysctl_table[] = { { .procname = "page_lock_unfairness", .data = &sysctl_page_lock_unfairness, .maxlen = sizeof(sysctl_page_lock_unfairness), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, } }; void __init pagecache_init(void) { int i; for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++) init_waitqueue_head(&folio_wait_table[i]); page_writeback_init(); register_sysctl_init("vm", filemap_sysctl_table); } /* * The page wait code treats the "wait->flags" somewhat unusually, because * we have multiple different kinds of waits, not just the usual "exclusive" * one. * * We have: * * (a) no special bits set: * * We're just waiting for the bit to be released, and when a waker * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up, * and remove it from the wait queue. * * Simple and straightforward. * * (b) WQ_FLAG_EXCLUSIVE: * * The waiter is waiting to get the lock, and only one waiter should * be woken up to avoid any thundering herd behavior. We'll set the * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue. * * This is the traditional exclusive wait. * * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM: * * The waiter is waiting to get the bit, and additionally wants the * lock to be transferred to it for fair lock behavior. If the lock * cannot be taken, we stop walking the wait queue without waking * the waiter. * * This is the "fair lock handoff" case, and in addition to setting * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see * that it now has the lock. */ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { unsigned int flags; struct wait_page_key *key = arg; struct wait_page_queue *wait_page = container_of(wait, struct wait_page_queue, wait); if (!wake_page_match(wait_page, key)) return 0; /* * If it's a lock handoff wait, we get the bit for it, and * stop walking (and do not wake it up) if we can't. */ flags = wait->flags; if (flags & WQ_FLAG_EXCLUSIVE) { if (test_bit(key->bit_nr, &key->folio->flags.f)) return -1; if (flags & WQ_FLAG_CUSTOM) { if (test_and_set_bit(key->bit_nr, &key->folio->flags.f)) return -1; flags |= WQ_FLAG_DONE; } } /* * We are holding the wait-queue lock, but the waiter that * is waiting for this will be checking the flags without * any locking. * * So update the flags atomically, and wake up the waiter * afterwards to avoid any races. This store-release pairs * with the load-acquire in folio_wait_bit_common(). */ smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN); wake_up_state(wait->private, mode); /* * Ok, we have successfully done what we're waiting for, * and we can unconditionally remove the wait entry. * * Note that this pairs with the "finish_wait()" in the * waiter, and has to be the absolute last thing we do. * After this list_del_init(&wait->entry) the wait entry * might be de-allocated and the process might even have * exited. */ list_del_init_careful(&wait->entry); return (flags & WQ_FLAG_EXCLUSIVE) != 0; } static void folio_wake_bit(struct folio *folio, int bit_nr) { wait_queue_head_t *q = folio_waitqueue(folio); struct wait_page_key key; unsigned long flags; key.folio = folio; key.bit_nr = bit_nr; key.page_match = 0; spin_lock_irqsave(&q->lock, flags); __wake_up_locked_key(q, TASK_NORMAL, &key); /* * It's possible to miss clearing waiters here, when we woke our page * waiters, but the hashed waitqueue has waiters for other pages on it. * That's okay, it's a rare case. The next waker will clear it. * * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE, * other), the flag may be cleared in the course of freeing the page; * but that is not required for correctness. */ if (!waitqueue_active(q) || !key.page_match) folio_clear_waiters(folio); spin_unlock_irqrestore(&q->lock, flags); } /* * A choice of three behaviors for folio_wait_bit_common(): */ enum behavior { EXCLUSIVE, /* Hold ref to page and take the bit when woken, like * __folio_lock() waiting on then setting PG_locked. */ SHARED, /* Hold ref to page and check the bit when woken, like * folio_wait_writeback() waiting on PG_writeback. */ DROP, /* Drop ref to page before wait, no check when woken, * like folio_put_wait_locked() on PG_locked. */ }; /* * Attempt to check (or get) the folio flag, and mark us done * if successful. */ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, struct wait_queue_entry *wait) { if (wait->flags & WQ_FLAG_EXCLUSIVE) { if (test_and_set_bit(bit_nr, &folio->flags.f)) return false; } else if (test_bit(bit_nr, &folio->flags.f)) return false; wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; return true; } static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, int state, enum behavior behavior) { wait_queue_head_t *q = folio_waitqueue(folio); int unfairness = sysctl_page_lock_unfairness; struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; unsigned long pflags; bool in_thrashing; if (bit_nr == PG_locked && !folio_test_uptodate(folio) && folio_test_workingset(folio)) { delayacct_thrashing_start(&in_thrashing); psi_memstall_enter(&pflags); thrashing = true; } init_wait(wait); wait->func = wake_page_function; wait_page.folio = folio; wait_page.bit_nr = bit_nr; repeat: wait->flags = 0; if (behavior == EXCLUSIVE) { wait->flags = WQ_FLAG_EXCLUSIVE; if (--unfairness < 0) wait->flags |= WQ_FLAG_CUSTOM; } /* * Do one last check whether we can get the * page bit synchronously. * * Do the folio_set_waiters() marking before that * to let any waker we _just_ missed know they * need to wake us up (otherwise they'll never * even go to the slow case that looks at the * page queue), and add ourselves to the wait * queue if we need to sleep. * * This part needs to be done under the queue * lock to avoid races. */ spin_lock_irq(&q->lock); folio_set_waiters(folio); if (!folio_trylock_flag(folio, bit_nr, wait)) __add_wait_queue_entry_tail(q, wait); spin_unlock_irq(&q->lock); /* * From now on, all the logic will be based on * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to * see whether the page bit testing has already * been done by the wake function. * * We can drop our reference to the folio. */ if (behavior == DROP) folio_put(folio); /* * Note that until the "finish_wait()", or until * we see the WQ_FLAG_WOKEN flag, we need to * be very careful with the 'wait->flags', because * we may race with a waker that sets them. */ for (;;) { unsigned int flags; set_current_state(state); /* Loop until we've been woken or interrupted */ flags = smp_load_acquire(&wait->flags); if (!(flags & WQ_FLAG_WOKEN)) { if (signal_pending_state(state, current)) break; io_schedule(); continue; } /* If we were non-exclusive, we're done */ if (behavior != EXCLUSIVE) break; /* If the waker got the lock for us, we're done */ if (flags & WQ_FLAG_DONE) break; /* * Otherwise, if we're getting the lock, we need to * try to get it ourselves. * * And if that fails, we'll have to retry this all. */ if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0)))) goto repeat; wait->flags |= WQ_FLAG_DONE; break; } /* * If a signal happened, this 'finish_wait()' may remove the last * waiter from the wait-queues, but the folio waiters bit will remain * set. That's ok. The next wakeup will take care of it, and trying * to do it here would be difficult and prone to races. */ finish_wait(q, wait); if (thrashing) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); } /* * NOTE! The wait->flags weren't stable until we've done the * 'finish_wait()', and we could have exited the loop above due * to a signal, and had a wakeup event happen after the signal * test but before the 'finish_wait()'. * * So only after the finish_wait() can we reliably determine * if we got woken up or not, so we can now figure out the final * return value based on that state without races. * * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive * waiter, but an exclusive one requires WQ_FLAG_DONE. */ if (behavior == EXCLUSIVE) return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR; return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } #ifdef CONFIG_MIGRATION /** * migration_entry_wait_on_locked - Wait for a migration entry to be removed * @entry: migration swap entry. * @ptl: already locked ptl. This function will drop the lock. * * Wait for a migration entry referencing the given page to be removed. This is * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except * this can be called without taking a reference on the page. Instead this * should be called while holding the ptl for the migration entry referencing * the page. * * Returns after unlocking the ptl. * * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) __releases(ptl) { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; unsigned long pflags; bool in_thrashing; wait_queue_head_t *q; struct folio *folio = pfn_swap_entry_folio(entry); q = folio_waitqueue(folio); if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { delayacct_thrashing_start(&in_thrashing); psi_memstall_enter(&pflags); thrashing = true; } init_wait(wait); wait->func = wake_page_function; wait_page.folio = folio; wait_page.bit_nr = PG_locked; wait->flags = 0; spin_lock_irq(&q->lock); folio_set_waiters(folio); if (!folio_trylock_flag(folio, PG_locked, wait)) __add_wait_queue_entry_tail(q, wait); spin_unlock_irq(&q->lock); /* * If a migration entry exists for the page the migration path must hold * a valid reference to the page, and it must take the ptl to remove the * migration entry. So the page is valid until the ptl is dropped. */ spin_unlock(ptl); for (;;) { unsigned int flags; set_current_state(TASK_UNINTERRUPTIBLE); /* Loop until we've been woken or interrupted */ flags = smp_load_acquire(&wait->flags); if (!(flags & WQ_FLAG_WOKEN)) { if (signal_pending_state(TASK_UNINTERRUPTIBLE, current)) break; io_schedule(); continue; } break; } finish_wait(q, wait); if (thrashing) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); } } #endif void folio_wait_bit(struct folio *folio, int bit_nr) { folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); } EXPORT_SYMBOL(folio_wait_bit); int folio_wait_bit_killable(struct folio *folio, int bit_nr) { return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED); } EXPORT_SYMBOL(folio_wait_bit_killable); /** * folio_put_wait_locked - Drop a reference and wait for it to be unlocked * @folio: The folio to wait for. * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc). * * The caller should hold a reference on @folio. They expect the page to * become unlocked relatively soon, but do not wish to hold up migration * (for example) by holding the reference while waiting for the folio to * come unlocked. After this function returns, the caller should not * dereference @folio. * * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal. */ static int folio_put_wait_locked(struct folio *folio, int state) { return folio_wait_bit_common(folio, PG_locked, state, DROP); } /** * folio_unlock - Unlock a locked folio. * @folio: The folio. * * Unlocks the folio and wakes up any thread sleeping on the page lock. * * Context: May be called from interrupt or process context. May not be * called from NMI context. */ void folio_unlock(struct folio *folio) { /* Bit 7 allows x86 to check the byte's sign bit */ BUILD_BUG_ON(PG_waiters != 7); BUILD_BUG_ON(PG_locked > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (folio_xor_flags_has_waiters(folio, 1 << PG_locked)) folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_unlock); /** * folio_end_read - End read on a folio. * @folio: The folio. * @success: True if all reads completed successfully. * * When all reads against a folio have completed, filesystems should * call this function to let the pagecache know that no more reads * are outstanding. This will unlock the folio and wake up any thread * sleeping on the lock. The folio will also be marked uptodate if all * reads succeeded. * * Context: May be called from interrupt or process context. May not be * called from NMI context. */ void folio_end_read(struct folio *folio, bool success) { unsigned long mask = 1 << PG_locked; /* Must be in bottom byte for x86 to work */ BUILD_BUG_ON(PG_uptodate > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio); if (likely(success)) mask |= 1 << PG_uptodate; if (folio_xor_flags_has_waiters(folio, mask)) folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_end_read); /** * folio_end_private_2 - Clear PG_private_2 and wake any waiters. * @folio: The folio. * * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for * it. The folio reference held for PG_private_2 being set is released. * * This is, for example, used when a netfs folio is being written to a local * disk cache, thereby allowing writes to the cache for the same folio to be * serialised. */ void folio_end_private_2(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio); clear_bit_unlock(PG_private_2, folio_flags(folio, 0)); folio_wake_bit(folio, PG_private_2); folio_put(folio); } EXPORT_SYMBOL(folio_end_private_2); /** * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * * Wait for PG_private_2 to be cleared on a folio. */ void folio_wait_private_2(struct folio *folio) { while (folio_test_private_2(folio)) folio_wait_bit(folio, PG_private_2); } EXPORT_SYMBOL(folio_wait_private_2); /** * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is * received by the calling task. * * Return: * - 0 if successful. * - -EINTR if a fatal signal was encountered. */ int folio_wait_private_2_killable(struct folio *folio) { int ret = 0; while (folio_test_private_2(folio)) { ret = folio_wait_bit_killable(folio, PG_private_2); if (ret < 0) break; } return ret; } EXPORT_SYMBOL(folio_wait_private_2_killable); static void filemap_end_dropbehind(struct folio *folio) { struct address_space *mapping = folio->mapping; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (folio_test_writeback(folio) || folio_test_dirty(folio)) return; if (!folio_test_clear_dropbehind(folio)) return; if (mapping) folio_unmap_invalidate(mapping, folio, 0); } /* * If folio was marked as dropbehind, then pages should be dropped when writeback * completes. Do that now. If we fail, it's likely because of a big folio - * just reset dropbehind for that case and latter completions should invalidate. */ static void filemap_end_dropbehind_write(struct folio *folio) { if (!folio_test_dropbehind(folio)) return; /* * Hitting !in_task() should not happen off RWF_DONTCACHE writeback, * but can happen if normal writeback just happens to find dirty folios * that were created as part of uncached writeback, and that writeback * would otherwise not need non-IRQ handling. Just skip the * invalidation in that case. */ if (in_task() && folio_trylock(folio)) { filemap_end_dropbehind(folio); folio_unlock(folio); } } /** * folio_end_writeback - End writeback against a folio. * @folio: The folio. * * The folio must actually be under writeback. * * Context: May be called from process or interrupt context. */ void folio_end_writeback(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); /* * folio_test_clear_reclaim() could be used here but it is an * atomic operation and overkill in this particular case. Failing * to shuffle a folio marked for immediate reclaim is too mild * a gain to justify taking an atomic operation penalty at the * end of every folio writeback. */ if (folio_test_reclaim(folio)) { folio_clear_reclaim(folio); folio_rotate_reclaimable(folio); } /* * Writeback does not hold a folio reference of its own, relying * on truncation to wait for the clearing of PG_writeback. * But here we must make sure that the folio is not freed and * reused before the folio_wake_bit(). */ folio_get(folio); if (__folio_end_writeback(folio)) folio_wake_bit(folio, PG_writeback); filemap_end_dropbehind_write(folio); acct_reclaim_writeback(folio); folio_put(folio); } EXPORT_SYMBOL(folio_end_writeback); /** * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it. * @folio: The folio to lock */ void __folio_lock(struct folio *folio) { folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE, EXCLUSIVE); } EXPORT_SYMBOL(__folio_lock); int __folio_lock_killable(struct folio *folio) { return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE, EXCLUSIVE); } EXPORT_SYMBOL_GPL(__folio_lock_killable); static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) { struct wait_queue_head *q = folio_waitqueue(folio); int ret; wait->folio = folio; wait->bit_nr = PG_locked; spin_lock_irq(&q->lock); __add_wait_queue_entry_tail(q, &wait->wait); folio_set_waiters(folio); ret = !folio_trylock(folio); /* * If we were successful now, we know we're still on the * waitqueue as we're still under the lock. This means it's * safe to remove and return success, we know the callback * isn't going to trigger. */ if (!ret) __remove_wait_queue(q, &wait->wait); else ret = -EIOCBQUEUED; spin_unlock_irq(&q->lock); return ret; } /* * Return values: * 0 - folio is locked. * non-zero - folio is not locked. * mmap_lock or per-VMA lock has been released (mmap_read_unlock() or * vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held. * * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0 * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed. */ vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf) { unsigned int flags = vmf->flags; if (fault_flag_allow_retry_first(flags)) { /* * CAUTION! In this case, mmap_lock/per-VMA lock is not * released even though returning VM_FAULT_RETRY. */ if (flags & FAULT_FLAG_RETRY_NOWAIT) return VM_FAULT_RETRY; release_fault_lock(vmf); if (flags & FAULT_FLAG_KILLABLE) folio_wait_locked_killable(folio); else folio_wait_locked(folio); return VM_FAULT_RETRY; } if (flags & FAULT_FLAG_KILLABLE) { bool ret; ret = __folio_lock_killable(folio); if (ret) { release_fault_lock(vmf); return VM_FAULT_RETRY; } } else { __folio_lock(folio); } return 0; } /** * page_cache_next_miss() - Find the next gap in the page cache. * @mapping: Mapping. * @index: Index. * @max_scan: Maximum range to search. * * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the * gap with the lowest index. * * This function may be called under the rcu_read_lock. However, this will * not atomically search a snapshot of the cache at a single point in time. * For example, if a gap is created at index 5, then subsequently a gap is * created at index 10, page_cache_next_miss covering both indices may * return 10 if called under the rcu_read_lock. * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'return - index >= max_scan' will be true). * In the rare case of index wrap-around, 0 will be returned. */ pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { XA_STATE(xas, &mapping->i_pages, index); unsigned long nr = max_scan; while (nr--) { void *entry = xas_next(&xas); if (!entry || xa_is_value(entry)) return xas.xa_index; if (xas.xa_index == 0) return 0; } return index + max_scan; } EXPORT_SYMBOL(page_cache_next_miss); /** * page_cache_prev_miss() - Find the previous gap in the page cache. * @mapping: Mapping. * @index: Index. * @max_scan: Maximum range to search. * * Search the range [max(index - max_scan + 1, 0), index] for the * gap with the highest index. * * This function may be called under the rcu_read_lock. However, this will * not atomically search a snapshot of the cache at a single point in time. * For example, if a gap is created at index 10, then subsequently a gap is * created at index 5, page_cache_prev_miss() covering both indices may * return 5 if called under the rcu_read_lock. * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'index - return >= max_scan' will be true). * In the rare case of wrap-around, ULONG_MAX will be returned. */ pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { XA_STATE(xas, &mapping->i_pages, index); while (max_scan--) { void *entry = xas_prev(&xas); if (!entry || xa_is_value(entry)) break; if (xas.xa_index == ULONG_MAX) break; } return xas.xa_index; } EXPORT_SYMBOL(page_cache_prev_miss); /* * Lockless page cache protocol: * On the lookup side: * 1. Load the folio from i_pages * 2. Increment the refcount if it's not zero * 3. If the folio is not found by xas_reload(), put the refcount and retry * * On the removal side: * A. Freeze the page (by zeroing the refcount if nobody else has a reference) * B. Remove the page from i_pages * C. Return the page to the page allocator * * This means that any page may have its reference count temporarily * increased by a speculative page cache (or GUP-fast) lookup as it can * be allocated by another user before the RCU grace period expires. * Because the refcount temporarily acquired here may end up being the * last refcount on the page, any page allocation must be freeable by * folio_put(). */ /* * filemap_get_entry - Get a page cache entry. * @mapping: the address_space to search * @index: The page cache index. * * Looks up the page cache entry at @mapping & @index. If it is a folio, * it is returned with an increased refcount. If it is a shadow entry * of a previously evicted folio, or a swap entry from shmem/tmpfs, * it is returned without further action. * * Return: The folio, swap or shadow entry, %NULL if nothing is found. */ void *filemap_get_entry(struct address_space *mapping, pgoff_t index) { XA_STATE(xas, &mapping->i_pages, index); struct folio *folio; rcu_read_lock(); repeat: xas_reset(&xas); folio = xas_load(&xas); if (xas_retry(&xas, folio)) goto repeat; /* * A shadow entry of a recently evicted page, or a swap entry from * shmem/tmpfs. Return it without attempting to raise page count. */ if (!folio || xa_is_value(folio)) goto out; if (!folio_try_get(folio)) goto repeat; if (unlikely(folio != xas_reload(&xas))) { folio_put(folio); goto repeat; } out: rcu_read_unlock(); return folio; } /** * __filemap_get_folio - Find and get a reference to a folio. * @mapping: The address_space to search. * @index: The page index. * @fgp_flags: %FGP flags modify how the folio is returned. * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. * * Looks up the page cache entry at @mapping & @index. * * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even * if the %GFP flags specified for %FGP_CREAT are atomic. * * If this function returns a folio, it is returned with an increased refcount. * * Return: The found folio or an ERR_PTR() otherwise. */ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp) { struct folio *folio; repeat: folio = filemap_get_entry(mapping, index); if (xa_is_value(folio)) folio = NULL; if (!folio) goto no_page; if (fgp_flags & FGP_LOCK) { if (fgp_flags & FGP_NOWAIT) { if (!folio_trylock(folio)) { folio_put(folio); return ERR_PTR(-EAGAIN); } } else { folio_lock(folio); } /* Has the page been truncated? */ if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); folio_put(folio); goto repeat; } VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); } if (fgp_flags & FGP_ACCESSED) folio_mark_accessed(folio); else if (fgp_flags & FGP_WRITE) { /* Clear idle flag for buffer write */ if (folio_test_idle(folio)) folio_clear_idle(folio); } if (fgp_flags & FGP_STABLE) folio_wait_stable(folio); no_page: if (!folio && (fgp_flags & FGP_CREAT)) { unsigned int min_order = mapping_min_folio_order(mapping); unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags)); int err; index = mapping_align_index(mapping, index); if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) gfp &= ~__GFP_FS; if (fgp_flags & FGP_NOWAIT) { gfp &= ~GFP_KERNEL; gfp |= GFP_NOWAIT; } if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; if (order > mapping_max_folio_order(mapping)) order = mapping_max_folio_order(mapping); /* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1)) order = __ffs(index); do { gfp_t alloc_gfp = gfp; err = -ENOMEM; if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; folio = filemap_alloc_folio(alloc_gfp, order); if (!folio) continue; /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) __folio_set_referenced(folio); if (fgp_flags & FGP_DONTCACHE) __folio_set_dropbehind(folio); err = filemap_add_folio(mapping, folio, index, gfp); if (!err) break; folio_put(folio); folio = NULL; } while (order-- > min_order); if (err == -EEXIST) goto repeat; if (err) { /* * When NOWAIT I/O fails to allocate folios this could * be due to a nonblocking memory allocation and not * because the system actually is out of memory. * Return -EAGAIN so that there caller retries in a * blocking fashion instead of propagating -ENOMEM * to the application. */ if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM) err = -EAGAIN; return ERR_PTR(err); } /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. */ if (folio && (fgp_flags & FGP_FOR_MMAP)) folio_unlock(folio); } if (!folio) return ERR_PTR(-ENOENT); /* not an uncached lookup, clear uncached if set */ if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE)) folio_clear_dropbehind(folio); return folio; } EXPORT_SYMBOL(__filemap_get_folio); static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) { struct folio *folio; retry: if (mark == XA_PRESENT) folio = xas_find(xas, max); else folio = xas_find_marked(xas, max, mark); if (xas_retry(xas, folio)) goto retry; /* * A shadow entry of a recently evicted page, a swap * entry from shmem/tmpfs or a DAX entry. Return it * without attempting to raise page count. */ if (!folio || xa_is_value(folio)) return folio; if (!folio_try_get(folio)) goto reset; if (unlikely(folio != xas_reload(xas))) { folio_put(folio); goto reset; } return folio; reset: xas_reset(xas); goto retry; } /** * find_get_entries - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page cache index * @end: The final page index (inclusive). * @fbatch: Where the resulting entries are placed. * @indices: The cache indices corresponding to the entries in @entries * * find_get_entries() will search for and return a batch of entries in * the mapping. The entries are placed in @fbatch. find_get_entries() * takes a reference on any actual folios it returns. * * The entries have ascending indexes. The indices may not be consecutive * due to not-present entries or large folios. * * Any shadow entries of evicted folios, or swap entries from * shmem/tmpfs, are included in the returned array. * * Return: The number of entries which were found. */ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { indices[fbatch->nr] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; } if (folio_batch_count(fbatch)) { unsigned long nr; int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; if (!xa_is_value(folio)) nr = folio_nr_pages(folio); else nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]); *start = round_down(indices[idx] + nr, nr); } rcu_read_unlock(); return folio_batch_count(fbatch); } /** * find_lock_entries - Find a batch of pagecache entries. * @mapping: The address_space to search. * @start: The starting page cache index. * @end: The final page index (inclusive). * @fbatch: Where the resulting entries are placed. * @indices: The cache indices of the entries in @fbatch. * * find_lock_entries() will return a batch of entries from @mapping. * Swap, shadow and DAX entries are included. Folios are returned * locked and with an incremented refcount. Folios which are locked * by somebody else or under writeback are skipped. Folios which are * partially outside the range are not returned. * * The entries have ascending indexes. The indices may not be consecutive * due to not-present entries, large folios, folios which could not be * locked or folios under writeback. * * Return: The number of entries which were found. */ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { unsigned long base; unsigned long nr; if (!xa_is_value(folio)) { nr = folio_nr_pages(folio); base = folio->index; /* Omit large folio which begins before the start */ if (base < *start) goto put; /* Omit large folio which extends beyond the end */ if (base + nr - 1 > end) goto put; if (!folio_trylock(folio)) goto put; if (folio->mapping != mapping || folio_test_writeback(folio)) goto unlock; VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), folio); } else { nr = 1 << xas_get_order(&xas); base = xas.xa_index & ~(nr - 1); /* Omit order>0 value which begins before the start */ if (base < *start) continue; /* Omit order>0 value which extends beyond the end */ if (base + nr - 1 > end) break; } /* Update start now so that last update is correct on return */ *start = base + nr; indices[fbatch->nr] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; continue; unlock: folio_unlock(folio); put: folio_put(folio); } rcu_read_unlock(); return folio_batch_count(fbatch); } /** * filemap_get_folios - Get a batch of folios * @mapping: The address_space to search * @start: The starting page index * @end: The final page index (inclusive) * @fbatch: The batch to fill. * * Search for and return a batch of folios in the mapping starting at * index @start and up to index @end (inclusive). The folios are returned * in @fbatch with an elevated reference count. * * Return: The number of folios which were found. * We also update @start to index the next folio for the traversal. */ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) { return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch); } EXPORT_SYMBOL(filemap_get_folios); /** * filemap_get_folios_contig - Get a batch of contiguous folios * @mapping: The address_space to search * @start: The starting page index * @end: The final page index (inclusive) * @fbatch: The batch to fill * * filemap_get_folios_contig() works exactly like filemap_get_folios(), * except the returned folios are guaranteed to be contiguous. This may * not return all contiguous folios if the batch gets filled up. * * Return: The number of folios found. * Also update @start to be positioned for traversal of the next folio. */ unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, *start); unsigned long nr; struct folio *folio; rcu_read_lock(); for (folio = xas_load(&xas); folio && xas.xa_index <= end; folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; /* * If the entry has been swapped out, we can stop looking. * No current caller is looking for DAX entries. */ if (xa_is_value(folio)) goto update_start; /* If we landed in the middle of a THP, continue at its end. */ if (xa_is_sibling(folio)) goto update_start; if (!folio_try_get(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) goto put_folio; if (!folio_batch_add(fbatch, folio)) { nr = folio_nr_pages(folio); *start = folio->index + nr; goto out; } xas_advance(&xas, folio_next_index(folio) - 1); continue; put_folio: folio_put(folio); retry: xas_reset(&xas); } update_start: nr = folio_batch_count(fbatch); if (nr) { folio = fbatch->folios[nr - 1]; *start = folio_next_index(folio); } out: rcu_read_unlock(); return folio_batch_count(fbatch); } EXPORT_SYMBOL(filemap_get_folios_contig); /** * filemap_get_folios_tag - Get a batch of folios matching @tag * @mapping: The address_space to search * @start: The starting page index * @end: The final page index (inclusive) * @tag: The tag index * @fbatch: The batch to fill * * The first folio may start before @start; if it does, it will contain * @start. The final folio may extend beyond @end; if it does, it will * contain @end. The folios have ascending indices. There may be gaps * between the folios if there are indices which have no folio in the * page cache. If folios are added to or removed from the page cache * while this is running, they may or may not be found by this call. * Only returns folios that are tagged with @tag. * * Return: The number of folios found. * Also update @start to index the next folio for traversal. */ unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, tag)) != NULL) { /* * Shadow entries should never be tagged, but this iteration * is lockless so there is a window for page reclaim to evict * a page we saw tagged. Skip over it. */ if (xa_is_value(folio)) continue; if (!folio_batch_add(fbatch, folio)) { unsigned long nr = folio_nr_pages(folio); *start = folio->index + nr; goto out; } } /* * We come here when there is no page beyond @end. We take care to not * overflow the index @start as it confuses some of the callers. This * breaks the iteration when there is a page at index -1 but that is * already broke anyway. */ if (end == (pgoff_t)-1) *start = (pgoff_t)-1; else *start = end + 1; out: rcu_read_unlock(); return folio_batch_count(fbatch); } EXPORT_SYMBOL(filemap_get_folios_tag); /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: * * ---R__________________________________________B__________ * ^ reading here ^ bad block(assume 4k) * * read(R) => miss => readahead(R...B) => media error => frustrating retries * => failing the whole request => read(R) => read(R+1) => * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... * * It is going insane. Fix it by quickly scaling down the readahead size. */ static void shrink_readahead_size_eio(struct file_ra_state *ra) { ra->ra_pages /= 4; } /* * filemap_get_read_batch - Get a batch of folios for read * * Get a batch of folios which represent a contiguous range of bytes in * the file. No exceptional entries will be returned. If @index is in * the middle of a folio, the entire folio will be returned. The last * folio in the batch may have the readahead flag set or the uptodate flag * clear so that the caller can take the appropriate action. */ static void filemap_get_read_batch(struct address_space *mapping, pgoff_t index, pgoff_t max, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, index); struct folio *folio; rcu_read_lock(); for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; if (xas.xa_index > max || xa_is_value(folio)) break; if (xa_is_sibling(folio)) break; if (!folio_try_get(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) goto put_folio; if (!folio_batch_add(fbatch, folio)) break; if (!folio_test_uptodate(folio)) break; if (folio_test_readahead(folio)) break; xas_advance(&xas, folio_next_index(folio) - 1); continue; put_folio: folio_put(folio); retry: xas_reset(&xas); } rcu_read_unlock(); } static int filemap_read_folio(struct file *file, filler_t filler, struct folio *folio) { bool workingset = folio_test_workingset(folio); unsigned long pflags; int error; /* Start the actual read. The read will unlock the page. */ if (unlikely(workingset)) psi_memstall_enter(&pflags); error = filler(file, folio); if (unlikely(workingset)) psi_memstall_leave(&pflags); if (error) return error; error = folio_wait_locked_killable(folio); if (error) return error; if (folio_test_uptodate(folio)) return 0; if (file) shrink_readahead_size_eio(&file->f_ra); return -EIO; } static bool filemap_range_uptodate(struct address_space *mapping, loff_t pos, size_t count, struct folio *folio, bool need_uptodate) { if (folio_test_uptodate(folio)) return true; /* pipes can't handle partially uptodate pages */ if (need_uptodate) return false; if (!mapping->a_ops->is_partially_uptodate) return false; if (mapping->host->i_blkbits >= folio_shift(folio)) return false; if (folio_pos(folio) > pos) { count -= folio_pos(folio) - pos; pos = 0; } else { pos -= folio_pos(folio); } if (pos == 0 && count >= folio_size(folio)) return false; return mapping->a_ops->is_partially_uptodate(folio, pos, count); } static int filemap_update_page(struct kiocb *iocb, struct address_space *mapping, size_t count, struct folio *folio, bool need_uptodate) { int error; if (iocb->ki_flags & IOCB_NOWAIT) { if (!filemap_invalidate_trylock_shared(mapping)) return -EAGAIN; } else { filemap_invalidate_lock_shared(mapping); } if (!folio_trylock(folio)) { error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) goto unlock_mapping; if (!(iocb->ki_flags & IOCB_WAITQ)) { filemap_invalidate_unlock_shared(mapping); /* * This is where we usually end up waiting for a * previously submitted readahead to finish. */ folio_put_wait_locked(folio, TASK_KILLABLE); return AOP_TRUNCATED_PAGE; } error = __folio_lock_async(folio, iocb->ki_waitq); if (error) goto unlock_mapping; } error = AOP_TRUNCATED_PAGE; if (!folio->mapping) goto unlock; error = 0; if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio, need_uptodate)) goto unlock; error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) goto unlock; error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, folio); goto unlock_mapping; unlock: folio_unlock(folio); unlock_mapping: filemap_invalidate_unlock_shared(mapping); if (error == AOP_TRUNCATED_PAGE) folio_put(folio); return error; } static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct folio *folio; int error; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t index; if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); if (!folio) return -ENOMEM; if (iocb->ki_flags & IOCB_DONTCACHE) __folio_set_dropbehind(folio); /* * Protect against truncate / hole punch. Grabbing invalidate_lock * here assures we cannot instantiate and bring uptodate new * pagecache folios after evicting page cache during truncate * and before actually freeing blocks. Note that we could * release invalidate_lock after inserting the folio into * the page cache as the locked folio would then be enough to * synchronize with hole punching. But there are code paths * such as filemap_update_page() filling in partially uptodate * pages or ->readahead() that need to hold invalidate_lock * while mapping blocks for IO so let's hold the lock here as * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order; error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) error = AOP_TRUNCATED_PAGE; if (error) goto error; error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, folio); if (error) goto error; filemap_invalidate_unlock_shared(mapping); folio_batch_add(fbatch, folio); return 0; error: filemap_invalidate_unlock_shared(mapping); folio_put(folio); return error; } static int filemap_readahead(struct kiocb *iocb, struct file *file, struct address_space *mapping, struct folio *folio, pgoff_t last_index) { DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index); if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_DONTCACHE) ractl.dropbehind = 1; page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } static int filemap_get_pages(struct kiocb *iocb, size_t count, struct folio_batch *fbatch, bool need_uptodate) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; unsigned int flags; int err = 0; /* "last_index" is the index of the folio beyond the end of the read */ last_index = round_up(iocb->ki_pos + count, mapping_min_folio_nrbytes(mapping)) >> PAGE_SHIFT; retry: if (fatal_signal_pending(current)) return -EINTR; filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index); if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_NOWAIT) flags = memalloc_noio_save(); if (iocb->ki_flags & IOCB_DONTCACHE) ractl.dropbehind = 1; page_cache_sync_ra(&ractl, last_index - index); if (iocb->ki_flags & IOCB_NOWAIT) memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { err = filemap_create_folio(iocb, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } folio = fbatch->folios[folio_batch_count(fbatch) - 1]; if (folio_test_readahead(folio)) { err = filemap_readahead(iocb, filp, mapping, folio, last_index); if (err) goto err; } if (!folio_test_uptodate(folio)) { if (folio_batch_count(fbatch) > 1) { err = -EAGAIN; goto err; } err = filemap_update_page(iocb, mapping, count, folio, need_uptodate); if (err) goto err; } trace_mm_filemap_get_pages(mapping, index, last_index - 1); return 0; err: if (err < 0) folio_put(folio); if (likely(--fbatch->nr)) return 0; if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) { unsigned int shift = folio_shift(folio); return (pos1 >> shift == pos2 >> shift); } static void filemap_end_dropbehind_read(struct folio *folio) { if (!folio_test_dropbehind(folio)) return; if (folio_test_writeback(folio) || folio_test_dirty(folio)) return; if (folio_trylock(folio)) { filemap_end_dropbehind(folio); folio_unlock(folio); } } /** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. * @iter: Destination for the data. * @already_read: Number of bytes already read by the caller. * * Copies data from the page cache. If the data is not currently present, * uses the readahead and read_folio address_space operations to fetch it. * * Return: Total number of bytes copied, including those already read by * the caller. If an error happens before any bytes are copied, returns * a negative error number. */ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t already_read) { struct file *filp = iocb->ki_filp; struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; struct folio_batch fbatch; int i, error = 0; bool writably_mapped; loff_t isize, end_offset; loff_t last_pos = ra->prev_pos; if (unlikely(iocb->ki_pos < 0)) return -EINVAL; if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; if (unlikely(!iov_iter_count(iter))) return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos); folio_batch_init(&fbatch); do { cond_resched(); /* * If we've already successfully copied some data, then we * can no longer safely return -EIOCBQUEUED. Hence mark * an async read NOWAIT at that point. */ if ((iocb->ki_flags & IOCB_WAITQ) && already_read) iocb->ki_flags |= IOCB_NOWAIT; if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; error = filemap_get_pages(iocb, iter->count, &fbatch, false); if (error < 0) break; /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(inode); if (unlikely(iocb->ki_pos >= isize)) goto put_folios; end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: */ writably_mapped = mapping_writably_mapped(mapping); /* * When a read accesses the same folio several times, only * mark it as accessed the first time. */ if (!pos_same_folio(iocb->ki_pos, last_pos - 1, fbatch.folios[0])) folio_mark_accessed(fbatch.folios[0]); for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; size_t fsize = folio_size(folio); size_t offset = iocb->ki_pos & (fsize - 1); size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); size_t copied; if (end_offset < folio_pos(folio)) break; if (i > 0) folio_mark_accessed(folio); /* * If users can be writing to this folio using arbitrary * virtual addresses, take care of potential aliasing * before reading the folio on the kernel side. */ if (writably_mapped) flush_dcache_folio(folio); copied = copy_folio_to_iter(folio, offset, bytes, iter); already_read += copied; iocb->ki_pos += copied; last_pos = iocb->ki_pos; if (copied < bytes) { error = -EFAULT; break; } } put_folios: for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; filemap_end_dropbehind_read(folio); folio_put(folio); } folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); ra->prev_pos = last_pos; return already_read ? already_read : error; } EXPORT_SYMBOL_GPL(filemap_read); int kiocb_write_and_wait(struct kiocb *iocb, size_t count) { struct address_space *mapping = iocb->ki_filp->f_mapping; loff_t pos = iocb->ki_pos; loff_t end = pos + count - 1; if (iocb->ki_flags & IOCB_NOWAIT) { if (filemap_range_needs_writeback(mapping, pos, end)) return -EAGAIN; return 0; } return filemap_write_and_wait_range(mapping, pos, end); } EXPORT_SYMBOL_GPL(kiocb_write_and_wait); int filemap_invalidate_pages(struct address_space *mapping, loff_t pos, loff_t end, bool nowait) { int ret; if (nowait) { /* we could block if there are any pages in the range */ if (filemap_range_has_page(mapping, pos, end)) return -EAGAIN; } else { ret = filemap_write_and_wait_range(mapping, pos, end); if (ret) return ret; } /* * After a write we want buffered reads to be sure to go to disk to get * the new data. We invalidate clean cached page from the region we're * about to write. We do this *before* the write so that we can return * without clobbering -EIOCBQUEUED from ->direct_IO(). */ return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); } int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) { struct address_space *mapping = iocb->ki_filp->f_mapping; return filemap_invalidate_pages(mapping, iocb->ki_pos, iocb->ki_pos + count - 1, iocb->ki_flags & IOCB_NOWAIT); } EXPORT_SYMBOL_GPL(kiocb_invalidate_pages); /** * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block * @iter: destination for the data read * * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. * * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall * be returned when no data can be read without waiting for I/O requests * to complete; it doesn't prevent readahead. * * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O * requests shall be made for the read or for readahead. When no data * can be read, -EAGAIN shall be returned. When readahead would be * triggered, a partial, possibly empty read shall be returned. * * Return: * * number of bytes copied, even for partial reads * * negative error code (or 0 if IOCB_NOIO) if nothing was read */ ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { size_t count = iov_iter_count(iter); ssize_t retval = 0; if (!count) return 0; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; retval = kiocb_write_and_wait(iocb, count); if (retval < 0) return retval; file_accessed(file); retval = mapping->a_ops->direct_IO(iocb, iter); if (retval >= 0) { iocb->ki_pos += retval; count -= retval; } if (retval != -EIOCBQUEUED) iov_iter_revert(iter, count - iov_iter_count(iter)); /* * Btrfs can have a short DIO read if we encounter * compressed extents, so if there was an error, or if * we've already read everything we wanted to, or if * there was a short read because we hit EOF, go ahead * and return. Otherwise fallthrough to buffered io for * the rest of the read. Buffered reads will not work for * DAX files, so don't bother trying. */ if (retval < 0 || !count || IS_DAX(inode)) return retval; if (iocb->ki_pos >= i_size_read(inode)) return retval; } return filemap_read(iocb, iter, retval); } EXPORT_SYMBOL(generic_file_read_iter); /* * Splice subpages from a folio into a pipe. */ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, struct folio *folio, loff_t fpos, size_t size) { struct page *page; size_t spliced = 0, offset = offset_in_folio(folio, fpos); page = folio_page(folio, offset / PAGE_SIZE); size = min(size, folio_size(folio) - offset); offset %= PAGE_SIZE; while (spliced < size && !pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); *buf = (struct pipe_buffer) { .ops = &page_cache_pipe_buf_ops, .page = page, .offset = offset, .len = part, }; folio_get(folio); pipe->head++; page++; spliced += part; offset = 0; } return spliced; } /** * filemap_splice_read - Splice data from a file's pagecache into a pipe * @in: The file to read from * @ppos: Pointer to the file position to read from * @pipe: The pipe to splice into * @len: The amount to splice * @flags: The SPLICE_F_* flags * * This function gets folios from a file's pagecache and splices them into the * pipe. Readahead will be called as necessary to fill more folios. This may * be used for blockdevs also. * * Return: On success, the number of bytes read will be returned and *@ppos * will be updated if appropriate; 0 will be returned if there is no more data * to be read; -EAGAIN will be returned if the pipe had no space, and some * other negative error code will be returned on error. A short read may occur * if the pipe has insufficient space, we reach the end of the data or we hit a * hole. */ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct folio_batch fbatch; struct kiocb iocb; size_t total_spliced = 0, used, npages; loff_t isize, end_offset; bool writably_mapped; int i, error = 0; if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes)) return 0; init_sync_kiocb(&iocb, in); iocb.ki_pos = *ppos; /* Work out how much data we can actually add into the pipe */ used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); folio_batch_init(&fbatch); do { cond_resched(); if (*ppos >= i_size_read(in->f_mapping->host)) break; iocb.ki_pos = *ppos; error = filemap_get_pages(&iocb, len, &fbatch, true); if (error < 0) break; /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(in->f_mapping->host); if (unlikely(*ppos >= isize)) break; end_offset = min_t(loff_t, isize, *ppos + len); /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: */ writably_mapped = mapping_writably_mapped(in->f_mapping); for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; size_t n; if (folio_pos(folio) >= end_offset) goto out; folio_mark_accessed(folio); /* * If users can be writing to this folio using arbitrary * virtual addresses, take care of potential aliasing * before reading the folio on the kernel side. */ if (writably_mapped) flush_dcache_folio(folio); n = min_t(loff_t, len, isize - *ppos); n = splice_folio_into_pipe(pipe, folio, *ppos, n); if (!n) goto out; len -= n; total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; if (pipe_is_full(pipe)) goto out; } folio_batch_release(&fbatch); } while (len); out: folio_batch_release(&fbatch); file_accessed(in); return total_spliced ? total_spliced : error; } EXPORT_SYMBOL(filemap_splice_read); static inline loff_t folio_seek_hole_data(struct xa_state *xas, struct address_space *mapping, struct folio *folio, loff_t start, loff_t end, bool seek_data) { const struct address_space_operations *ops = mapping->a_ops; size_t offset, bsz = i_blocksize(mapping->host); if (xa_is_value(folio) || folio_test_uptodate(folio)) return seek_data ? start : end; if (!ops->is_partially_uptodate) return seek_data ? end : start; xas_pause(xas); rcu_read_unlock(); folio_lock(folio); if (unlikely(folio->mapping != mapping)) goto unlock; offset = offset_in_folio(folio, start) & ~(bsz - 1); do { if (ops->is_partially_uptodate(folio, offset, bsz) == seek_data) break; start = (start + bsz) & ~((u64)bsz - 1); offset += bsz; } while (offset < folio_size(folio)); unlock: folio_unlock(folio); rcu_read_lock(); return start; } static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio) { if (xa_is_value(folio)) return PAGE_SIZE << xas_get_order(xas); return folio_size(folio); } /** * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache. * @mapping: Address space to search. * @start: First byte to consider. * @end: Limit of search (exclusive). * @whence: Either SEEK_HOLE or SEEK_DATA. * * If the page cache knows which blocks contain holes and which blocks * contain data, your filesystem can use this function to implement * SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are * entirely memory-based such as tmpfs, and filesystems which support * unwritten extents. * * Return: The requested offset on success, or -ENXIO if @whence specifies * SEEK_DATA and there is no data after @start. There is an implicit hole * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start * and @end contain data. */ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, loff_t end, int whence) { XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); pgoff_t max = (end - 1) >> PAGE_SHIFT; bool seek_data = (whence == SEEK_DATA); struct folio *folio; if (end <= start) return -ENXIO; rcu_read_lock(); while ((folio = find_get_entry(&xas, max, XA_PRESENT))) { loff_t pos = (u64)xas.xa_index << PAGE_SHIFT; size_t seek_size; if (start < pos) { if (!seek_data) goto unlock; start = pos; } seek_size = seek_folio_size(&xas, folio); pos = round_up((u64)pos + 1, seek_size); start = folio_seek_hole_data(&xas, mapping, folio, start, pos, seek_data); if (start < pos) goto unlock; if (start >= end) break; if (seek_size > PAGE_SIZE) xas_set(&xas, pos >> PAGE_SHIFT); if (!xa_is_value(folio)) folio_put(folio); } if (seek_data) start = -ENXIO; unlock: rcu_read_unlock(); if (folio && !xa_is_value(folio)) folio_put(folio); if (start > end) return end; return start; } #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) /* * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock * @vmf - the vm_fault for this fault. * @folio - the folio to lock. * @fpin - the pointer to the file we may pin (or is already pinned). * * This works similar to lock_folio_or_retry in that it can drop the * mmap_lock. It differs in that it actually returns the folio locked * if it returns 1 and 0 if it couldn't lock the folio. If we did have * to drop the mmap_lock then fpin will point to the pinned file and * needs to be fput()'ed at a later point. */ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, struct file **fpin) { if (folio_trylock(folio)) return 1; /* * NOTE! This will make us return with VM_FAULT_RETRY, but with * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT * is supposed to work. We have way too many special cases.. */ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) return 0; *fpin = maybe_unlock_mmap_for_io(vmf, *fpin); if (vmf->flags & FAULT_FLAG_KILLABLE) { if (__folio_lock_killable(folio)) { /* * We didn't have the right flags to drop the * fault lock, but all fault_handlers only check * for fatal signals if we return VM_FAULT_RETRY, * so we need to drop the fault lock here and * return 0 if we don't have a fpin. */ if (*fpin == NULL) release_fault_lock(vmf); return 0; } } else __folio_lock(folio); return 1; } /* * Synchronous readahead happens when we don't even find a page in the page * cache at all. We don't want to perform IO under the mmap sem, so if we have * to drop the mmap sem we return the file that was pinned in order for us to do * that. If we didn't pin a file then we return NULL. The file that is * returned needs to be fput()'ed when we're done with it. */ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); struct file *fpin = NULL; vm_flags_t vm_flags = vmf->vma->vm_flags; unsigned short mmap_miss; #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); ra->size = HPAGE_PMD_NR; /* * Fetch two PMD folios, so we get the chance to actually * readahead, unless we've been told not to. */ if (!(vm_flags & VM_RAND_READ)) ra->size *= 2; ra->async_size = HPAGE_PMD_NR; ra->order = HPAGE_PMD_ORDER; page_cache_ra_order(&ractl, ra); return fpin; } #endif /* * If we don't want any read-ahead, don't bother. VM_EXEC case below is * already intended for random access. */ if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ) return fpin; if (!ra->ra_pages) return fpin; if (vm_flags & VM_SEQ_READ) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_sync_ra(&ractl, ra->ra_pages); return fpin; } /* Avoid banging the cache line if not needed */ mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss < MMAP_LOTSAMISS * 10) WRITE_ONCE(ra->mmap_miss, ++mmap_miss); /* * Do we miss much more than hit in this file? If so, * stop bothering with read-ahead. It will only hurt. */ if (mmap_miss > MMAP_LOTSAMISS) return fpin; if (vm_flags & VM_EXEC) { /* * Allow arch to request a preferred minimum folio order for * executable memory. This can often be beneficial to * performance if (e.g.) arm64 can contpte-map the folio. * Executable memory rarely benefits from readahead, due to its * random access nature, so set async_size to 0. * * Limit to the boundaries of the VMA to avoid reading in any * pad that might exist between sections, which would be a waste * of memory. */ struct vm_area_struct *vma = vmf->vma; unsigned long start = vma->vm_pgoff; unsigned long end = start + vma_pages(vma); unsigned long ra_end; ra->order = exec_folio_order(); ra->start = round_down(vmf->pgoff, 1UL << ra->order); ra->start = max(ra->start, start); ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order); ra_end = min(ra_end, end); ra->size = ra_end - ra->start; ra->async_size = 0; } else { /* * mmap read-around */ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ra->order = 0; } fpin = maybe_unlock_mmap_for_io(vmf, fpin); ractl._index = ra->start; page_cache_ra_order(&ractl, ra); return fpin; } /* * Asynchronous readahead happens when we find the page and PG_readahead, * so we want to possibly extend the readahead further. We return the file that * was pinned if we have to drop the mmap_lock in order to do IO. */ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, struct folio *folio) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff); struct file *fpin = NULL; unsigned short mmap_miss; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; /* * If the folio is locked, we're likely racing against another fault. * Don't touch the mmap_miss counter to avoid decreasing it multiple * times for a single folio and break the balance with mmap_miss * increase in do_sync_mmap_readahead(). */ if (likely(!folio_test_locked(folio))) { mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss) WRITE_ONCE(ra->mmap_miss, --mmap_miss); } if (folio_test_readahead(folio)) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_async_ra(&ractl, folio, ra->ra_pages); } return fpin; } static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; pte_t *ptep; /* * We might have COW'ed a pagecache folio and might now have an mlocked * anon folio mapped. The original pagecache folio is not mlocked and * might have been evicted. During a read+clear/modify/write update of * the PTE, such as done in do_numa_page()/change_pte_range(), we * temporarily clear the PTE under PT lock and might detect it here as * "none" when not holding the PT lock. * * Not rechecking the PTE under PT lock could result in an unexpected * major fault in an mlock'ed region. Recheck only for this special * scenario while holding the PT lock, to not degrade non-mlocked * scenarios. Recheck the PTE without PT lock firstly, thereby reducing * the number of times we hold PT lock. */ if (!(vma->vm_flags & VM_LOCKED)) return 0; if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return 0; ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!ptep)) return VM_FAULT_NOPAGE; if (unlikely(!pte_none(ptep_get_lockless(ptep)))) { ret = VM_FAULT_NOPAGE; } else { spin_lock(vmf->ptl); if (unlikely(!pte_none(ptep_get(ptep)))) ret = VM_FAULT_NOPAGE; spin_unlock(vmf->ptl); } pte_unmap(ptep); return ret; } /** * filemap_fault - read in file data for page fault handling * @vmf: struct vm_fault containing details of the fault * * filemap_fault() is invoked via the vma operations vector for a * mapped memory region to read in file data during a page fault. * * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. * * vma->vm_mm->mmap_lock must be held on entry. * * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap(). * * If our return value does not have VM_FAULT_RETRY set, the mmap_lock * has not been released. * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. * * Return: bitwise-OR of %VM_FAULT_ codes. */ vm_fault_t filemap_fault(struct vm_fault *vmf) { int error; struct file *file = vmf->vma->vm_file; struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; pgoff_t max_idx, index = vmf->pgoff; struct folio *folio; vm_fault_t ret = 0; bool mapping_locked = false; max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(index >= max_idx)) return VM_FAULT_SIGBUS; trace_mm_filemap_fault(mapping, index); /* * Do we have something in the page cache already? */ folio = filemap_get_folio(mapping, index); if (likely(!IS_ERR(folio))) { /* * We found the page, so try async readahead before waiting for * the lock. */ if (!(vmf->flags & FAULT_FLAG_TRIED)) fpin = do_async_mmap_readahead(vmf, folio); if (unlikely(!folio_test_uptodate(folio))) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } } else { ret = filemap_fault_recheck_pte_none(vmf); if (unlikely(ret)) return ret; /* No page in the page cache at all */ count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; fpin = do_sync_mmap_readahead(vmf); retry_find: /* * See comment in filemap_create_folio() why we need * invalidate_lock */ if (!mapping_locked) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } folio = __filemap_get_folio(mapping, index, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask); if (IS_ERR(folio)) { if (fpin) goto out_retry; filemap_invalidate_unlock_shared(mapping); return VM_FAULT_OOM; } } if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin)) goto out_retry; /* Did it get truncated? */ if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); folio_put(folio); goto retry_find; } VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); /* * We have a locked folio in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error, * or because readahead was otherwise unable to retrieve it. */ if (unlikely(!folio_test_uptodate(folio))) { /* * If the invalidate lock is not held, the folio was in cache * and uptodate and now it is not. Strange but possible since we * didn't hold the page lock all the time. Let's drop * everything, get the invalidate lock and try again. */ if (!mapping_locked) { folio_unlock(folio); folio_put(folio); goto retry_find; } /* * OK, the folio is really not uptodate. This can be because the * VMA has the VM_RAND_READ flag set, or because an error * arose. Let's read it in directly. */ goto page_not_uptodate; } /* * We've made it this far and we had to drop our mmap_lock, now is the * time to return to the upper layer and have it re-find the vma and * redo the fault. */ if (fpin) { folio_unlock(folio); goto out_retry; } if (mapping_locked) filemap_invalidate_unlock_shared(mapping); /* * Found the page and have a reference on it. * We must recheck i_size under page lock. */ max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(index >= max_idx)) { folio_unlock(folio); folio_put(folio); return VM_FAULT_SIGBUS; } vmf->page = folio_file_page(folio, index); return ret | VM_FAULT_LOCKED; page_not_uptodate: /* * Umm, take care of errors if the page isn't up-to-date. * Try to re-read it _once_. We do this synchronously, * because there really aren't any performance issues here * and we need to check for errors. */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); if (fpin) goto out_retry; folio_put(folio); if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; filemap_invalidate_unlock_shared(mapping); return VM_FAULT_SIGBUS; out_retry: /* * We dropped the mmap_lock, we need to return to the fault handler to * re-find the vma and come back and find our hopefully still populated * page. */ if (!IS_ERR(folio)) folio_put(folio); if (mapping_locked) filemap_invalidate_unlock_shared(mapping); if (fpin) fput(fpin); return ret | VM_FAULT_RETRY; } EXPORT_SYMBOL(filemap_fault); static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, pgoff_t start) { struct mm_struct *mm = vmf->vma->vm_mm; /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) { folio_unlock(folio); folio_put(folio); return true; } if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) { struct page *page = folio_file_page(folio, start); vm_fault_t ret = do_set_pmd(vmf, folio, page); if (!ret) { /* The page is mapped successfully, reference consumed. */ folio_unlock(folio); return true; } } if (pmd_none(*vmf->pmd) && vmf->prealloc_pte) pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); return false; } static struct folio *next_uptodate_folio(struct xa_state *xas, struct address_space *mapping, pgoff_t end_pgoff) { struct folio *folio = xas_next_entry(xas, end_pgoff); unsigned long max_idx; do { if (!folio) return NULL; if (xas_retry(xas, folio)) continue; if (xa_is_value(folio)) continue; if (!folio_try_get(folio)) continue; if (folio_test_locked(folio)) goto skip; /* Has the page moved or been split? */ if (unlikely(folio != xas_reload(xas))) goto skip; if (!folio_test_uptodate(folio) || folio_test_readahead(folio)) goto skip; if (!folio_trylock(folio)) goto skip; if (folio->mapping != mapping) goto unlock; if (!folio_test_uptodate(folio)) goto unlock; max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (xas->xa_index >= max_idx) goto unlock; return folio; unlock: folio_unlock(folio); skip: folio_put(folio); } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL); return NULL; } /* * Map page range [start_page, start_page + nr_pages) of folio. * start_page is gotten from start by folio_page(folio, start) */ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, unsigned long *rss, unsigned short *mmap_miss) { vm_fault_t ret = 0; struct page *page = folio_page(folio, start); unsigned int count = 0; pte_t *old_ptep = vmf->pte; do { if (PageHWPoison(page + count)) goto skip; /* * If there are too many folios that are recently evicted * in a file, they will probably continue to be evicted. * In such situation, read-ahead is only a waste of IO. * Don't decrease mmap_miss in this scenario to make sure * we can stop read-ahead. */ if (!folio_test_workingset(folio)) (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ if (!pte_none(ptep_get(&vmf->pte[count]))) goto skip; count++; continue; skip: if (count) { set_pte_range(vmf, folio, page, count, addr); *rss += count; folio_ref_add(folio, count); if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } count++; page += count; vmf->pte += count; addr += count * PAGE_SIZE; count = 0; } while (--nr_pages > 0); if (count) { set_pte_range(vmf, folio, page, count, addr); *rss += count; folio_ref_add(folio, count); if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } vmf->pte = old_ptep; return ret; } static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, struct folio *folio, unsigned long addr, unsigned long *rss, unsigned short *mmap_miss) { vm_fault_t ret = 0; struct page *page = &folio->page; if (PageHWPoison(page)) return ret; /* See comment of filemap_map_folio_range() */ if (!folio_test_workingset(folio)) (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be * handled in the specific fault path, and it'll prohibit * the fault-around logic. */ if (!pte_none(ptep_get(vmf->pte))) return ret; if (vmf->address == addr) ret = VM_FAULT_NOPAGE; set_pte_range(vmf, folio, page, 1, addr); (*rss)++; folio_ref_inc(folio); return ret; } vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct vm_area_struct *vma = vmf->vma; struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t file_end, last_pgoff = start_pgoff; unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct folio *folio; vm_fault_t ret = 0; unsigned long rss = 0; unsigned int nr_pages = 0, folio_type; unsigned short mmap_miss = 0, mmap_miss_saved; rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); if (!folio) goto out; if (filemap_map_pmd(vmf, folio, start_pgoff)) { ret = VM_FAULT_NOPAGE; goto out; } addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) { folio_unlock(folio); folio_put(folio); goto out; } file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; if (end_pgoff > file_end) end_pgoff = file_end; folio_type = mm_counter_file(folio); do { unsigned long end; addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; end = folio_next_index(folio) - 1; nr_pages = min(end, end_pgoff) - xas.xa_index + 1; if (!folio_test_large(folio)) ret |= filemap_map_order0_folio(vmf, folio, addr, &rss, &mmap_miss); else ret |= filemap_map_folio_range(vmf, folio, xas.xa_index - folio->index, addr, nr_pages, &rss, &mmap_miss); folio_unlock(folio); folio_put(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); add_mm_counter(vma->vm_mm, folio_type, rss); pte_unmap_unlock(vmf->pte, vmf->ptl); trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff); out: rcu_read_unlock(); mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss); if (mmap_miss >= mmap_miss_saved) WRITE_ONCE(file->f_ra.mmap_miss, 0); else WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss); return ret; } EXPORT_SYMBOL(filemap_map_pages); vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct folio *folio = page_folio(vmf->page); vm_fault_t ret = VM_FAULT_LOCKED; sb_start_pagefault(mapping->host->i_sb); file_update_time(vmf->vma->vm_file); folio_lock(folio); if (folio->mapping != mapping) { folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } /* * We mark the folio dirty already here so that when freeze is in * progress, we are guaranteed that writeback during freezing will * see the dirty folio and writeprotect it again. */ folio_mark_dirty(folio); folio_wait_stable(folio); out: sb_end_pagefault(mapping->host->i_sb); return ret; } const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, }; /* This is used for a general mmap of a disk file */ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; return 0; } int generic_file_mmap_prepare(struct vm_area_desc *desc) { struct file *file = desc->file; struct address_space *mapping = file->f_mapping; if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(file); desc->vm_ops = &generic_file_vm_ops; return 0; } /* * This is for filesystems which do not implement ->writepage. */ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { if (vma_is_shared_maywrite(vma)) return -EINVAL; return generic_file_mmap(file, vma); } int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc) { if (is_shared_maywrite(desc->vm_flags)) return -EINVAL; return generic_file_mmap_prepare(desc); } #else vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { return VM_FAULT_SIGBUS; } int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } int generic_file_mmap_prepare(struct vm_area_desc *desc) { return -ENOSYS; } int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc) { return -ENOSYS; } #endif /* CONFIG_MMU */ EXPORT_SYMBOL(filemap_page_mkwrite); EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_mmap_prepare); EXPORT_SYMBOL(generic_file_readonly_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap_prepare); static struct folio *do_read_cache_folio(struct address_space *mapping, pgoff_t index, filler_t filler, struct file *file, gfp_t gfp) { struct folio *folio; int err; if (!filler) filler = mapping->a_ops->read_folio; repeat: folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) { folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping)); if (!folio) return ERR_PTR(-ENOMEM); index = mapping_align_index(mapping, index); err = filemap_add_folio(mapping, folio, index, gfp); if (unlikely(err)) { folio_put(folio); if (err == -EEXIST) goto repeat; /* Presumably ENOMEM for xarray node */ return ERR_PTR(err); } goto filler; } if (folio_test_uptodate(folio)) goto out; if (!folio_trylock(folio)) { folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); goto repeat; } /* Folio was truncated from mapping */ if (!folio->mapping) { folio_unlock(folio); folio_put(folio); goto repeat; } /* Someone else locked and filled the page in a very small window */ if (folio_test_uptodate(folio)) { folio_unlock(folio); goto out; } filler: err = filemap_read_folio(file, filler, folio); if (err) { folio_put(folio); if (err == AOP_TRUNCATED_PAGE) goto repeat; return ERR_PTR(err); } out: folio_mark_accessed(folio); return folio; } /** * read_cache_folio - Read into page cache, fill it if needed. * @mapping: The address_space to read from. * @index: The index to read. * @filler: Function to perform the read, or NULL to use aops->read_folio(). * @file: Passed to filler function, may be NULL if not required. * * Read one page into the page cache. If it succeeds, the folio returned * will contain @index, but it may not be the first page of the folio. * * If the filler function returns an error, it will be returned to the * caller. * * Context: May sleep. Expects mapping->invalidate_lock to be held. * Return: An uptodate folio on success, ERR_PTR() on failure. */ struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, filler_t filler, struct file *file) { return do_read_cache_folio(mapping, index, filler, file, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(read_cache_folio); /** * mapping_read_folio_gfp - Read into page cache, using specified allocation flags. * @mapping: The address_space for the folio. * @index: The index that the allocated folio will contain. * @gfp: The page allocator flags to use if allocating. * * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with * any new memory allocations done using the specified allocation flags. * * The most likely error from this function is EIO, but ENOMEM is * possible and so is EINTR. If ->read_folio returns another error, * that will be returned to the caller. * * The function expects mapping->invalidate_lock to be already held. * * Return: Uptodate folio on success, ERR_PTR() on failure. */ struct folio *mapping_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { return do_read_cache_folio(mapping, index, NULL, NULL, gfp); } EXPORT_SYMBOL(mapping_read_folio_gfp); static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp) { struct folio *folio; folio = do_read_cache_folio(mapping, index, filler, file, gfp); if (IS_ERR(folio)) return &folio->page; return folio_file_page(folio, index); } struct page *read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, struct file *file) { return do_read_cache_page(mapping, index, filler, file, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(read_cache_page); /** * read_cache_page_gfp - read into page cache, using specified page allocation flags. * @mapping: the page's address_space * @index: the page index * @gfp: the page allocator flags to use if allocating * * This is the same as "read_mapping_page(mapping, index, NULL)", but with * any new page allocations done using the specified allocation flags. * * If the page does not get brought uptodate, return -EIO. * * The function expects mapping->invalidate_lock to be already held. * * Return: up to date page on success, ERR_PTR() on failure. */ struct page *read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { return do_read_cache_page(mapping, index, NULL, NULL, gfp); } EXPORT_SYMBOL(read_cache_page_gfp); /* * Warn about a page cache invalidation failure during a direct I/O write. */ static void dio_warn_stale_pagecache(struct file *filp) { static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); char pathname[128]; char *path; errseq_set(&filp->f_mapping->wb_err, -EIO); if (__ratelimit(&_rs)) { path = file_path(filp, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, current->comm); } } void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count) { struct address_space *mapping = iocb->ki_filp->f_mapping; if (mapping->nrpages && invalidate_inode_pages2_range(mapping, iocb->ki_pos >> PAGE_SHIFT, (iocb->ki_pos + count - 1) >> PAGE_SHIFT)) dio_warn_stale_pagecache(iocb->ki_filp); } ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct address_space *mapping = iocb->ki_filp->f_mapping; size_t write_len = iov_iter_count(from); ssize_t written; /* * If a page can not be invalidated, return 0 to fall back * to buffered write. */ written = kiocb_invalidate_pages(iocb, write_len); if (written) { if (written == -EBUSY) return 0; return written; } written = mapping->a_ops->direct_IO(iocb, from); /* * Finally, try again to invalidate clean pages which might have been * cached by non-direct readahead, or faulted in by get_user_pages() * if the source of the write was an mmap'ed region of the file * we're writing. Either one is a pretty crazy thing to do, * so we don't support it 100%. If this invalidation * fails, tough, the write still worked... * * Most of the time we do not need this since dio_complete() will do * the invalidation for us. However there are some file systems that * do not end up with dio_complete() being called, so let's not break * them by removing it completely. * * Noticeable example is a blkdev_direct_IO(). * * Skip invalidation for async writes or if mapping has no pages. */ if (written > 0) { struct inode *inode = mapping->host; loff_t pos = iocb->ki_pos; kiocb_invalidate_post_direct_write(iocb, written); pos += written; write_len -= written; if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { i_size_write(inode, pos); mark_inode_dirty(inode); } iocb->ki_pos = pos; } if (written != -EIOCBQUEUED) iov_iter_revert(from, write_len - iov_iter_count(from)); return written; } EXPORT_SYMBOL(generic_file_direct_write); ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) { struct file *file = iocb->ki_filp; loff_t pos = iocb->ki_pos; struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping->a_ops; size_t chunk = mapping_max_folio_size(mapping); long status = 0; ssize_t written = 0; do { struct folio *folio; size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ void *fsdata = NULL; bytes = iov_iter_count(i); retry: offset = pos & (chunk - 1); bytes = min(chunk - offset, bytes); balance_dirty_pages_ratelimited(mapping); if (fatal_signal_pending(current)) { status = -EINTR; break; } status = a_ops->write_begin(iocb, mapping, pos, bytes, &folio, &fsdata); if (unlikely(status < 0)) break; offset = offset_in_folio(folio, pos); if (bytes > folio_size(folio) - offset) bytes = folio_size(folio) - offset; if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); /* * Faults here on mmap()s can recurse into arbitrary * filesystem code. Lots of locks are held that can * deadlock. Use an atomic copy to avoid deadlocking * in page fault handling. */ copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); flush_dcache_folio(folio); status = a_ops->write_end(iocb, mapping, pos, bytes, copied, folio, fsdata); if (unlikely(status != copied)) { iov_iter_revert(i, copied - max(status, 0L)); if (unlikely(status < 0)) break; } cond_resched(); if (unlikely(status == 0)) { /* * A short copy made ->write_end() reject the * thing entirely. Might be memory poisoning * halfway through, might be a race with munmap, * might be severe memory pressure. */ if (chunk > PAGE_SIZE) chunk /= 2; if (copied) { bytes = copied; goto retry; } /* * 'folio' is now unlocked and faults on it can be * handled. Ensure forward progress by trying to * fault it in now. */ if (fault_in_iov_iter_readable(i, bytes) == bytes) { status = -EFAULT; break; } } else { pos += status; written += status; } } while (iov_iter_count(i)); if (!written) return status; iocb->ki_pos += written; return written; } EXPORT_SYMBOL(generic_perform_write); /** * __generic_file_write_iter - write data to a file * @iocb: IO state structure (file, offset, etc.) * @from: iov_iter with data to write * * This function does all the work needed for actually writing data to a * file. It does all basic checks, removes SUID from the file, updates * modification times and calls proper subroutines depending on whether we * do direct IO or a standard buffered write. * * It expects i_rwsem to be grabbed unless we work on a block device or similar * object which does not need locking at all. * * This function does *not* take care of syncing data in case of O_SYNC write. * A caller has to handle it. This is mainly due to the fact that we want to * avoid syncing under i_rwsem. * * Return: * * number of bytes written, even for truncated writes * * negative error code if no data has been written at all */ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t ret; ret = file_remove_privs(file); if (ret) return ret; ret = file_update_time(file); if (ret) return ret; if (iocb->ki_flags & IOCB_DIRECT) { ret = generic_file_direct_write(iocb, from); /* * If the write stopped short of completing, fall back to * buffered writes. Some filesystems do this for writes to * holes, for example. For DAX files, a buffered write will * not succeed (even if it did, DAX does not handle dirty * page-cache pages correctly). */ if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode)) return ret; return direct_write_fallback(iocb, from, ret, generic_perform_write(iocb, from)); } return generic_perform_write(iocb, from); } EXPORT_SYMBOL(__generic_file_write_iter); /** * generic_file_write_iter - write data to a file * @iocb: IO state structure * @from: iov_iter with data to write * * This is a wrapper around __generic_file_write_iter() to be used by most * filesystems. It takes care of syncing the file in case of O_SYNC file * and acquires i_rwsem as needed. * Return: * * negative error code if no data has been written at all of * vfs_fsync_range() failed for a synchronous write * * number of bytes written, even for truncated writes */ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; } EXPORT_SYMBOL(generic_file_write_iter); /** * filemap_release_folio() - Release fs-specific metadata on a folio. * @folio: The folio which the kernel is trying to free. * @gfp: Memory allocation flags (and I/O mode). * * The address_space is trying to release any data attached to a folio * (presumably at folio->private). * * This will also be called if the private_2 flag is set on a page, * indicating that the folio has other metadata associated with it. * * The @gfp argument specifies whether I/O may be performed to release * this page (__GFP_IO), and whether the call may block * (__GFP_RECLAIM & __GFP_FS). * * Return: %true if the release was successful, otherwise %false. */ bool filemap_release_folio(struct folio *folio, gfp_t gfp) { struct address_space * const mapping = folio->mapping; BUG_ON(!folio_test_locked(folio)); if (!folio_needs_release(folio)) return true; if (folio_test_writeback(folio)) return false; if (mapping && mapping->a_ops->release_folio) return mapping->a_ops->release_folio(folio, gfp); return try_to_free_buffers(folio); } EXPORT_SYMBOL(filemap_release_folio); /** * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache * @inode: The inode to flush * @flush: Set to write back rather than simply invalidate. * @start: First byte to in range. * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start * onwards. * * Invalidate all the folios on an inode that contribute to the specified * range, possibly writing them back first. Whilst the operation is * undertaken, the invalidate lock is held to prevent new folios from being * installed. */ int filemap_invalidate_inode(struct inode *inode, bool flush, loff_t start, loff_t end) { struct address_space *mapping = inode->i_mapping; pgoff_t first = start >> PAGE_SHIFT; pgoff_t last = end >> PAGE_SHIFT; pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1; if (!mapping || !mapping->nrpages || end < start) goto out; /* Prevent new folios from being added to the inode. */ filemap_invalidate_lock(mapping); if (!mapping->nrpages) goto unlock; unmap_mapping_pages(mapping, first, nr, false); /* Write back the data if we're asked to. */ if (flush) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; filemap_fdatawrite_wbc(mapping, &wbc); } /* Wait for writeback to complete on all folios and discard. */ invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE); unlock: filemap_invalidate_unlock(mapping); out: return filemap_check_errors(mapping); } EXPORT_SYMBOL_GPL(filemap_invalidate_inode); #ifdef CONFIG_CACHESTAT_SYSCALL /** * filemap_cachestat() - compute the page cache statistics of a mapping * @mapping: The mapping to compute the statistics for. * @first_index: The starting page cache index. * @last_index: The final page index (inclusive). * @cs: the cachestat struct to write the result to. * * This will query the page cache statistics of a mapping in the * page range of [first_index, last_index] (inclusive). The statistics * queried include: number of dirty pages, number of pages marked for * writeback, and the number of (recently) evicted pages. */ static void filemap_cachestat(struct address_space *mapping, pgoff_t first_index, pgoff_t last_index, struct cachestat *cs) { XA_STATE(xas, &mapping->i_pages, first_index); struct folio *folio; /* Flush stats (and potentially sleep) outside the RCU read section. */ mem_cgroup_flush_stats_ratelimited(NULL); rcu_read_lock(); xas_for_each(&xas, folio, last_index) { int order; unsigned long nr_pages; pgoff_t folio_first_index, folio_last_index; /* * Don't deref the folio. It is not pinned, and might * get freed (and reused) underneath us. * * We *could* pin it, but that would be expensive for * what should be a fast and lightweight syscall. * * Instead, derive all information of interest from * the rcu-protected xarray. */ if (xas_retry(&xas, folio)) continue; order = xas_get_order(&xas); nr_pages = 1 << order; folio_first_index = round_down(xas.xa_index, 1 << order); folio_last_index = folio_first_index + nr_pages - 1; /* Folios might straddle the range boundaries, only count covered pages */ if (folio_first_index < first_index) nr_pages -= first_index - folio_first_index; if (folio_last_index > last_index) nr_pages -= folio_last_index - last_index; if (xa_is_value(folio)) { /* page is evicted */ void *shadow = (void *)folio; bool workingset; /* not used */ cs->nr_evicted += nr_pages; #ifdef CONFIG_SWAP /* implies CONFIG_MMU */ if (shmem_mapping(mapping)) { /* shmem file - in swap cache */ swp_entry_t swp = radix_to_swp_entry(folio); /* swapin error results in poisoned entry */ if (non_swap_entry(swp)) goto resched; /* * Getting a swap entry from the shmem * inode means we beat * shmem_unuse(). rcu_read_lock() * ensures swapoff waits for us before * freeing the swapper space. However, * we can race with swapping and * invalidation, so there might not be * a shadow in the swapcache (yet). */ shadow = get_shadow_from_swap_cache(swp); if (!shadow) goto resched; } #endif if (workingset_test_recent(shadow, true, &workingset, false)) cs->nr_recently_evicted += nr_pages; goto resched; } /* page is in cache */ cs->nr_cache += nr_pages; if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY)) cs->nr_dirty += nr_pages; if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK)) cs->nr_writeback += nr_pages; resched: if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); } /* * See mincore: reveal pagecache information only for files * that the calling process has write access to, or could (if * tried) open for writing. */ static inline bool can_do_cachestat(struct file *f) { if (f->f_mode & FMODE_WRITE) return true; if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f))) return true; return file_permission(f, MAY_WRITE) == 0; } /* * The cachestat(2) system call. * * cachestat() returns the page cache statistics of a file in the * bytes range specified by `off` and `len`: number of cached pages, * number of dirty pages, number of pages marked for writeback, * number of evicted pages, and number of recently evicted pages. * * An evicted page is a page that is previously in the page cache * but has been evicted since. A page is recently evicted if its last * eviction was recent enough that its reentry to the cache would * indicate that it is actively being used by the system, and that * there is memory pressure on the system. * * `off` and `len` must be non-negative integers. If `len` > 0, * the queried range is [`off`, `off` + `len`]. If `len` == 0, * we will query in the range from `off` to the end of the file. * * The `flags` argument is unused for now, but is included for future * extensibility. User should pass 0 (i.e no flag specified). * * Currently, hugetlbfs is not supported. * * Because the status of a page can change after cachestat() checks it * but before it returns to the application, the returned values may * contain stale information. * * return values: * zero - success * -EFAULT - cstat or cstat_range points to an illegal address * -EINVAL - invalid flags * -EBADF - invalid file descriptor * -EOPNOTSUPP - file descriptor is of a hugetlbfs file */ SYSCALL_DEFINE4(cachestat, unsigned int, fd, struct cachestat_range __user *, cstat_range, struct cachestat __user *, cstat, unsigned int, flags) { CLASS(fd, f)(fd); struct address_space *mapping; struct cachestat_range csr; struct cachestat cs; pgoff_t first_index, last_index; if (fd_empty(f)) return -EBADF; if (copy_from_user(&csr, cstat_range, sizeof(struct cachestat_range))) return -EFAULT; /* hugetlbfs is not supported */ if (is_file_hugepages(fd_file(f))) return -EOPNOTSUPP; if (!can_do_cachestat(fd_file(f))) return -EPERM; if (flags != 0) return -EINVAL; first_index = csr.off >> PAGE_SHIFT; last_index = csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT; memset(&cs, 0, sizeof(struct cachestat)); mapping = fd_file(f)->f_mapping; filemap_cachestat(mapping, first_index, last_index, &cs); if (copy_to_user(cstat, &cs, sizeof(struct cachestat))) return -EFAULT; return 0; } #endif /* CONFIG_CACHESTAT_SYSCALL */ |
| 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 | // SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/sched/mm.h> #include <linux/atomic.h> #include <linux/vmalloc.h> #include "ctree.h" #include "volumes.h" #include "zoned.h" #include "disk-io.h" #include "block-group.h" #include "dev-replace.h" #include "space-info.h" #include "fs.h" #include "accessors.h" #include "bio.h" #include "transaction.h" #include "sysfs.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 /* Invalid allocation pointer value for missing devices */ #define WP_MISSING_DEV ((u64)-1) /* Pseudo write pointer value for conventional zone */ #define WP_CONVENTIONAL ((u64)-2) /* * Location of the first zone of superblock logging zone pairs. * * - primary superblock: 0B (zone 0) * - first copy: 512G (zone starting at that offset) * - second copy: 4T (zone starting at that offset) */ #define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL) #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) #define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) #define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 /* Default number of max active zones when the device has no limits. */ #define BTRFS_DEFAULT_MAX_ACTIVE_ZONES 128 /* * Minimum of active zones we need: * * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group * - 1 zone for tree-log dedicated block group * - 1 zone for relocation */ #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) /* * Minimum / maximum supported zone size. Currently, SMR disks have a zone * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. * We do not expect the zone size to become larger than 8GiB or smaller than * 4MiB in the near future. */ #define BTRFS_MAX_ZONE_SIZE SZ_8G #define BTRFS_MIN_ZONE_SIZE SZ_4M #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) static void wait_eb_writebacks(struct btrfs_block_group *block_group); static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written); static inline bool sb_zone_is_full(const struct blk_zone *zone) { return (zone->cond == BLK_ZONE_COND_FULL) || (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity); } static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct blk_zone *zones = data; memcpy(&zones[idx], zone, sizeof(*zone)); return 0; } static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, u64 *wp_ret) { bool empty[BTRFS_NR_SB_LOG_ZONES]; bool full[BTRFS_NR_SB_LOG_ZONES]; sector_t sector; for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL); empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY); full[i] = sb_zone_is_full(&zones[i]); } /* * Possible states of log buffer zones * * Empty[0] In use[0] Full[0] * Empty[1] * 0 1 * In use[1] x x 1 * Full[1] 0 0 C * * Log position: * *: Special case, no superblock is written * 0: Use write pointer of zones[0] * 1: Use write pointer of zones[1] * C: Compare super blocks from zones[0] and zones[1], use the latest * one determined by generation * x: Invalid state */ if (empty[0] && empty[1]) { /* Special case to distinguish no superblock to read */ *wp_ret = zones[0].start << SECTOR_SHIFT; return -ENOENT; } else if (full[0] && full[1]) { /* Compare two super blocks */ struct address_space *mapping = bdev->bd_mapping; struct page *page[BTRFS_NR_SB_LOG_ZONES]; struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT; u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) - BTRFS_SUPER_INFO_SIZE; page[i] = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); if (IS_ERR(page[i])) { if (i == 1) btrfs_release_disk_super(super[0]); return PTR_ERR(page[i]); } super[i] = page_address(page[i]); } if (btrfs_super_generation(super[0]) > btrfs_super_generation(super[1])) sector = zones[1].start; else sector = zones[0].start; for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) btrfs_release_disk_super(super[i]); } else if (!full[0] && (empty[1] || full[1])) { sector = zones[0].wp; } else if (full[0]) { sector = zones[1].wp; } else { return -EUCLEAN; } *wp_ret = sector << SECTOR_SHIFT; return 0; } /* * Get the first zone number of the superblock mirror */ static inline u32 sb_zone_number(int shift, int mirror) { u64 zone = U64_MAX; ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); switch (mirror) { case 0: zone = 0; break; case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; } ASSERT(zone <= U32_MAX); return (u32)zone; } static inline sector_t zone_start_sector(u32 zone_number, struct block_device *bdev) { return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev)); } static inline u64 zone_start_physical(u32 zone_number, struct btrfs_zoned_device_info *zone_info) { return (u64)zone_number << zone_info->zone_size_shift; } /* * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block * device into static sized chunks and fake a conventional zone on each of * them. */ static int emulate_report_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int nr_zones) { const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT; sector_t bdev_size = bdev_nr_sectors(device->bdev); unsigned int i; pos >>= SECTOR_SHIFT; for (i = 0; i < nr_zones; i++) { zones[i].start = i * zone_sectors + pos; zones[i].len = zone_sectors; zones[i].capacity = zone_sectors; zones[i].wp = zones[i].start + zone_sectors; zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL; zones[i].cond = BLK_ZONE_COND_NOT_WP; if (zones[i].wp >= bdev_size) { i++; break; } } return i; } static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { struct btrfs_zoned_device_info *zinfo = device->zone_info; int ret; if (!*nr_zones) return 0; if (!bdev_is_zoned(device->bdev)) { ret = emulate_report_zones(device, pos, zones, *nr_zones); *nr_zones = ret; return 0; } /* Check cache */ if (zinfo->zone_cache) { unsigned int i; u32 zno; ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); zno = pos >> zinfo->zone_size_shift; /* * We cannot report zones beyond the zone end. So, it is OK to * cap *nr_zones to at the end. */ *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno); for (i = 0; i < *nr_zones; i++) { struct blk_zone *zone_info; zone_info = &zinfo->zone_cache[zno + i]; if (!zone_info->len) break; } if (i == *nr_zones) { /* Cache hit on all the zones */ memcpy(zones, zinfo->zone_cache + zno, sizeof(*zinfo->zone_cache) * *nr_zones); return 0; } } ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, copy_zone_info_cb, zones); if (ret < 0) { btrfs_err(device->fs_info, "zoned: failed to read zone %llu on %s (devid %llu)", pos, rcu_dereference(device->name), device->devid); return ret; } *nr_zones = ret; if (!ret) return -EIO; /* Populate cache */ if (zinfo->zone_cache) { u32 zno = pos >> zinfo->zone_size_shift; memcpy(zinfo->zone_cache + zno, zones, sizeof(*zinfo->zone_cache) * *nr_zones); } return 0; } /* The emulated zone size is determined from the size of device extent */ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) { BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct extent_buffer *leaf; struct btrfs_dev_extent *dext; int ret = 0; key.objectid = 1; key.type = BTRFS_DEV_EXTENT_KEY; key.offset = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) return ret; /* No dev extents at all? Not good */ if (ret > 0) return -EUCLEAN; } leaf = path->nodes[0]; dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); return 0; } int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; int ret = 0; /* fs_info->zone_size might not set yet. Use the incomapt flag here. */ if (!btrfs_fs_incompat(fs_info, ZONED)) return 0; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { /* We can skip reading of zone info for missing devices */ if (!device->bdev) continue; ret = btrfs_get_dev_zone_info(device, true); if (ret) break; } mutex_unlock(&fs_devices->device_list_mutex); return ret; } int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; unsigned int max_active_zones; unsigned int nactive; sector_t nr_sectors; sector_t sector = 0; struct blk_zone *zones = NULL; unsigned int i, nreported = 0, nr_zones; sector_t zone_sectors; char *model, *emulated; int ret; /* * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not * yet be set. */ if (!btrfs_fs_incompat(fs_info, ZONED)) return 0; if (device->zone_info) return 0; zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL); if (!zone_info) return -ENOMEM; device->zone_info = zone_info; if (!bdev_is_zoned(bdev)) { if (!fs_info->zone_size) { ret = calculate_emulated_zone_size(fs_info); if (ret) goto out; } ASSERT(fs_info->zone_size); zone_sectors = fs_info->zone_size >> SECTOR_SHIFT; } else { zone_sectors = bdev_zone_sectors(bdev); } ASSERT(is_power_of_two_u64(zone_sectors)); zone_info->zone_size = zone_sectors << SECTOR_SHIFT; /* We reject devices with a zone size larger than 8GB */ if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { btrfs_err(fs_info, "zoned: %s: zone size %llu larger than supported maximum %llu", rcu_dereference(device->name), zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); ret = -EINVAL; goto out; } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) { btrfs_err(fs_info, "zoned: %s: zone size %llu smaller than supported minimum %u", rcu_dereference(device->name), zone_info->zone_size, BTRFS_MIN_ZONE_SIZE); ret = -EINVAL; goto out; } nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; max_active_zones = min_not_zero(bdev_max_active_zones(bdev), bdev_max_open_zones(bdev)); if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES) max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES; if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { btrfs_err(fs_info, "zoned: %s: max active zones %u is too small, need at least %u active zones", rcu_dereference(device->name), max_active_zones, BTRFS_MIN_ACTIVE_ZONES); ret = -EINVAL; goto out; } zone_info->max_active_zones = max_active_zones; zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->seq_zones) { ret = -ENOMEM; goto out; } zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->empty_zones) { ret = -ENOMEM; goto out; } zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->active_zones) { ret = -ENOMEM; goto out; } zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); if (!zones) { ret = -ENOMEM; goto out; } /* * Enable zone cache only for a zoned device. On a non-zoned device, we * fill the zone info with emulated CONVENTIONAL zones, so no need to * use the cache. */ if (populate_cache && bdev_is_zoned(device->bdev)) { zone_info->zone_cache = vcalloc(zone_info->nr_zones, sizeof(struct blk_zone)); if (!zone_info->zone_cache) { btrfs_err(device->fs_info, "zoned: failed to allocate zone cache for %s", rcu_dereference(device->name)); ret = -ENOMEM; goto out; } } /* Get zones type */ nactive = 0; while (sector < nr_sectors) { nr_zones = BTRFS_REPORT_NR_ZONES; ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, &nr_zones); if (ret) goto out; for (i = 0; i < nr_zones; i++) { if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) __set_bit(nreported, zone_info->seq_zones); switch (zones[i].cond) { case BLK_ZONE_COND_EMPTY: __set_bit(nreported, zone_info->empty_zones); break; case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: case BLK_ZONE_COND_CLOSED: __set_bit(nreported, zone_info->active_zones); nactive++; break; } nreported++; } sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; } if (nreported != zone_info->nr_zones) { btrfs_err(device->fs_info, "inconsistent number of zones on %s (%u/%u)", rcu_dereference(device->name), nreported, zone_info->nr_zones); ret = -EIO; goto out; } if (max_active_zones) { if (nactive > max_active_zones) { btrfs_err(device->fs_info, "zoned: %u active zones on %s exceeds max_active_zones %u", nactive, rcu_dereference(device->name), max_active_zones); ret = -EIO; goto out; } atomic_set(&zone_info->active_zones_left, max_active_zones - nactive); set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags); } /* Validate superblock log */ nr_zones = BTRFS_NR_SB_LOG_ZONES; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { u32 sb_zone; u64 sb_wp; int sb_pos = BTRFS_NR_SB_LOG_ZONES * i; sb_zone = sb_zone_number(zone_info->zone_size_shift, i); if (sb_zone + 1 >= zone_info->nr_zones) continue; ret = btrfs_get_dev_zones(device, zone_start_physical(sb_zone, zone_info), &zone_info->sb_zones[sb_pos], &nr_zones); if (ret) goto out; if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { btrfs_err(device->fs_info, "zoned: failed to read super block log zone info at devid %llu zone %u", device->devid, sb_zone); ret = -EUCLEAN; goto out; } /* * If zones[0] is conventional, always use the beginning of the * zone to record superblock. No need to validate in that case. */ if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == BLK_ZONE_TYPE_CONVENTIONAL) continue; ret = sb_write_pointer(device->bdev, &zone_info->sb_zones[sb_pos], &sb_wp); if (ret != -ENOENT && ret) { btrfs_err(device->fs_info, "zoned: super block log zone corrupted devid %llu zone %u", device->devid, sb_zone); ret = -EUCLEAN; goto out; } } kvfree(zones); if (bdev_is_zoned(bdev)) { model = "host-managed zoned"; emulated = ""; } else { model = "regular"; emulated = "emulated "; } btrfs_info(fs_info, "%s block device %s, %u %szones of %llu bytes", model, rcu_dereference(device->name), zone_info->nr_zones, emulated, zone_info->zone_size); return 0; out: kvfree(zones); btrfs_destroy_dev_zone_info(device); return ret; } void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { struct btrfs_zoned_device_info *zone_info = device->zone_info; if (!zone_info) return; bitmap_free(zone_info->active_zones); bitmap_free(zone_info->seq_zones); bitmap_free(zone_info->empty_zones); vfree(zone_info->zone_cache); kfree(zone_info); device->zone_info = NULL; } struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev) { struct btrfs_zoned_device_info *zone_info; zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL); if (!zone_info) return NULL; zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->seq_zones) goto out; bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones, zone_info->nr_zones); zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->empty_zones) goto out; bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones, zone_info->nr_zones); zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->active_zones) goto out; bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones, zone_info->nr_zones); zone_info->zone_cache = NULL; return zone_info; out: bitmap_free(zone_info->seq_zones); bitmap_free(zone_info->empty_zones); bitmap_free(zone_info->active_zones); kfree(zone_info); return NULL; } static int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) { unsigned int nr_zones = 1; int ret; ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones); if (ret != 0 || !nr_zones) return ret ? ret : -EIO; return 0; } static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) { struct btrfs_device *device; list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { if (device->bdev && bdev_is_zoned(device->bdev)) { btrfs_err(fs_info, "zoned: mode not enabled but zoned device found: %pg", device->bdev); return -EINVAL; } } return 0; } int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) { struct queue_limits *lim = &fs_info->limits; struct btrfs_device *device; u64 zone_size = 0; int ret; /* * Host-Managed devices can't be used without the ZONED flag. With the * ZONED all devices can be used, using zone emulation if required. */ if (!btrfs_fs_incompat(fs_info, ZONED)) return btrfs_check_for_zoned_device(fs_info); blk_set_stacking_limits(lim); list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { struct btrfs_zoned_device_info *zone_info = device->zone_info; if (!device->bdev) continue; if (!zone_size) { zone_size = zone_info->zone_size; } else if (zone_info->zone_size != zone_size) { btrfs_err(fs_info, "zoned: unequal block device zone sizes: have %llu found %llu", zone_info->zone_size, zone_size); return -EINVAL; } /* * With the zoned emulation, we can have non-zoned device on the * zoned mode. In this case, we don't have a valid max zone * append size. */ if (bdev_is_zoned(device->bdev)) blk_stack_limits(lim, bdev_limits(device->bdev), 0); } ret = blk_validate_limits(lim); if (ret) { btrfs_err(fs_info, "zoned: failed to validate queue limits"); return ret; } /* * stripe_size is always aligned to BTRFS_STRIPE_LEN in * btrfs_create_chunk(). Since we want stripe_len == zone_size, * check the alignment here. */ if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { btrfs_err(fs_info, "zoned: zone size %llu not aligned to stripe %u", zone_size, BTRFS_STRIPE_LEN); return -EINVAL; } if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { btrfs_err(fs_info, "zoned: mixed block groups not supported"); return -EINVAL; } fs_info->zone_size = zone_size; /* * Also limit max_zone_append_size by max_segments * PAGE_SIZE. * Technically, we can have multiple pages per segment. But, since * we add the pages one by one to a bio, and cannot increase the * metadata reservation even if it increases the number of extents, it * is safe to stick with the limit. */ fs_info->max_zone_append_size = ALIGN_DOWN( min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT, (u64)lim->max_sectors << SECTOR_SHIFT, (u64)lim->max_segments << PAGE_SHIFT), fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size, fs_info->max_zone_append_size); /* * Check mount options here, because we might change fs_info->zoned * from fs_info->zone_size. */ ret = btrfs_check_mountopts_zoned(fs_info, &fs_info->mount_opt); if (ret) return ret; btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); return 0; } int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info, unsigned long long *mount_opt) { if (!btrfs_is_zoned(info)) return 0; /* * Space cache writing is not COWed. Disable that to avoid write errors * in sequential zones. */ if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { btrfs_err(info, "zoned: space cache v1 is not supported"); return -EINVAL; } if (btrfs_raw_test_opt(*mount_opt, NODATACOW)) { btrfs_err(info, "zoned: NODATACOW not supported"); return -EINVAL; } if (btrfs_raw_test_opt(*mount_opt, DISCARD_ASYNC)) { btrfs_info(info, "zoned: async discard ignored and disabled for zoned mode"); btrfs_clear_opt(*mount_opt, DISCARD_ASYNC); } return 0; } static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, int rw, u64 *bytenr_ret) { u64 wp; int ret; if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) { *bytenr_ret = zones[0].start << SECTOR_SHIFT; return 0; } ret = sb_write_pointer(bdev, zones, &wp); if (ret != -ENOENT && ret < 0) return ret; if (rw == WRITE) { struct blk_zone *reset = NULL; if (wp == zones[0].start << SECTOR_SHIFT) reset = &zones[0]; else if (wp == zones[1].start << SECTOR_SHIFT) reset = &zones[1]; if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { unsigned int nofs_flags; ASSERT(sb_zone_is_full(reset)); nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, reset->start, reset->len); memalloc_nofs_restore(nofs_flags); if (ret) return ret; reset->cond = BLK_ZONE_COND_EMPTY; reset->wp = reset->start; } } else if (ret != -ENOENT) { /* * For READ, we want the previous one. Move write pointer to * the end of a zone, if it is at the head of a zone. */ u64 zone_end = 0; if (wp == zones[0].start << SECTOR_SHIFT) zone_end = zones[1].start + zones[1].capacity; else if (wp == zones[1].start << SECTOR_SHIFT) zone_end = zones[0].start + zones[0].capacity; if (zone_end) wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT, BTRFS_SUPER_INFO_SIZE); wp -= BTRFS_SUPER_INFO_SIZE; } *bytenr_ret = wp; return 0; } int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, u64 *bytenr_ret) { struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES]; sector_t zone_sectors; u32 sb_zone; int ret; u8 zone_sectors_shift; sector_t nr_sectors; u32 nr_zones; if (!bdev_is_zoned(bdev)) { *bytenr_ret = btrfs_sb_offset(mirror); return 0; } ASSERT(rw == READ || rw == WRITE); zone_sectors = bdev_zone_sectors(bdev); if (!is_power_of_2(zone_sectors)) return -EINVAL; zone_sectors_shift = ilog2(zone_sectors); nr_sectors = bdev_nr_sectors(bdev); nr_zones = nr_sectors >> zone_sectors_shift; sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); if (sb_zone + 1 >= nr_zones) return -ENOENT; ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev), BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, zones); if (ret < 0) return ret; if (ret != BTRFS_NR_SB_LOG_ZONES) return -EIO; return sb_log_location(bdev, zones, rw, bytenr_ret); } int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, u64 *bytenr_ret) { struct btrfs_zoned_device_info *zinfo = device->zone_info; u32 zone_num; /* * For a zoned filesystem on a non-zoned block device, use the same * super block locations as regular filesystem. Doing so, the super * block can always be retrieved and the zoned flag of the volume * detected from the super block information. */ if (!bdev_is_zoned(device->bdev)) { *bytenr_ret = btrfs_sb_offset(mirror); return 0; } zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); if (zone_num + 1 >= zinfo->nr_zones) return -ENOENT; return sb_log_location(device->bdev, &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror], rw, bytenr_ret); } static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, int mirror) { u32 zone_num; if (!zinfo) return false; zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); if (zone_num + 1 >= zinfo->nr_zones) return false; if (!test_bit(zone_num, zinfo->seq_zones)) return false; return true; } int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) { struct btrfs_zoned_device_info *zinfo = device->zone_info; struct blk_zone *zone; int i; if (!is_sb_log_zone(zinfo, mirror)) return 0; zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { /* Advance the next zone */ if (zone->cond == BLK_ZONE_COND_FULL) { zone++; continue; } if (zone->cond == BLK_ZONE_COND_EMPTY) zone->cond = BLK_ZONE_COND_IMP_OPEN; zone->wp += SUPER_INFO_SECTORS; if (sb_zone_is_full(zone)) { /* * No room left to write new superblock. Since * superblock is written with REQ_SYNC, it is safe to * finish the zone now. * * If the write pointer is exactly at the capacity, * explicit ZONE_FINISH is not necessary. */ if (zone->wp != zone->start + zone->capacity) { unsigned int nofs_flags; int ret; nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, zone->start, zone->len); memalloc_nofs_restore(nofs_flags); if (ret) return ret; } zone->wp = zone->start + zone->len; zone->cond = BLK_ZONE_COND_FULL; } return 0; } /* All the zones are FULL. Should not reach here. */ DEBUG_WARN("unexpected state, all zones full"); return -EIO; } int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) { unsigned int nofs_flags; sector_t zone_sectors; sector_t nr_sectors; u8 zone_sectors_shift; u32 sb_zone; u32 nr_zones; int ret; zone_sectors = bdev_zone_sectors(bdev); zone_sectors_shift = ilog2(zone_sectors); nr_sectors = bdev_nr_sectors(bdev); nr_zones = nr_sectors >> zone_sectors_shift; sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); if (sb_zone + 1 >= nr_zones) return -ENOENT; nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, zone_start_sector(sb_zone, bdev), zone_sectors * BTRFS_NR_SB_LOG_ZONES); memalloc_nofs_restore(nofs_flags); return ret; } /* * Find allocatable zones within a given region. * * @device: the device to allocate a region on * @hole_start: the position of the hole to allocate the region * @num_bytes: size of wanted region * @hole_end: the end of the hole * @return: position of allocatable zones * * Allocatable region should not contain any superblock locations. */ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, u64 hole_end, u64 num_bytes) { struct btrfs_zoned_device_info *zinfo = device->zone_info; const u8 shift = zinfo->zone_size_shift; u64 nzones = num_bytes >> shift; u64 pos = hole_start; u64 begin, end; bool have_sb; int i; ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); while (pos < hole_end) { begin = pos >> shift; end = begin + nzones; if (end > zinfo->nr_zones) return hole_end; /* Check if zones in the region are all empty */ if (btrfs_dev_is_sequential(device, pos) && !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) { pos += zinfo->zone_size; continue; } have_sb = false; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { u32 sb_zone; u64 sb_pos; sb_zone = sb_zone_number(shift, i); if (!(end <= sb_zone || sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { have_sb = true; pos = zone_start_physical( sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo); break; } /* We also need to exclude regular superblock positions */ sb_pos = btrfs_sb_offset(i); if (!(pos + num_bytes <= sb_pos || sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) { have_sb = true; pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE, zinfo->zone_size); break; } } if (!have_sb) break; } return pos; } static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos) { struct btrfs_zoned_device_info *zone_info = device->zone_info; unsigned int zno = (pos >> zone_info->zone_size_shift); /* We can use any number of zones */ if (zone_info->max_active_zones == 0) return true; if (!test_bit(zno, zone_info->active_zones)) { /* Active zone left? */ if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0) return false; if (test_and_set_bit(zno, zone_info->active_zones)) { /* Someone already set the bit */ atomic_inc(&zone_info->active_zones_left); } } return true; } static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) { struct btrfs_zoned_device_info *zone_info = device->zone_info; unsigned int zno = (pos >> zone_info->zone_size_shift); /* We can use any number of zones */ if (zone_info->max_active_zones == 0) return; if (test_and_clear_bit(zno, zone_info->active_zones)) atomic_inc(&zone_info->active_zones_left); } int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes) { unsigned int nofs_flags; int ret; *bytes = 0; nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT); memalloc_nofs_restore(nofs_flags); if (ret) return ret; *bytes = length; while (length) { btrfs_dev_set_zone_empty(device, physical); btrfs_dev_clear_active_zone(device, physical); physical += device->zone_info->zone_size; length -= device->zone_info->zone_size; } return 0; } int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) { struct btrfs_zoned_device_info *zinfo = device->zone_info; const u8 shift = zinfo->zone_size_shift; unsigned long begin = start >> shift; unsigned long nbits = size >> shift; u64 pos; int ret; ASSERT(IS_ALIGNED(start, zinfo->zone_size)); ASSERT(IS_ALIGNED(size, zinfo->zone_size)); if (begin + nbits > zinfo->nr_zones) return -ERANGE; /* All the zones are conventional */ if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits)) return 0; /* All the zones are sequential and empty */ if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) && bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits)) return 0; for (pos = start; pos < start + size; pos += zinfo->zone_size) { u64 reset_bytes; if (!btrfs_dev_is_sequential(device, pos) || btrfs_dev_is_empty_zone(device, pos)) continue; /* Free regions should be empty */ btrfs_warn( device->fs_info, "zoned: resetting device %s (devid %llu) zone %llu for allocation", rcu_dereference(device->name), device->devid, pos >> shift); WARN_ON_ONCE(1); ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, &reset_bytes); if (ret) return ret; } return 0; } /* * Calculate an allocation pointer from the extent allocation information * for a block group consist of conventional zones. It is pointed to the * end of the highest addressed extent in the block group as an allocation * offset. */ static int calculate_alloc_pointer(struct btrfs_block_group *cache, u64 *offset_ret, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_root *root; BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; int ret; u64 length; /* * Avoid tree lookups for a new block group, there's no use for it. * It must always be 0. * * Also, we have a lock chain of extent buffer lock -> chunk mutex. * For new a block group, this function is called from * btrfs_make_block_group() which is already taking the chunk mutex. * Thus, we cannot call calculate_alloc_pointer() which takes extent * buffer locks to avoid deadlock. */ if (new) { *offset_ret = 0; return 0; } path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = cache->start + cache->length; key.type = 0; key.offset = 0; root = btrfs_extent_root(fs_info, key.objectid); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); /* We should not find the exact match */ if (!ret) ret = -EUCLEAN; if (ret < 0) return ret; ret = btrfs_previous_extent_item(root, path, cache->start); if (ret) { if (ret == 1) { ret = 0; *offset_ret = 0; } return ret; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); if (found_key.type == BTRFS_EXTENT_ITEM_KEY) length = found_key.offset; else length = fs_info->nodesize; if (!(found_key.objectid >= cache->start && found_key.objectid + length <= cache->start + cache->length)) { return -EUCLEAN; } *offset_ret = found_key.objectid + length - cache->start; return 0; } struct zone_info { u64 physical; u64 capacity; u64 alloc_offset; }; static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, struct zone_info *info, unsigned long *active, struct btrfs_chunk_map *map, bool new) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; struct btrfs_device *device; int dev_replace_is_ongoing = 0; unsigned int nofs_flag; struct blk_zone zone; int ret; info->physical = map->stripes[zone_idx].physical; down_read(&dev_replace->rwsem); device = map->stripes[zone_idx].dev; if (!device->bdev) { up_read(&dev_replace->rwsem); info->alloc_offset = WP_MISSING_DEV; return 0; } /* Consider a zone as active if we can allow any number of active zones. */ if (!device->zone_info->max_active_zones) __set_bit(zone_idx, active); if (!btrfs_dev_is_sequential(device, info->physical)) { up_read(&dev_replace->rwsem); info->alloc_offset = WP_CONVENTIONAL; return 0; } ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical)); /* This zone will be used for allocation, so mark this zone non-empty. */ btrfs_dev_clear_zone_empty(device, info->physical); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical); /* * The group is mapped to a sequential zone. Get the zone write pointer * to determine the allocation offset within the zone. */ WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size)); if (new) { sector_t capacity; capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT); up_read(&dev_replace->rwsem); info->alloc_offset = 0; info->capacity = capacity << SECTOR_SHIFT; return 0; } nofs_flag = memalloc_nofs_save(); ret = btrfs_get_dev_zone(device, info->physical, &zone); memalloc_nofs_restore(nofs_flag); if (ret) { up_read(&dev_replace->rwsem); if (ret != -EIO && ret != -EOPNOTSUPP) return ret; info->alloc_offset = WP_MISSING_DEV; return 0; } if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { btrfs_err(fs_info, "zoned: unexpected conventional zone %llu on device %s (devid %llu)", zone.start << SECTOR_SHIFT, rcu_dereference(device->name), device->devid); up_read(&dev_replace->rwsem); return -EIO; } info->capacity = (zone.capacity << SECTOR_SHIFT); switch (zone.cond) { case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: btrfs_err(fs_info, "zoned: offline/readonly zone %llu on device %s (devid %llu)", (info->physical >> device->zone_info->zone_size_shift), rcu_dereference(device->name), device->devid); info->alloc_offset = WP_MISSING_DEV; break; case BLK_ZONE_COND_EMPTY: info->alloc_offset = 0; break; case BLK_ZONE_COND_FULL: info->alloc_offset = info->capacity; break; default: /* Partially used zone. */ info->alloc_offset = ((zone.wp - zone.start) << SECTOR_SHIFT); __set_bit(zone_idx, active); break; } up_read(&dev_replace->rwsem); return 0; } static int btrfs_load_block_group_single(struct btrfs_block_group *bg, struct zone_info *info, unsigned long *active) { if (info->alloc_offset == WP_MISSING_DEV) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", info->physical); return -EIO; } bg->alloc_offset = info->alloc_offset; bg->zone_capacity = info->capacity; if (test_bit(0, active)) set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); return 0; } static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active, u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree"); return -EINVAL; } bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); if (zone_info[0].alloc_offset == WP_MISSING_DEV) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[0].physical); return -EIO; } if (zone_info[1].alloc_offset == WP_MISSING_DEV) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[1].physical); return -EIO; } if (zone_info[0].alloc_offset == WP_CONVENTIONAL) zone_info[0].alloc_offset = last_alloc; if (zone_info[1].alloc_offset == WP_CONVENTIONAL) zone_info[1].alloc_offset = last_alloc; if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) { btrfs_err(bg->fs_info, "zoned: write pointer offset mismatch of zones in DUP profile"); return -EIO; } if (test_bit(0, active) != test_bit(1, active)) { if (!btrfs_zone_activate(bg)) return -EIO; } else if (test_bit(0, active)) { set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } bg->alloc_offset = zone_info[0].alloc_offset; return 0; } static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active, u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; int i; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", btrfs_bg_type_to_raid_name(map->type)); return -EINVAL; } /* In case a device is missing we have a cap of 0, so don't use it. */ bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); for (i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; if (zone_info[i].alloc_offset == WP_CONVENTIONAL) zone_info[i].alloc_offset = last_alloc; if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && !btrfs_test_opt(fs_info, DEGRADED)) { btrfs_err(fs_info, "zoned: write pointer offset mismatch of zones in %s profile", btrfs_bg_type_to_raid_name(map->type)); return -EIO; } if (test_bit(0, active) != test_bit(i, active)) { if (!btrfs_test_opt(fs_info, DEGRADED) && !btrfs_zone_activate(bg)) { return -EIO; } } else { if (test_bit(0, active)) set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } } if (zone_info[0].alloc_offset != WP_MISSING_DEV) bg->alloc_offset = zone_info[0].alloc_offset; else bg->alloc_offset = zone_info[i - 1].alloc_offset; return 0; } static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active, u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", btrfs_bg_type_to_raid_name(map->type)); return -EINVAL; } for (int i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { u64 stripe_nr, full_stripe_nr; u64 stripe_offset; int stripe_index; stripe_nr = div64_u64(last_alloc, map->stripe_size); stripe_offset = stripe_nr * map->stripe_size; full_stripe_nr = div_u64(stripe_nr, map->num_stripes); div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); zone_info[i].alloc_offset = full_stripe_nr * map->stripe_size; if (stripe_index > i) zone_info[i].alloc_offset += map->stripe_size; else if (stripe_index == i) zone_info[i].alloc_offset += (last_alloc - stripe_offset); } if (test_bit(0, active) != test_bit(i, active)) { if (!btrfs_zone_activate(bg)) return -EIO; } else { if (test_bit(0, active)) set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } bg->zone_capacity += zone_info[i].capacity; bg->alloc_offset += zone_info[i].alloc_offset; } return 0; } static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active, u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", btrfs_bg_type_to_raid_name(map->type)); return -EINVAL; } for (int i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; if (test_bit(0, active) != test_bit(i, active)) { if (!btrfs_zone_activate(bg)) return -EIO; } else { if (test_bit(0, active)) set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { u64 stripe_nr, full_stripe_nr; u64 stripe_offset; int stripe_index; stripe_nr = div64_u64(last_alloc, map->stripe_size); stripe_offset = stripe_nr * map->stripe_size; full_stripe_nr = div_u64(stripe_nr, map->num_stripes / map->sub_stripes); div_u64_rem(stripe_nr, (map->num_stripes / map->sub_stripes), &stripe_index); zone_info[i].alloc_offset = full_stripe_nr * map->stripe_size; if (stripe_index > (i / map->sub_stripes)) zone_info[i].alloc_offset += map->stripe_size; else if (stripe_index == (i / map->sub_stripes)) zone_info[i].alloc_offset += (last_alloc - stripe_offset); } if ((i % map->sub_stripes) == 0) { bg->zone_capacity += zone_info[i].capacity; bg->alloc_offset += zone_info[i].alloc_offset; } } return 0; } int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_chunk_map *map; u64 logical = cache->start; u64 length = cache->length; struct zone_info *zone_info = NULL; int ret; int i; unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; u64 profile; if (!btrfs_is_zoned(fs_info)) return 0; /* Sanity check */ if (!IS_ALIGNED(length, fs_info->zone_size)) { btrfs_err(fs_info, "zoned: block group %llu len %llu unaligned to zone size %llu", logical, length, fs_info->zone_size); return -EIO; } map = btrfs_find_chunk_map(fs_info, logical, length); if (!map) return -EINVAL; cache->physical_map = map; zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS); if (!zone_info) { ret = -ENOMEM; goto out; } active = bitmap_zalloc(map->num_stripes, GFP_NOFS); if (!active) { ret = -ENOMEM; goto out; } for (i = 0; i < map->num_stripes; i++) { ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new); if (ret) goto out; if (zone_info[i].alloc_offset == WP_CONVENTIONAL) num_conventional++; else num_sequential++; } if (num_sequential > 0) set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); if (num_conventional > 0) { /* Zone capacity is always zone size in emulation */ cache->zone_capacity = cache->length; ret = calculate_alloc_pointer(cache, &last_alloc, new); if (ret) { btrfs_err(fs_info, "zoned: failed to determine allocation offset of bg %llu", cache->start); goto out; } else if (map->num_stripes == num_conventional) { cache->alloc_offset = last_alloc; set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); goto out; } } profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; switch (profile) { case 0: /* single */ ret = btrfs_load_block_group_single(cache, &zone_info[0], active); break; case BTRFS_BLOCK_GROUP_DUP: ret = btrfs_load_block_group_dup(cache, map, zone_info, active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID1: case BTRFS_BLOCK_GROUP_RAID1C3: case BTRFS_BLOCK_GROUP_RAID1C4: ret = btrfs_load_block_group_raid1(cache, map, zone_info, active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID0: ret = btrfs_load_block_group_raid0(cache, map, zone_info, active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID10: ret = btrfs_load_block_group_raid10(cache, map, zone_info, active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID5: case BTRFS_BLOCK_GROUP_RAID6: default: btrfs_err(fs_info, "zoned: profile %s not yet supported", btrfs_bg_type_to_raid_name(map->type)); ret = -EINVAL; goto out; } if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 && profile != BTRFS_BLOCK_GROUP_RAID10) { /* * Detected broken write pointer. Make this block group * unallocatable by setting the allocation pointer at the end of * allocatable region. Relocating this block group will fix the * mismatch. * * Currently, we cannot handle RAID0 or RAID10 case like this * because we don't have a proper zone_capacity value. But, * reading from this block group won't work anyway by a missing * stripe. */ cache->alloc_offset = cache->zone_capacity; } out: /* Reject non SINGLE data profiles without RST */ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", btrfs_bg_type_to_raid_name(map->type)); return -EINVAL; } if (cache->alloc_offset > cache->zone_capacity) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", cache->alloc_offset, cache->zone_capacity, cache->start); ret = -EIO; } /* An extent is allocated after the write pointer */ if (!ret && num_conventional && last_alloc > cache->alloc_offset) { btrfs_err(fs_info, "zoned: got wrong write pointer in BG %llu: %llu > %llu", logical, last_alloc, cache->alloc_offset); ret = -EIO; } if (!ret) { cache->meta_write_pointer = cache->alloc_offset + cache->start; if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) { btrfs_get_block_group(cache); spin_lock(&fs_info->zone_active_bgs_lock); list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs); spin_unlock(&fs_info->zone_active_bgs_lock); } } else { btrfs_free_chunk_map(cache->physical_map); cache->physical_map = NULL; } bitmap_free(active); kfree(zone_info); return ret; } void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { u64 unusable, free; if (!btrfs_is_zoned(cache->fs_info)) return; WARN_ON(cache->bytes_super != 0); unusable = (cache->alloc_offset - cache->used) + (cache->length - cache->zone_capacity); free = cache->zone_capacity - cache->alloc_offset; /* We only need ->free_space in ALLOC_SEQ block groups */ cache->cached = BTRFS_CACHE_FINISHED; cache->free_space_ctl->free_space = free; cache->zone_unusable = unusable; } bool btrfs_use_zone_append(struct btrfs_bio *bbio) { u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = bbio->fs_info; struct btrfs_block_group *cache; bool ret = false; if (!btrfs_is_zoned(fs_info)) return false; if (!inode || !is_data_inode(inode)) return false; if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) return false; /* * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the * extent layout the relocation code has. * Furthermore we have set aside own block-group from which only the * relocation "process" can allocate and make sure only one process at a * time can add pages to an extent that gets relocated, so it's safe to * use regular REQ_OP_WRITE for this special case. */ if (btrfs_is_data_reloc_root(inode->root)) return false; cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); if (!cache) return false; ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); btrfs_put_block_group(cache); return ret; } void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; struct btrfs_ordered_sum *sum = bbio->sums; if (physical < bbio->orig_physical) sum->logical -= bbio->orig_physical - physical; else sum->logical += physical - bbio->orig_physical; } static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, u64 logical) { struct extent_map_tree *em_tree = &ordered->inode->extent_tree; struct extent_map *em; ordered->disk_bytenr = logical; write_lock(&em_tree->lock); em = btrfs_search_extent_mapping(em_tree, ordered->file_offset, ordered->num_bytes); /* The em should be a new COW extent, thus it should not have an offset. */ ASSERT(em->offset == 0); em->disk_bytenr = logical; btrfs_free_extent_map(em); write_unlock(&em_tree->lock); } static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, u64 logical, u64 len) { struct btrfs_ordered_extent *new; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && btrfs_split_extent_map(ordered->inode, ordered->file_offset, ordered->num_bytes, len, logical)) return false; new = btrfs_split_ordered_extent(ordered, len); if (IS_ERR(new)) return false; new->disk_bytenr = logical; btrfs_finish_one_ordered(new); return true; } void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered) { struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_sum *sum; u64 logical, len; /* * Write to pre-allocated region is for the data relocation, and so * it should use WRITE operation. No split/rewrite are necessary. */ if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) return; ASSERT(!list_empty(&ordered->list)); /* The ordered->list can be empty in the above pre-alloc case. */ sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list); logical = sum->logical; len = sum->len; while (len < ordered->disk_num_bytes) { sum = list_next_entry(sum, list); if (sum->logical == logical + len) { len += sum->len; continue; } if (!btrfs_zoned_split_ordered(ordered, logical, len)) { set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); btrfs_err(fs_info, "failed to split ordered extent"); goto out; } logical = sum->logical; len = sum->len; } if (ordered->disk_bytenr != logical) btrfs_rewrite_logical_zoned(ordered, logical); out: /* * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures * were allocated by btrfs_alloc_dummy_sum only to record the logical * addresses and don't contain actual checksums. We thus must free them * here so that we don't attempt to log the csums later. */ if ((inode->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) { while ((sum = list_first_entry_or_null(&ordered->list, typeof(*sum), list))) { list_del(&sum->list); kfree(sum); } } } static bool check_bg_is_active(struct btrfs_eb_write_context *ctx, struct btrfs_block_group **active_bg) { const struct writeback_control *wbc = ctx->wbc; struct btrfs_block_group *block_group = ctx->zoned_bg; struct btrfs_fs_info *fs_info = block_group->fs_info; if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) return true; if (fs_info->treelog_bg == block_group->start) { if (!btrfs_zone_activate(block_group)) { int ret_fin = btrfs_zone_finish_one_bg(fs_info); if (ret_fin != 1 || !btrfs_zone_activate(block_group)) return false; } } else if (*active_bg != block_group) { struct btrfs_block_group *tgt = *active_bg; /* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */ lockdep_assert_held(&fs_info->zoned_meta_io_lock); if (tgt) { /* * If there is an unsent IO left in the allocated area, * we cannot wait for them as it may cause a deadlock. */ if (tgt->meta_write_pointer < tgt->start + tgt->alloc_offset) { if (wbc->sync_mode == WB_SYNC_NONE || (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)) return false; } /* Pivot active metadata/system block group. */ btrfs_zoned_meta_io_unlock(fs_info); wait_eb_writebacks(tgt); do_zone_finish(tgt, true); btrfs_zoned_meta_io_lock(fs_info); if (*active_bg == tgt) { btrfs_put_block_group(tgt); *active_bg = NULL; } } if (!btrfs_zone_activate(block_group)) return false; if (*active_bg != block_group) { ASSERT(*active_bg == NULL); *active_bg = block_group; btrfs_get_block_group(block_group); } } return true; } /* * Check if @ctx->eb is aligned to the write pointer. * * Return: * 0: @ctx->eb is at the write pointer. You can write it. * -EAGAIN: There is a hole. The caller should handle the case. * -EBUSY: There is a hole, but the caller can just bail out. */ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, struct btrfs_eb_write_context *ctx) { const struct writeback_control *wbc = ctx->wbc; const struct extent_buffer *eb = ctx->eb; struct btrfs_block_group *block_group = ctx->zoned_bg; if (!btrfs_is_zoned(fs_info)) return 0; if (block_group) { if (block_group->start > eb->start || block_group->start + block_group->length <= eb->start) { btrfs_put_block_group(block_group); block_group = NULL; ctx->zoned_bg = NULL; } } if (!block_group) { block_group = btrfs_lookup_block_group(fs_info, eb->start); if (!block_group) return 0; ctx->zoned_bg = block_group; } if (block_group->meta_write_pointer == eb->start) { struct btrfs_block_group **tgt; if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) return 0; if (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) tgt = &fs_info->active_system_bg; else tgt = &fs_info->active_meta_bg; if (check_bg_is_active(ctx, tgt)) return 0; } /* * Since we may release fs_info->zoned_meta_io_lock, someone can already * start writing this eb. In that case, we can just bail out. */ if (block_group->meta_write_pointer > eb->start) return -EBUSY; /* If for_sync, this hole will be filled with transaction commit. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) return -EAGAIN; return -EBUSY; } int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) { if (!btrfs_dev_is_sequential(device, physical)) return -EOPNOTSUPP; return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, GFP_NOFS, 0); } static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, struct blk_zone *zone) { struct btrfs_io_context *bioc = NULL; u64 mapped_length = PAGE_SIZE; unsigned int nofs_flag; int nmirrors; int i, ret; ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &mapped_length, &bioc, NULL, NULL); if (ret || !bioc || mapped_length < PAGE_SIZE) { ret = -EIO; goto out_put_bioc; } if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EINVAL; goto out_put_bioc; } nofs_flag = memalloc_nofs_save(); nmirrors = (int)bioc->num_stripes; for (i = 0; i < nmirrors; i++) { u64 physical = bioc->stripes[i].physical; struct btrfs_device *dev = bioc->stripes[i].dev; /* Missing device */ if (!dev->bdev) continue; ret = btrfs_get_dev_zone(dev, physical, zone); /* Failing device */ if (ret == -EIO || ret == -EOPNOTSUPP) continue; break; } memalloc_nofs_restore(nofs_flag); out_put_bioc: btrfs_put_bioc(bioc); return ret; } /* * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by * filling zeros between @physical_pos to a write pointer of dev-replace * source device. */ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos) { struct btrfs_fs_info *fs_info = tgt_dev->fs_info; struct blk_zone zone; u64 length; u64 wp; int ret; if (!btrfs_dev_is_sequential(tgt_dev, physical_pos)) return 0; ret = read_zone_info(fs_info, logical, &zone); if (ret) return ret; wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT); if (physical_pos == wp) return 0; if (physical_pos > wp) return -EUCLEAN; length = wp - physical_pos; return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); } /* * Activate block group and underlying device zones * * @block_group: the block group to activate * * Return: true on success, false otherwise */ bool btrfs_zone_activate(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_chunk_map *map; struct btrfs_device *device; u64 physical; const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA); bool ret; int i; if (!btrfs_is_zoned(block_group->fs_info)) return true; map = block_group->physical_map; spin_lock(&fs_info->zone_active_bgs_lock); spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; goto out_unlock; } if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) { /* The caller should check if the block group is full. */ if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) { ret = false; goto out_unlock; } } else { /* Since it is already written, it should have been active. */ WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start); } for (i = 0; i < map->num_stripes; i++) { struct btrfs_zoned_device_info *zinfo; int reserved = 0; device = map->stripes[i].dev; physical = map->stripes[i].physical; zinfo = device->zone_info; if (!device->bdev) continue; if (zinfo->max_active_zones == 0) continue; if (is_data) reserved = zinfo->reserved_active_zones; /* * For the data block group, leave active zones for one * metadata block group and one system block group. */ if (atomic_read(&zinfo->active_zones_left) <= reserved) { ret = false; goto out_unlock; } if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; goto out_unlock; } if (!is_data) zinfo->reserved_active_zones--; } /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); spin_unlock(&block_group->lock); /* For the active block group list */ btrfs_get_block_group(block_group); list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); spin_unlock(&fs_info->zone_active_bgs_lock); return true; out_unlock: spin_unlock(&block_group->lock); spin_unlock(&fs_info->zone_active_bgs_lock); return ret; } static void wait_eb_writebacks(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; const u64 end = block_group->start + block_group->length; struct extent_buffer *eb; unsigned long index, start = (block_group->start >> fs_info->nodesize_bits); rcu_read_lock(); xa_for_each_start(&fs_info->buffer_tree, index, eb, start) { if (eb->start < block_group->start) continue; if (eb->start >= end) break; rcu_read_unlock(); wait_on_extent_buffer_writeback(eb); rcu_read_lock(); } rcu_read_unlock(); } static int call_zone_finish(struct btrfs_block_group *block_group, struct btrfs_io_stripe *stripe) { struct btrfs_device *device = stripe->dev; const u64 physical = stripe->physical; struct btrfs_zoned_device_info *zinfo = device->zone_info; int ret; if (!device->bdev) return 0; if (zinfo->max_active_zones == 0) return 0; if (btrfs_dev_is_sequential(device, physical)) { unsigned int nofs_flags; nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, physical >> SECTOR_SHIFT, zinfo->zone_size >> SECTOR_SHIFT); memalloc_nofs_restore(nofs_flags); if (ret) return ret; } if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) zinfo->reserved_active_zones++; btrfs_dev_clear_active_zone(device, physical); return 0; } static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_chunk_map *map; const bool is_metadata = (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int ret = 0; int i; spin_lock(&block_group->lock); if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } /* Check if we have unwritten allocated space */ if (is_metadata && block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { spin_unlock(&block_group->lock); return -EAGAIN; } /* * If we are sure that the block group is full (= no more room left for * new allocation) and the IO for the last usable block is completed, we * don't need to wait for the other IOs. This holds because we ensure * the sequential IO submissions using the ZONE_APPEND command for data * and block_group->meta_write_pointer for metadata. */ if (!fully_written) { if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return -EAGAIN; } spin_unlock(&block_group->lock); ret = btrfs_inc_block_group_ro(block_group, false); if (ret) return ret; /* Ensure all writes in this block group finish */ btrfs_wait_block_group_reservations(block_group); /* No need to wait for NOCOW writers. Zoned mode does not allow that */ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group); /* Wait for extent buffers to be written. */ if (is_metadata) wait_eb_writebacks(block_group); spin_lock(&block_group->lock); /* * Bail out if someone already deactivated the block group, or * allocated space is left in the block group. */ if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); btrfs_dec_block_group_ro(block_group); return 0; } if (block_group->reserved || test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); btrfs_dec_block_group_ro(block_group); return -EAGAIN; } } clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); block_group->alloc_offset = block_group->zone_capacity; if (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) block_group->meta_write_pointer = block_group->start + block_group->zone_capacity; block_group->free_space_ctl->free_space = 0; btrfs_clear_treelog_bg(block_group); btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock); down_read(&dev_replace->rwsem); map = block_group->physical_map; for (i = 0; i < map->num_stripes; i++) { ret = call_zone_finish(block_group, &map->stripes[i]); if (ret) { up_read(&dev_replace->rwsem); return ret; } } up_read(&dev_replace->rwsem); if (!fully_written) btrfs_dec_block_group_ro(block_group); spin_lock(&fs_info->zone_active_bgs_lock); ASSERT(!list_empty(&block_group->active_bg_list)); list_del_init(&block_group->active_bg_list); spin_unlock(&fs_info->zone_active_bgs_lock); /* For active_bg_list */ btrfs_put_block_group(block_group); clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); return 0; } int btrfs_zone_finish(struct btrfs_block_group *block_group) { if (!btrfs_is_zoned(block_group->fs_info)) return 0; return do_zone_finish(block_group, false); } bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) { struct btrfs_fs_info *fs_info = fs_devices->fs_info; struct btrfs_device *device; bool ret = false; if (!btrfs_is_zoned(fs_info)) return true; if (test_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags)) return false; /* Check if there is a device with active zones left */ mutex_lock(&fs_info->chunk_mutex); spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { struct btrfs_zoned_device_info *zinfo = device->zone_info; int reserved = 0; if (!device->bdev) continue; if (!zinfo->max_active_zones) { ret = true; break; } if (flags & BTRFS_BLOCK_GROUP_DATA) reserved = zinfo->reserved_active_zones; switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) { case 0: /* single */ ret = (atomic_read(&zinfo->active_zones_left) >= (1 + reserved)); break; case BTRFS_BLOCK_GROUP_DUP: ret = (atomic_read(&zinfo->active_zones_left) >= (2 + reserved)); break; } if (ret) break; } spin_unlock(&fs_info->zone_active_bgs_lock); mutex_unlock(&fs_info->chunk_mutex); if (!ret) set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); return ret; } int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_block_group *block_group; u64 min_alloc_bytes; if (!btrfs_is_zoned(fs_info)) return 0; block_group = btrfs_lookup_block_group(fs_info, logical); if (WARN_ON_ONCE(!block_group)) return -ENOENT; /* No MIXED_BG on zoned btrfs. */ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) min_alloc_bytes = fs_info->sectorsize; else min_alloc_bytes = fs_info->nodesize; /* Bail out if we can allocate more data from this block group. */ if (logical + length + min_alloc_bytes <= block_group->start + block_group->zone_capacity) goto out; do_zone_finish(block_group, true); out: btrfs_put_block_group(block_group); return 0; } static void btrfs_zone_finish_endio_workfn(struct work_struct *work) { int ret; struct btrfs_block_group *bg = container_of(work, struct btrfs_block_group, zone_finish_work); wait_on_extent_buffer_writeback(bg->last_eb); free_extent_buffer(bg->last_eb); ret = do_zone_finish(bg, true); if (ret) btrfs_handle_fs_error(bg->fs_info, ret, "Failed to finish block-group's zone"); btrfs_put_block_group(bg); } void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb) { if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) return; if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing", bg->start); return; } /* For the work */ btrfs_get_block_group(bg); refcount_inc(&eb->refs); bg->last_eb = eb; INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); queue_work(system_unbound_wq, &bg->zone_finish_work); } void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; spin_lock(&fs_info->relocation_bg_lock); if (fs_info->data_reloc_bg == bg->start) fs_info->data_reloc_bg = 0; spin_unlock(&fs_info->relocation_bg_lock); } void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) { struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; struct btrfs_space_info *space_info = data_sinfo; struct btrfs_trans_handle *trans; struct btrfs_block_group *bg; struct list_head *bg_list; u64 alloc_flags; bool first = true; bool did_chunk_alloc = false; int index; int ret; if (!btrfs_is_zoned(fs_info)) return; if (fs_info->data_reloc_bg) return; if (sb_rdonly(fs_info->sb)) return; alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags); index = btrfs_bg_flags_to_raid_index(alloc_flags); /* Scan the data space_info to find empty block groups. Take the second one. */ again: bg_list = &space_info->block_groups[index]; list_for_each_entry(bg, bg_list, list) { if (bg->alloc_offset != 0) continue; if (first) { first = false; continue; } if (space_info == data_sinfo) { /* Migrate the block group to the data relocation space_info. */ struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0]; int factor; ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); factor = btrfs_bg_type_to_factor(bg->flags); down_write(&space_info->groups_sem); list_del_init(&bg->list); /* We can assume this as we choose the second empty one. */ ASSERT(!list_empty(&space_info->block_groups[index])); up_write(&space_info->groups_sem); spin_lock(&space_info->lock); space_info->total_bytes -= bg->length; space_info->disk_total -= bg->length * factor; /* There is no allocation ever happened. */ ASSERT(bg->used == 0); ASSERT(bg->zone_unusable == 0); /* No super block in a block group on the zoned setup. */ ASSERT(bg->bytes_super == 0); spin_unlock(&space_info->lock); bg->space_info = reloc_sinfo; if (reloc_sinfo->block_group_kobjs[index] == NULL) btrfs_sysfs_add_block_group_type(bg); btrfs_add_bg_to_space_info(fs_info, bg); } fs_info->data_reloc_bg = bg->start; set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags); btrfs_zone_activate(bg); return; } if (did_chunk_alloc) return; trans = btrfs_join_transaction(fs_info->tree_root); if (IS_ERR(trans)) return; /* Allocate new BG in the data relocation space_info. */ space_info = data_sinfo->sub_group[0]; ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); if (ret == 1) { /* * We allocated a new block group in the data relocation space_info. We * can take that one. */ first = false; did_chunk_alloc = true; goto again; } } void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; if (!btrfs_is_zoned(fs_info)) return; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->zone_info) { vfree(device->zone_info->zone_cache); device->zone_info->zone_cache = NULL; } } mutex_unlock(&fs_devices->device_list_mutex); } bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 total = btrfs_super_total_bytes(fs_info->super_copy); u64 used = 0; u64 factor; ASSERT(btrfs_is_zoned(fs_info)); if (fs_info->bg_reclaim_threshold == 0) return false; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (!device->bdev) continue; used += device->bytes_used; } mutex_unlock(&fs_devices->device_list_mutex); factor = div64_u64(used * 100, total); return factor >= fs_info->bg_reclaim_threshold; } void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_block_group *block_group; if (!btrfs_is_zoned(fs_info)) return; block_group = btrfs_lookup_block_group(fs_info, logical); /* It should be called on a previous data relocation block group. */ ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); spin_lock(&block_group->lock); if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) goto out; /* All relocation extents are written. */ if (block_group->start + block_group->alloc_offset == logical + length) { /* * Now, release this block group for further allocations and * zone finish. */ clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); } out: spin_unlock(&block_group->lock); btrfs_put_block_group(block_group); } int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) { struct btrfs_block_group *block_group; struct btrfs_block_group *min_bg = NULL; u64 min_avail = U64_MAX; int ret; spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { u64 avail; spin_lock(&block_group->lock); if (block_group->reserved || block_group->alloc_offset == 0 || !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) || test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); continue; } avail = block_group->zone_capacity - block_group->alloc_offset; if (min_avail > avail) { if (min_bg) btrfs_put_block_group(min_bg); min_bg = block_group; min_avail = avail; btrfs_get_block_group(min_bg); } spin_unlock(&block_group->lock); } spin_unlock(&fs_info->zone_active_bgs_lock); if (!min_bg) return 0; ret = btrfs_zone_finish(min_bg); btrfs_put_block_group(min_bg); return ret < 0 ? ret : 1; } int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, bool do_finish) { struct btrfs_block_group *bg; int index; if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) return 0; for (;;) { int ret; bool need_finish = false; down_read(&space_info->groups_sem); for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) { list_for_each_entry(bg, &space_info->block_groups[index], list) { if (!spin_trylock(&bg->lock)) continue; if (btrfs_zoned_bg_is_full(bg) || test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags)) { spin_unlock(&bg->lock); continue; } spin_unlock(&bg->lock); if (btrfs_zone_activate(bg)) { up_read(&space_info->groups_sem); return 1; } need_finish = true; } } up_read(&space_info->groups_sem); if (!do_finish || !need_finish) break; ret = btrfs_zone_finish_one_bg(fs_info); if (ret == 0) break; if (ret < 0) return ret; } return 0; } /* * Reserve zones for one metadata block group, one tree-log block group, and one * system block group. */ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_block_group *block_group; struct btrfs_device *device; /* Reserve zones for normal SINGLE metadata and tree-log block group. */ unsigned int metadata_reserve = 2; /* Reserve a zone for SINGLE system block group. */ unsigned int system_reserve = 1; if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) return; /* * This function is called from the mount context. So, there is no * parallel process touching the bits. No need for read_seqretry(). */ if (fs_info->avail_metadata_alloc_bits & BTRFS_BLOCK_GROUP_DUP) metadata_reserve = 4; if (fs_info->avail_system_alloc_bits & BTRFS_BLOCK_GROUP_DUP) system_reserve = 2; /* Apply the reservation on all the devices. */ mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (!device->bdev) continue; device->zone_info->reserved_active_zones = metadata_reserve + system_reserve; } mutex_unlock(&fs_devices->device_list_mutex); /* Release reservation for currently active block groups. */ spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { struct btrfs_chunk_map *map = block_group->physical_map; if (!(block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) continue; for (int i = 0; i < map->num_stripes; i++) map->stripes[i].dev->zone_info->reserved_active_zones--; } spin_unlock(&fs_info->zone_active_bgs_lock); } /* * Reset the zones of unused block groups from @space_info->bytes_zone_unusable. * * @space_info: the space to work on * @num_bytes: targeting reclaim bytes * * This one resets the zones of a block group, so we can reuse the region * without removing the block group. On the other hand, btrfs_delete_unused_bgs() * just removes a block group and frees up the underlying zones. So, we still * need to allocate a new block group to reuse the zones. * * Resetting is faster than deleting/recreating a block group. It is similar * to freeing the logical space on the regular mode. However, we cannot change * the block group's profile with this operation. */ int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes) { struct btrfs_fs_info *fs_info = space_info->fs_info; const sector_t zone_size_sectors = fs_info->zone_size >> SECTOR_SHIFT; if (!btrfs_is_zoned(fs_info)) return 0; while (num_bytes > 0) { struct btrfs_chunk_map *map; struct btrfs_block_group *bg = NULL; bool found = false; u64 reclaimed = 0; /* * Here, we choose a fully zone_unusable block group. It's * technically possible to reset a partly zone_unusable block * group, which still has some free space left. However, * handling that needs to cope with the allocation side, which * makes the logic more complex. So, let's handle the easy case * for now. */ spin_lock(&fs_info->unused_bgs_lock); list_for_each_entry(bg, &fs_info->unused_bgs, bg_list) { if ((bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != space_info->flags) continue; /* * Use trylock to avoid locking order violation. In * btrfs_reclaim_bgs_work(), the lock order is * &bg->lock -> &fs_info->unused_bgs_lock. We skip a * block group if we cannot take its lock. */ if (!spin_trylock(&bg->lock)) continue; if (btrfs_is_block_group_used(bg) || bg->zone_unusable < bg->length) { spin_unlock(&bg->lock); continue; } spin_unlock(&bg->lock); found = true; break; } if (!found) { spin_unlock(&fs_info->unused_bgs_lock); return 0; } list_del_init(&bg->bg_list); btrfs_put_block_group(bg); spin_unlock(&fs_info->unused_bgs_lock); /* * Since the block group is fully zone_unusable and we cannot * allocate from this block group anymore, we don't need to set * this block group read-only. */ down_read(&fs_info->dev_replace.rwsem); map = bg->physical_map; for (int i = 0; i < map->num_stripes; i++) { struct btrfs_io_stripe *stripe = &map->stripes[i]; unsigned int nofs_flags; int ret; nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(stripe->dev->bdev, REQ_OP_ZONE_RESET, stripe->physical >> SECTOR_SHIFT, zone_size_sectors); memalloc_nofs_restore(nofs_flags); if (ret) { up_read(&fs_info->dev_replace.rwsem); return ret; } } up_read(&fs_info->dev_replace.rwsem); spin_lock(&space_info->lock); spin_lock(&bg->lock); ASSERT(!btrfs_is_block_group_used(bg)); if (bg->ro) { spin_unlock(&bg->lock); spin_unlock(&space_info->lock); continue; } reclaimed = bg->alloc_offset; bg->zone_unusable = bg->length - bg->zone_capacity; bg->alloc_offset = 0; /* * This holds because we currently reset fully used then freed * block group. */ ASSERT(reclaimed == bg->zone_capacity); bg->free_space_ctl->free_space += reclaimed; space_info->bytes_zone_unusable -= reclaimed; spin_unlock(&bg->lock); btrfs_return_free_space(space_info, reclaimed); spin_unlock(&space_info->lock); if (num_bytes <= reclaimed) break; num_bytes -= reclaimed; } return 0; } |
| 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 | // SPDX-License-Identifier: GPL-2.0-only /* * Frontend driver for mobile DVB-T demodulator DiBcom 3000M-B * DiBcom (http://www.dibcom.fr/) * * Copyright (C) 2004-5 Patrick Boettcher (patrick.boettcher@posteo.de) * * based on GPL code from DibCom, which has * * Copyright (C) 2004 Amaury Demol for DiBcom * * Acknowledgements * * Amaury Demol from DiBcom for providing specs and driver * sources, on which this driver (and the dvb-dibusb) are based. * * see Documentation/driver-api/media/drivers/dvb-usb.rst for more information */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/delay.h> #include <linux/string.h> #include <linux/slab.h> #include <media/dvb_frontend.h> #include "dib3000.h" #include "dib3000mb_priv.h" /* Version information */ #define DRIVER_VERSION "0.1" #define DRIVER_DESC "DiBcom 3000M-B DVB-T demodulator" #define DRIVER_AUTHOR "Patrick Boettcher, patrick.boettcher@posteo.de" static int debug; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "set debugging level (1=info,2=xfer,4=setfe,8=getfe (|-able))."); #define deb_info(args...) dprintk(0x01, args) #define deb_i2c(args...) dprintk(0x02, args) #define deb_srch(args...) dprintk(0x04, args) #define deb_info(args...) dprintk(0x01, args) #define deb_xfer(args...) dprintk(0x02, args) #define deb_setf(args...) dprintk(0x04, args) #define deb_getf(args...) dprintk(0x08, args) static int dib3000_read_reg(struct dib3000_state *state, u16 reg) { u8 wb[] = { ((reg >> 8) | 0x80) & 0xff, reg & 0xff }; u8 rb[2] = {}; struct i2c_msg msg[] = { { .addr = state->config.demod_address, .flags = 0, .buf = wb, .len = 2 }, { .addr = state->config.demod_address, .flags = I2C_M_RD, .buf = rb, .len = 2 }, }; if (i2c_transfer(state->i2c, msg, 2) != 2) deb_i2c("i2c read error\n"); deb_i2c("reading i2c bus (reg: %5d 0x%04x, val: %5d 0x%04x)\n",reg,reg, (rb[0] << 8) | rb[1],(rb[0] << 8) | rb[1]); return (rb[0] << 8) | rb[1]; } static int dib3000_write_reg(struct dib3000_state *state, u16 reg, u16 val) { u8 b[] = { (reg >> 8) & 0xff, reg & 0xff, (val >> 8) & 0xff, val & 0xff, }; struct i2c_msg msg[] = { { .addr = state->config.demod_address, .flags = 0, .buf = b, .len = 4 } }; deb_i2c("writing i2c bus (reg: %5d 0x%04x, val: %5d 0x%04x)\n",reg,reg,val,val); return i2c_transfer(state->i2c,msg, 1) != 1 ? -EREMOTEIO : 0; } static int dib3000_search_status(u16 irq,u16 lock) { if (irq & 0x02) { if (lock & 0x01) { deb_srch("auto search succeeded\n"); return 1; // auto search succeeded } else { deb_srch("auto search not successful\n"); return 0; // auto search failed } } else if (irq & 0x01) { deb_srch("auto search failed\n"); return 0; // auto search failed } return -1; // try again } /* for auto search */ static u16 dib3000_seq[2][2][2] = /* fft,gua, inv */ { /* fft */ { /* gua */ { 0, 1 }, /* 0 0 { 0,1 } */ { 3, 9 }, /* 0 1 { 0,1 } */ }, { { 2, 5 }, /* 1 0 { 0,1 } */ { 6, 11 }, /* 1 1 { 0,1 } */ } }; static int dib3000mb_get_frontend(struct dvb_frontend* fe, struct dtv_frontend_properties *c); static int dib3000mb_set_frontend(struct dvb_frontend *fe, int tuner) { struct dib3000_state* state = fe->demodulator_priv; struct dtv_frontend_properties *c = &fe->dtv_property_cache; enum fe_code_rate fe_cr = FEC_NONE; int search_state, seq; if (tuner && fe->ops.tuner_ops.set_params) { fe->ops.tuner_ops.set_params(fe); if (fe->ops.i2c_gate_ctrl) fe->ops.i2c_gate_ctrl(fe, 0); switch (c->bandwidth_hz) { case 8000000: wr_foreach(dib3000mb_reg_timing_freq, dib3000mb_timing_freq[2]); wr_foreach(dib3000mb_reg_bandwidth, dib3000mb_bandwidth_8mhz); break; case 7000000: wr_foreach(dib3000mb_reg_timing_freq, dib3000mb_timing_freq[1]); wr_foreach(dib3000mb_reg_bandwidth, dib3000mb_bandwidth_7mhz); break; case 6000000: wr_foreach(dib3000mb_reg_timing_freq, dib3000mb_timing_freq[0]); wr_foreach(dib3000mb_reg_bandwidth, dib3000mb_bandwidth_6mhz); break; case 0: return -EOPNOTSUPP; default: pr_err("unknown bandwidth value.\n"); return -EINVAL; } deb_setf("bandwidth: %d MHZ\n", c->bandwidth_hz / 1000000); } wr(DIB3000MB_REG_LOCK1_MASK, DIB3000MB_LOCK1_SEARCH_4); switch (c->transmission_mode) { case TRANSMISSION_MODE_2K: deb_setf("transmission mode: 2k\n"); wr(DIB3000MB_REG_FFT, DIB3000_TRANSMISSION_MODE_2K); break; case TRANSMISSION_MODE_8K: deb_setf("transmission mode: 8k\n"); wr(DIB3000MB_REG_FFT, DIB3000_TRANSMISSION_MODE_8K); break; case TRANSMISSION_MODE_AUTO: deb_setf("transmission mode: auto\n"); break; default: return -EINVAL; } switch (c->guard_interval) { case GUARD_INTERVAL_1_32: deb_setf("guard 1_32\n"); wr(DIB3000MB_REG_GUARD_TIME, DIB3000_GUARD_TIME_1_32); break; case GUARD_INTERVAL_1_16: deb_setf("guard 1_16\n"); wr(DIB3000MB_REG_GUARD_TIME, DIB3000_GUARD_TIME_1_16); break; case GUARD_INTERVAL_1_8: deb_setf("guard 1_8\n"); wr(DIB3000MB_REG_GUARD_TIME, DIB3000_GUARD_TIME_1_8); break; case GUARD_INTERVAL_1_4: deb_setf("guard 1_4\n"); wr(DIB3000MB_REG_GUARD_TIME, DIB3000_GUARD_TIME_1_4); break; case GUARD_INTERVAL_AUTO: deb_setf("guard auto\n"); break; default: return -EINVAL; } switch (c->inversion) { case INVERSION_OFF: deb_setf("inversion off\n"); wr(DIB3000MB_REG_DDS_INV, DIB3000_DDS_INVERSION_OFF); break; case INVERSION_AUTO: deb_setf("inversion auto\n"); break; case INVERSION_ON: deb_setf("inversion on\n"); wr(DIB3000MB_REG_DDS_INV, DIB3000_DDS_INVERSION_ON); break; default: return -EINVAL; } switch (c->modulation) { case QPSK: deb_setf("modulation: qpsk\n"); wr(DIB3000MB_REG_QAM, DIB3000_CONSTELLATION_QPSK); break; case QAM_16: deb_setf("modulation: qam16\n"); wr(DIB3000MB_REG_QAM, DIB3000_CONSTELLATION_16QAM); break; case QAM_64: deb_setf("modulation: qam64\n"); wr(DIB3000MB_REG_QAM, DIB3000_CONSTELLATION_64QAM); break; case QAM_AUTO: break; default: return -EINVAL; } switch (c->hierarchy) { case HIERARCHY_NONE: deb_setf("hierarchy: none\n"); fallthrough; case HIERARCHY_1: deb_setf("hierarchy: alpha=1\n"); wr(DIB3000MB_REG_VIT_ALPHA, DIB3000_ALPHA_1); break; case HIERARCHY_2: deb_setf("hierarchy: alpha=2\n"); wr(DIB3000MB_REG_VIT_ALPHA, DIB3000_ALPHA_2); break; case HIERARCHY_4: deb_setf("hierarchy: alpha=4\n"); wr(DIB3000MB_REG_VIT_ALPHA, DIB3000_ALPHA_4); break; case HIERARCHY_AUTO: deb_setf("hierarchy: alpha=auto\n"); break; default: return -EINVAL; } if (c->hierarchy == HIERARCHY_NONE) { wr(DIB3000MB_REG_VIT_HRCH, DIB3000_HRCH_OFF); wr(DIB3000MB_REG_VIT_HP, DIB3000_SELECT_HP); fe_cr = c->code_rate_HP; } else if (c->hierarchy != HIERARCHY_AUTO) { wr(DIB3000MB_REG_VIT_HRCH, DIB3000_HRCH_ON); wr(DIB3000MB_REG_VIT_HP, DIB3000_SELECT_LP); fe_cr = c->code_rate_LP; } switch (fe_cr) { case FEC_1_2: deb_setf("fec: 1_2\n"); wr(DIB3000MB_REG_VIT_CODE_RATE, DIB3000_FEC_1_2); break; case FEC_2_3: deb_setf("fec: 2_3\n"); wr(DIB3000MB_REG_VIT_CODE_RATE, DIB3000_FEC_2_3); break; case FEC_3_4: deb_setf("fec: 3_4\n"); wr(DIB3000MB_REG_VIT_CODE_RATE, DIB3000_FEC_3_4); break; case FEC_5_6: deb_setf("fec: 5_6\n"); wr(DIB3000MB_REG_VIT_CODE_RATE, DIB3000_FEC_5_6); break; case FEC_7_8: deb_setf("fec: 7_8\n"); wr(DIB3000MB_REG_VIT_CODE_RATE, DIB3000_FEC_7_8); break; case FEC_NONE: deb_setf("fec: none\n"); break; case FEC_AUTO: deb_setf("fec: auto\n"); break; default: return -EINVAL; } seq = dib3000_seq [c->transmission_mode == TRANSMISSION_MODE_AUTO] [c->guard_interval == GUARD_INTERVAL_AUTO] [c->inversion == INVERSION_AUTO]; deb_setf("seq? %d\n", seq); wr(DIB3000MB_REG_SEQ, seq); wr(DIB3000MB_REG_ISI, seq ? DIB3000MB_ISI_INHIBIT : DIB3000MB_ISI_ACTIVATE); if (c->transmission_mode == TRANSMISSION_MODE_2K) { if (c->guard_interval == GUARD_INTERVAL_1_8) { wr(DIB3000MB_REG_SYNC_IMPROVEMENT, DIB3000MB_SYNC_IMPROVE_2K_1_8); } else { wr(DIB3000MB_REG_SYNC_IMPROVEMENT, DIB3000MB_SYNC_IMPROVE_DEFAULT); } wr(DIB3000MB_REG_UNK_121, DIB3000MB_UNK_121_2K); } else { wr(DIB3000MB_REG_UNK_121, DIB3000MB_UNK_121_DEFAULT); } wr(DIB3000MB_REG_MOBILE_ALGO, DIB3000MB_MOBILE_ALGO_OFF); wr(DIB3000MB_REG_MOBILE_MODE_QAM, DIB3000MB_MOBILE_MODE_QAM_OFF); wr(DIB3000MB_REG_MOBILE_MODE, DIB3000MB_MOBILE_MODE_OFF); wr_foreach(dib3000mb_reg_agc_bandwidth, dib3000mb_agc_bandwidth_high); wr(DIB3000MB_REG_ISI, DIB3000MB_ISI_ACTIVATE); wr(DIB3000MB_REG_RESTART, DIB3000MB_RESTART_AGC + DIB3000MB_RESTART_CTRL); wr(DIB3000MB_REG_RESTART, DIB3000MB_RESTART_OFF); /* wait for AGC lock */ msleep(70); wr_foreach(dib3000mb_reg_agc_bandwidth, dib3000mb_agc_bandwidth_low); /* something has to be auto searched */ if (c->modulation == QAM_AUTO || c->hierarchy == HIERARCHY_AUTO || fe_cr == FEC_AUTO || c->inversion == INVERSION_AUTO) { int as_count=0; deb_setf("autosearch enabled.\n"); wr(DIB3000MB_REG_ISI, DIB3000MB_ISI_INHIBIT); wr(DIB3000MB_REG_RESTART, DIB3000MB_RESTART_AUTO_SEARCH); wr(DIB3000MB_REG_RESTART, DIB3000MB_RESTART_OFF); while ((search_state = dib3000_search_status( rd(DIB3000MB_REG_AS_IRQ_PENDING), rd(DIB3000MB_REG_LOCK2_VALUE))) < 0 && as_count++ < 100) msleep(1); deb_setf("search_state after autosearch %d after %d checks\n", search_state, as_count); if (search_state == 1) { if (dib3000mb_get_frontend(fe, c) == 0) { deb_setf("reading tuning data from frontend succeeded.\n"); return dib3000mb_set_frontend(fe, 0); } } } else { wr(DIB3000MB_REG_RESTART, DIB3000MB_RESTART_CTRL); wr(DIB3000MB_REG_RESTART, DIB3000MB_RESTART_OFF); } return 0; } static int dib3000mb_fe_init(struct dvb_frontend* fe, int mobile_mode) { struct dib3000_state* state = fe->demodulator_priv; deb_info("dib3000mb is getting up.\n"); wr(DIB3000MB_REG_POWER_CONTROL, DIB3000MB_POWER_UP); wr(DIB3000MB_REG_RESTART, DIB3000MB_RESTART_AGC); wr(DIB3000MB_REG_RESET_DEVICE, DIB3000MB_RESET_DEVICE); wr(DIB3000MB_REG_RESET_DEVICE, DIB3000MB_RESET_DEVICE_RST); wr(DIB3000MB_REG_CLOCK, DIB3000MB_CLOCK_DEFAULT); wr(DIB3000MB_REG_ELECT_OUT_MODE, DIB3000MB_ELECT_OUT_MODE_ON); wr(DIB3000MB_REG_DDS_FREQ_MSB, DIB3000MB_DDS_FREQ_MSB); wr(DIB3000MB_REG_DDS_FREQ_LSB, DIB3000MB_DDS_FREQ_LSB); wr_foreach(dib3000mb_reg_timing_freq, dib3000mb_timing_freq[2]); wr_foreach(dib3000mb_reg_impulse_noise, dib3000mb_impulse_noise_values[DIB3000MB_IMPNOISE_OFF]); wr_foreach(dib3000mb_reg_agc_gain, dib3000mb_default_agc_gain); wr(DIB3000MB_REG_PHASE_NOISE, DIB3000MB_PHASE_NOISE_DEFAULT); wr_foreach(dib3000mb_reg_phase_noise, dib3000mb_default_noise_phase); wr_foreach(dib3000mb_reg_lock_duration, dib3000mb_default_lock_duration); wr_foreach(dib3000mb_reg_agc_bandwidth, dib3000mb_agc_bandwidth_low); wr(DIB3000MB_REG_LOCK0_MASK, DIB3000MB_LOCK0_DEFAULT); wr(DIB3000MB_REG_LOCK1_MASK, DIB3000MB_LOCK1_SEARCH_4); wr(DIB3000MB_REG_LOCK2_MASK, DIB3000MB_LOCK2_DEFAULT); wr(DIB3000MB_REG_SEQ, dib3000_seq[1][1][1]); wr_foreach(dib3000mb_reg_bandwidth, dib3000mb_bandwidth_8mhz); wr(DIB3000MB_REG_UNK_68, DIB3000MB_UNK_68); wr(DIB3000MB_REG_UNK_69, DIB3000MB_UNK_69); wr(DIB3000MB_REG_UNK_71, DIB3000MB_UNK_71); wr(DIB3000MB_REG_UNK_77, DIB3000MB_UNK_77); wr(DIB3000MB_REG_UNK_78, DIB3000MB_UNK_78); wr(DIB3000MB_REG_ISI, DIB3000MB_ISI_INHIBIT); wr(DIB3000MB_REG_UNK_92, DIB3000MB_UNK_92); wr(DIB3000MB_REG_UNK_96, DIB3000MB_UNK_96); wr(DIB3000MB_REG_UNK_97, DIB3000MB_UNK_97); wr(DIB3000MB_REG_UNK_106, DIB3000MB_UNK_106); wr(DIB3000MB_REG_UNK_107, DIB3000MB_UNK_107); wr(DIB3000MB_REG_UNK_108, DIB3000MB_UNK_108); wr(DIB3000MB_REG_UNK_122, DIB3000MB_UNK_122); wr(DIB3000MB_REG_MOBILE_MODE_QAM, DIB3000MB_MOBILE_MODE_QAM_OFF); wr(DIB3000MB_REG_BERLEN, DIB3000MB_BERLEN_DEFAULT); wr_foreach(dib3000mb_reg_filter_coeffs, dib3000mb_filter_coeffs); wr(DIB3000MB_REG_MOBILE_ALGO, DIB3000MB_MOBILE_ALGO_ON); wr(DIB3000MB_REG_MULTI_DEMOD_MSB, DIB3000MB_MULTI_DEMOD_MSB); wr(DIB3000MB_REG_MULTI_DEMOD_LSB, DIB3000MB_MULTI_DEMOD_LSB); wr(DIB3000MB_REG_OUTPUT_MODE, DIB3000MB_OUTPUT_MODE_SLAVE); wr(DIB3000MB_REG_FIFO_142, DIB3000MB_FIFO_142); wr(DIB3000MB_REG_MPEG2_OUT_MODE, DIB3000MB_MPEG2_OUT_MODE_188); wr(DIB3000MB_REG_PID_PARSE, DIB3000MB_PID_PARSE_ACTIVATE); wr(DIB3000MB_REG_FIFO, DIB3000MB_FIFO_INHIBIT); wr(DIB3000MB_REG_FIFO_146, DIB3000MB_FIFO_146); wr(DIB3000MB_REG_FIFO_147, DIB3000MB_FIFO_147); wr(DIB3000MB_REG_DATA_IN_DIVERSITY, DIB3000MB_DATA_DIVERSITY_IN_OFF); return 0; } static int dib3000mb_get_frontend(struct dvb_frontend* fe, struct dtv_frontend_properties *c) { struct dib3000_state* state = fe->demodulator_priv; enum fe_code_rate *cr; u16 tps_val; int inv_test1,inv_test2; u32 dds_val, threshold = 0x800000; if (!rd(DIB3000MB_REG_TPS_LOCK)) return 0; dds_val = ((rd(DIB3000MB_REG_DDS_VALUE_MSB) & 0xff) << 16) + rd(DIB3000MB_REG_DDS_VALUE_LSB); deb_getf("DDS_VAL: %x %x %x\n", dds_val, rd(DIB3000MB_REG_DDS_VALUE_MSB), rd(DIB3000MB_REG_DDS_VALUE_LSB)); if (dds_val < threshold) inv_test1 = 0; else if (dds_val == threshold) inv_test1 = 1; else inv_test1 = 2; dds_val = ((rd(DIB3000MB_REG_DDS_FREQ_MSB) & 0xff) << 16) + rd(DIB3000MB_REG_DDS_FREQ_LSB); deb_getf("DDS_FREQ: %x %x %x\n", dds_val, rd(DIB3000MB_REG_DDS_FREQ_MSB), rd(DIB3000MB_REG_DDS_FREQ_LSB)); if (dds_val < threshold) inv_test2 = 0; else if (dds_val == threshold) inv_test2 = 1; else inv_test2 = 2; c->inversion = ((inv_test2 == 2) && (inv_test1==1 || inv_test1==0)) || ((inv_test2 == 0) && (inv_test1==1 || inv_test1==2)) ? INVERSION_ON : INVERSION_OFF; deb_getf("inversion %d %d, %d\n", inv_test2, inv_test1, c->inversion); switch ((tps_val = rd(DIB3000MB_REG_TPS_QAM))) { case DIB3000_CONSTELLATION_QPSK: deb_getf("QPSK\n"); c->modulation = QPSK; break; case DIB3000_CONSTELLATION_16QAM: deb_getf("QAM16\n"); c->modulation = QAM_16; break; case DIB3000_CONSTELLATION_64QAM: deb_getf("QAM64\n"); c->modulation = QAM_64; break; default: pr_err("Unexpected constellation returned by TPS (%d)\n", tps_val); break; } deb_getf("TPS: %d\n", tps_val); if (rd(DIB3000MB_REG_TPS_HRCH)) { deb_getf("HRCH ON\n"); cr = &c->code_rate_LP; c->code_rate_HP = FEC_NONE; switch ((tps_val = rd(DIB3000MB_REG_TPS_VIT_ALPHA))) { case DIB3000_ALPHA_0: deb_getf("HIERARCHY_NONE\n"); c->hierarchy = HIERARCHY_NONE; break; case DIB3000_ALPHA_1: deb_getf("HIERARCHY_1\n"); c->hierarchy = HIERARCHY_1; break; case DIB3000_ALPHA_2: deb_getf("HIERARCHY_2\n"); c->hierarchy = HIERARCHY_2; break; case DIB3000_ALPHA_4: deb_getf("HIERARCHY_4\n"); c->hierarchy = HIERARCHY_4; break; default: pr_err("Unexpected ALPHA value returned by TPS (%d)\n", tps_val); break; } deb_getf("TPS: %d\n", tps_val); tps_val = rd(DIB3000MB_REG_TPS_CODE_RATE_LP); } else { deb_getf("HRCH OFF\n"); cr = &c->code_rate_HP; c->code_rate_LP = FEC_NONE; c->hierarchy = HIERARCHY_NONE; tps_val = rd(DIB3000MB_REG_TPS_CODE_RATE_HP); } switch (tps_val) { case DIB3000_FEC_1_2: deb_getf("FEC_1_2\n"); *cr = FEC_1_2; break; case DIB3000_FEC_2_3: deb_getf("FEC_2_3\n"); *cr = FEC_2_3; break; case DIB3000_FEC_3_4: deb_getf("FEC_3_4\n"); *cr = FEC_3_4; break; case DIB3000_FEC_5_6: deb_getf("FEC_5_6\n"); *cr = FEC_4_5; break; case DIB3000_FEC_7_8: deb_getf("FEC_7_8\n"); *cr = FEC_7_8; break; default: pr_err("Unexpected FEC returned by TPS (%d)\n", tps_val); break; } deb_getf("TPS: %d\n",tps_val); switch ((tps_val = rd(DIB3000MB_REG_TPS_GUARD_TIME))) { case DIB3000_GUARD_TIME_1_32: deb_getf("GUARD_INTERVAL_1_32\n"); c->guard_interval = GUARD_INTERVAL_1_32; break; case DIB3000_GUARD_TIME_1_16: deb_getf("GUARD_INTERVAL_1_16\n"); c->guard_interval = GUARD_INTERVAL_1_16; break; case DIB3000_GUARD_TIME_1_8: deb_getf("GUARD_INTERVAL_1_8\n"); c->guard_interval = GUARD_INTERVAL_1_8; break; case DIB3000_GUARD_TIME_1_4: deb_getf("GUARD_INTERVAL_1_4\n"); c->guard_interval = GUARD_INTERVAL_1_4; break; default: pr_err("Unexpected Guard Time returned by TPS (%d)\n", tps_val); break; } deb_getf("TPS: %d\n", tps_val); switch ((tps_val = rd(DIB3000MB_REG_TPS_FFT))) { case DIB3000_TRANSMISSION_MODE_2K: deb_getf("TRANSMISSION_MODE_2K\n"); c->transmission_mode = TRANSMISSION_MODE_2K; break; case DIB3000_TRANSMISSION_MODE_8K: deb_getf("TRANSMISSION_MODE_8K\n"); c->transmission_mode = TRANSMISSION_MODE_8K; break; default: pr_err("unexpected transmission mode return by TPS (%d)\n", tps_val); break; } deb_getf("TPS: %d\n", tps_val); return 0; } static int dib3000mb_read_status(struct dvb_frontend *fe, enum fe_status *stat) { struct dib3000_state* state = fe->demodulator_priv; *stat = 0; if (rd(DIB3000MB_REG_AGC_LOCK)) *stat |= FE_HAS_SIGNAL; if (rd(DIB3000MB_REG_CARRIER_LOCK)) *stat |= FE_HAS_CARRIER; if (rd(DIB3000MB_REG_VIT_LCK)) *stat |= FE_HAS_VITERBI; if (rd(DIB3000MB_REG_TS_SYNC_LOCK)) *stat |= (FE_HAS_SYNC | FE_HAS_LOCK); deb_getf("actual status is %2x\n",*stat); deb_getf("autoval: tps: %d, qam: %d, hrch: %d, alpha: %d, hp: %d, lp: %d, guard: %d, fft: %d cell: %d\n", rd(DIB3000MB_REG_TPS_LOCK), rd(DIB3000MB_REG_TPS_QAM), rd(DIB3000MB_REG_TPS_HRCH), rd(DIB3000MB_REG_TPS_VIT_ALPHA), rd(DIB3000MB_REG_TPS_CODE_RATE_HP), rd(DIB3000MB_REG_TPS_CODE_RATE_LP), rd(DIB3000MB_REG_TPS_GUARD_TIME), rd(DIB3000MB_REG_TPS_FFT), rd(DIB3000MB_REG_TPS_CELL_ID)); //*stat = FE_HAS_SIGNAL | FE_HAS_CARRIER | FE_HAS_VITERBI | FE_HAS_SYNC | FE_HAS_LOCK; return 0; } static int dib3000mb_read_ber(struct dvb_frontend* fe, u32 *ber) { struct dib3000_state* state = fe->demodulator_priv; *ber = ((rd(DIB3000MB_REG_BER_MSB) << 16) | rd(DIB3000MB_REG_BER_LSB)); return 0; } /* see dib3000-watch dvb-apps for exact calculations of signal_strength and snr */ static int dib3000mb_read_signal_strength(struct dvb_frontend* fe, u16 *strength) { struct dib3000_state* state = fe->demodulator_priv; *strength = rd(DIB3000MB_REG_SIGNAL_POWER) * 0xffff / 0x170; return 0; } static int dib3000mb_read_snr(struct dvb_frontend* fe, u16 *snr) { struct dib3000_state* state = fe->demodulator_priv; short sigpow = rd(DIB3000MB_REG_SIGNAL_POWER); int icipow = ((rd(DIB3000MB_REG_NOISE_POWER_MSB) & 0xff) << 16) | rd(DIB3000MB_REG_NOISE_POWER_LSB); *snr = (sigpow << 8) / ((icipow > 0) ? icipow : 1); return 0; } static int dib3000mb_read_unc_blocks(struct dvb_frontend* fe, u32 *unc) { struct dib3000_state* state = fe->demodulator_priv; *unc = rd(DIB3000MB_REG_PACKET_ERROR_RATE); return 0; } static int dib3000mb_sleep(struct dvb_frontend* fe) { struct dib3000_state* state = fe->demodulator_priv; deb_info("dib3000mb is going to bed.\n"); wr(DIB3000MB_REG_POWER_CONTROL, DIB3000MB_POWER_DOWN); return 0; } static int dib3000mb_fe_get_tune_settings(struct dvb_frontend* fe, struct dvb_frontend_tune_settings *tune) { tune->min_delay_ms = 800; return 0; } static int dib3000mb_fe_init_nonmobile(struct dvb_frontend* fe) { return dib3000mb_fe_init(fe, 0); } static int dib3000mb_set_frontend_and_tuner(struct dvb_frontend *fe) { return dib3000mb_set_frontend(fe, 1); } static void dib3000mb_release(struct dvb_frontend* fe) { struct dib3000_state *state = fe->demodulator_priv; kfree(state); } /* pid filter and transfer stuff */ static int dib3000mb_pid_control(struct dvb_frontend *fe,int index, int pid,int onoff) { struct dib3000_state *state = fe->demodulator_priv; pid = (onoff ? pid | DIB3000_ACTIVATE_PID_FILTERING : 0); wr(index+DIB3000MB_REG_FIRST_PID,pid); return 0; } static int dib3000mb_fifo_control(struct dvb_frontend *fe, int onoff) { struct dib3000_state *state = fe->demodulator_priv; deb_xfer("%s fifo\n",onoff ? "enabling" : "disabling"); if (onoff) { wr(DIB3000MB_REG_FIFO, DIB3000MB_FIFO_ACTIVATE); } else { wr(DIB3000MB_REG_FIFO, DIB3000MB_FIFO_INHIBIT); } return 0; } static int dib3000mb_pid_parse(struct dvb_frontend *fe, int onoff) { struct dib3000_state *state = fe->demodulator_priv; deb_xfer("%s pid parsing\n",onoff ? "enabling" : "disabling"); wr(DIB3000MB_REG_PID_PARSE,onoff); return 0; } static int dib3000mb_tuner_pass_ctrl(struct dvb_frontend *fe, int onoff, u8 pll_addr) { struct dib3000_state *state = fe->demodulator_priv; if (onoff) { wr(DIB3000MB_REG_TUNER, DIB3000_TUNER_WRITE_ENABLE(pll_addr)); } else { wr(DIB3000MB_REG_TUNER, DIB3000_TUNER_WRITE_DISABLE(pll_addr)); } return 0; } static const struct dvb_frontend_ops dib3000mb_ops; struct dvb_frontend* dib3000mb_attach(const struct dib3000_config* config, struct i2c_adapter* i2c, struct dib_fe_xfer_ops *xfer_ops) { struct dib3000_state* state = NULL; /* allocate memory for the internal state */ state = kzalloc(sizeof(struct dib3000_state), GFP_KERNEL); if (state == NULL) goto error; /* setup the state */ state->i2c = i2c; memcpy(&state->config,config,sizeof(struct dib3000_config)); /* check for the correct demod */ if (rd(DIB3000_REG_MANUFACTOR_ID) != DIB3000_I2C_ID_DIBCOM) goto error; if (rd(DIB3000_REG_DEVICE_ID) != DIB3000MB_DEVICE_ID) goto error; /* create dvb_frontend */ memcpy(&state->frontend.ops, &dib3000mb_ops, sizeof(struct dvb_frontend_ops)); state->frontend.demodulator_priv = state; /* set the xfer operations */ xfer_ops->pid_parse = dib3000mb_pid_parse; xfer_ops->fifo_ctrl = dib3000mb_fifo_control; xfer_ops->pid_ctrl = dib3000mb_pid_control; xfer_ops->tuner_pass_ctrl = dib3000mb_tuner_pass_ctrl; return &state->frontend; error: kfree(state); return NULL; } static const struct dvb_frontend_ops dib3000mb_ops = { .delsys = { SYS_DVBT }, .info = { .name = "DiBcom 3000M-B DVB-T", .frequency_min_hz = 44250 * kHz, .frequency_max_hz = 867250 * kHz, .frequency_stepsize_hz = 62500, .caps = FE_CAN_INVERSION_AUTO | FE_CAN_FEC_1_2 | FE_CAN_FEC_2_3 | FE_CAN_FEC_3_4 | FE_CAN_FEC_5_6 | FE_CAN_FEC_7_8 | FE_CAN_FEC_AUTO | FE_CAN_QPSK | FE_CAN_QAM_16 | FE_CAN_QAM_64 | FE_CAN_QAM_AUTO | FE_CAN_TRANSMISSION_MODE_AUTO | FE_CAN_GUARD_INTERVAL_AUTO | FE_CAN_RECOVER | FE_CAN_HIERARCHY_AUTO, }, .release = dib3000mb_release, .init = dib3000mb_fe_init_nonmobile, .sleep = dib3000mb_sleep, .set_frontend = dib3000mb_set_frontend_and_tuner, .get_frontend = dib3000mb_get_frontend, .get_tune_settings = dib3000mb_fe_get_tune_settings, .read_status = dib3000mb_read_status, .read_ber = dib3000mb_read_ber, .read_signal_strength = dib3000mb_read_signal_strength, .read_snr = dib3000mb_read_snr, .read_ucblocks = dib3000mb_read_unc_blocks, }; MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); EXPORT_SYMBOL_GPL(dib3000mb_attach); |
| 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Benq DC E300 subdriver * * Copyright (C) 2009 Jean-Francois Moine (http://moinejf.free.fr) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define MODULE_NAME "benq" #include "gspca.h" MODULE_AUTHOR("Jean-Francois Moine <http://moinejf.free.fr>"); MODULE_DESCRIPTION("Benq DC E300 USB Camera Driver"); MODULE_LICENSE("GPL"); /* specific webcam descriptor */ struct sd { struct gspca_dev gspca_dev; /* !! must be the first item */ }; static const struct v4l2_pix_format vga_mode[] = { {320, 240, V4L2_PIX_FMT_JPEG, V4L2_FIELD_NONE, .bytesperline = 320, .sizeimage = 320 * 240 * 3 / 8 + 590, .colorspace = V4L2_COLORSPACE_JPEG}, }; static void sd_isoc_irq(struct urb *urb); /* -- write a register -- */ static void reg_w(struct gspca_dev *gspca_dev, u16 value, u16 index) { struct usb_device *dev = gspca_dev->dev; int ret; if (gspca_dev->usb_err < 0) return; ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0x02, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, index, NULL, 0, 500); if (ret < 0) { pr_err("reg_w err %d\n", ret); gspca_dev->usb_err = ret; } } /* this function is called at probe time */ static int sd_config(struct gspca_dev *gspca_dev, const struct usb_device_id *id) { gspca_dev->cam.cam_mode = vga_mode; gspca_dev->cam.nmodes = ARRAY_SIZE(vga_mode); gspca_dev->cam.no_urb_create = 1; return 0; } /* this function is called at probe and resume time */ static int sd_init(struct gspca_dev *gspca_dev) { return 0; } /* -- start the camera -- */ static int sd_start(struct gspca_dev *gspca_dev) { struct urb *urb; int i, n; /* create 4 URBs - 2 on endpoint 0x83 and 2 on 0x082 */ #if MAX_NURBS < 4 #error "Not enough URBs in the gspca table" #endif #define SD_PKT_SZ 64 #define SD_NPKT 32 for (n = 0; n < 4; n++) { urb = usb_alloc_urb(SD_NPKT, GFP_KERNEL); if (!urb) return -ENOMEM; gspca_dev->urb[n] = urb; urb->transfer_buffer = usb_alloc_coherent(gspca_dev->dev, SD_PKT_SZ * SD_NPKT, GFP_KERNEL, &urb->transfer_dma); if (urb->transfer_buffer == NULL) { pr_err("usb_alloc_coherent failed\n"); return -ENOMEM; } urb->dev = gspca_dev->dev; urb->context = gspca_dev; urb->transfer_buffer_length = SD_PKT_SZ * SD_NPKT; urb->pipe = usb_rcvisocpipe(gspca_dev->dev, n & 1 ? 0x82 : 0x83); urb->transfer_flags = URB_ISO_ASAP | URB_NO_TRANSFER_DMA_MAP; urb->interval = 1; urb->complete = sd_isoc_irq; urb->number_of_packets = SD_NPKT; for (i = 0; i < SD_NPKT; i++) { urb->iso_frame_desc[i].length = SD_PKT_SZ; urb->iso_frame_desc[i].offset = SD_PKT_SZ * i; } } return gspca_dev->usb_err; } static void sd_stopN(struct gspca_dev *gspca_dev) { struct usb_interface *intf; reg_w(gspca_dev, 0x003c, 0x0003); reg_w(gspca_dev, 0x003c, 0x0004); reg_w(gspca_dev, 0x003c, 0x0005); reg_w(gspca_dev, 0x003c, 0x0006); reg_w(gspca_dev, 0x003c, 0x0007); intf = usb_ifnum_to_if(gspca_dev->dev, gspca_dev->iface); usb_set_interface(gspca_dev->dev, gspca_dev->iface, intf->num_altsetting - 1); } static void sd_pkt_scan(struct gspca_dev *gspca_dev, u8 *data, /* isoc packet */ int len) /* iso packet length */ { /* unused */ } /* reception of an URB */ static void sd_isoc_irq(struct urb *urb) { struct gspca_dev *gspca_dev = (struct gspca_dev *) urb->context; struct urb *urb0; u8 *data; int i, st; gspca_dbg(gspca_dev, D_PACK, "sd isoc irq\n"); if (!gspca_dev->streaming) return; if (urb->status != 0) { if (urb->status == -ESHUTDOWN) return; /* disconnection */ #ifdef CONFIG_PM if (gspca_dev->frozen) return; #endif pr_err("urb status: %d\n", urb->status); return; } /* if this is a control URN (ep 0x83), wait */ if (urb == gspca_dev->urb[0] || urb == gspca_dev->urb[2]) return; /* scan both received URBs */ if (urb == gspca_dev->urb[1]) urb0 = gspca_dev->urb[0]; else urb0 = gspca_dev->urb[2]; for (i = 0; i < urb->number_of_packets; i++) { /* check the packet status and length */ if (urb0->iso_frame_desc[i].actual_length != SD_PKT_SZ || urb->iso_frame_desc[i].actual_length != SD_PKT_SZ) { gspca_err(gspca_dev, "ISOC bad lengths %d / %d\n", urb0->iso_frame_desc[i].actual_length, urb->iso_frame_desc[i].actual_length); gspca_dev->last_packet_type = DISCARD_PACKET; continue; } st = urb0->iso_frame_desc[i].status; if (st == 0) st = urb->iso_frame_desc[i].status; if (st) { pr_err("ISOC data error: [%d] status=%d\n", i, st); gspca_dev->last_packet_type = DISCARD_PACKET; continue; } /* * The images are received in URBs of different endpoints * (0x83 and 0x82). * Image pieces in URBs of ep 0x83 are continuated in URBs of * ep 0x82 of the same index. * The packets in the URBs of endpoint 0x83 start with: * - 80 ba/bb 00 00 = start of image followed by 'ff d8' * - 04 ba/bb oo oo = image piece * where 'oo oo' is the image offset (not checked) * - (other -> bad frame) * The images are JPEG encoded with full header and * normal ff escape. * The end of image ('ff d9') may occur in any URB. * (not checked) */ data = (u8 *) urb0->transfer_buffer + urb0->iso_frame_desc[i].offset; if (data[0] == 0x80 && (data[1] & 0xfe) == 0xba) { /* new image */ gspca_frame_add(gspca_dev, LAST_PACKET, NULL, 0); gspca_frame_add(gspca_dev, FIRST_PACKET, data + 4, SD_PKT_SZ - 4); } else if (data[0] == 0x04 && (data[1] & 0xfe) == 0xba) { gspca_frame_add(gspca_dev, INTER_PACKET, data + 4, SD_PKT_SZ - 4); } else { gspca_dev->last_packet_type = DISCARD_PACKET; continue; } data = (u8 *) urb->transfer_buffer + urb->iso_frame_desc[i].offset; gspca_frame_add(gspca_dev, INTER_PACKET, data, SD_PKT_SZ); } /* resubmit the URBs */ st = usb_submit_urb(urb0, GFP_ATOMIC); if (st < 0) pr_err("usb_submit_urb(0) ret %d\n", st); st = usb_submit_urb(urb, GFP_ATOMIC); if (st < 0) pr_err("usb_submit_urb() ret %d\n", st); } /* sub-driver description */ static const struct sd_desc sd_desc = { .name = MODULE_NAME, .config = sd_config, .init = sd_init, .start = sd_start, .stopN = sd_stopN, .pkt_scan = sd_pkt_scan, }; /* -- module initialisation -- */ static const struct usb_device_id device_table[] = { {USB_DEVICE(0x04a5, 0x3035)}, {} }; MODULE_DEVICE_TABLE(usb, device_table); /* -- device connect -- */ static int sd_probe(struct usb_interface *intf, const struct usb_device_id *id) { return gspca_dev_probe(intf, id, &sd_desc, sizeof(struct sd), THIS_MODULE); } static struct usb_driver sd_driver = { .name = MODULE_NAME, .id_table = device_table, .probe = sd_probe, .disconnect = gspca_disconnect, #ifdef CONFIG_PM .suspend = gspca_suspend, .resume = gspca_resume, .reset_resume = gspca_resume, #endif }; module_usb_driver(sd_driver); |
| 1 2 1 1 3 1 1 1 4 1 5 5 5 5 5 5 8 1 3 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 | // SPDX-License-Identifier: GPL-2.0-or-later /* * vimc-sensor.c Virtual Media Controller Driver * * Copyright (C) 2015-2017 Helen Koike <helen.fornazier@gmail.com> */ #include <linux/v4l2-mediabus.h> #include <linux/vmalloc.h> #include <media/v4l2-ctrls.h> #include <media/v4l2-event.h> #include <media/v4l2-subdev.h> #include <media/tpg/v4l2-tpg.h> #include "vimc-common.h" enum vimc_sensor_osd_mode { VIMC_SENSOR_OSD_SHOW_ALL = 0, VIMC_SENSOR_OSD_SHOW_COUNTERS = 1, VIMC_SENSOR_OSD_SHOW_NONE = 2 }; struct vimc_sensor_device { struct vimc_ent_device ved; struct v4l2_subdev sd; struct tpg_data tpg; struct v4l2_ctrl_handler hdl; struct media_pad pad; u8 *frame; /* * Virtual "hardware" configuration, filled when the stream starts or * when controls are set. */ struct { struct v4l2_area size; enum vimc_sensor_osd_mode osd_value; u64 start_stream_ts; } hw; }; static const struct v4l2_mbus_framefmt fmt_default = { .width = 640, .height = 480, .code = MEDIA_BUS_FMT_RGB888_1X24, .field = V4L2_FIELD_NONE, .colorspace = V4L2_COLORSPACE_SRGB, }; static int vimc_sensor_init_state(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state) { struct v4l2_mbus_framefmt *mf; mf = v4l2_subdev_state_get_format(sd_state, 0); *mf = fmt_default; return 0; } static int vimc_sensor_enum_mbus_code(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state, struct v4l2_subdev_mbus_code_enum *code) { u32 mbus_code = vimc_mbus_code_by_index(code->index); if (!mbus_code) return -EINVAL; code->code = mbus_code; return 0; } static int vimc_sensor_enum_frame_size(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state, struct v4l2_subdev_frame_size_enum *fse) { const struct vimc_pix_map *vpix; if (fse->index) return -EINVAL; /* Only accept code in the pix map table */ vpix = vimc_pix_map_by_code(fse->code); if (!vpix) return -EINVAL; fse->min_width = VIMC_FRAME_MIN_WIDTH; fse->max_width = VIMC_FRAME_MAX_WIDTH; fse->min_height = VIMC_FRAME_MIN_HEIGHT; fse->max_height = VIMC_FRAME_MAX_HEIGHT; return 0; } static void vimc_sensor_tpg_s_format(struct vimc_sensor_device *vsensor, const struct v4l2_mbus_framefmt *format) { const struct vimc_pix_map *vpix = vimc_pix_map_by_code(format->code); tpg_reset_source(&vsensor->tpg, format->width, format->height, format->field); tpg_s_bytesperline(&vsensor->tpg, 0, format->width * vpix->bpp); tpg_s_buf_height(&vsensor->tpg, format->height); tpg_s_fourcc(&vsensor->tpg, vpix->pixelformat); /* TODO: add support for V4L2_FIELD_ALTERNATE */ tpg_s_field(&vsensor->tpg, format->field, false); tpg_s_colorspace(&vsensor->tpg, format->colorspace); tpg_s_ycbcr_enc(&vsensor->tpg, format->ycbcr_enc); tpg_s_quantization(&vsensor->tpg, format->quantization); tpg_s_xfer_func(&vsensor->tpg, format->xfer_func); } static void vimc_sensor_adjust_fmt(struct v4l2_mbus_framefmt *fmt) { const struct vimc_pix_map *vpix; /* Only accept code in the pix map table */ vpix = vimc_pix_map_by_code(fmt->code); if (!vpix) fmt->code = fmt_default.code; fmt->width = clamp_t(u32, fmt->width, VIMC_FRAME_MIN_WIDTH, VIMC_FRAME_MAX_WIDTH) & ~1; fmt->height = clamp_t(u32, fmt->height, VIMC_FRAME_MIN_HEIGHT, VIMC_FRAME_MAX_HEIGHT) & ~1; /* TODO: add support for V4L2_FIELD_ALTERNATE */ if (fmt->field == V4L2_FIELD_ANY || fmt->field == V4L2_FIELD_ALTERNATE) fmt->field = fmt_default.field; vimc_colorimetry_clamp(fmt); } static int vimc_sensor_set_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state, struct v4l2_subdev_format *fmt) { struct vimc_sensor_device *vsensor = v4l2_get_subdevdata(sd); struct v4l2_mbus_framefmt *mf; /* Do not change the format while stream is on */ if (fmt->which == V4L2_SUBDEV_FORMAT_ACTIVE && vsensor->frame) return -EBUSY; mf = v4l2_subdev_state_get_format(sd_state, fmt->pad); /* Set the new format */ vimc_sensor_adjust_fmt(&fmt->format); dev_dbg(vsensor->ved.dev, "%s: format update: " "old:%dx%d (0x%x, %d, %d, %d, %d) " "new:%dx%d (0x%x, %d, %d, %d, %d)\n", vsensor->sd.name, /* old */ mf->width, mf->height, mf->code, mf->colorspace, mf->quantization, mf->xfer_func, mf->ycbcr_enc, /* new */ fmt->format.width, fmt->format.height, fmt->format.code, fmt->format.colorspace, fmt->format.quantization, fmt->format.xfer_func, fmt->format.ycbcr_enc); *mf = fmt->format; return 0; } static const struct v4l2_subdev_pad_ops vimc_sensor_pad_ops = { .enum_mbus_code = vimc_sensor_enum_mbus_code, .enum_frame_size = vimc_sensor_enum_frame_size, .get_fmt = v4l2_subdev_get_fmt, .set_fmt = vimc_sensor_set_fmt, }; static void *vimc_sensor_process_frame(struct vimc_ent_device *ved, const void *sink_frame) { struct vimc_sensor_device *vsensor = container_of(ved, struct vimc_sensor_device, ved); const unsigned int line_height = 16; u8 *basep[TPG_MAX_PLANES][2]; unsigned int line = 1; char str[100]; tpg_fill_plane_buffer(&vsensor->tpg, 0, 0, vsensor->frame); tpg_calc_text_basep(&vsensor->tpg, basep, 0, vsensor->frame); switch (vsensor->hw.osd_value) { case VIMC_SENSOR_OSD_SHOW_ALL: { const char *order = tpg_g_color_order(&vsensor->tpg); tpg_gen_text(&vsensor->tpg, basep, line++ * line_height, 16, order); snprintf(str, sizeof(str), "brightness %3d, contrast %3d, saturation %3d, hue %d ", vsensor->tpg.brightness, vsensor->tpg.contrast, vsensor->tpg.saturation, vsensor->tpg.hue); tpg_gen_text(&vsensor->tpg, basep, line++ * line_height, 16, str); snprintf(str, sizeof(str), "sensor size: %dx%d", vsensor->hw.size.width, vsensor->hw.size.height); tpg_gen_text(&vsensor->tpg, basep, line++ * line_height, 16, str); fallthrough; } case VIMC_SENSOR_OSD_SHOW_COUNTERS: { unsigned int ms; ms = div_u64(ktime_get_ns() - vsensor->hw.start_stream_ts, 1000000); snprintf(str, sizeof(str), "%02d:%02d:%02d:%03d", (ms / (60 * 60 * 1000)) % 24, (ms / (60 * 1000)) % 60, (ms / 1000) % 60, ms % 1000); tpg_gen_text(&vsensor->tpg, basep, line++ * line_height, 16, str); break; } case VIMC_SENSOR_OSD_SHOW_NONE: default: break; } return vsensor->frame; } static int vimc_sensor_s_stream(struct v4l2_subdev *sd, int enable) { struct vimc_sensor_device *vsensor = container_of(sd, struct vimc_sensor_device, sd); if (enable) { const struct v4l2_mbus_framefmt *format; struct v4l2_subdev_state *state; const struct vimc_pix_map *vpix; unsigned int frame_size; state = v4l2_subdev_lock_and_get_active_state(sd); format = v4l2_subdev_state_get_format(state, 0); /* Configure the test pattern generator. */ vimc_sensor_tpg_s_format(vsensor, format); /* Calculate the frame size. */ vpix = vimc_pix_map_by_code(format->code); frame_size = format->width * vpix->bpp * format->height; vsensor->hw.size.width = format->width; vsensor->hw.size.height = format->height; v4l2_subdev_unlock_state(state); /* * Allocate the frame buffer. Use vmalloc to be able to * allocate a large amount of memory */ vsensor->frame = vmalloc(frame_size); if (!vsensor->frame) return -ENOMEM; vsensor->hw.start_stream_ts = ktime_get_ns(); } else { vfree(vsensor->frame); vsensor->frame = NULL; } return 0; } static const struct v4l2_subdev_core_ops vimc_sensor_core_ops = { .log_status = v4l2_ctrl_subdev_log_status, .subscribe_event = v4l2_ctrl_subdev_subscribe_event, .unsubscribe_event = v4l2_event_subdev_unsubscribe, }; static const struct v4l2_subdev_video_ops vimc_sensor_video_ops = { .s_stream = vimc_sensor_s_stream, }; static const struct v4l2_subdev_ops vimc_sensor_ops = { .core = &vimc_sensor_core_ops, .pad = &vimc_sensor_pad_ops, .video = &vimc_sensor_video_ops, }; static const struct v4l2_subdev_internal_ops vimc_sensor_internal_ops = { .init_state = vimc_sensor_init_state, }; static int vimc_sensor_s_ctrl(struct v4l2_ctrl *ctrl) { struct vimc_sensor_device *vsensor = container_of(ctrl->handler, struct vimc_sensor_device, hdl); switch (ctrl->id) { case VIMC_CID_TEST_PATTERN: tpg_s_pattern(&vsensor->tpg, ctrl->val); break; case V4L2_CID_HFLIP: tpg_s_hflip(&vsensor->tpg, ctrl->val); break; case V4L2_CID_VFLIP: tpg_s_vflip(&vsensor->tpg, ctrl->val); break; case V4L2_CID_BRIGHTNESS: tpg_s_brightness(&vsensor->tpg, ctrl->val); break; case V4L2_CID_CONTRAST: tpg_s_contrast(&vsensor->tpg, ctrl->val); break; case V4L2_CID_HUE: tpg_s_hue(&vsensor->tpg, ctrl->val); break; case V4L2_CID_SATURATION: tpg_s_saturation(&vsensor->tpg, ctrl->val); break; case VIMC_CID_OSD_TEXT_MODE: vsensor->hw.osd_value = ctrl->val; break; default: return -EINVAL; } return 0; } static const struct v4l2_ctrl_ops vimc_sensor_ctrl_ops = { .s_ctrl = vimc_sensor_s_ctrl, }; static void vimc_sensor_release(struct vimc_ent_device *ved) { struct vimc_sensor_device *vsensor = container_of(ved, struct vimc_sensor_device, ved); v4l2_ctrl_handler_free(&vsensor->hdl); tpg_free(&vsensor->tpg); v4l2_subdev_cleanup(&vsensor->sd); media_entity_cleanup(vsensor->ved.ent); kfree(vsensor); } /* Image Processing Controls */ static const struct v4l2_ctrl_config vimc_sensor_ctrl_class = { .flags = V4L2_CTRL_FLAG_READ_ONLY | V4L2_CTRL_FLAG_WRITE_ONLY, .id = VIMC_CID_VIMC_CLASS, .name = "VIMC Controls", .type = V4L2_CTRL_TYPE_CTRL_CLASS, }; static const struct v4l2_ctrl_config vimc_sensor_ctrl_test_pattern = { .ops = &vimc_sensor_ctrl_ops, .id = VIMC_CID_TEST_PATTERN, .name = "Test Pattern", .type = V4L2_CTRL_TYPE_MENU, .max = TPG_PAT_NOISE, .qmenu = tpg_pattern_strings, }; static const char * const vimc_ctrl_osd_mode_strings[] = { "All", "Counters Only", "None", NULL, }; static const struct v4l2_ctrl_config vimc_sensor_ctrl_osd_mode = { .ops = &vimc_sensor_ctrl_ops, .id = VIMC_CID_OSD_TEXT_MODE, .name = "Show Information", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vimc_ctrl_osd_mode_strings) - 2, .qmenu = vimc_ctrl_osd_mode_strings, }; static struct vimc_ent_device *vimc_sensor_add(struct vimc_device *vimc, const char *vcfg_name) { struct v4l2_device *v4l2_dev = &vimc->v4l2_dev; struct vimc_sensor_device *vsensor; int ret; /* Allocate the vsensor struct */ vsensor = kzalloc(sizeof(*vsensor), GFP_KERNEL); if (!vsensor) return ERR_PTR(-ENOMEM); v4l2_ctrl_handler_init(&vsensor->hdl, 4); v4l2_ctrl_new_custom(&vsensor->hdl, &vimc_sensor_ctrl_class, NULL); v4l2_ctrl_new_custom(&vsensor->hdl, &vimc_sensor_ctrl_test_pattern, NULL); v4l2_ctrl_new_custom(&vsensor->hdl, &vimc_sensor_ctrl_osd_mode, NULL); v4l2_ctrl_new_std(&vsensor->hdl, &vimc_sensor_ctrl_ops, V4L2_CID_VFLIP, 0, 1, 1, 0); v4l2_ctrl_new_std(&vsensor->hdl, &vimc_sensor_ctrl_ops, V4L2_CID_HFLIP, 0, 1, 1, 0); v4l2_ctrl_new_std(&vsensor->hdl, &vimc_sensor_ctrl_ops, V4L2_CID_BRIGHTNESS, 0, 255, 1, 128); v4l2_ctrl_new_std(&vsensor->hdl, &vimc_sensor_ctrl_ops, V4L2_CID_CONTRAST, 0, 255, 1, 128); v4l2_ctrl_new_std(&vsensor->hdl, &vimc_sensor_ctrl_ops, V4L2_CID_HUE, -128, 127, 1, 0); v4l2_ctrl_new_std(&vsensor->hdl, &vimc_sensor_ctrl_ops, V4L2_CID_SATURATION, 0, 255, 1, 128); vsensor->sd.ctrl_handler = &vsensor->hdl; if (vsensor->hdl.error) { ret = vsensor->hdl.error; goto err_free_vsensor; } /* Initialize the test pattern generator */ tpg_init(&vsensor->tpg, fmt_default.width, fmt_default.height); ret = tpg_alloc(&vsensor->tpg, VIMC_FRAME_MAX_WIDTH); if (ret) goto err_free_hdl; /* Initialize ved and sd */ vsensor->pad.flags = MEDIA_PAD_FL_SOURCE; ret = vimc_ent_sd_register(&vsensor->ved, &vsensor->sd, v4l2_dev, vcfg_name, MEDIA_ENT_F_CAM_SENSOR, 1, &vsensor->pad, &vimc_sensor_internal_ops, &vimc_sensor_ops); if (ret) goto err_free_tpg; vsensor->ved.process_frame = vimc_sensor_process_frame; vsensor->ved.dev = vimc->mdev.dev; return &vsensor->ved; err_free_tpg: tpg_free(&vsensor->tpg); err_free_hdl: v4l2_ctrl_handler_free(&vsensor->hdl); err_free_vsensor: kfree(vsensor); return ERR_PTR(ret); } const struct vimc_ent_type vimc_sensor_type = { .add = vimc_sensor_add, .release = vimc_sensor_release }; |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 | // SPDX-License-Identifier: GPL-2.0-only /* * inode.c - securityfs * * Copyright (C) 2005 Greg Kroah-Hartman <gregkh@suse.de> * * Based on fs/debugfs/inode.c which had the following copyright notice: * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. */ /* #define DEBUG */ #include <linux/sysfs.h> #include <linux/kobject.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/lsm_hooks.h> #include <linux/magic.h> static struct vfsmount *mount; static int mount_count; static void securityfs_free_inode(struct inode *inode) { if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); free_inode_nonrcu(inode); } static const struct super_operations securityfs_super_operations = { .statfs = simple_statfs, .free_inode = securityfs_free_inode, }; static int securityfs_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr files[] = {{""}}; int error; error = simple_fill_super(sb, SECURITYFS_MAGIC, files); if (error) return error; sb->s_op = &securityfs_super_operations; return 0; } static int securityfs_get_tree(struct fs_context *fc) { return get_tree_single(fc, securityfs_fill_super); } static const struct fs_context_operations securityfs_context_ops = { .get_tree = securityfs_get_tree, }; static int securityfs_init_fs_context(struct fs_context *fc) { fc->ops = &securityfs_context_ops; return 0; } static struct file_system_type fs_type = { .owner = THIS_MODULE, .name = "securityfs", .init_fs_context = securityfs_init_fs_context, .kill_sb = kill_litter_super, }; /** * securityfs_create_dentry - create a dentry in the securityfs filesystem * * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the securityfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * @iops: a point to a struct of inode_operations that should be used for * this file/dir * * This is the basic "create a file/dir/symlink" function for * securityfs. It allows for a wide range of flexibility in creating * a file, or a directory (if you want to create a directory, the * securityfs_create_dir() function is recommended to be used * instead). * * This function returns a pointer to a dentry if it succeeds. This * pointer must be passed to the securityfs_remove() function when the * file is to be removed (no automatic cleanup happens if your module * is unloaded, you are responsible here). If an error occurs, the * function will return the error value (via ERR_PTR). * * If securityfs is not enabled in the kernel, the value %-ENODEV is * returned. */ static struct dentry *securityfs_create_dentry(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops, const struct inode_operations *iops) { struct dentry *dentry; struct inode *dir, *inode; int error; bool pinned = false; if (!(mode & S_IFMT)) mode = (mode & S_IALLUGO) | S_IFREG; pr_debug("securityfs: creating file '%s'\n",name); if (!parent) { error = simple_pin_fs(&fs_type, &mount, &mount_count); if (error) return ERR_PTR(error); pinned = true; parent = mount->mnt_root; } dir = d_inode(parent); inode_lock(dir); dentry = lookup_noperm(&QSTR(name), parent); if (IS_ERR(dentry)) goto out; if (d_really_is_positive(dentry)) { error = -EEXIST; goto out1; } inode = new_inode(dir->i_sb); if (!inode) { error = -ENOMEM; goto out1; } inode->i_ino = get_next_ino(); inode->i_mode = mode; simple_inode_init_ts(inode); inode->i_private = data; if (S_ISDIR(mode)) { inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; inc_nlink(inode); inc_nlink(dir); } else if (S_ISLNK(mode)) { inode->i_op = iops ? iops : &simple_symlink_inode_operations; inode->i_link = data; } else { inode->i_fop = fops; } d_instantiate(dentry, inode); inode_unlock(dir); return dentry; out1: dput(dentry); dentry = ERR_PTR(error); out: inode_unlock(dir); if (pinned) simple_release_fs(&mount, &mount_count); return dentry; } /** * securityfs_create_file - create a file in the securityfs filesystem * * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the securityfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * * This function creates a file in securityfs with the given @name. * * This function returns a pointer to a dentry if it succeeds. This * pointer must be passed to the securityfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here). If an error occurs, the function will return * the error value (via ERR_PTR). * * If securityfs is not enabled in the kernel, the value %-ENODEV is * returned. */ struct dentry *securityfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { return securityfs_create_dentry(name, mode, parent, data, fops, NULL); } EXPORT_SYMBOL_GPL(securityfs_create_file); /** * securityfs_create_dir - create a directory in the securityfs filesystem * * @name: a pointer to a string containing the name of the directory to * create. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * directory will be created in the root of the securityfs filesystem. * * This function creates a directory in securityfs with the given @name. * * This function returns a pointer to a dentry if it succeeds. This * pointer must be passed to the securityfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here). If an error occurs, the function will return * the error value (via ERR_PTR). * * If securityfs is not enabled in the kernel, the value %-ENODEV is * returned. */ struct dentry *securityfs_create_dir(const char *name, struct dentry *parent) { return securityfs_create_file(name, S_IFDIR | 0755, parent, NULL, NULL); } EXPORT_SYMBOL_GPL(securityfs_create_dir); /** * securityfs_create_symlink - create a symlink in the securityfs filesystem * * @name: a pointer to a string containing the name of the symlink to * create. * @parent: a pointer to the parent dentry for the symlink. This should be a * directory dentry if set. If this parameter is %NULL, then the * directory will be created in the root of the securityfs filesystem. * @target: a pointer to a string containing the name of the symlink's target. * If this parameter is %NULL, then the @iops parameter needs to be * setup to handle .readlink and .get_link inode_operations. * @iops: a pointer to the struct inode_operations to use for the symlink. If * this parameter is %NULL, then the default simple_symlink_inode * operations will be used. * * This function creates a symlink in securityfs with the given @name. * * This function returns a pointer to a dentry if it succeeds. This * pointer must be passed to the securityfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here). If an error occurs, the function will return * the error value (via ERR_PTR). * * If securityfs is not enabled in the kernel, the value %-ENODEV is * returned. */ struct dentry *securityfs_create_symlink(const char *name, struct dentry *parent, const char *target, const struct inode_operations *iops) { struct dentry *dent; char *link = NULL; if (target) { link = kstrdup(target, GFP_KERNEL); if (!link) return ERR_PTR(-ENOMEM); } dent = securityfs_create_dentry(name, S_IFLNK | 0444, parent, link, NULL, iops); if (IS_ERR(dent)) kfree(link); return dent; } EXPORT_SYMBOL_GPL(securityfs_create_symlink); static void remove_one(struct dentry *victim) { if (victim->d_parent == victim->d_sb->s_root) simple_release_fs(&mount, &mount_count); } /** * securityfs_remove - removes a file or directory from the securityfs filesystem * * @dentry: a pointer to a the dentry of the file or directory to be removed. * * This function removes a file or directory in securityfs that was previously * created with a call to another securityfs function (like * securityfs_create_file() or variants thereof.) * * This function is required to be called in order for the file to be * removed. No automatic cleanup of files will happen when a module is * removed; you are responsible here. * * AV: when applied to directory it will take all children out; no need to call * it for descendents if ancestor is getting killed. */ void securityfs_remove(struct dentry *dentry) { if (IS_ERR_OR_NULL(dentry)) return; simple_pin_fs(&fs_type, &mount, &mount_count); simple_recursive_removal(dentry, remove_one); simple_release_fs(&mount, &mount_count); } EXPORT_SYMBOL_GPL(securityfs_remove); #ifdef CONFIG_SECURITY static struct dentry *lsm_dentry; static ssize_t lsm_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { return simple_read_from_buffer(buf, count, ppos, lsm_names, strlen(lsm_names)); } static const struct file_operations lsm_ops = { .read = lsm_read, .llseek = generic_file_llseek, }; #endif static int __init securityfs_init(void) { int retval; retval = sysfs_create_mount_point(kernel_kobj, "security"); if (retval) return retval; retval = register_filesystem(&fs_type); if (retval) { sysfs_remove_mount_point(kernel_kobj, "security"); return retval; } #ifdef CONFIG_SECURITY lsm_dentry = securityfs_create_file("lsm", 0444, NULL, NULL, &lsm_ops); #endif return 0; } core_initcall(securityfs_init); |
| 15 28 44 44 43 44 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor ipc mediation * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2017 Canonical Ltd. */ #include <linux/gfp.h> #include "include/audit.h" #include "include/capability.h" #include "include/cred.h" #include "include/policy.h" #include "include/ipc.h" #include "include/sig_names.h" static inline int map_signal_num(int sig) { if (sig > SIGRTMAX) return SIGUNKNOWN; else if (sig >= SIGRTMIN) return sig - SIGRTMIN + SIGRT_BASE; else if (sig < MAXMAPPED_SIG) return sig_map[sig]; return SIGUNKNOWN; } /** * audit_signal_mask - convert mask to permission string * @mask: permission mask to convert * * Returns: pointer to static string */ static const char *audit_signal_mask(u32 mask) { if (mask & MAY_READ) return "receive"; if (mask & MAY_WRITE) return "send"; return ""; } /** * audit_signal_cb() - call back for signal specific audit fields * @ab: audit_buffer (NOT NULL) * @va: audit struct to audit values of (NOT NULL) */ static void audit_signal_cb(struct audit_buffer *ab, void *va) { struct common_audit_data *sa = va; struct apparmor_audit_data *ad = aad(sa); if (ad->request & AA_SIGNAL_PERM_MASK) { audit_log_format(ab, " requested_mask=\"%s\"", audit_signal_mask(ad->request)); if (ad->denied & AA_SIGNAL_PERM_MASK) { audit_log_format(ab, " denied_mask=\"%s\"", audit_signal_mask(ad->denied)); } } if (ad->signal == SIGUNKNOWN) audit_log_format(ab, "signal=unknown(%d)", ad->unmappedsig); else if (ad->signal < MAXMAPPED_SIGNAME) audit_log_format(ab, " signal=%s", sig_names[ad->signal]); else audit_log_format(ab, " signal=rtmin+%d", ad->signal - SIGRT_BASE); audit_log_format(ab, " peer="); aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->peer, FLAGS_NONE, GFP_ATOMIC); } static int profile_signal_perm(const struct cred *cred, struct aa_profile *profile, struct aa_label *peer, u32 request, struct apparmor_audit_data *ad) { struct aa_ruleset *rules = profile->label.rules[0]; struct aa_perms perms; aa_state_t state; if (profile_unconfined(profile)) return 0; ad->subj_cred = cred; ad->peer = peer; /* TODO: secondary cache check <profile, profile, perm> */ state = RULE_MEDIATES(rules, AA_CLASS_SIGNAL); if (!state) return 0; state = aa_dfa_next(rules->policy->dfa, state, ad->signal); aa_label_match(profile, rules, peer, state, false, request, &perms); aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, ad, audit_signal_cb); } int aa_may_signal(const struct cred *subj_cred, struct aa_label *sender, const struct cred *target_cred, struct aa_label *target, int sig) { struct aa_profile *profile; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_SIGNAL, OP_SIGNAL); ad.signal = map_signal_num(sig); ad.unmappedsig = sig; return xcheck_labels(sender, target, profile, profile_signal_perm(subj_cred, profile, target, MAY_WRITE, &ad), profile_signal_perm(target_cred, profile, sender, MAY_READ, &ad)); } |
| 939 4085 230 1176 723 229 3521 3519 3540 3520 468 227 229 223 228 230 466 463 462 467 464 465 468 466 4 4 4 1172 1174 1179 1169 1173 1188 1163 1161 1174 728 639 723 722 724 721 720 638 1185 524 1149 1168 653 732 731 14 13 13 1158 1163 100 100 100 98 99 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 | // SPDX-License-Identifier: GPL-2.0-only /* * Generic helpers for smp ipi calls * * (C) Jens Axboe <jens.axboe@oracle.com> 2008 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/irq_work.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/gfp.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/sched.h> #include <linux/sched/idle.h> #include <linux/hypervisor.h> #include <linux/sched/clock.h> #include <linux/nmi.h> #include <linux/sched/debug.h> #include <linux/jump_label.h> #include <linux/string_choices.h> #include <trace/events/ipi.h> #define CREATE_TRACE_POINTS #include <trace/events/csd.h> #undef CREATE_TRACE_POINTS #include "smpboot.h" #include "sched/smp.h" #define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK) struct call_function_data { call_single_data_t __percpu *csd; cpumask_var_t cpumask; cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1); static void __flush_smp_call_function_queue(bool warn_cpu_offline); int smpcfd_prepare_cpu(unsigned int cpu) { struct call_function_data *cfd = &per_cpu(cfd_data, cpu); if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return -ENOMEM; if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, cpu_to_node(cpu))) { free_cpumask_var(cfd->cpumask); return -ENOMEM; } cfd->csd = alloc_percpu(call_single_data_t); if (!cfd->csd) { free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); return -ENOMEM; } return 0; } int smpcfd_dead_cpu(unsigned int cpu) { struct call_function_data *cfd = &per_cpu(cfd_data, cpu); free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); free_percpu(cfd->csd); return 0; } int smpcfd_dying_cpu(unsigned int cpu) { /* * The IPIs for the smp-call-function callbacks queued by other CPUs * might arrive late, either due to hardware latencies or because this * CPU disabled interrupts (inside stop-machine) before the IPIs were * sent. So flush out any pending callbacks explicitly (without waiting * for the IPIs to arrive), to ensure that the outgoing CPU doesn't go * offline with work still pending. * * This runs with interrupts disabled inside the stopper task invoked by * stop_machine(), ensuring mutually exclusive CPU offlining and IPI flush. */ __flush_smp_call_function_queue(false); irq_work_run(); return 0; } void __init call_function_init(void) { int i; for_each_possible_cpu(i) init_llist_head(&per_cpu(call_single_queue, i)); smpcfd_prepare_cpu(smp_processor_id()); } static __always_inline void send_call_function_single_ipi(int cpu) { if (call_function_single_prep_ipi(cpu)) { trace_ipi_send_cpu(cpu, _RET_IP_, generic_smp_call_function_single_interrupt); arch_send_call_function_single_ipi(cpu); } } static __always_inline void send_call_function_ipi_mask(struct cpumask *mask) { trace_ipi_send_cpumask(mask, _RET_IP_, generic_smp_call_function_single_interrupt); arch_send_call_function_ipi_mask(mask); } static __always_inline void csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd) { trace_csd_function_entry(func, csd); func(info); trace_csd_function_exit(func, csd); } #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled); /* * Parse the csdlock_debug= kernel boot parameter. * * If you need to restore the old "ext" value that once provided * additional debugging information, reapply the following commits: * * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging") * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging") */ static int __init csdlock_debug(char *str) { int ret; unsigned int val = 0; ret = get_option(&str, &val); if (ret) { if (val) static_branch_enable(&csdlock_debug_enabled); else static_branch_disable(&csdlock_debug_enabled); } return 1; } __setup("csdlock_debug=", csdlock_debug); static DEFINE_PER_CPU(call_single_data_t *, cur_csd); static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ module_param(csd_lock_timeout, ulong, 0644); static int panic_on_ipistall; /* CSD panic timeout in milliseconds, 300000 for five minutes. */ module_param(panic_on_ipistall, int, 0644); static atomic_t csd_bug_count = ATOMIC_INIT(0); /* Record current CSD work for current CPU, NULL to erase. */ static void __csd_lock_record(call_single_data_t *csd) { if (!csd) { smp_mb(); /* NULL cur_csd after unlock. */ __this_cpu_write(cur_csd, NULL); return; } __this_cpu_write(cur_csd_func, csd->func); __this_cpu_write(cur_csd_info, csd->info); smp_wmb(); /* func and info before csd. */ __this_cpu_write(cur_csd, csd); smp_mb(); /* Update cur_csd before function call. */ /* Or before unlock, as the case may be. */ } static __always_inline void csd_lock_record(call_single_data_t *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) __csd_lock_record(csd); } static int csd_lock_wait_getcpu(call_single_data_t *csd) { unsigned int csd_type; csd_type = CSD_TYPE(csd); if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC) return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */ return -1; } static atomic_t n_csd_lock_stuck; /** * csd_lock_is_stuck - Has a CSD-lock acquisition been stuck too long? * * Returns @true if a CSD-lock acquisition is stuck and has been stuck * long enough for a "non-responsive CSD lock" message to be printed. */ bool csd_lock_is_stuck(void) { return !!atomic_read(&n_csd_lock_stuck); } /* * Complain if too much time spent waiting. Note that only * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id, unsigned long *nmessages) { int cpu = -1; int cpux; bool firsttime; u64 ts2, ts_delta; call_single_data_t *cpu_cur_csd; unsigned int flags = READ_ONCE(csd->node.u_flags); unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC; if (!(flags & CSD_FLAG_LOCK)) { if (!unlikely(*bug_id)) return true; cpu = csd_lock_wait_getcpu(csd); pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n", *bug_id, raw_smp_processor_id(), cpu); atomic_dec(&n_csd_lock_stuck); return true; } ts2 = ktime_get_mono_fast_ns(); /* How long since we last checked for a stuck CSD lock.*/ ts_delta = ts2 - *ts1; if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) * (!*nmessages ? 1 : (ilog2(num_online_cpus()) / 2 + 1)) || csd_lock_timeout_ns == 0)) return false; if (ts0 > ts2) { /* Our own sched_clock went backward; don't blame another CPU. */ ts_delta = ts0 - ts2; pr_alert("sched_clock on CPU %d went backward by %llu ns\n", raw_smp_processor_id(), ts_delta); *ts1 = ts2; return false; } firsttime = !*bug_id; if (firsttime) *bug_id = atomic_inc_return(&csd_bug_count); cpu = csd_lock_wait_getcpu(csd); if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu)) cpux = 0; else cpux = cpu; cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ /* How long since this CSD lock was stuck. */ ts_delta = ts2 - ts0; pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n", firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta, cpu, csd->func, csd->info); (*nmessages)++; if (firsttime) atomic_inc(&n_csd_lock_stuck); /* * If the CSD lock is still stuck after 5 minutes, it is unlikely * to become unstuck. Use a signed comparison to avoid triggering * on underflows when the TSC is out of sync between sockets. */ BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC)); if (cpu_cur_csd && csd != cpu_cur_csd) { pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n", *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)), READ_ONCE(per_cpu(cur_csd_info, cpux))); } else { pr_alert("\tcsd: CSD lock (#%d) %s.\n", *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request"); } if (cpu >= 0) { if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0)) dump_cpu_task(cpu); if (!cpu_cur_csd) { pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); arch_send_call_function_single_ipi(cpu); } } if (firsttime) dump_stack(); *ts1 = ts2; return false; } /* * csd_lock/csd_unlock used to serialize access to per-cpu csd resources * * For non-synchronous ipi calls the csd can still be in use by the * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ static void __csd_lock_wait(call_single_data_t *csd) { unsigned long nmessages = 0; int bug_id = 0; u64 ts0, ts1; ts1 = ts0 = ktime_get_mono_fast_ns(); for (;;) { if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages)) break; cpu_relax(); } smp_acquire__after_ctrl_dep(); } static __always_inline void csd_lock_wait(call_single_data_t *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) { __csd_lock_wait(csd); return; } smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #else static void csd_lock_record(call_single_data_t *csd) { } static __always_inline void csd_lock_wait(call_single_data_t *csd) { smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #endif static __always_inline void csd_lock(call_single_data_t *csd) { csd_lock_wait(csd); csd->node.u_flags |= CSD_FLAG_LOCK; /* * prevent CPU from reordering the above assignment * to ->flags with any subsequent assignments to other * fields of the specified call_single_data_t structure: */ smp_wmb(); } static __always_inline void csd_unlock(call_single_data_t *csd) { WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: */ smp_store_release(&csd->node.u_flags, 0); } static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); void __smp_call_single_queue(int cpu, struct llist_node *node) { /* * We have to check the type of the CSD before queueing it, because * once queued it can have its flags cleared by * flush_smp_call_function_queue() * even if we haven't sent the smp_call IPI yet (e.g. the stopper * executes migration_cpu_stop() on the remote CPU). */ if (trace_csd_queue_cpu_enabled()) { call_single_data_t *csd; smp_call_func_t func; csd = container_of(node, call_single_data_t, node.llist); func = CSD_TYPE(csd) == CSD_TYPE_TTWU ? sched_ttwu_pending : csd->func; trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); } /* * The list addition should be visible to the target CPU when it pops * the head of the list to pull the entry off it in the IPI handler * because of normal cache coherency rules implied by the underlying * llist ops. * * If IPIs can go out of order to the cache coherency protocol * in an architecture, sufficient synchronisation should be added * to arch code to make it appear to obey cache coherency WRT * locking and barrier primitives. Generic code isn't really * equipped to do the right thing... */ if (llist_add(node, &per_cpu(call_single_queue, cpu))) send_call_function_single_ipi(cpu); } /* * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ static int generic_exec_single(int cpu, call_single_data_t *csd) { /* * Preemption already disabled here so stopper cannot run on this CPU, * ensuring mutually exclusive CPU offlining and last IPI flush. */ if (cpu == smp_processor_id()) { smp_call_func_t func = csd->func; void *info = csd->info; unsigned long flags; /* * We can unlock early even for the synchronous on-stack case, * since we're doing this from the same CPU.. */ csd_lock_record(csd); csd_unlock(csd); local_irq_save(flags); csd_do_func(func, info, NULL); csd_lock_record(NULL); local_irq_restore(flags); return 0; } if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) { csd_unlock(csd); return -ENXIO; } __smp_call_single_queue(cpu, &csd->node.llist); return 0; } /** * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks * * Invoked by arch to handle an IPI for call function single. * Must be called with interrupts disabled. */ void generic_smp_call_function_single_interrupt(void) { __flush_smp_call_function_queue(true); } /** * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks * * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an * offline CPU. Skip this check if set to 'false'. * * Flush any pending smp-call-function callbacks queued on this CPU. This is * invoked by the generic IPI handler, as well as by a CPU about to go offline, * to ensure that all pending IPI callbacks are run before it goes completely * offline. * * Loop through the call_single_queue and run all the queued callbacks. * Must be called with interrupts disabled. */ static void __flush_smp_call_function_queue(bool warn_cpu_offline) { call_single_data_t *csd, *csd_next; struct llist_node *entry, *prev; struct llist_head *head; static bool warned; atomic_t *tbt; lockdep_assert_irqs_disabled(); /* Allow waiters to send backtrace NMI from here onwards */ tbt = this_cpu_ptr(&trigger_backtrace); atomic_set_release(tbt, 1); head = this_cpu_ptr(&call_single_queue); entry = llist_del_all(head); entry = llist_reverse_order(entry); /* There shouldn't be any pending callbacks on an offline CPU. */ if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) && !warned && entry != NULL)) { warned = true; WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); /* * We don't have to use the _safe() variant here * because we are not invoking the IPI handlers yet. */ llist_for_each_entry(csd, entry, node.llist) { switch (CSD_TYPE(csd)) { case CSD_TYPE_ASYNC: case CSD_TYPE_SYNC: case CSD_TYPE_IRQ_WORK: pr_warn("IPI callback %pS sent to offline CPU\n", csd->func); break; case CSD_TYPE_TTWU: pr_warn("IPI task-wakeup sent to offline CPU\n"); break; default: pr_warn("IPI callback, unknown type %d, sent to offline CPU\n", CSD_TYPE(csd)); break; } } } /* * First; run all SYNC callbacks, people are waiting for us. */ prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { /* Do we wait until *after* callback? */ if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { smp_call_func_t func = csd->func; void *info = csd->info; if (prev) { prev->next = &csd_next->node.llist; } else { entry = &csd_next->node.llist; } csd_lock_record(csd); csd_do_func(func, info, csd); csd_unlock(csd); csd_lock_record(NULL); } else { prev = &csd->node.llist; } } if (!entry) return; /* * Second; run all !SYNC callbacks. */ prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { int type = CSD_TYPE(csd); if (type != CSD_TYPE_TTWU) { if (prev) { prev->next = &csd_next->node.llist; } else { entry = &csd_next->node.llist; } if (type == CSD_TYPE_ASYNC) { smp_call_func_t func = csd->func; void *info = csd->info; csd_lock_record(csd); csd_unlock(csd); csd_do_func(func, info, csd); csd_lock_record(NULL); } else if (type == CSD_TYPE_IRQ_WORK) { irq_work_single(csd); } } else { prev = &csd->node.llist; } } /* * Third; only CSD_TYPE_TTWU is left, issue those. */ if (entry) { csd = llist_entry(entry, typeof(*csd), node.llist); csd_do_func(sched_ttwu_pending, entry, csd); } } /** * flush_smp_call_function_queue - Flush pending smp-call-function callbacks * from task context (idle, migration thread) * * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to * handle queued SMP function calls before scheduling. * * The migration thread has to ensure that an eventually pending wakeup has * been handled before it migrates a task. */ void flush_smp_call_function_queue(void) { unsigned int was_pending; unsigned long flags; if (llist_empty(this_cpu_ptr(&call_single_queue))) return; local_irq_save(flags); /* Get the already pending soft interrupts for RT enabled kernels */ was_pending = local_softirq_pending(); __flush_smp_call_function_queue(true); if (local_softirq_pending()) do_softirq_post_smp_call_flush(was_pending); local_irq_restore(flags); } /* * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed on other CPUs. * * Returns 0 on success, else a negative status code. */ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { call_single_data_t *csd; call_single_data_t csd_stack = { .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, }, }; int this_cpu; int err; /* * Prevent preemption and reschedule on another CPU, as well as CPU * removal. This prevents stopper from running on this CPU, thus * providing mutual exclusion of the below cpu_online() check and * IPI sending ensuring IPI are not missed by CPU going offline. */ this_cpu = get_cpu(); /* * Can deadlock when called with interrupts disabled. * We allow cpu's that are not yet online though, as no one else can * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress); /* * When @wait we can deadlock when we interrupt between llist_add() and * arch_send_call_function_ipi*(); when !@wait we can deadlock due to * csd_lock() on because the interrupt context uses the same csd * storage. */ WARN_ON_ONCE(!in_task()); csd = &csd_stack; if (!wait) { csd = this_cpu_ptr(&csd_data); csd_lock(csd); } csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif err = generic_exec_single(cpu, csd); if (wait) csd_lock_wait(csd); put_cpu(); return err; } EXPORT_SYMBOL(smp_call_function_single); /** * smp_call_function_single_async() - Run an asynchronous function on a * specific CPU. * @cpu: The CPU to run on. * @csd: Pre-allocated and setup data structure * * Like smp_call_function_single(), but the call is asynchonous and * can thus be done from contexts with disabled interrupts. * * The caller passes his own pre-allocated data structure * (ie: embedded in an object) and is responsible for synchronizing it * such that the IPIs performed on the @csd are strictly serialized. * * If the function is called with one csd which has not yet been * processed by previous call to smp_call_function_single_async(), the * function will return immediately with -EBUSY showing that the csd * object is still in progress. * * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. * * Return: %0 on success or negative errno value on error */ int smp_call_function_single_async(int cpu, call_single_data_t *csd) { int err = 0; preempt_disable(); if (csd->node.u_flags & CSD_FLAG_LOCK) { err = -EBUSY; goto out; } csd->node.u_flags = CSD_FLAG_LOCK; smp_wmb(); err = generic_exec_single(cpu, csd); out: preempt_enable(); return err; } EXPORT_SYMBOL_GPL(smp_call_function_single_async); /* * smp_call_function_any - Run a function on any of the given cpus * @mask: The mask of cpus it can run on. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed. * * Returns 0 on success, else a negative status code (if no cpus were online). * * Selection preference: * 1) current cpu if in @mask * 2) nearest cpu in @mask, based on NUMA topology */ int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait) { unsigned int cpu; int ret; /* Try for same CPU (cheapest) */ cpu = get_cpu(); if (!cpumask_test_cpu(cpu, mask)) cpu = sched_numa_find_nth_cpu(mask, 0, cpu_to_node(cpu)); ret = smp_call_function_single(cpu, func, info, wait); put_cpu(); return ret; } EXPORT_SYMBOL_GPL(smp_call_function_any); /* * Flags to be used as scf_flags argument of smp_call_function_many_cond(). * * %SCF_WAIT: Wait until function execution is completed * %SCF_RUN_LOCAL: Run also locally if local cpu is set in cpumask */ #define SCF_WAIT (1U << 0) #define SCF_RUN_LOCAL (1U << 1) static void smp_call_function_many_cond(const struct cpumask *mask, smp_call_func_t func, void *info, unsigned int scf_flags, smp_cond_func_t cond_func) { int cpu, last_cpu, this_cpu = smp_processor_id(); struct call_function_data *cfd; bool wait = scf_flags & SCF_WAIT; int nr_cpus = 0; bool run_remote = false; lockdep_assert_preemption_disabled(); /* * Can deadlock when called with interrupts disabled. * We allow cpu's that are not yet online though, as no one else can * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ if (cpu_online(this_cpu) && !oops_in_progress && !early_boot_irqs_disabled) lockdep_assert_irqs_enabled(); /* * When @wait we can deadlock when we interrupt between llist_add() and * arch_send_call_function_ipi*(); when !@wait we can deadlock due to * csd_lock() on because the interrupt context uses the same csd * storage. */ WARN_ON_ONCE(!in_task()); /* Check if we need remote execution, i.e., any CPU excluding this one. */ if (cpumask_any_and_but(mask, cpu_online_mask, this_cpu) < nr_cpu_ids) { cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); __cpumask_clear_cpu(this_cpu, cfd->cpumask); cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu); if (cond_func && !cond_func(cpu, info)) { __cpumask_clear_cpu(cpu, cfd->cpumask); continue; } /* Work is enqueued on a remote CPU. */ run_remote = true; csd_lock(csd); if (wait) csd->node.u_flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); /* * Kick the remote CPU if this is the first work * item enqueued. */ if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { __cpumask_set_cpu(cpu, cfd->cpumask_ipi); nr_cpus++; last_cpu = cpu; } } /* * Choose the most efficient way to send an IPI. Note that the * number of CPUs might be zero due to concurrent changes to the * provided mask. */ if (nr_cpus == 1) send_call_function_single_ipi(last_cpu); else if (likely(nr_cpus > 1)) send_call_function_ipi_mask(cfd->cpumask_ipi); } /* Check if we need local execution. */ if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) && (!cond_func || cond_func(this_cpu, info))) { unsigned long flags; local_irq_save(flags); csd_do_func(func, info, NULL); local_irq_restore(flags); } if (run_remote && wait) { for_each_cpu(cpu, cfd->cpumask) { call_single_data_t *csd; csd = per_cpu_ptr(cfd->csd, cpu); csd_lock_wait(csd); } } } /** * smp_call_function_many(): Run a function on a set of CPUs. * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait * (atomically) until function has completed on other CPUs. If * %SCF_RUN_LOCAL is set, the function will also be run locally * if the local CPU is set in the @cpumask. * * If @wait is true, then returns once @func has returned. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. Preemption * must be disabled when calling this function. */ void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL); } EXPORT_SYMBOL(smp_call_function_many); /** * smp_call_function(): Run a function on all other CPUs. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed * on other CPUs. * * Returns 0. * * If @wait is true, then returns once @func has returned; otherwise * it returns just before the target cpu calls @func. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ void smp_call_function(smp_call_func_t func, void *info, int wait) { preempt_disable(); smp_call_function_many(cpu_online_mask, func, info, wait); preempt_enable(); } EXPORT_SYMBOL(smp_call_function); /* Setup configured maximum number of CPUs to activate */ unsigned int setup_max_cpus = NR_CPUS; EXPORT_SYMBOL(setup_max_cpus); /* * Setup routine for controlling SMP activation * * Command-line option of "nosmp" or "maxcpus=0" will disable SMP * activation entirely (the MPS table probe still happens, though). * * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer * greater than 0, limits the maximum number of CPUs activated in * SMP mode to <NUM>. */ void __weak __init arch_disable_smp_support(void) { } static int __init nosmp(char *str) { setup_max_cpus = 0; arch_disable_smp_support(); return 0; } early_param("nosmp", nosmp); /* this is hard limit */ static int __init nrcpus(char *str) { int nr_cpus; if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids) set_nr_cpu_ids(nr_cpus); return 0; } early_param("nr_cpus", nrcpus); static int __init maxcpus(char *str) { get_option(&str, &setup_max_cpus); if (setup_max_cpus == 0) arch_disable_smp_support(); return 0; } early_param("maxcpus", maxcpus); #if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS) /* Setup number of possible processor ids */ unsigned int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); #endif /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ void __init setup_nr_cpu_ids(void) { set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1); } /* Called by boot processor to activate the rest. */ void __init smp_init(void) { int num_nodes, num_cpus; idle_threads_init(); cpuhp_threads_init(); pr_info("Bringing up secondary CPUs ...\n"); bringup_nonboot_cpus(setup_max_cpus); num_nodes = num_online_nodes(); num_cpus = num_online_cpus(); pr_info("Brought up %d node%s, %d CPU%s\n", num_nodes, str_plural(num_nodes), num_cpus, str_plural(num_cpus)); /* Any cleanup work */ smp_cpus_done(setup_max_cpus); } /* * on_each_cpu_cond(): Call a function on each processor for which * the supplied function cond_func returns true, optionally waiting * for all the required CPUs to finish. This may include the local * processor. * @cond_func: A callback function that is passed a cpu id and * the info parameter. The function is called * with preemption disabled. The function should * return a boolean value indicating whether to IPI * the specified CPU. * @func: The function to run on all applicable CPUs. * This must be fast and non-blocking. * @info: An arbitrary pointer to pass to both functions. * @wait: If true, wait (atomically) until function has * completed on other CPUs. * * Preemption is disabled to protect against CPUs going offline but not online. * CPUs going online during the call will not be seen or sent an IPI. * * You must not call this function with disabled interrupts or * from a hardware interrupt handler or from a bottom half handler. */ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask) { unsigned int scf_flags = SCF_RUN_LOCAL; if (wait) scf_flags |= SCF_WAIT; preempt_disable(); smp_call_function_many_cond(mask, func, info, scf_flags, cond_func); preempt_enable(); } EXPORT_SYMBOL(on_each_cpu_cond_mask); static void do_nothing(void *unused) { } /** * kick_all_cpus_sync - Force all cpus out of idle * * Used to synchronize the update of pm_idle function pointer. It's * called after the pointer is updated and returns after the dummy * callback function has been executed on all cpus. The execution of * the function can only happen on the remote cpus after they have * left the idle function which had been called via pm_idle function * pointer. So it's guaranteed that nothing uses the previous pointer * anymore. */ void kick_all_cpus_sync(void) { /* Make sure the change is visible before we kick the cpus */ smp_mb(); smp_call_function(do_nothing, NULL, 1); } EXPORT_SYMBOL_GPL(kick_all_cpus_sync); /** * wake_up_all_idle_cpus - break all cpus out of idle * wake_up_all_idle_cpus try to break all cpus which is in idle state even * including idle polling cpus, for non-idle cpus, we will do nothing * for them. */ void wake_up_all_idle_cpus(void) { int cpu; for_each_possible_cpu(cpu) { preempt_disable(); if (cpu != smp_processor_id() && cpu_online(cpu)) wake_up_if_idle(cpu); preempt_enable(); } } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); /** * struct smp_call_on_cpu_struct - Call a function on a specific CPU * @work: &work_struct * @done: &completion to signal * @func: function to call * @data: function's data argument * @ret: return value from @func * @cpu: target CPU (%-1 for any CPU) * * Used to call a function on a specific cpu and wait for it to return. * Optionally make sure the call is done on a specified physical cpu via vcpu * pinning in order to support virtualized environments. */ struct smp_call_on_cpu_struct { struct work_struct work; struct completion done; int (*func)(void *); void *data; int ret; int cpu; }; static void smp_call_on_cpu_callback(struct work_struct *work) { struct smp_call_on_cpu_struct *sscs; sscs = container_of(work, struct smp_call_on_cpu_struct, work); if (sscs->cpu >= 0) hypervisor_pin_vcpu(sscs->cpu); sscs->ret = sscs->func(sscs->data); if (sscs->cpu >= 0) hypervisor_pin_vcpu(-1); complete(&sscs->done); } int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) { struct smp_call_on_cpu_struct sscs = { .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done), .func = func, .data = par, .cpu = phys ? cpu : -1, }; INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback); if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -ENXIO; queue_work_on(cpu, system_wq, &sscs.work); wait_for_completion(&sscs.done); destroy_work_on_stack(&sscs.work); return sscs.ret; } EXPORT_SYMBOL_GPL(smp_call_on_cpu); |
| 151 39 117 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | // SPDX-License-Identifier: GPL-2.0-only /* * Lock-less NULL terminated single linked list * * The basic atomic operation of this list is cmpxchg on long. On * architectures that don't have NMI-safe cmpxchg implementation, the * list can NOT be used in NMI handlers. So code that uses the list in * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. * * Copyright 2010,2011 Intel Corp. * Author: Huang Ying <ying.huang@intel.com> */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/llist.h> /** * llist_del_first - delete the first entry of lock-less list * @head: the head for your lock-less list * * If list is empty, return NULL, otherwise, return the first entry * deleted, this is the newest added one. * * Only one llist_del_first user can be used simultaneously with * multiple llist_add users without lock. Because otherwise * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add, * llist_add) sequence in another user may change @head->first->next, * but keep @head->first. If multiple consumers are needed, please * use llist_del_all or use lock between consumers. */ struct llist_node *llist_del_first(struct llist_head *head) { struct llist_node *entry, *next; entry = smp_load_acquire(&head->first); do { if (entry == NULL) return NULL; next = READ_ONCE(entry->next); } while (!try_cmpxchg(&head->first, &entry, next)); return entry; } EXPORT_SYMBOL_GPL(llist_del_first); /** * llist_del_first_this - delete given entry of lock-less list if it is first * @head: the head for your lock-less list * @this: a list entry. * * If head of the list is given entry, delete and return %true else * return %false. * * Multiple callers can safely call this concurrently with multiple * llist_add() callers, providing all the callers offer a different @this. */ bool llist_del_first_this(struct llist_head *head, struct llist_node *this) { struct llist_node *entry, *next; /* acquire ensures orderig wrt try_cmpxchg() is llist_del_first() */ entry = smp_load_acquire(&head->first); do { if (entry != this) return false; next = READ_ONCE(entry->next); } while (!try_cmpxchg(&head->first, &entry, next)); return true; } EXPORT_SYMBOL_GPL(llist_del_first_this); /** * llist_reverse_order - reverse order of a llist chain * @head: first item of the list to be reversed * * Reverse the order of a chain of llist entries and return the * new first entry. */ struct llist_node *llist_reverse_order(struct llist_node *head) { struct llist_node *new_head = NULL; while (head) { struct llist_node *tmp = head; head = head->next; tmp->next = new_head; new_head = tmp; } return new_head; } EXPORT_SYMBOL_GPL(llist_reverse_order); |
| 526 525 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2012 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/if_arp.h> #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/kernel.h> #include <linux/llc.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/openvswitch.h> #include <linux/export.h> #include <net/ip_tunnels.h> #include <net/rtnetlink.h> #include "datapath.h" #include "vport.h" #include "vport-internal_dev.h" #include "vport-netdev.h" static struct vport_ops ovs_netdev_vport_ops; /* Must be called with rcu_read_lock. */ static void netdev_port_receive(struct sk_buff *skb) { struct vport *vport; vport = ovs_netdev_get_vport(skb->dev); if (unlikely(!vport)) goto error; if (unlikely(skb_warn_if_lro(skb))) goto error; /* Make our own copy of the packet. Otherwise we will mangle the * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return; if (skb->dev->type == ARPHRD_ETHER) skb_push_rcsum(skb, ETH_HLEN); ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: kfree_skb(skb); } /* Called with rcu_read_lock and bottom-halves disabled. */ static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; netdev_port_receive(skb); return RX_HANDLER_CONSUMED; } static struct net_device *get_dpdev(const struct datapath *dp) { struct vport *local; local = ovs_vport_ovsl(dp, OVSP_LOCAL); return local->dev; } struct vport *ovs_netdev_link(struct vport *vport, const char *name) { int err; vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); if (!vport->dev) { err = -ENODEV; goto error_free_vport; } /* Ensure that the device exists and that the provided * name is not one of its aliases. */ if (strcmp(name, ovs_vport_name(vport))) { err = -ENODEV; goto error_put; } netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL); if (vport->dev->flags & IFF_LOOPBACK || (vport->dev->type != ARPHRD_ETHER && vport->dev->type != ARPHRD_NONE) || ovs_is_internal_dev(vport->dev)) { err = -EINVAL; goto error_put; } rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, get_dpdev(vport->dp), NULL, NULL, NULL); if (err) goto error_unlock; err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, vport); if (err) goto error_master_upper_dev_unlink; dev_disable_lro(vport->dev); dev_set_promiscuity(vport->dev, 1); vport->dev->priv_flags |= IFF_OVS_DATAPATH; rtnl_unlock(); return vport; error_master_upper_dev_unlink: netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); error_unlock: rtnl_unlock(); error_put: netdev_put(vport->dev, &vport->dev_tracker); error_free_vport: ovs_vport_free(vport); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(ovs_netdev_link); static struct vport *netdev_create(const struct vport_parms *parms) { struct vport *vport; vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); if (IS_ERR(vport)) return vport; return ovs_netdev_link(vport, parms->name); } static void vport_netdev_free(struct rcu_head *rcu) { struct vport *vport = container_of(rcu, struct vport, rcu); netdev_put(vport->dev, &vport->dev_tracker); ovs_vport_free(vport); } void ovs_netdev_detach_dev(struct vport *vport) { ASSERT_RTNL(); vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; netdev_rx_handler_unregister(vport->dev); netdev_upper_dev_unlink(vport->dev, netdev_master_upper_dev_get(vport->dev)); dev_set_promiscuity(vport->dev, -1); } static void netdev_destroy(struct vport *vport) { rtnl_lock(); if (netif_is_ovs_port(vport->dev)) ovs_netdev_detach_dev(vport); rtnl_unlock(); call_rcu(&vport->rcu, vport_netdev_free); } void ovs_netdev_tunnel_destroy(struct vport *vport) { rtnl_lock(); if (netif_is_ovs_port(vport->dev)) ovs_netdev_detach_dev(vport); /* We can be invoked by both explicit vport deletion and * underlying netdev deregistration; delete the link only * if it's not already shutting down. */ if (vport->dev->reg_state == NETREG_REGISTERED) rtnl_delete_link(vport->dev, 0, NULL); netdev_put(vport->dev, &vport->dev_tracker); vport->dev = NULL; rtnl_unlock(); call_rcu(&vport->rcu, vport_netdev_free); } EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); /* Returns null if this device is not attached to a datapath. */ struct vport *ovs_netdev_get_vport(struct net_device *dev) { if (likely(netif_is_ovs_port(dev))) return (struct vport *) rcu_dereference_rtnl(dev->rx_handler_data); else return NULL; } static struct vport_ops ovs_netdev_vport_ops = { .type = OVS_VPORT_TYPE_NETDEV, .create = netdev_create, .destroy = netdev_destroy, .send = dev_queue_xmit, }; int __init ovs_netdev_init(void) { return ovs_vport_ops_register(&ovs_netdev_vport_ops); } void ovs_netdev_exit(void) { ovs_vport_ops_unregister(&ovs_netdev_vport_ops); } |
| 7 113 105 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FB_H #define _LINUX_FB_H #include <uapi/linux/fb.h> #define FBIO_CURSOR _IOWR('F', 0x08, struct fb_cursor_user) #include <linux/mutex.h> #include <linux/printk.h> #include <linux/refcount.h> #include <linux/types.h> #include <linux/workqueue.h> #include <asm/video.h> struct backlight_device; struct device; struct device_node; struct fb_info; struct file; struct i2c_adapter; struct inode; struct lcd_device; struct module; struct notifier_block; struct page; struct videomode; struct vm_area_struct; /* Definitions below are used in the parsed monitor specs */ #define FB_DPMS_ACTIVE_OFF 1 #define FB_DPMS_SUSPEND 2 #define FB_DPMS_STANDBY 4 #define FB_DISP_DDI 1 #define FB_DISP_ANA_700_300 2 #define FB_DISP_ANA_714_286 4 #define FB_DISP_ANA_1000_400 8 #define FB_DISP_ANA_700_000 16 #define FB_DISP_MONO 32 #define FB_DISP_RGB 64 #define FB_DISP_MULTI 128 #define FB_DISP_UNKNOWN 256 #define FB_SIGNAL_NONE 0 #define FB_SIGNAL_BLANK_BLANK 1 #define FB_SIGNAL_SEPARATE 2 #define FB_SIGNAL_COMPOSITE 4 #define FB_SIGNAL_SYNC_ON_GREEN 8 #define FB_SIGNAL_SERRATION_ON 16 #define FB_MISC_PRIM_COLOR 1 #define FB_MISC_1ST_DETAIL 2 /* First Detailed Timing is preferred */ #define FB_MISC_HDMI 4 struct fb_chroma { __u32 redx; /* in fraction of 1024 */ __u32 greenx; __u32 bluex; __u32 whitex; __u32 redy; __u32 greeny; __u32 bluey; __u32 whitey; }; struct fb_monspecs { struct fb_chroma chroma; struct fb_videomode *modedb; /* mode database */ __u8 manufacturer[4]; /* Manufacturer */ __u8 monitor[14]; /* Monitor String */ __u8 serial_no[14]; /* Serial Number */ __u8 ascii[14]; /* ? */ __u32 modedb_len; /* mode database length */ __u32 model; /* Monitor Model */ __u32 serial; /* Serial Number - Integer */ __u32 year; /* Year manufactured */ __u32 week; /* Week Manufactured */ __u32 hfmin; /* hfreq lower limit (Hz) */ __u32 hfmax; /* hfreq upper limit (Hz) */ __u32 dclkmin; /* pixelclock lower limit (Hz) */ __u32 dclkmax; /* pixelclock upper limit (Hz) */ __u16 input; /* display type - see FB_DISP_* */ __u16 dpms; /* DPMS support - see FB_DPMS_ */ __u16 signal; /* Signal Type - see FB_SIGNAL_* */ __u16 vfmin; /* vfreq lower limit (Hz) */ __u16 vfmax; /* vfreq upper limit (Hz) */ __u16 gamma; /* Gamma - in fractions of 100 */ __u16 gtf : 1; /* supports GTF */ __u16 misc; /* Misc flags - see FB_MISC_* */ __u8 version; /* EDID version... */ __u8 revision; /* ...and revision */ __u8 max_x; /* Maximum horizontal size (cm) */ __u8 max_y; /* Maximum vertical size (cm) */ }; struct fb_cmap_user { __u32 start; /* First entry */ __u32 len; /* Number of entries */ __u16 __user *red; /* Red values */ __u16 __user *green; __u16 __user *blue; __u16 __user *transp; /* transparency, can be NULL */ }; struct fb_image_user { __u32 dx; /* Where to place image */ __u32 dy; __u32 width; /* Size of image */ __u32 height; __u32 fg_color; /* Only used when a mono bitmap */ __u32 bg_color; __u8 depth; /* Depth of the image */ const char __user *data; /* Pointer to image data */ struct fb_cmap_user cmap; /* color map info */ }; struct fb_cursor_user { __u16 set; /* what to set */ __u16 enable; /* cursor on/off */ __u16 rop; /* bitop operation */ const char __user *mask; /* cursor mask bits */ struct fbcurpos hot; /* cursor hot spot */ struct fb_image_user image; /* Cursor image */ }; /* * Register/unregister for framebuffer events */ #ifdef CONFIG_GUMSTIX_AM200EPD /* only used by mach-pxa/am200epd.c */ #define FB_EVENT_FB_REGISTERED 0x05 #define FB_EVENT_FB_UNREGISTERED 0x06 #endif struct fb_event { struct fb_info *info; void *data; }; /* Enough for the VT console needs, see its max_font_width/height */ #define FB_MAX_BLIT_WIDTH 64 #define FB_MAX_BLIT_HEIGHT 128 struct fb_blit_caps { DECLARE_BITMAP(x, FB_MAX_BLIT_WIDTH); DECLARE_BITMAP(y, FB_MAX_BLIT_HEIGHT); u32 len; u32 flags; }; #ifdef CONFIG_FB_NOTIFY extern int fb_register_client(struct notifier_block *nb); extern int fb_unregister_client(struct notifier_block *nb); extern int fb_notifier_call_chain(unsigned long val, void *v); #else static inline int fb_register_client(struct notifier_block *nb) { return 0; }; static inline int fb_unregister_client(struct notifier_block *nb) { return 0; }; static inline int fb_notifier_call_chain(unsigned long val, void *v) { return 0; }; #endif /* * Pixmap structure definition * * The purpose of this structure is to translate data * from the hardware independent format of fbdev to what * format the hardware needs. */ #define FB_PIXMAP_DEFAULT 1 /* used internally by fbcon */ #define FB_PIXMAP_SYSTEM 2 /* memory is in system RAM */ #define FB_PIXMAP_IO 4 /* memory is iomapped */ #define FB_PIXMAP_SYNC 256 /* set if GPU can DMA */ struct fb_pixmap { u8 *addr; /* pointer to memory */ u32 size; /* size of buffer in bytes */ u32 offset; /* current offset to buffer */ u32 buf_align; /* byte alignment of each bitmap */ u32 scan_align; /* alignment per scanline */ u32 access_align; /* alignment per read/write (bits) */ u32 flags; /* see FB_PIXMAP_* */ /* supported bit block dimensions */ /* Format: test_bit(width - 1, blit_x) */ /* test_bit(height - 1, blit_y) */ /* if zero, will be set to full (all) */ DECLARE_BITMAP(blit_x, FB_MAX_BLIT_WIDTH); DECLARE_BITMAP(blit_y, FB_MAX_BLIT_HEIGHT); /* access methods */ void (*writeio)(struct fb_info *info, void __iomem *dst, void *src, unsigned int size); void (*readio) (struct fb_info *info, void *dst, void __iomem *src, unsigned int size); }; #ifdef CONFIG_FB_DEFERRED_IO struct fb_deferred_io_pageref { struct page *page; unsigned long offset; /* private */ struct list_head list; }; struct fb_deferred_io { /* delay between mkwrite and deferred handler */ unsigned long delay; bool sort_pagereflist; /* sort pagelist by offset */ int open_count; /* number of opened files; protected by fb_info lock */ struct mutex lock; /* mutex that protects the pageref list */ struct list_head pagereflist; /* list of pagerefs for touched pages */ struct address_space *mapping; /* page cache object for fb device */ /* callback */ struct page *(*get_page)(struct fb_info *info, unsigned long offset); void (*deferred_io)(struct fb_info *info, struct list_head *pagelist); }; #endif /* * Frame buffer operations * * LOCKING NOTE: those functions must _ALL_ be called with the console * semaphore held, this is the only suitable locking mechanism we have * in 2.6. Some may be called at interrupt time at this point though. * * The exception to this is the debug related hooks. Putting the fb * into a debug state (e.g. flipping to the kernel console) and restoring * it must be done in a lock-free manner, so low level drivers should * keep track of the initial console (if applicable) and may need to * perform direct, unlocked hardware writes in these hooks. */ struct fb_ops { /* open/release and usage marking */ struct module *owner; int (*fb_open)(struct fb_info *info, int user); int (*fb_release)(struct fb_info *info, int user); /* For framebuffers with strange non linear layouts or that do not * work with normal memory mapped access */ ssize_t (*fb_read)(struct fb_info *info, char __user *buf, size_t count, loff_t *ppos); ssize_t (*fb_write)(struct fb_info *info, const char __user *buf, size_t count, loff_t *ppos); /* checks var and eventually tweaks it to something supported, * DO NOT MODIFY PAR */ int (*fb_check_var)(struct fb_var_screeninfo *var, struct fb_info *info); /* set the video mode according to info->var */ int (*fb_set_par)(struct fb_info *info); /* set color register */ int (*fb_setcolreg)(unsigned regno, unsigned red, unsigned green, unsigned blue, unsigned transp, struct fb_info *info); /* set color registers in batch */ int (*fb_setcmap)(struct fb_cmap *cmap, struct fb_info *info); /* blank display */ int (*fb_blank)(int blank, struct fb_info *info); /* pan display */ int (*fb_pan_display)(struct fb_var_screeninfo *var, struct fb_info *info); /* Draws a rectangle */ void (*fb_fillrect) (struct fb_info *info, const struct fb_fillrect *rect); /* Copy data from area to another */ void (*fb_copyarea) (struct fb_info *info, const struct fb_copyarea *region); /* Draws a image to the display */ void (*fb_imageblit) (struct fb_info *info, const struct fb_image *image); /* Draws cursor */ int (*fb_cursor) (struct fb_info *info, struct fb_cursor *cursor); /* wait for blit idle, optional */ int (*fb_sync)(struct fb_info *info); /* perform fb specific ioctl (optional) */ int (*fb_ioctl)(struct fb_info *info, unsigned int cmd, unsigned long arg); /* Handle 32bit compat ioctl (optional) */ int (*fb_compat_ioctl)(struct fb_info *info, unsigned cmd, unsigned long arg); /* perform fb specific mmap */ int (*fb_mmap)(struct fb_info *info, struct vm_area_struct *vma); /* get capability given var */ void (*fb_get_caps)(struct fb_info *info, struct fb_blit_caps *caps, struct fb_var_screeninfo *var); /* teardown any resources to do with this framebuffer */ void (*fb_destroy)(struct fb_info *info); /* called at KDB enter and leave time to prepare the console */ int (*fb_debug_enter)(struct fb_info *info); int (*fb_debug_leave)(struct fb_info *info); }; #ifdef CONFIG_FB_TILEBLITTING #define FB_TILE_CURSOR_NONE 0 #define FB_TILE_CURSOR_UNDERLINE 1 #define FB_TILE_CURSOR_LOWER_THIRD 2 #define FB_TILE_CURSOR_LOWER_HALF 3 #define FB_TILE_CURSOR_TWO_THIRDS 4 #define FB_TILE_CURSOR_BLOCK 5 struct fb_tilemap { __u32 width; /* width of each tile in pixels */ __u32 height; /* height of each tile in scanlines */ __u32 depth; /* color depth of each tile */ __u32 length; /* number of tiles in the map */ const __u8 *data; /* actual tile map: a bitmap array, packed to the nearest byte */ }; struct fb_tilerect { __u32 sx; /* origin in the x-axis */ __u32 sy; /* origin in the y-axis */ __u32 width; /* number of tiles in the x-axis */ __u32 height; /* number of tiles in the y-axis */ __u32 index; /* what tile to use: index to tile map */ __u32 fg; /* foreground color */ __u32 bg; /* background color */ __u32 rop; /* raster operation */ }; struct fb_tilearea { __u32 sx; /* source origin in the x-axis */ __u32 sy; /* source origin in the y-axis */ __u32 dx; /* destination origin in the x-axis */ __u32 dy; /* destination origin in the y-axis */ __u32 width; /* number of tiles in the x-axis */ __u32 height; /* number of tiles in the y-axis */ }; struct fb_tileblit { __u32 sx; /* origin in the x-axis */ __u32 sy; /* origin in the y-axis */ __u32 width; /* number of tiles in the x-axis */ __u32 height; /* number of tiles in the y-axis */ __u32 fg; /* foreground color */ __u32 bg; /* background color */ __u32 length; /* number of tiles to draw */ __u32 *indices; /* array of indices to tile map */ }; struct fb_tilecursor { __u32 sx; /* cursor position in the x-axis */ __u32 sy; /* cursor position in the y-axis */ __u32 mode; /* 0 = erase, 1 = draw */ __u32 shape; /* see FB_TILE_CURSOR_* */ __u32 fg; /* foreground color */ __u32 bg; /* background color */ }; struct fb_tile_ops { /* set tile characteristics */ void (*fb_settile)(struct fb_info *info, struct fb_tilemap *map); /* all dimensions from hereon are in terms of tiles */ /* move a rectangular region of tiles from one area to another*/ void (*fb_tilecopy)(struct fb_info *info, struct fb_tilearea *area); /* fill a rectangular region with a tile */ void (*fb_tilefill)(struct fb_info *info, struct fb_tilerect *rect); /* copy an array of tiles */ void (*fb_tileblit)(struct fb_info *info, struct fb_tileblit *blit); /* cursor */ void (*fb_tilecursor)(struct fb_info *info, struct fb_tilecursor *cursor); /* get maximum length of the tile map */ int (*fb_get_tilemax)(struct fb_info *info); }; #endif /* CONFIG_FB_TILEBLITTING */ /* FBINFO_* = fb_info.flags bit flags */ #define FBINFO_HWACCEL_DISABLED 0x0002 /* When FBINFO_HWACCEL_DISABLED is set: * Hardware acceleration is turned off. Software implementations * of required functions (copyarea(), fillrect(), and imageblit()) * takes over; acceleration engine should be in a quiescent state */ /* hints */ #define FBINFO_VIRTFB 0x0004 /* FB is System RAM, not device. */ #define FBINFO_PARTIAL_PAN_OK 0x0040 /* otw use pan only for double-buffering */ #define FBINFO_READS_FAST 0x0080 /* soft-copy faster than rendering */ /* hardware supported ops */ /* semantics: when a bit is set, it indicates that the operation is * accelerated by hardware. * required functions will still work even if the bit is not set. * optional functions may not even exist if the flag bit is not set. */ #define FBINFO_HWACCEL_NONE 0x0000 #define FBINFO_HWACCEL_COPYAREA 0x0100 /* required */ #define FBINFO_HWACCEL_FILLRECT 0x0200 /* required */ #define FBINFO_HWACCEL_IMAGEBLIT 0x0400 /* required */ #define FBINFO_HWACCEL_ROTATE 0x0800 /* optional */ #define FBINFO_HWACCEL_XPAN 0x1000 /* optional */ #define FBINFO_HWACCEL_YPAN 0x2000 /* optional */ #define FBINFO_HWACCEL_YWRAP 0x4000 /* optional */ #define FBINFO_MISC_TILEBLITTING 0x20000 /* use tile blitting */ /* A driver may set this flag to indicate that it does want a set_par to be * called every time when fbcon_switch is executed. The advantage is that with * this flag set you can really be sure that set_par is always called before * any of the functions dependent on the correct hardware state or altering * that state, even if you are using some broken X releases. The disadvantage * is that it introduces unwanted delays to every console switch if set_par * is slow. It is a good idea to try this flag in the drivers initialization * code whenever there is a bug report related to switching between X and the * framebuffer console. */ #define FBINFO_MISC_ALWAYS_SETPAR 0x40000 /* * Host and GPU endianness differ. */ #define FBINFO_FOREIGN_ENDIAN 0x100000 /* * Big endian math. This is the same flags as above, but with different * meaning, it is set by the fb subsystem depending FOREIGN_ENDIAN flag * and host endianness. Drivers should not use this flag. */ #define FBINFO_BE_MATH 0x100000 /* * Hide smem_start in the FBIOGET_FSCREENINFO IOCTL. This is used by modern DRM * drivers to stop userspace from trying to share buffers behind the kernel's * back. Instead dma-buf based buffer sharing should be used. */ #define FBINFO_HIDE_SMEM_START 0x200000 struct fb_info { refcount_t count; int node; int flags; /* * -1 by default, set to a FB_ROTATE_* value by the driver, if it knows * a lcd is not mounted upright and fbcon should rotate to compensate. */ int fbcon_rotate_hint; struct mutex lock; /* Lock for open/release/ioctl funcs */ struct mutex mm_lock; /* Lock for fb_mmap and smem_* fields */ struct fb_var_screeninfo var; /* Current var */ struct fb_fix_screeninfo fix; /* Current fix */ struct fb_monspecs monspecs; /* Current Monitor specs */ struct fb_pixmap pixmap; /* Image hardware mapper */ struct fb_pixmap sprite; /* Cursor hardware mapper */ struct fb_cmap cmap; /* Current cmap */ struct list_head modelist; /* mode list */ struct fb_videomode *mode; /* current mode */ int blank; /* current blanking; see FB_BLANK_ constants */ #if IS_ENABLED(CONFIG_FB_BACKLIGHT) /* assigned backlight device */ /* set before framebuffer registration, remove after unregister */ struct backlight_device *bl_dev; /* Backlight level curve */ struct mutex bl_curve_mutex; u8 bl_curve[FB_BACKLIGHT_LEVELS]; #endif /* * Assigned LCD device; set before framebuffer * registration, remove after unregister */ struct lcd_device *lcd_dev; #ifdef CONFIG_FB_DEFERRED_IO struct delayed_work deferred_work; unsigned long npagerefs; struct fb_deferred_io_pageref *pagerefs; struct fb_deferred_io *fbdefio; #endif const struct fb_ops *fbops; struct device *device; /* This is the parent */ #if defined(CONFIG_FB_DEVICE) struct device *dev; /* This is this fb device */ #endif int class_flag; /* private sysfs flags */ #ifdef CONFIG_FB_TILEBLITTING struct fb_tile_ops *tileops; /* Tile Blitting */ #endif union { char __iomem *screen_base; /* Virtual address */ char *screen_buffer; }; unsigned long screen_size; /* Amount of ioremapped VRAM or 0 */ void *pseudo_palette; /* Fake palette of 16 colors */ #define FBINFO_STATE_RUNNING 0 #define FBINFO_STATE_SUSPENDED 1 u32 state; /* Hardware state i.e suspend */ void *fbcon_par; /* fbcon use-only private area */ /* From here on everything is device dependent */ void *par; bool skip_vt_switch; /* no VT switch on suspend/resume required */ bool skip_panic; /* Do not write to the fb after a panic */ }; /* This will go away * fbset currently hacks in FB_ACCELF_TEXT into var.accel_flags * when it wants to turn the acceleration engine on. This is * really a separate operation, and should be modified via sysfs. * But for now, we leave it broken with the following define */ #define STUPID_ACCELF_TEXT_SHIT #define FB_LEFT_POS(p, bpp) (fb_be_math(p) ? (32 - (bpp)) : 0) #define FB_SHIFT_HIGH(p, val, bits) (fb_be_math(p) ? (val) >> (bits) : \ (val) << (bits)) #define FB_SHIFT_LOW(p, val, bits) (fb_be_math(p) ? (val) << (bits) : \ (val) >> (bits)) /* * `Generic' versions of the frame buffer device operations */ extern int fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var); extern int fb_pan_display(struct fb_info *info, struct fb_var_screeninfo *var); extern int fb_blank(struct fb_info *info, int blank); /* * Helpers for framebuffers in I/O memory */ extern void cfb_fillrect(struct fb_info *info, const struct fb_fillrect *rect); extern void cfb_copyarea(struct fb_info *info, const struct fb_copyarea *area); extern void cfb_imageblit(struct fb_info *info, const struct fb_image *image); extern ssize_t fb_io_read(struct fb_info *info, char __user *buf, size_t count, loff_t *ppos); extern ssize_t fb_io_write(struct fb_info *info, const char __user *buf, size_t count, loff_t *ppos); int fb_io_mmap(struct fb_info *info, struct vm_area_struct *vma); #define __FB_DEFAULT_IOMEM_OPS_RDWR \ .fb_read = fb_io_read, \ .fb_write = fb_io_write #define __FB_DEFAULT_IOMEM_OPS_DRAW \ .fb_fillrect = cfb_fillrect, \ .fb_copyarea = cfb_copyarea, \ .fb_imageblit = cfb_imageblit #define __FB_DEFAULT_IOMEM_OPS_MMAP \ .fb_mmap = fb_io_mmap #define FB_DEFAULT_IOMEM_OPS \ __FB_DEFAULT_IOMEM_OPS_RDWR, \ __FB_DEFAULT_IOMEM_OPS_DRAW, \ __FB_DEFAULT_IOMEM_OPS_MMAP /* * Helpers for framebuffers in system memory */ extern void sys_fillrect(struct fb_info *info, const struct fb_fillrect *rect); extern void sys_copyarea(struct fb_info *info, const struct fb_copyarea *area); extern void sys_imageblit(struct fb_info *info, const struct fb_image *image); extern ssize_t fb_sys_read(struct fb_info *info, char __user *buf, size_t count, loff_t *ppos); extern ssize_t fb_sys_write(struct fb_info *info, const char __user *buf, size_t count, loff_t *ppos); #define __FB_DEFAULT_SYSMEM_OPS_RDWR \ .fb_read = fb_sys_read, \ .fb_write = fb_sys_write #define __FB_DEFAULT_SYSMEM_OPS_DRAW \ .fb_fillrect = sys_fillrect, \ .fb_copyarea = sys_copyarea, \ .fb_imageblit = sys_imageblit /* * Helpers for framebuffers in DMA-able memory */ #define __FB_DEFAULT_DMAMEM_OPS_RDWR \ .fb_read = fb_sys_read, \ .fb_write = fb_sys_write #define __FB_DEFAULT_DMAMEM_OPS_DRAW \ .fb_fillrect = sys_fillrect, \ .fb_copyarea = sys_copyarea, \ .fb_imageblit = sys_imageblit /* fbmem.c */ extern int register_framebuffer(struct fb_info *fb_info); extern void unregister_framebuffer(struct fb_info *fb_info); extern int devm_register_framebuffer(struct device *dev, struct fb_info *fb_info); extern char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size); extern void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 idx, u32 height, u32 shift_high, u32 shift_low, u32 mod); extern void fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 s_pitch, u32 height); extern void fb_set_suspend(struct fb_info *info, int state); extern int fb_get_color_depth(struct fb_var_screeninfo *var, struct fb_fix_screeninfo *fix); extern int fb_get_options(const char *name, char **option); extern int fb_new_modelist(struct fb_info *info); static inline void lock_fb_info(struct fb_info *info) { mutex_lock(&info->lock); } static inline void unlock_fb_info(struct fb_info *info) { mutex_unlock(&info->lock); } static inline void __fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 s_pitch, u32 height) { u32 i, j; d_pitch -= s_pitch; for (i = height; i--; ) { /* s_pitch is a few bytes at the most, memcpy is suboptimal */ for (j = 0; j < s_pitch; j++) *dst++ = *src++; dst += d_pitch; } } /* fb_defio.c */ int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma); extern int fb_deferred_io_init(struct fb_info *info); extern void fb_deferred_io_open(struct fb_info *info, struct inode *inode, struct file *file); extern void fb_deferred_io_release(struct fb_info *info); extern void fb_deferred_io_cleanup(struct fb_info *info); extern int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasync); /* * Generate callbacks for deferred I/O */ #define __FB_GEN_DEFAULT_DEFERRED_OPS_RDWR(__prefix, __damage_range, __mode) \ static ssize_t __prefix ## _defio_read(struct fb_info *info, char __user *buf, \ size_t count, loff_t *ppos) \ { \ return fb_ ## __mode ## _read(info, buf, count, ppos); \ } \ static ssize_t __prefix ## _defio_write(struct fb_info *info, const char __user *buf, \ size_t count, loff_t *ppos) \ { \ unsigned long offset = *ppos; \ ssize_t ret = fb_ ## __mode ## _write(info, buf, count, ppos); \ if (ret > 0) \ __damage_range(info, offset, ret); \ return ret; \ } #define __FB_GEN_DEFAULT_DEFERRED_OPS_DRAW(__prefix, __damage_area, __mode) \ static void __prefix ## _defio_fillrect(struct fb_info *info, \ const struct fb_fillrect *rect) \ { \ __mode ## _fillrect(info, rect); \ __damage_area(info, rect->dx, rect->dy, rect->width, rect->height); \ } \ static void __prefix ## _defio_copyarea(struct fb_info *info, \ const struct fb_copyarea *area) \ { \ __mode ## _copyarea(info, area); \ __damage_area(info, area->dx, area->dy, area->width, area->height); \ } \ static void __prefix ## _defio_imageblit(struct fb_info *info, \ const struct fb_image *image) \ { \ __mode ## _imageblit(info, image); \ __damage_area(info, image->dx, image->dy, image->width, image->height); \ } #define FB_GEN_DEFAULT_DEFERRED_IOMEM_OPS(__prefix, __damage_range, __damage_area) \ __FB_GEN_DEFAULT_DEFERRED_OPS_RDWR(__prefix, __damage_range, io) \ __FB_GEN_DEFAULT_DEFERRED_OPS_DRAW(__prefix, __damage_area, cfb) #define FB_GEN_DEFAULT_DEFERRED_SYSMEM_OPS(__prefix, __damage_range, __damage_area) \ __FB_GEN_DEFAULT_DEFERRED_OPS_RDWR(__prefix, __damage_range, sys) \ __FB_GEN_DEFAULT_DEFERRED_OPS_DRAW(__prefix, __damage_area, sys) #define FB_GEN_DEFAULT_DEFERRED_DMAMEM_OPS(__prefix, __damage_range, __damage_area) \ __FB_GEN_DEFAULT_DEFERRED_OPS_RDWR(__prefix, __damage_range, sys) \ __FB_GEN_DEFAULT_DEFERRED_OPS_DRAW(__prefix, __damage_area, sys) /* * Initializes struct fb_ops for deferred I/O. */ #define __FB_DEFAULT_DEFERRED_OPS_RDWR(__prefix) \ .fb_read = __prefix ## _defio_read, \ .fb_write = __prefix ## _defio_write #define __FB_DEFAULT_DEFERRED_OPS_DRAW(__prefix) \ .fb_fillrect = __prefix ## _defio_fillrect, \ .fb_copyarea = __prefix ## _defio_copyarea, \ .fb_imageblit = __prefix ## _defio_imageblit #define __FB_DEFAULT_DEFERRED_OPS_MMAP(__prefix) \ .fb_mmap = fb_deferred_io_mmap #define FB_DEFAULT_DEFERRED_OPS(__prefix) \ __FB_DEFAULT_DEFERRED_OPS_RDWR(__prefix), \ __FB_DEFAULT_DEFERRED_OPS_DRAW(__prefix), \ __FB_DEFAULT_DEFERRED_OPS_MMAP(__prefix) static inline bool fb_be_math(struct fb_info *info) { #ifdef CONFIG_FB_FOREIGN_ENDIAN #if defined(CONFIG_FB_BOTH_ENDIAN) return info->flags & FBINFO_BE_MATH; #elif defined(CONFIG_FB_BIG_ENDIAN) return true; #elif defined(CONFIG_FB_LITTLE_ENDIAN) return false; #endif /* CONFIG_FB_BOTH_ENDIAN */ #else #ifdef __BIG_ENDIAN return true; #else return false; #endif /* __BIG_ENDIAN */ #endif /* CONFIG_FB_FOREIGN_ENDIAN */ } extern struct fb_info *framebuffer_alloc(size_t size, struct device *dev); extern void framebuffer_release(struct fb_info *info); extern void fb_bl_default_curve(struct fb_info *fb_info, u8 off, u8 min, u8 max); #if IS_ENABLED(CONFIG_FB_BACKLIGHT) struct backlight_device *fb_bl_device(struct fb_info *info); void fb_bl_notify_blank(struct fb_info *info, int old_blank); #else static inline struct backlight_device *fb_bl_device(struct fb_info *info) { return NULL; } static inline void fb_bl_notify_blank(struct fb_info *info, int old_blank) { } #endif static inline struct lcd_device *fb_lcd_device(struct fb_info *info) { return info->lcd_dev; } /* fbmon.c */ #define FB_MAXTIMINGS 0 #define FB_VSYNCTIMINGS 1 #define FB_HSYNCTIMINGS 2 #define FB_DCLKTIMINGS 3 #define FB_IGNOREMON 0x100 #define FB_MODE_IS_UNKNOWN 0 #define FB_MODE_IS_DETAILED 1 #define FB_MODE_IS_STANDARD 2 #define FB_MODE_IS_VESA 4 #define FB_MODE_IS_CALCULATED 8 #define FB_MODE_IS_FIRST 16 #define FB_MODE_IS_FROM_VAR 32 extern int fbmon_dpms(const struct fb_info *fb_info); extern int fb_get_mode(int flags, u32 val, struct fb_var_screeninfo *var, struct fb_info *info); extern int fb_validate_mode(const struct fb_var_screeninfo *var, struct fb_info *info); extern int fb_parse_edid(unsigned char *edid, struct fb_var_screeninfo *var); extern const unsigned char *fb_firmware_edid(struct device *device); extern void fb_edid_to_monspecs(unsigned char *edid, struct fb_monspecs *specs); extern void fb_destroy_modedb(struct fb_videomode *modedb); extern int fb_find_mode_cvt(struct fb_videomode *mode, int margins, int rb); extern unsigned char *fb_ddc_read(struct i2c_adapter *adapter); extern int of_get_fb_videomode(struct device_node *np, struct fb_videomode *fb, int index); extern int fb_videomode_from_videomode(const struct videomode *vm, struct fb_videomode *fbmode); /* modedb.c */ #define VESA_MODEDB_SIZE 43 #define DMT_SIZE 0x50 extern void fb_var_to_videomode(struct fb_videomode *mode, const struct fb_var_screeninfo *var); extern void fb_videomode_to_var(struct fb_var_screeninfo *var, const struct fb_videomode *mode); extern int fb_mode_is_equal(const struct fb_videomode *mode1, const struct fb_videomode *mode2); extern int fb_add_videomode(const struct fb_videomode *mode, struct list_head *head); extern void fb_delete_videomode(const struct fb_videomode *mode, struct list_head *head); extern const struct fb_videomode *fb_match_mode(const struct fb_var_screeninfo *var, struct list_head *head); extern const struct fb_videomode *fb_find_best_mode(const struct fb_var_screeninfo *var, struct list_head *head); extern const struct fb_videomode *fb_find_nearest_mode(const struct fb_videomode *mode, struct list_head *head); extern void fb_destroy_modelist(struct list_head *head); extern void fb_videomode_to_modelist(const struct fb_videomode *modedb, int num, struct list_head *head); extern const struct fb_videomode *fb_find_best_display(const struct fb_monspecs *specs, struct list_head *head); /* fbcmap.c */ extern int fb_alloc_cmap(struct fb_cmap *cmap, int len, int transp); extern int fb_alloc_cmap_gfp(struct fb_cmap *cmap, int len, int transp, gfp_t flags); extern void fb_dealloc_cmap(struct fb_cmap *cmap); extern int fb_copy_cmap(const struct fb_cmap *from, struct fb_cmap *to); extern int fb_cmap_to_user(const struct fb_cmap *from, struct fb_cmap_user *to); extern int fb_set_cmap(struct fb_cmap *cmap, struct fb_info *fb_info); extern int fb_set_user_cmap(struct fb_cmap_user *cmap, struct fb_info *fb_info); extern const struct fb_cmap *fb_default_cmap(int len); extern void fb_invert_cmaps(void); struct fb_videomode { const char *name; /* optional */ u32 refresh; /* optional */ u32 xres; u32 yres; u32 pixclock; u32 left_margin; u32 right_margin; u32 upper_margin; u32 lower_margin; u32 hsync_len; u32 vsync_len; u32 sync; u32 vmode; u32 flag; }; struct dmt_videomode { u32 dmt_id; u32 std_2byte_code; u32 cvt_3byte_code; const struct fb_videomode *mode; }; extern const struct fb_videomode vesa_modes[]; extern const struct dmt_videomode dmt_modes[]; struct fb_modelist { struct list_head list; struct fb_videomode mode; }; extern int fb_find_mode(struct fb_var_screeninfo *var, struct fb_info *info, const char *mode_option, const struct fb_videomode *db, unsigned int dbsize, const struct fb_videomode *default_mode, unsigned int default_bpp); bool fb_modesetting_disabled(const char *drvname); /* * Convenience logging macros */ #define fb_err(fb_info, fmt, ...) \ pr_err("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__) #define fb_notice(info, fmt, ...) \ pr_notice("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__) #define fb_warn(fb_info, fmt, ...) \ pr_warn("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__) #define fb_info(fb_info, fmt, ...) \ pr_info("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__) #define fb_dbg(fb_info, fmt, ...) \ pr_debug("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__) #define fb_warn_once(fb_info, fmt, ...) \ pr_warn_once("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__) #define fb_WARN_ONCE(fb_info, condition, fmt, ...) \ WARN_ONCE(condition, "fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__) #define fb_WARN_ON_ONCE(fb_info, x) \ fb_WARN_ONCE(fb_info, (x), "%s", "fb_WARN_ON_ONCE(" __stringify(x) ")") #endif /* _LINUX_FB_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_LEGACY_H #define __X86_KERNEL_FPU_LEGACY_H #include <asm/fpu/types.h> extern unsigned int mxcsr_feature_mask; static inline void ldmxcsr(u32 mxcsr) { asm volatile("ldmxcsr %0" :: "m" (mxcsr)); } /* * Returns 0 on success or the trap number when the operation raises an * exception. */ #define user_insn(insn, output, input...) \ ({ \ int err; \ \ might_fault(); \ \ asm volatile(ASM_STAC "\n" \ "1: " #insn "\n" \ "2: " ASM_CLAC "\n" \ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ : [err] "=a" (err), output \ : "0"(0), input); \ err; \ }) #define kernel_insn_err(insn, output, input...) \ ({ \ int err; \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[err]) \ : [err] "=r" (err), output \ : "0"(0), input); \ err; \ }) #define kernel_insn(insn, output, input...) \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \ : output : input) static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx) { return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); } static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); else return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); } static inline void fxrstor(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int fxrstor_safe(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline void frstor(struct fregs_state *fx) { kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int frstor_safe(struct fregs_state *fx) { return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int frstor_from_user_sigframe(struct fregs_state __user *fx) { return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline void fxsave(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); else asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); } #endif |
| 190 169 168 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | // SPDX-License-Identifier: GPL-2.0-or-later /* * x86 instruction attribute tables * * Written by Masami Hiramatsu <mhiramat@redhat.com> */ #include <asm/insn.h> /* __ignore_sync_check__ */ /* Attribute tables are generated from opcode map */ #include "inat-tables.c" /* Attribute search APIs */ insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode) { return inat_primary_table[opcode]; } int inat_get_last_prefix_id(insn_byte_t last_pfx) { insn_attr_t lpfx_attr; lpfx_attr = inat_get_opcode_attribute(last_pfx); return inat_last_prefix_id(lpfx_attr); } insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id, insn_attr_t esc_attr) { const insn_attr_t *table; int n; n = inat_escape_id(esc_attr); table = inat_escape_tables[n][0]; if (!table) return 0; if (inat_has_variant(table[opcode]) && lpfx_id) { table = inat_escape_tables[n][lpfx_id]; if (!table) return 0; } return table[opcode]; } insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id, insn_attr_t grp_attr) { const insn_attr_t *table; int n; n = inat_group_id(grp_attr); table = inat_group_tables[n][0]; if (!table) return inat_group_common_attribute(grp_attr); if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) { table = inat_group_tables[n][lpfx_id]; if (!table) return inat_group_common_attribute(grp_attr); } return table[X86_MODRM_REG(modrm)] | inat_group_common_attribute(grp_attr); } insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, insn_byte_t vex_p) { const insn_attr_t *table; if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX) return 0; /* At first, this checks the master table */ table = inat_avx_tables[vex_m][0]; if (!table) return 0; if (!inat_is_group(table[opcode]) && vex_p) { /* If this is not a group, get attribute directly */ table = inat_avx_tables[vex_m][vex_p]; if (!table) return 0; } return table[opcode]; } insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, insn_byte_t map_select) { const insn_attr_t *table; if (map_select < X86_XOP_M_MIN || map_select > X86_XOP_M_MAX) return 0; map_select -= X86_XOP_M_MIN; /* At first, this checks the master table */ table = inat_xop_tables[map_select]; if (!table) return 0; return table[opcode]; } |
| 21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 | // SPDX-License-Identifier: GPL-2.0-only /* * Software WEP encryption implementation * Copyright 2002, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2003, Instant802 Networks, Inc. * Copyright (C) 2023 Intel Corporation */ #include <linux/netdevice.h> #include <linux/types.h> #include <linux/random.h> #include <linux/compiler.h> #include <linux/crc32.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/mm.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/unaligned.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "wep.h" void ieee80211_wep_init(struct ieee80211_local *local) { /* start WEP IV from a random value */ get_random_bytes(&local->wep_iv, IEEE80211_WEP_IV_LEN); } static inline bool ieee80211_wep_weak_iv(u32 iv, int keylen) { /* * Fluhrer, Mantin, and Shamir have reported weaknesses in the * key scheduling algorithm of RC4. At least IVs (KeyByte + 3, * 0xff, N) can be used to speedup attacks, so avoid using them. */ if ((iv & 0xff00) == 0xff00) { u8 B = (iv >> 16) & 0xff; if (B >= 3 && B < 3 + keylen) return true; } return false; } static void ieee80211_wep_get_iv(struct ieee80211_local *local, int keylen, int keyidx, u8 *iv) { local->wep_iv++; if (ieee80211_wep_weak_iv(local->wep_iv, keylen)) local->wep_iv += 0x0100; if (!iv) return; *iv++ = (local->wep_iv >> 16) & 0xff; *iv++ = (local->wep_iv >> 8) & 0xff; *iv++ = local->wep_iv & 0xff; *iv++ = keyidx << 6; } static u8 *ieee80211_wep_add_iv(struct ieee80211_local *local, struct sk_buff *skb, int keylen, int keyidx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); unsigned int hdrlen; u8 *newhdr; hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); if (WARN_ON(skb_headroom(skb) < IEEE80211_WEP_IV_LEN)) return NULL; hdrlen = ieee80211_hdrlen(hdr->frame_control); newhdr = skb_push(skb, IEEE80211_WEP_IV_LEN); memmove(newhdr, newhdr + IEEE80211_WEP_IV_LEN, hdrlen); /* the HW only needs room for the IV, but not the actual IV */ if (info->control.hw_key && (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) return newhdr + hdrlen; ieee80211_wep_get_iv(local, keylen, keyidx, newhdr + hdrlen); return newhdr + hdrlen; } static void ieee80211_wep_remove_iv(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_key *key) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; unsigned int hdrlen; hdrlen = ieee80211_hdrlen(hdr->frame_control); memmove(skb->data + IEEE80211_WEP_IV_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_WEP_IV_LEN); } /* Perform WEP encryption using given key. data buffer must have tailroom * for 4-byte ICV. data_len must not include this ICV. Note: this function * does _not_ add IV. data = RC4(data | CRC32(data)) */ int ieee80211_wep_encrypt_data(struct arc4_ctx *ctx, u8 *rc4key, size_t klen, u8 *data, size_t data_len) { __le32 icv; icv = cpu_to_le32(~crc32_le(~0, data, data_len)); put_unaligned(icv, (__le32 *)(data + data_len)); arc4_setkey(ctx, rc4key, klen); arc4_crypt(ctx, data, data, data_len + IEEE80211_WEP_ICV_LEN); memzero_explicit(ctx, sizeof(*ctx)); return 0; } /* Perform WEP encryption on given skb. 4 bytes of extra space (IV) in the * beginning of the buffer 4 bytes of extra space (ICV) in the end of the * buffer will be added. Both IV and ICV will be transmitted, so the * payload length increases with 8 bytes. * * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data)) */ int ieee80211_wep_encrypt(struct ieee80211_local *local, struct sk_buff *skb, const u8 *key, int keylen, int keyidx) { u8 *iv; size_t len; u8 rc4key[3 + WLAN_KEY_LEN_WEP104]; if (WARN_ON(skb_tailroom(skb) < IEEE80211_WEP_ICV_LEN)) return -1; iv = ieee80211_wep_add_iv(local, skb, keylen, keyidx); if (!iv) return -1; len = skb->len - (iv + IEEE80211_WEP_IV_LEN - skb->data); /* Prepend 24-bit IV to RC4 key */ memcpy(rc4key, iv, 3); /* Copy rest of the WEP key (the secret part) */ memcpy(rc4key + 3, key, keylen); /* Add room for ICV */ skb_put(skb, IEEE80211_WEP_ICV_LEN); return ieee80211_wep_encrypt_data(&local->wep_tx_ctx, rc4key, keylen + 3, iv + IEEE80211_WEP_IV_LEN, len); } /* Perform WEP decryption using given key. data buffer includes encrypted * payload, including 4-byte ICV, but _not_ IV. data_len must not include ICV. * Return 0 on success and -1 on ICV mismatch. */ int ieee80211_wep_decrypt_data(struct arc4_ctx *ctx, u8 *rc4key, size_t klen, u8 *data, size_t data_len) { __le32 crc; arc4_setkey(ctx, rc4key, klen); arc4_crypt(ctx, data, data, data_len + IEEE80211_WEP_ICV_LEN); memzero_explicit(ctx, sizeof(*ctx)); crc = cpu_to_le32(~crc32_le(~0, data, data_len)); if (memcmp(&crc, data + data_len, IEEE80211_WEP_ICV_LEN) != 0) /* ICV mismatch */ return -1; return 0; } /* Perform WEP decryption on given skb. Buffer includes whole WEP part of * the frame: IV (4 bytes), encrypted payload (including SNAP header), * ICV (4 bytes). skb->len includes both IV and ICV. * * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on * failure. If frame is OK, IV and ICV will be removed, i.e., decrypted payload * is moved to the beginning of the skb and skb length will be reduced. */ static int ieee80211_wep_decrypt(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_key *key) { u32 klen; u8 rc4key[3 + WLAN_KEY_LEN_WEP104]; u8 keyidx; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; unsigned int hdrlen; size_t len; int ret = 0; if (!ieee80211_has_protected(hdr->frame_control)) return -1; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (skb->len < hdrlen + IEEE80211_WEP_IV_LEN + IEEE80211_WEP_ICV_LEN) return -1; len = skb->len - hdrlen - IEEE80211_WEP_IV_LEN - IEEE80211_WEP_ICV_LEN; keyidx = skb->data[hdrlen + 3] >> 6; if (!key || keyidx != key->conf.keyidx) return -1; klen = 3 + key->conf.keylen; /* Prepend 24-bit IV to RC4 key */ memcpy(rc4key, skb->data + hdrlen, 3); /* Copy rest of the WEP key (the secret part) */ memcpy(rc4key + 3, key->conf.key, key->conf.keylen); if (ieee80211_wep_decrypt_data(&local->wep_rx_ctx, rc4key, klen, skb->data + hdrlen + IEEE80211_WEP_IV_LEN, len)) ret = -1; /* Trim ICV */ skb_trim(skb, skb->len - IEEE80211_WEP_ICV_LEN); /* Remove IV */ memmove(skb->data + IEEE80211_WEP_IV_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_WEP_IV_LEN); return ret; } ieee80211_rx_result ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx) { struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; __le16 fc = hdr->frame_control; if (!ieee80211_is_data(fc) && !ieee80211_is_auth(fc)) return RX_CONTINUE; if (!(status->flag & RX_FLAG_DECRYPTED)) { if (skb_linearize(rx->skb)) return RX_DROP_U_OOM; if (ieee80211_wep_decrypt(rx->local, rx->skb, rx->key)) return RX_DROP_U_WEP_DEC_FAIL; } else if (!(status->flag & RX_FLAG_IV_STRIPPED)) { if (!pskb_may_pull(rx->skb, ieee80211_hdrlen(fc) + IEEE80211_WEP_IV_LEN)) return RX_DROP_U_NO_IV; ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key); /* remove ICV */ if (!(status->flag & RX_FLAG_ICV_STRIPPED) && pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN)) return RX_DROP_U_NO_ICV; } return RX_CONTINUE; } static int wep_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_key_conf *hw_key = info->control.hw_key; if (!hw_key) { if (ieee80211_wep_encrypt(tx->local, skb, tx->key->conf.key, tx->key->conf.keylen, tx->key->conf.keyidx)) return -1; } else if ((hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) || (hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) { if (!ieee80211_wep_add_iv(tx->local, skb, tx->key->conf.keylen, tx->key->conf.keyidx)) return -1; } return 0; } ieee80211_tx_result ieee80211_crypto_wep_encrypt(struct ieee80211_tx_data *tx) { struct sk_buff *skb; ieee80211_tx_set_protected(tx); skb_queue_walk(&tx->skbs, skb) { if (wep_encrypt_skb(tx, skb) < 0) { I802_DEBUG_INC(tx->local->tx_handlers_drop_wep); return TX_DROP; } } return TX_CONTINUE; } |
| 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 | // SPDX-License-Identifier: GPL-2.0 /* * Media device request objects * * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved. * Copyright (C) 2018 Intel Corporation * * Author: Hans Verkuil <hansverk@cisco.com> * Author: Sakari Ailus <sakari.ailus@linux.intel.com> */ #ifndef MEDIA_REQUEST_H #define MEDIA_REQUEST_H #include <linux/list.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/refcount.h> #include <media/media-device.h> /** * enum media_request_state - media request state * * @MEDIA_REQUEST_STATE_IDLE: Idle * @MEDIA_REQUEST_STATE_VALIDATING: Validating the request, no state changes * allowed * @MEDIA_REQUEST_STATE_QUEUED: Queued * @MEDIA_REQUEST_STATE_COMPLETE: Completed, the request is done * @MEDIA_REQUEST_STATE_CLEANING: Cleaning, the request is being re-inited * @MEDIA_REQUEST_STATE_UPDATING: The request is being updated, i.e. * request objects are being added, * modified or removed * @NR_OF_MEDIA_REQUEST_STATE: The number of media request states, used * internally for sanity check purposes */ enum media_request_state { MEDIA_REQUEST_STATE_IDLE, MEDIA_REQUEST_STATE_VALIDATING, MEDIA_REQUEST_STATE_QUEUED, MEDIA_REQUEST_STATE_COMPLETE, MEDIA_REQUEST_STATE_CLEANING, MEDIA_REQUEST_STATE_UPDATING, NR_OF_MEDIA_REQUEST_STATE, }; struct media_request_object; /** * struct media_request - Media device request * @mdev: Media device this request belongs to * @kref: Reference count * @debug_str: Prefix for debug messages (process name:fd) * @state: The state of the request * @updating_count: count the number of request updates that are in progress * @access_count: count the number of request accesses that are in progress * @objects: List of @struct media_request_object request objects * @num_incomplete_objects: The number of incomplete objects in the request * @poll_wait: Wait queue for poll * @lock: Serializes access to this struct */ struct media_request { struct media_device *mdev; struct kref kref; char debug_str[TASK_COMM_LEN + 11]; enum media_request_state state; unsigned int updating_count; unsigned int access_count; struct list_head objects; unsigned int num_incomplete_objects; wait_queue_head_t poll_wait; spinlock_t lock; }; #ifdef CONFIG_MEDIA_CONTROLLER /** * media_request_lock_for_access - Lock the request to access its objects * * @req: The media request * * Use before accessing a completed request. A reference to the request must * be held during the access. This usually takes place automatically through * a file handle. Use @media_request_unlock_for_access when done. */ static inline int __must_check media_request_lock_for_access(struct media_request *req) { unsigned long flags; int ret = -EBUSY; spin_lock_irqsave(&req->lock, flags); if (req->state == MEDIA_REQUEST_STATE_COMPLETE) { req->access_count++; ret = 0; } spin_unlock_irqrestore(&req->lock, flags); return ret; } /** * media_request_unlock_for_access - Unlock a request previously locked for * access * * @req: The media request * * Unlock a request that has previously been locked using * @media_request_lock_for_access. */ static inline void media_request_unlock_for_access(struct media_request *req) { unsigned long flags; spin_lock_irqsave(&req->lock, flags); if (!WARN_ON(!req->access_count)) req->access_count--; spin_unlock_irqrestore(&req->lock, flags); } /** * media_request_lock_for_update - Lock the request for updating its objects * * @req: The media request * * Use before updating a request, i.e. adding, modifying or removing a request * object in it. A reference to the request must be held during the update. This * usually takes place automatically through a file handle. Use * @media_request_unlock_for_update when done. */ static inline int __must_check media_request_lock_for_update(struct media_request *req) { unsigned long flags; int ret = 0; spin_lock_irqsave(&req->lock, flags); if (req->state == MEDIA_REQUEST_STATE_IDLE || req->state == MEDIA_REQUEST_STATE_UPDATING) { req->state = MEDIA_REQUEST_STATE_UPDATING; req->updating_count++; } else { ret = -EBUSY; } spin_unlock_irqrestore(&req->lock, flags); return ret; } /** * media_request_unlock_for_update - Unlock a request previously locked for * update * * @req: The media request * * Unlock a request that has previously been locked using * @media_request_lock_for_update. */ static inline void media_request_unlock_for_update(struct media_request *req) { unsigned long flags; spin_lock_irqsave(&req->lock, flags); WARN_ON(req->updating_count <= 0); if (!--req->updating_count) req->state = MEDIA_REQUEST_STATE_IDLE; spin_unlock_irqrestore(&req->lock, flags); } /** * media_request_get - Get the media request * * @req: The media request * * Get the media request. */ static inline void media_request_get(struct media_request *req) { kref_get(&req->kref); } /** * media_request_put - Put the media request * * @req: The media request * * Put the media request. The media request will be released * when the refcount reaches 0. */ void media_request_put(struct media_request *req); /** * media_request_get_by_fd - Get a media request by fd * * @mdev: Media device this request belongs to * @request_fd: The file descriptor of the request * * Get the request represented by @request_fd that is owned * by the media device. * * Return a -EBADR error pointer if requests are not supported * by this driver. Return -EINVAL if the request was not found. * Return the pointer to the request if found: the caller will * have to call @media_request_put when it finished using the * request. */ struct media_request * media_request_get_by_fd(struct media_device *mdev, int request_fd); /** * media_request_alloc - Allocate the media request * * @mdev: Media device this request belongs to * @alloc_fd: Store the request's file descriptor in this int * * Allocated the media request and put the fd in @alloc_fd. */ int media_request_alloc(struct media_device *mdev, int *alloc_fd); #else static inline void media_request_get(struct media_request *req) { } static inline void media_request_put(struct media_request *req) { } static inline struct media_request * media_request_get_by_fd(struct media_device *mdev, int request_fd) { return ERR_PTR(-EBADR); } #endif /** * struct media_request_object_ops - Media request object operations * @prepare: Validate and prepare the request object, optional. * @unprepare: Unprepare the request object, optional. * @queue: Queue the request object, optional. * @unbind: Unbind the request object, optional. * @release: Release the request object, required. */ struct media_request_object_ops { int (*prepare)(struct media_request_object *object); void (*unprepare)(struct media_request_object *object); void (*queue)(struct media_request_object *object); void (*unbind)(struct media_request_object *object); void (*release)(struct media_request_object *object); }; /** * struct media_request_object - An opaque object that belongs to a media * request * * @ops: object's operations * @priv: object's priv pointer * @req: the request this object belongs to (can be NULL) * @list: List entry of the object for @struct media_request * @kref: Reference count of the object, acquire before releasing req->lock * @completed: If true, then this object was completed. * * An object related to the request. This struct is always embedded in * another struct that contains the actual data for this request object. */ struct media_request_object { const struct media_request_object_ops *ops; void *priv; struct media_request *req; struct list_head list; struct kref kref; bool completed; }; #ifdef CONFIG_MEDIA_CONTROLLER /** * media_request_object_get - Get a media request object * * @obj: The object * * Get a media request object. */ static inline void media_request_object_get(struct media_request_object *obj) { kref_get(&obj->kref); } /** * media_request_object_put - Put a media request object * * @obj: The object * * Put a media request object. Once all references are gone, the * object's memory is released. */ void media_request_object_put(struct media_request_object *obj); /** * media_request_object_find - Find an object in a request * * @req: The media request * @ops: Find an object with this ops value * @priv: Find an object with this priv value * * Both @ops and @priv must be non-NULL. * * Returns the object pointer or NULL if not found. The caller must * call media_request_object_put() once it finished using the object. * * Since this function needs to walk the list of objects it takes * the @req->lock spin lock to make this safe. */ struct media_request_object * media_request_object_find(struct media_request *req, const struct media_request_object_ops *ops, void *priv); /** * media_request_object_init - Initialise a media request object * * @obj: The object * * Initialise a media request object. The object will be released using the * release callback of the ops once it has no references (this function * initialises references to one). */ void media_request_object_init(struct media_request_object *obj); /** * media_request_object_bind - Bind a media request object to a request * * @req: The media request * @ops: The object ops for this object * @priv: A driver-specific priv pointer associated with this object * @is_buffer: Set to true if the object a buffer object. * @obj: The object * * Bind this object to the request and set the ops and priv values of * the object so it can be found later with media_request_object_find(). * * Every bound object must be unbound or completed by the kernel at some * point in time, otherwise the request will never complete. When the * request is released all completed objects will be unbound by the * request core code. * * Buffer objects will be added to the end of the request's object * list, non-buffer objects will be added to the front of the list. * This ensures that all buffer objects are at the end of the list * and that all non-buffer objects that they depend on are processed * first. */ int media_request_object_bind(struct media_request *req, const struct media_request_object_ops *ops, void *priv, bool is_buffer, struct media_request_object *obj); /** * media_request_object_unbind - Unbind a media request object * * @obj: The object * * Unbind the media request object from the request. */ void media_request_object_unbind(struct media_request_object *obj); /** * media_request_object_complete - Mark the media request object as complete * * @obj: The object * * Mark the media request object as complete. Only bound objects can * be completed. */ void media_request_object_complete(struct media_request_object *obj); #else static inline int __must_check media_request_lock_for_access(struct media_request *req) { return -EINVAL; } static inline void media_request_unlock_for_access(struct media_request *req) { } static inline int __must_check media_request_lock_for_update(struct media_request *req) { return -EINVAL; } static inline void media_request_unlock_for_update(struct media_request *req) { } static inline void media_request_object_get(struct media_request_object *obj) { } static inline void media_request_object_put(struct media_request_object *obj) { } static inline struct media_request_object * media_request_object_find(struct media_request *req, const struct media_request_object_ops *ops, void *priv) { return NULL; } static inline void media_request_object_init(struct media_request_object *obj) { obj->ops = NULL; obj->req = NULL; } static inline int media_request_object_bind(struct media_request *req, const struct media_request_object_ops *ops, void *priv, bool is_buffer, struct media_request_object *obj) { return 0; } static inline void media_request_object_unbind(struct media_request_object *obj) { } static inline void media_request_object_complete(struct media_request_object *obj) { } #endif #endif |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PROJID_H #define _LINUX_PROJID_H /* * A set of types for the internal kernel types representing project ids. * * The types defined in this header allow distinguishing which project ids in * the kernel are values used by userspace and which project id values are * the internal kernel values. With the addition of user namespaces the values * can be different. Using the type system makes it possible for the compiler * to detect when we overlook these differences. * */ #include <linux/types.h> struct user_namespace; extern struct user_namespace init_user_ns; typedef __kernel_uid32_t projid_t; typedef struct { projid_t val; } kprojid_t; static inline projid_t __kprojid_val(kprojid_t projid) { return projid.val; } #define KPROJIDT_INIT(value) (kprojid_t){ value } #define INVALID_PROJID KPROJIDT_INIT(-1) #define OVERFLOW_PROJID 65534 static inline bool projid_eq(kprojid_t left, kprojid_t right) { return __kprojid_val(left) == __kprojid_val(right); } static inline bool projid_lt(kprojid_t left, kprojid_t right) { return __kprojid_val(left) < __kprojid_val(right); } static inline bool projid_valid(kprojid_t projid) { return !projid_eq(projid, INVALID_PROJID); } #ifdef CONFIG_USER_NS extern kprojid_t make_kprojid(struct user_namespace *from, projid_t projid); extern projid_t from_kprojid(struct user_namespace *to, kprojid_t projid); extern projid_t from_kprojid_munged(struct user_namespace *to, kprojid_t projid); static inline bool kprojid_has_mapping(struct user_namespace *ns, kprojid_t projid) { return from_kprojid(ns, projid) != (projid_t)-1; } #else static inline kprojid_t make_kprojid(struct user_namespace *from, projid_t projid) { return KPROJIDT_INIT(projid); } static inline projid_t from_kprojid(struct user_namespace *to, kprojid_t kprojid) { return __kprojid_val(kprojid); } static inline projid_t from_kprojid_munged(struct user_namespace *to, kprojid_t kprojid) { projid_t projid = from_kprojid(to, kprojid); if (projid == (projid_t)-1) projid = OVERFLOW_PROJID; return projid; } static inline bool kprojid_has_mapping(struct user_namespace *ns, kprojid_t projid) { return true; } #endif /* CONFIG_USER_NS */ #endif /* _LINUX_PROJID_H */ |
| 8 8 6 4 2 2 1 3 4 4 4 5 6 5 1 3 3 3 1 3 2 2 2 2 1 1 6 6 1 4 4 1 1 1 3 6 6 9 10 1 1 2 1 1 1 2 2 2 2 10 4 5 10 10 10 1 10 10 8 3 3 10 10 9 1 1 8 10 9 1 9 10 1 1 1 1 6 6 1 1 6 2 5 5 5 5 2 2 2 6 6 1 5 5 6 2 1 1 3 1 2 3 1 2 2 1 1 2 1 1 6 1 5 9 9 2 3 1 1 1 1 2 12 12 3 1 1 3 3 3 3 3 3 1 3 3 3 3 3 3 1 3 1 2 1 1 1 2 1 1 3 1 2 1 1 1 1 3 1 1 2 2 2 1 1 1 2 2 1 1 2 2 1 1 7 8 8 7 3 2 1 1 8 1 2 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 | // SPDX-License-Identifier: GPL-2.0-only /* * vivid-vid-out.c - video output support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/videodev2.h> #include <linux/v4l2-dv-timings.h> #include <media/v4l2-common.h> #include <media/v4l2-event.h> #include <media/v4l2-dv-timings.h> #include <media/v4l2-rect.h> #include "vivid-core.h" #include "vivid-osd.h" #include "vivid-vid-common.h" #include "vivid-kthread-out.h" #include "vivid-vid-out.h" static int vid_out_queue_setup(struct vb2_queue *vq, unsigned *nbuffers, unsigned *nplanes, unsigned sizes[], struct device *alloc_devs[]) { struct vivid_dev *dev = vb2_get_drv_priv(vq); const struct vivid_fmt *vfmt = dev->fmt_out; unsigned planes = vfmt->buffers; unsigned h = dev->fmt_out_rect.height; unsigned int size = dev->bytesperline_out[0] * h + vfmt->data_offset[0]; unsigned p; for (p = vfmt->buffers; p < vfmt->planes; p++) size += dev->bytesperline_out[p] * h / vfmt->vdownsampling[p] + vfmt->data_offset[p]; if (dev->field_out == V4L2_FIELD_ALTERNATE) { /* * You cannot use write() with FIELD_ALTERNATE since the field * information (TOP/BOTTOM) cannot be passed to the kernel. */ if (vb2_fileio_is_active(vq)) return -EINVAL; } if (dev->queue_setup_error) { /* * Error injection: test what happens if queue_setup() returns * an error. */ dev->queue_setup_error = false; return -EINVAL; } if (*nplanes) { /* * Check if the number of requested planes match * the number of planes in the current format. You can't mix that. */ if (*nplanes != planes) return -EINVAL; if (sizes[0] < size) return -EINVAL; for (p = 1; p < planes; p++) { if (sizes[p] < dev->bytesperline_out[p] * h / vfmt->vdownsampling[p] + vfmt->data_offset[p]) return -EINVAL; } } else { for (p = 0; p < planes; p++) sizes[p] = p ? dev->bytesperline_out[p] * h / vfmt->vdownsampling[p] + vfmt->data_offset[p] : size; } *nplanes = planes; dprintk(dev, 1, "%s: count=%u\n", __func__, *nbuffers); for (p = 0; p < planes; p++) dprintk(dev, 1, "%s: size[%u]=%u\n", __func__, p, sizes[p]); return 0; } static int vid_out_buf_out_validate(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); dprintk(dev, 1, "%s\n", __func__); if (dev->field_out != V4L2_FIELD_ALTERNATE) vbuf->field = dev->field_out; else if (vbuf->field != V4L2_FIELD_TOP && vbuf->field != V4L2_FIELD_BOTTOM) return -EINVAL; return 0; } static int vid_out_buf_prepare(struct vb2_buffer *vb) { struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); const struct vivid_fmt *vfmt = dev->fmt_out; unsigned int planes = vfmt->buffers; unsigned int h = dev->fmt_out_rect.height; unsigned int size = dev->bytesperline_out[0] * h; unsigned p; for (p = vfmt->buffers; p < vfmt->planes; p++) size += dev->bytesperline_out[p] * h / vfmt->vdownsampling[p]; dprintk(dev, 1, "%s\n", __func__); if (WARN_ON(NULL == dev->fmt_out)) return -EINVAL; if (dev->buf_prepare_error) { /* * Error injection: test what happens if buf_prepare() returns * an error. */ dev->buf_prepare_error = false; return -EINVAL; } for (p = 0; p < planes; p++) { if (p) size = dev->bytesperline_out[p] * h / vfmt->vdownsampling[p]; size += vb->planes[p].data_offset; if (vb2_get_plane_payload(vb, p) < size) { dprintk(dev, 1, "%s the payload is too small for plane %u (%lu < %u)\n", __func__, p, vb2_get_plane_payload(vb, p), size); return -EINVAL; } } return 0; } static void vid_out_buf_queue(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); struct vivid_buffer *buf = container_of(vbuf, struct vivid_buffer, vb); dprintk(dev, 1, "%s\n", __func__); spin_lock(&dev->slock); list_add_tail(&buf->list, &dev->vid_out_active); spin_unlock(&dev->slock); } static int vid_out_start_streaming(struct vb2_queue *vq, unsigned count) { struct vivid_dev *dev = vb2_get_drv_priv(vq); int err; dev->vid_out_seq_count = 0; dprintk(dev, 1, "%s\n", __func__); if (dev->start_streaming_error) { dev->start_streaming_error = false; err = -EINVAL; } else { err = vivid_start_generating_vid_out(dev, &dev->vid_out_streaming); } if (err) { struct vivid_buffer *buf, *tmp; list_for_each_entry_safe(buf, tmp, &dev->vid_out_active, list) { list_del(&buf->list); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_QUEUED); } } return err; } /* abort streaming and wait for last buffer */ static void vid_out_stop_streaming(struct vb2_queue *vq) { struct vivid_dev *dev = vb2_get_drv_priv(vq); dprintk(dev, 1, "%s\n", __func__); vivid_stop_generating_vid_out(dev, &dev->vid_out_streaming); } static void vid_out_buf_request_complete(struct vb2_buffer *vb) { struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); v4l2_ctrl_request_complete(vb->req_obj.req, &dev->ctrl_hdl_vid_out); } const struct vb2_ops vivid_vid_out_qops = { .queue_setup = vid_out_queue_setup, .buf_out_validate = vid_out_buf_out_validate, .buf_prepare = vid_out_buf_prepare, .buf_queue = vid_out_buf_queue, .start_streaming = vid_out_start_streaming, .stop_streaming = vid_out_stop_streaming, .buf_request_complete = vid_out_buf_request_complete, }; /* * Called whenever the format has to be reset which can occur when * changing outputs, standard, timings, etc. */ void vivid_update_format_out(struct vivid_dev *dev) { struct v4l2_bt_timings *bt = &dev->dv_timings_out.bt; unsigned size, p; u64 pixelclock; switch (dev->output_type[dev->output]) { case SVID: default: dev->field_out = dev->tv_field_out; dev->sink_rect.width = 720; if (dev->std_out & V4L2_STD_525_60) { dev->sink_rect.height = 480; dev->timeperframe_vid_out = (struct v4l2_fract) { 1001, 30000 }; dev->service_set_out = V4L2_SLICED_CAPTION_525; } else { dev->sink_rect.height = 576; dev->timeperframe_vid_out = (struct v4l2_fract) { 1000, 25000 }; dev->service_set_out = V4L2_SLICED_WSS_625 | V4L2_SLICED_TELETEXT_B; } dev->colorspace_out = V4L2_COLORSPACE_SMPTE170M; break; case HDMI: dev->sink_rect.width = bt->width; dev->sink_rect.height = bt->height; size = V4L2_DV_BT_FRAME_WIDTH(bt) * V4L2_DV_BT_FRAME_HEIGHT(bt); if (can_reduce_fps(bt) && (bt->flags & V4L2_DV_FL_REDUCED_FPS)) pixelclock = div_u64(bt->pixelclock * 1000, 1001); else pixelclock = bt->pixelclock; dev->timeperframe_vid_out = (struct v4l2_fract) { size / 100, (u32)pixelclock / 100 }; if (bt->interlaced) dev->field_out = V4L2_FIELD_ALTERNATE; else dev->field_out = V4L2_FIELD_NONE; if (!dev->dvi_d_out && (bt->flags & V4L2_DV_FL_IS_CE_VIDEO)) { if (bt->width == 720 && bt->height <= 576) dev->colorspace_out = V4L2_COLORSPACE_SMPTE170M; else dev->colorspace_out = V4L2_COLORSPACE_REC709; } else { dev->colorspace_out = V4L2_COLORSPACE_SRGB; } break; } dev->xfer_func_out = V4L2_XFER_FUNC_DEFAULT; dev->ycbcr_enc_out = V4L2_YCBCR_ENC_DEFAULT; dev->hsv_enc_out = V4L2_HSV_ENC_180; dev->quantization_out = V4L2_QUANTIZATION_DEFAULT; dev->compose_out = dev->sink_rect; dev->compose_bounds_out = dev->sink_rect; dev->crop_out = dev->compose_out; if (V4L2_FIELD_HAS_T_OR_B(dev->field_out)) dev->crop_out.height /= 2; dev->fmt_out_rect = dev->crop_out; for (p = 0; p < dev->fmt_out->planes; p++) dev->bytesperline_out[p] = (dev->sink_rect.width * dev->fmt_out->bit_depth[p]) / 8; } /* Map the field to something that is valid for the current output */ static enum v4l2_field vivid_field_out(struct vivid_dev *dev, enum v4l2_field field) { if (vivid_is_svid_out(dev)) { switch (field) { case V4L2_FIELD_INTERLACED_TB: case V4L2_FIELD_INTERLACED_BT: case V4L2_FIELD_SEQ_TB: case V4L2_FIELD_SEQ_BT: case V4L2_FIELD_ALTERNATE: return field; case V4L2_FIELD_INTERLACED: default: return V4L2_FIELD_INTERLACED; } } if (vivid_is_hdmi_out(dev)) return dev->dv_timings_out.bt.interlaced ? V4L2_FIELD_ALTERNATE : V4L2_FIELD_NONE; return V4L2_FIELD_NONE; } static enum tpg_pixel_aspect vivid_get_pixel_aspect(const struct vivid_dev *dev) { if (vivid_is_svid_out(dev)) return (dev->std_out & V4L2_STD_525_60) ? TPG_PIXEL_ASPECT_NTSC : TPG_PIXEL_ASPECT_PAL; if (vivid_is_hdmi_out(dev) && dev->sink_rect.width == 720 && dev->sink_rect.height <= 576) return dev->sink_rect.height == 480 ? TPG_PIXEL_ASPECT_NTSC : TPG_PIXEL_ASPECT_PAL; return TPG_PIXEL_ASPECT_SQUARE; } int vivid_g_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_pix_format_mplane *mp = &f->fmt.pix_mp; const struct vivid_fmt *fmt = dev->fmt_out; unsigned p; mp->width = dev->fmt_out_rect.width; mp->height = dev->fmt_out_rect.height; mp->field = dev->field_out; mp->pixelformat = fmt->fourcc; mp->colorspace = dev->colorspace_out; mp->xfer_func = dev->xfer_func_out; mp->ycbcr_enc = dev->ycbcr_enc_out; mp->quantization = dev->quantization_out; mp->num_planes = fmt->buffers; for (p = 0; p < mp->num_planes; p++) { mp->plane_fmt[p].bytesperline = dev->bytesperline_out[p]; mp->plane_fmt[p].sizeimage = mp->plane_fmt[p].bytesperline * mp->height / fmt->vdownsampling[p] + fmt->data_offset[p]; } for (p = fmt->buffers; p < fmt->planes; p++) { unsigned stride = dev->bytesperline_out[p]; mp->plane_fmt[0].sizeimage += (stride * mp->height) / fmt->vdownsampling[p]; } return 0; } int vivid_try_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_bt_timings *bt = &dev->dv_timings_out.bt; struct v4l2_pix_format_mplane *mp = &f->fmt.pix_mp; struct v4l2_plane_pix_format *pfmt = mp->plane_fmt; const struct vivid_fmt *fmt; unsigned bytesperline, max_bpl; unsigned factor = 1; unsigned w, h; unsigned p; fmt = vivid_get_format(dev, mp->pixelformat); if (!fmt) { dprintk(dev, 1, "Fourcc format (0x%08x) unknown.\n", mp->pixelformat); mp->pixelformat = V4L2_PIX_FMT_YUYV; fmt = vivid_get_format(dev, mp->pixelformat); } mp->field = vivid_field_out(dev, mp->field); if (vivid_is_svid_out(dev)) { w = 720; h = (dev->std_out & V4L2_STD_525_60) ? 480 : 576; } else { w = dev->sink_rect.width; h = dev->sink_rect.height; } if (V4L2_FIELD_HAS_T_OR_B(mp->field)) factor = 2; if (!dev->has_scaler_out && !dev->has_crop_out && !dev->has_compose_out) { mp->width = w; mp->height = h / factor; } else { struct v4l2_rect r = { 0, 0, mp->width, mp->height * factor }; v4l2_rect_set_min_size(&r, &vivid_min_rect); v4l2_rect_set_max_size(&r, &vivid_max_rect); if (dev->has_scaler_out && !dev->has_crop_out) { struct v4l2_rect max_r = { 0, 0, MAX_ZOOM * w, MAX_ZOOM * h }; v4l2_rect_set_max_size(&r, &max_r); } else if (!dev->has_scaler_out && dev->has_compose_out && !dev->has_crop_out) { v4l2_rect_set_max_size(&r, &dev->sink_rect); } else if (!dev->has_scaler_out && !dev->has_compose_out) { v4l2_rect_set_min_size(&r, &dev->sink_rect); } mp->width = r.width; mp->height = r.height / factor; } /* This driver supports custom bytesperline values */ mp->num_planes = fmt->buffers; for (p = 0; p < fmt->buffers; p++) { /* Calculate the minimum supported bytesperline value */ bytesperline = (mp->width * fmt->bit_depth[p]) >> 3; /* Calculate the maximum supported bytesperline value */ max_bpl = (MAX_ZOOM * MAX_WIDTH * fmt->bit_depth[p]) >> 3; if (pfmt[p].bytesperline > max_bpl) pfmt[p].bytesperline = max_bpl; if (pfmt[p].bytesperline < bytesperline) pfmt[p].bytesperline = bytesperline; pfmt[p].sizeimage = (pfmt[p].bytesperline * mp->height) / fmt->vdownsampling[p] + fmt->data_offset[p]; memset(pfmt[p].reserved, 0, sizeof(pfmt[p].reserved)); } for (p = fmt->buffers; p < fmt->planes; p++) pfmt[0].sizeimage += (pfmt[0].bytesperline * mp->height * (fmt->bit_depth[p] / fmt->vdownsampling[p])) / (fmt->bit_depth[0] / fmt->vdownsampling[0]); mp->xfer_func = V4L2_XFER_FUNC_DEFAULT; mp->ycbcr_enc = V4L2_YCBCR_ENC_DEFAULT; mp->quantization = V4L2_QUANTIZATION_DEFAULT; if (vivid_is_svid_out(dev)) { mp->colorspace = V4L2_COLORSPACE_SMPTE170M; } else if (dev->dvi_d_out || !(bt->flags & V4L2_DV_FL_IS_CE_VIDEO)) { mp->colorspace = V4L2_COLORSPACE_SRGB; if (dev->dvi_d_out) mp->quantization = V4L2_QUANTIZATION_LIM_RANGE; } else if (bt->width == 720 && bt->height <= 576) { mp->colorspace = V4L2_COLORSPACE_SMPTE170M; } else if (mp->colorspace != V4L2_COLORSPACE_SMPTE170M && mp->colorspace != V4L2_COLORSPACE_REC709 && mp->colorspace != V4L2_COLORSPACE_OPRGB && mp->colorspace != V4L2_COLORSPACE_BT2020 && mp->colorspace != V4L2_COLORSPACE_SRGB) { mp->colorspace = V4L2_COLORSPACE_REC709; } memset(mp->reserved, 0, sizeof(mp->reserved)); return 0; } int vivid_s_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct v4l2_pix_format_mplane *mp = &f->fmt.pix_mp; struct vivid_dev *dev = video_drvdata(file); struct v4l2_rect *crop = &dev->crop_out; struct v4l2_rect *compose = &dev->compose_out; struct vb2_queue *q = &dev->vb_vid_out_q; int ret = vivid_try_fmt_vid_out(file, priv, f); unsigned factor = 1; unsigned p; if (ret < 0) return ret; if (vb2_is_busy(q) && (vivid_is_svid_out(dev) || mp->width != dev->fmt_out_rect.width || mp->height != dev->fmt_out_rect.height || mp->pixelformat != dev->fmt_out->fourcc || mp->field != dev->field_out)) { dprintk(dev, 1, "%s device busy\n", __func__); return -EBUSY; } /* * Allow for changing the colorspace on the fly. Useful for testing * purposes, and it is something that HDMI transmitters are able * to do. */ if (vb2_is_busy(q)) goto set_colorspace; dev->fmt_out = vivid_get_format(dev, mp->pixelformat); if (V4L2_FIELD_HAS_T_OR_B(mp->field)) factor = 2; if (dev->has_scaler_out || dev->has_crop_out || dev->has_compose_out) { struct v4l2_rect r = { 0, 0, mp->width, mp->height }; if (dev->has_scaler_out) { if (dev->has_crop_out) v4l2_rect_map_inside(crop, &r); else *crop = r; if (dev->has_compose_out && !dev->has_crop_out) { struct v4l2_rect min_r = { 0, 0, r.width / MAX_ZOOM, factor * r.height / MAX_ZOOM }; struct v4l2_rect max_r = { 0, 0, r.width * MAX_ZOOM, factor * r.height * MAX_ZOOM }; v4l2_rect_set_min_size(compose, &min_r); v4l2_rect_set_max_size(compose, &max_r); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); } else if (dev->has_compose_out) { struct v4l2_rect min_r = { 0, 0, crop->width / MAX_ZOOM, factor * crop->height / MAX_ZOOM }; struct v4l2_rect max_r = { 0, 0, crop->width * MAX_ZOOM, factor * crop->height * MAX_ZOOM }; v4l2_rect_set_min_size(compose, &min_r); v4l2_rect_set_max_size(compose, &max_r); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); } } else if (dev->has_compose_out && !dev->has_crop_out) { v4l2_rect_set_size_to(crop, &r); r.height *= factor; v4l2_rect_set_size_to(compose, &r); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); } else if (!dev->has_compose_out) { v4l2_rect_map_inside(crop, &r); r.height /= factor; v4l2_rect_set_size_to(compose, &r); } else { r.height *= factor; v4l2_rect_set_max_size(compose, &r); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); crop->top *= factor; crop->height *= factor; v4l2_rect_set_size_to(crop, compose); v4l2_rect_map_inside(crop, &r); crop->top /= factor; crop->height /= factor; } } else { struct v4l2_rect r = { 0, 0, mp->width, mp->height }; v4l2_rect_set_size_to(crop, &r); r.height /= factor; v4l2_rect_set_size_to(compose, &r); } dev->fmt_out_rect.width = mp->width; dev->fmt_out_rect.height = mp->height; for (p = 0; p < mp->num_planes; p++) dev->bytesperline_out[p] = mp->plane_fmt[p].bytesperline; for (p = dev->fmt_out->buffers; p < dev->fmt_out->planes; p++) dev->bytesperline_out[p] = (dev->bytesperline_out[0] * dev->fmt_out->bit_depth[p]) / dev->fmt_out->bit_depth[0]; dev->field_out = mp->field; if (vivid_is_svid_out(dev)) dev->tv_field_out = mp->field; set_colorspace: dev->colorspace_out = mp->colorspace; dev->xfer_func_out = mp->xfer_func; dev->ycbcr_enc_out = mp->ycbcr_enc; dev->quantization_out = mp->quantization; struct vivid_dev *in_dev = vivid_output_is_connected_to(dev); if (in_dev) { vivid_send_source_change(in_dev, SVID); vivid_send_source_change(in_dev, HDMI); } return 0; } int vidioc_g_fmt_vid_out_mplane(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (!dev->multiplanar) return -ENOTTY; return vivid_g_fmt_vid_out(file, priv, f); } int vidioc_try_fmt_vid_out_mplane(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (!dev->multiplanar) return -ENOTTY; return vivid_try_fmt_vid_out(file, priv, f); } int vidioc_s_fmt_vid_out_mplane(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (!dev->multiplanar) return -ENOTTY; return vivid_s_fmt_vid_out(file, priv, f); } int vidioc_g_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; return fmt_sp2mp_func(file, priv, f, vivid_g_fmt_vid_out); } int vidioc_try_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; return fmt_sp2mp_func(file, priv, f, vivid_try_fmt_vid_out); } int vidioc_s_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; return fmt_sp2mp_func(file, priv, f, vivid_s_fmt_vid_out); } int vivid_vid_out_g_selection(struct file *file, void *priv, struct v4l2_selection *sel) { struct vivid_dev *dev = video_drvdata(file); if (!dev->has_crop_out && !dev->has_compose_out) return -ENOTTY; if (sel->type != V4L2_BUF_TYPE_VIDEO_OUTPUT) return -EINVAL; sel->r.left = sel->r.top = 0; switch (sel->target) { case V4L2_SEL_TGT_CROP: if (!dev->has_crop_out) return -EINVAL; sel->r = dev->crop_out; break; case V4L2_SEL_TGT_CROP_DEFAULT: if (!dev->has_crop_out) return -EINVAL; sel->r = dev->fmt_out_rect; break; case V4L2_SEL_TGT_CROP_BOUNDS: if (!dev->has_crop_out) return -EINVAL; sel->r = vivid_max_rect; break; case V4L2_SEL_TGT_COMPOSE: if (!dev->has_compose_out) return -EINVAL; sel->r = dev->compose_out; break; case V4L2_SEL_TGT_COMPOSE_DEFAULT: case V4L2_SEL_TGT_COMPOSE_BOUNDS: if (!dev->has_compose_out) return -EINVAL; sel->r = dev->sink_rect; break; default: return -EINVAL; } return 0; } int vivid_vid_out_s_selection(struct file *file, void *priv, struct v4l2_selection *s) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_rect *crop = &dev->crop_out; struct v4l2_rect *compose = &dev->compose_out; unsigned factor = V4L2_FIELD_HAS_T_OR_B(dev->field_out) ? 2 : 1; int ret; if (!dev->has_crop_out && !dev->has_compose_out) return -ENOTTY; if (s->type != V4L2_BUF_TYPE_VIDEO_OUTPUT) return -EINVAL; switch (s->target) { case V4L2_SEL_TGT_CROP: if (!dev->has_crop_out) return -EINVAL; ret = vivid_vid_adjust_sel(s->flags, &s->r); if (ret) return ret; v4l2_rect_set_min_size(&s->r, &vivid_min_rect); v4l2_rect_set_max_size(&s->r, &dev->fmt_out_rect); if (dev->has_scaler_out) { struct v4l2_rect max_rect = { 0, 0, dev->sink_rect.width * MAX_ZOOM, (dev->sink_rect.height / factor) * MAX_ZOOM }; v4l2_rect_set_max_size(&s->r, &max_rect); if (dev->has_compose_out) { struct v4l2_rect min_rect = { 0, 0, s->r.width / MAX_ZOOM, (s->r.height * factor) / MAX_ZOOM }; struct v4l2_rect max_rect = { 0, 0, s->r.width * MAX_ZOOM, (s->r.height * factor) * MAX_ZOOM }; v4l2_rect_set_min_size(compose, &min_rect); v4l2_rect_set_max_size(compose, &max_rect); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); } } else if (dev->has_compose_out) { s->r.top *= factor; s->r.height *= factor; v4l2_rect_set_max_size(&s->r, &dev->sink_rect); v4l2_rect_set_size_to(compose, &s->r); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); s->r.top /= factor; s->r.height /= factor; } else { v4l2_rect_set_size_to(&s->r, &dev->sink_rect); s->r.height /= factor; } v4l2_rect_map_inside(&s->r, &dev->fmt_out_rect); *crop = s->r; break; case V4L2_SEL_TGT_COMPOSE: if (!dev->has_compose_out) return -EINVAL; ret = vivid_vid_adjust_sel(s->flags, &s->r); if (ret) return ret; v4l2_rect_set_min_size(&s->r, &vivid_min_rect); v4l2_rect_set_max_size(&s->r, &dev->sink_rect); v4l2_rect_map_inside(&s->r, &dev->compose_bounds_out); s->r.top /= factor; s->r.height /= factor; if (dev->has_scaler_out) { struct v4l2_rect fmt = dev->fmt_out_rect; struct v4l2_rect max_rect = { 0, 0, s->r.width * MAX_ZOOM, s->r.height * MAX_ZOOM }; struct v4l2_rect min_rect = { 0, 0, s->r.width / MAX_ZOOM, s->r.height / MAX_ZOOM }; v4l2_rect_set_min_size(&fmt, &min_rect); if (!dev->has_crop_out) v4l2_rect_set_max_size(&fmt, &max_rect); if (!v4l2_rect_same_size(&dev->fmt_out_rect, &fmt) && vb2_is_busy(&dev->vb_vid_out_q)) return -EBUSY; if (dev->has_crop_out) { v4l2_rect_set_min_size(crop, &min_rect); v4l2_rect_set_max_size(crop, &max_rect); } dev->fmt_out_rect = fmt; } else if (dev->has_crop_out) { struct v4l2_rect fmt = dev->fmt_out_rect; v4l2_rect_set_min_size(&fmt, &s->r); if (!v4l2_rect_same_size(&dev->fmt_out_rect, &fmt) && vb2_is_busy(&dev->vb_vid_out_q)) return -EBUSY; dev->fmt_out_rect = fmt; v4l2_rect_set_size_to(crop, &s->r); v4l2_rect_map_inside(crop, &dev->fmt_out_rect); } else { if (!v4l2_rect_same_size(&s->r, &dev->fmt_out_rect) && vb2_is_busy(&dev->vb_vid_out_q)) return -EBUSY; v4l2_rect_set_size_to(&dev->fmt_out_rect, &s->r); v4l2_rect_set_size_to(crop, &s->r); crop->height /= factor; v4l2_rect_map_inside(crop, &dev->fmt_out_rect); } s->r.top *= factor; s->r.height *= factor; *compose = s->r; break; default: return -EINVAL; } return 0; } int vivid_vid_out_g_pixelaspect(struct file *file, void *priv, int type, struct v4l2_fract *f) { struct vivid_dev *dev = video_drvdata(file); if (type != V4L2_BUF_TYPE_VIDEO_OUTPUT) return -EINVAL; switch (vivid_get_pixel_aspect(dev)) { case TPG_PIXEL_ASPECT_NTSC: f->numerator = 11; f->denominator = 10; break; case TPG_PIXEL_ASPECT_PAL: f->numerator = 54; f->denominator = 59; break; default: break; } return 0; } int vidioc_g_fmt_vid_out_overlay(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); const struct v4l2_rect *compose = &dev->compose_out; struct v4l2_window *win = &f->fmt.win; if (!dev->has_fb) return -EINVAL; win->w.top = dev->overlay_out_top; win->w.left = dev->overlay_out_left; win->w.width = compose->width; win->w.height = compose->height; win->field = V4L2_FIELD_ANY; win->chromakey = dev->chromakey_out; win->global_alpha = dev->global_alpha_out; return 0; } int vidioc_try_fmt_vid_out_overlay(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); const struct v4l2_rect *compose = &dev->compose_out; struct v4l2_window *win = &f->fmt.win; if (!dev->has_fb) return -EINVAL; win->w.left = clamp_t(int, win->w.left, -dev->display_width, dev->display_width); win->w.top = clamp_t(int, win->w.top, -dev->display_height, dev->display_height); win->w.width = compose->width; win->w.height = compose->height; /* * It makes no sense for an OSD to overlay only top or bottom fields, * so always set this to ANY. */ win->field = V4L2_FIELD_ANY; return 0; } int vidioc_s_fmt_vid_out_overlay(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_window *win = &f->fmt.win; int ret = vidioc_try_fmt_vid_out_overlay(file, priv, f); if (ret) return ret; dev->overlay_out_top = win->w.top; dev->overlay_out_left = win->w.left; dev->chromakey_out = win->chromakey; dev->global_alpha_out = win->global_alpha; return ret; } int vivid_vid_out_overlay(struct file *file, void *priv, unsigned i) { struct vivid_dev *dev = video_drvdata(file); if (i && !dev->fmt_out->can_do_overlay) { dprintk(dev, 1, "unsupported output format for output overlay\n"); return -EINVAL; } dev->overlay_out_enabled = i; return 0; } int vivid_vid_out_g_fbuf(struct file *file, void *priv, struct v4l2_framebuffer *a) { struct vivid_dev *dev = video_drvdata(file); a->capability = V4L2_FBUF_CAP_EXTERNOVERLAY | V4L2_FBUF_CAP_CHROMAKEY | V4L2_FBUF_CAP_SRC_CHROMAKEY | V4L2_FBUF_CAP_GLOBAL_ALPHA | V4L2_FBUF_CAP_LOCAL_ALPHA | V4L2_FBUF_CAP_LOCAL_INV_ALPHA; a->flags = V4L2_FBUF_FLAG_OVERLAY | dev->fbuf_out_flags; a->base = (void *)dev->video_pbase; a->fmt.width = dev->display_width; a->fmt.height = dev->display_height; if (vivid_fb_green_bits(dev) == 5) a->fmt.pixelformat = V4L2_PIX_FMT_ARGB555; else a->fmt.pixelformat = V4L2_PIX_FMT_RGB565; a->fmt.bytesperline = dev->display_byte_stride; a->fmt.sizeimage = a->fmt.height * a->fmt.bytesperline; a->fmt.field = V4L2_FIELD_NONE; a->fmt.colorspace = V4L2_COLORSPACE_SRGB; a->fmt.priv = 0; return 0; } int vivid_vid_out_s_fbuf(struct file *file, void *priv, const struct v4l2_framebuffer *a) { struct vivid_dev *dev = video_drvdata(file); const unsigned chroma_flags = V4L2_FBUF_FLAG_CHROMAKEY | V4L2_FBUF_FLAG_SRC_CHROMAKEY; const unsigned alpha_flags = V4L2_FBUF_FLAG_GLOBAL_ALPHA | V4L2_FBUF_FLAG_LOCAL_ALPHA | V4L2_FBUF_FLAG_LOCAL_INV_ALPHA; if ((a->flags & chroma_flags) == chroma_flags) return -EINVAL; switch (a->flags & alpha_flags) { case 0: case V4L2_FBUF_FLAG_GLOBAL_ALPHA: case V4L2_FBUF_FLAG_LOCAL_ALPHA: case V4L2_FBUF_FLAG_LOCAL_INV_ALPHA: break; default: return -EINVAL; } dev->fbuf_out_flags &= ~(chroma_flags | alpha_flags); dev->fbuf_out_flags |= a->flags & (chroma_flags | alpha_flags); return 0; } static const struct v4l2_audioout vivid_audio_outputs[] = { { 0, "Line-Out 1" }, { 1, "Line-Out 2" }, }; int vidioc_enum_output(struct file *file, void *priv, struct v4l2_output *out) { struct vivid_dev *dev = video_drvdata(file); if (out->index >= dev->num_outputs) return -EINVAL; out->type = V4L2_OUTPUT_TYPE_ANALOG; switch (dev->output_type[out->index]) { case SVID: snprintf(out->name, sizeof(out->name), "S-Video %03u-%u", dev->inst, dev->output_name_counter[out->index]); out->std = V4L2_STD_ALL; if (dev->has_audio_outputs) out->audioset = (1 << ARRAY_SIZE(vivid_audio_outputs)) - 1; out->capabilities = V4L2_OUT_CAP_STD; break; case HDMI: snprintf(out->name, sizeof(out->name), "HDMI %03u-%u", dev->inst, dev->output_name_counter[out->index]); out->capabilities = V4L2_OUT_CAP_DV_TIMINGS; break; } return 0; } int vidioc_g_output(struct file *file, void *priv, unsigned *o) { struct vivid_dev *dev = video_drvdata(file); *o = dev->output; return 0; } int vidioc_s_output(struct file *file, void *priv, unsigned o) { struct vivid_dev *dev = video_drvdata(file); if (o >= dev->num_outputs) return -EINVAL; if (o == dev->output) return 0; if (vb2_is_busy(&dev->vb_vid_out_q) || vb2_is_busy(&dev->vb_vbi_out_q) || vb2_is_busy(&dev->vb_meta_out_q)) return -EBUSY; dev->output = o; dev->tv_audio_output = 0; if (dev->output_type[o] == SVID) dev->vid_out_dev.tvnorms = V4L2_STD_ALL; else dev->vid_out_dev.tvnorms = 0; dev->vbi_out_dev.tvnorms = dev->vid_out_dev.tvnorms; dev->meta_out_dev.tvnorms = dev->vid_out_dev.tvnorms; vivid_update_format_out(dev); return 0; } int vidioc_enumaudout(struct file *file, void *priv, struct v4l2_audioout *vout) { if (vout->index >= ARRAY_SIZE(vivid_audio_outputs)) return -EINVAL; *vout = vivid_audio_outputs[vout->index]; return 0; } int vidioc_g_audout(struct file *file, void *priv, struct v4l2_audioout *vout) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_svid_out(dev)) return -EINVAL; *vout = vivid_audio_outputs[dev->tv_audio_output]; return 0; } int vidioc_s_audout(struct file *file, void *priv, const struct v4l2_audioout *vout) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_svid_out(dev)) return -EINVAL; if (vout->index >= ARRAY_SIZE(vivid_audio_outputs)) return -EINVAL; dev->tv_audio_output = vout->index; return 0; } int vivid_vid_out_s_std(struct file *file, void *priv, v4l2_std_id id) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_svid_out(dev)) return -ENODATA; if (dev->std_out == id) return 0; if (vb2_is_busy(&dev->vb_vid_out_q) || vb2_is_busy(&dev->vb_vbi_out_q)) return -EBUSY; dev->std_out = id; vivid_update_format_out(dev); return 0; } static bool valid_cvt_gtf_timings(struct v4l2_dv_timings *timings) { struct v4l2_bt_timings *bt = &timings->bt; if ((bt->standards & (V4L2_DV_BT_STD_CVT | V4L2_DV_BT_STD_GTF)) && v4l2_valid_dv_timings(timings, &vivid_dv_timings_cap, NULL, NULL)) return true; return false; } int vivid_vid_out_s_dv_timings(struct file *file, void *priv, struct v4l2_dv_timings *timings) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_hdmi_out(dev)) return -ENODATA; if (!v4l2_find_dv_timings_cap(timings, &vivid_dv_timings_cap, 0, NULL, NULL) && !valid_cvt_gtf_timings(timings)) return -EINVAL; if (v4l2_match_dv_timings(timings, &dev->dv_timings_out, 0, true)) return 0; if (vb2_is_busy(&dev->vb_vid_out_q)) return -EBUSY; dev->dv_timings_out = *timings; vivid_update_format_out(dev); return 0; } int vivid_vid_out_g_parm(struct file *file, void *priv, struct v4l2_streamparm *parm) { struct vivid_dev *dev = video_drvdata(file); if (parm->type != (dev->multiplanar ? V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE : V4L2_BUF_TYPE_VIDEO_OUTPUT)) return -EINVAL; parm->parm.output.capability = V4L2_CAP_TIMEPERFRAME; parm->parm.output.timeperframe = dev->timeperframe_vid_out; parm->parm.output.writebuffers = 1; return 0; } int vidioc_subscribe_event(struct v4l2_fh *fh, const struct v4l2_event_subscription *sub) { switch (sub->type) { case V4L2_EVENT_SOURCE_CHANGE: if (fh->vdev->vfl_dir == VFL_DIR_RX) return v4l2_src_change_event_subscribe(fh, sub); break; default: return v4l2_ctrl_subscribe_event(fh, sub); } return -EINVAL; } |
| 34 38 38 38 38 2391 2387 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 | // SPDX-License-Identifier: GPL-2.0 /* * Wakeup statistics in sysfs * * Copyright (c) 2019 Linux Foundation * Copyright (c) 2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org> * Copyright (c) 2019 Google Inc. */ #include <linux/device.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/kobject.h> #include <linux/slab.h> #include <linux/timekeeping.h> #include "power.h" static struct class *wakeup_class; #define wakeup_attr(_name) \ static ssize_t _name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct wakeup_source *ws = dev_get_drvdata(dev); \ \ return sysfs_emit(buf, "%lu\n", ws->_name); \ } \ static DEVICE_ATTR_RO(_name) wakeup_attr(active_count); wakeup_attr(event_count); wakeup_attr(wakeup_count); wakeup_attr(expire_count); wakeup_attr(relax_count); static ssize_t active_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time = ws->active ? ktime_sub(ktime_get(), ws->last_time) : 0; return sysfs_emit(buf, "%lld\n", ktime_to_ms(active_time)); } static DEVICE_ATTR_RO(active_time_ms); static ssize_t total_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time; ktime_t total_time = ws->total_time; if (ws->active) { active_time = ktime_sub(ktime_get(), ws->last_time); total_time = ktime_add(total_time, active_time); } return sysfs_emit(buf, "%lld\n", ktime_to_ms(total_time)); } static DEVICE_ATTR_RO(total_time_ms); static ssize_t max_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time; ktime_t max_time = ws->max_time; if (ws->active) { active_time = ktime_sub(ktime_get(), ws->last_time); if (active_time > max_time) max_time = active_time; } return sysfs_emit(buf, "%lld\n", ktime_to_ms(max_time)); } static DEVICE_ATTR_RO(max_time_ms); static ssize_t last_change_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); return sysfs_emit(buf, "%lld\n", ktime_to_ms(ws->last_time)); } static DEVICE_ATTR_RO(last_change_ms); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); return sysfs_emit(buf, "%s\n", ws->name); } static DEVICE_ATTR_RO(name); static ssize_t prevent_suspend_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t prevent_sleep_time = ws->prevent_sleep_time; if (ws->active && ws->autosleep_enabled) { prevent_sleep_time = ktime_add(prevent_sleep_time, ktime_sub(ktime_get(), ws->start_prevent_time)); } return sysfs_emit(buf, "%lld\n", ktime_to_ms(prevent_sleep_time)); } static DEVICE_ATTR_RO(prevent_suspend_time_ms); static struct attribute *wakeup_source_attrs[] = { &dev_attr_name.attr, &dev_attr_active_count.attr, &dev_attr_event_count.attr, &dev_attr_wakeup_count.attr, &dev_attr_expire_count.attr, &dev_attr_relax_count.attr, &dev_attr_active_time_ms.attr, &dev_attr_total_time_ms.attr, &dev_attr_max_time_ms.attr, &dev_attr_last_change_ms.attr, &dev_attr_prevent_suspend_time_ms.attr, NULL, }; ATTRIBUTE_GROUPS(wakeup_source); static void device_create_release(struct device *dev) { kfree(dev); } static struct device *wakeup_source_device_create(struct device *parent, struct wakeup_source *ws) { struct device *dev = NULL; int retval; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { retval = -ENOMEM; goto error; } device_initialize(dev); dev->devt = MKDEV(0, 0); dev->class = wakeup_class; dev->parent = parent; dev->groups = wakeup_source_groups; dev->release = device_create_release; dev_set_drvdata(dev, ws); device_set_pm_not_required(dev); retval = dev_set_name(dev, "wakeup%d", ws->id); if (retval) goto error; retval = device_add(dev); if (retval) goto error; return dev; error: put_device(dev); return ERR_PTR(retval); } /** * wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs. * @parent: Device given wakeup source is associated with (or NULL if virtual). * @ws: Wakeup source to be added in sysfs. */ int wakeup_source_sysfs_add(struct device *parent, struct wakeup_source *ws) { struct device *dev; dev = wakeup_source_device_create(parent, ws); if (IS_ERR(dev)) return PTR_ERR(dev); ws->dev = dev; return 0; } /** * pm_wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs * for a device if they're missing. * @parent: Device given wakeup source is associated with */ int pm_wakeup_source_sysfs_add(struct device *parent) { if (!parent->power.wakeup || parent->power.wakeup->dev) return 0; return wakeup_source_sysfs_add(parent, parent->power.wakeup); } /** * wakeup_source_sysfs_remove - Remove wakeup_source attributes from sysfs. * @ws: Wakeup source to be removed from sysfs. */ void wakeup_source_sysfs_remove(struct wakeup_source *ws) { device_unregister(ws->dev); } static int __init wakeup_sources_sysfs_init(void) { wakeup_class = class_create("wakeup"); return PTR_ERR_OR_ZERO(wakeup_class); } postcore_initcall(wakeup_sources_sysfs_init); |
| 2 2 79 79 79 78 79 354 50 196 224 355 251 193 14 361 301 306 306 166 2 135 299 7 7 7 7 2 2 1 5 1 2 3 74 1 66 3 109 293 9 226 7 118 2 16 290 4 293 294 9 292 297 114 211 22 4 303 10 55 121 95 121 95 1 84 85 91 102 57 3 293 43 295 1 1 297 93 294 73 52 19 57 13 3 70 70 60 10 2 8 1 16 1 2 1 1 12 4 1 9 8 8 2 2 13 45 32 1 14 6 6 1 6 3 2 2 4 1 3 11 12 73 2 63 19 2 2 5 85 16 73 21 9 12 48 50 50 72 51 14 27 65 23 21 23 23 17 6 1 16 22 1 10 4 3 4 4 1 3 2 10 1 9 7 4 2 2 2 7 7 3 1 2 1 3 3 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 | // SPDX-License-Identifier: GPL-2.0-or-later /* Keyring handling * * Copyright (C) 2004-2005, 2008, 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/seq_file.h> #include <linux/err.h> #include <linux/user_namespace.h> #include <linux/nsproxy.h> #include <keys/keyring-type.h> #include <keys/user-type.h> #include <linux/assoc_array_priv.h> #include <linux/uaccess.h> #include <net/net_namespace.h> #include "internal.h" /* * When plumbing the depths of the key tree, this sets a hard limit * set on how deep we're willing to go. */ #define KEYRING_SEARCH_MAX_DEPTH 6 /* * We mark pointers we pass to the associative array with bit 1 set if * they're keyrings and clear otherwise. */ #define KEYRING_PTR_SUBTYPE 0x2UL static inline bool keyring_ptr_is_keyring(const struct assoc_array_ptr *x) { return (unsigned long)x & KEYRING_PTR_SUBTYPE; } static inline struct key *keyring_ptr_to_key(const struct assoc_array_ptr *x) { void *object = assoc_array_ptr_to_leaf(x); return (struct key *)((unsigned long)object & ~KEYRING_PTR_SUBTYPE); } static inline void *keyring_key_to_ptr(struct key *key) { if (key->type == &key_type_keyring) return (void *)((unsigned long)key | KEYRING_PTR_SUBTYPE); return key; } static DEFINE_RWLOCK(keyring_name_lock); /* * Clean up the bits of user_namespace that belong to us. */ void key_free_user_ns(struct user_namespace *ns) { write_lock(&keyring_name_lock); list_del_init(&ns->keyring_name_list); write_unlock(&keyring_name_lock); key_put(ns->user_keyring_register); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif } /* * The keyring key type definition. Keyrings are simply keys of this type and * can be treated as ordinary keys in addition to having their own special * operations. */ static int keyring_preparse(struct key_preparsed_payload *prep); static void keyring_free_preparse(struct key_preparsed_payload *prep); static int keyring_instantiate(struct key *keyring, struct key_preparsed_payload *prep); static void keyring_revoke(struct key *keyring); static void keyring_destroy(struct key *keyring); static void keyring_describe(const struct key *keyring, struct seq_file *m); static long keyring_read(const struct key *keyring, char *buffer, size_t buflen); struct key_type key_type_keyring = { .name = "keyring", .def_datalen = 0, .preparse = keyring_preparse, .free_preparse = keyring_free_preparse, .instantiate = keyring_instantiate, .revoke = keyring_revoke, .destroy = keyring_destroy, .describe = keyring_describe, .read = keyring_read, }; EXPORT_SYMBOL(key_type_keyring); /* * Semaphore to serialise link/link calls to prevent two link calls in parallel * introducing a cycle. */ static DEFINE_MUTEX(keyring_serialise_link_lock); /* * Publish the name of a keyring so that it can be found by name (if it has * one and it doesn't begin with a dot). */ static void keyring_publish_name(struct key *keyring) { struct user_namespace *ns = current_user_ns(); if (keyring->description && keyring->description[0] && keyring->description[0] != '.') { write_lock(&keyring_name_lock); list_add_tail(&keyring->name_link, &ns->keyring_name_list); write_unlock(&keyring_name_lock); } } /* * Preparse a keyring payload */ static int keyring_preparse(struct key_preparsed_payload *prep) { return prep->datalen != 0 ? -EINVAL : 0; } /* * Free a preparse of a user defined key payload */ static void keyring_free_preparse(struct key_preparsed_payload *prep) { } /* * Initialise a keyring. * * Returns 0 on success, -EINVAL if given any data. */ static int keyring_instantiate(struct key *keyring, struct key_preparsed_payload *prep) { assoc_array_init(&keyring->keys); /* make the keyring available by name if it has one */ keyring_publish_name(keyring); return 0; } /* * Multiply 64-bits by 32-bits to 96-bits and fold back to 64-bit. Ideally we'd * fold the carry back too, but that requires inline asm. */ static u64 mult_64x32_and_fold(u64 x, u32 y) { u64 hi = (u64)(u32)(x >> 32) * y; u64 lo = (u64)(u32)(x) * y; return lo + ((u64)(u32)hi << 32) + (u32)(hi >> 32); } /* * Hash a key type and description. */ static void hash_key_type_and_desc(struct keyring_index_key *index_key) { const unsigned level_shift = ASSOC_ARRAY_LEVEL_STEP; const unsigned long fan_mask = ASSOC_ARRAY_FAN_MASK; const char *description = index_key->description; unsigned long hash, type; u32 piece; u64 acc; int n, desc_len = index_key->desc_len; type = (unsigned long)index_key->type; acc = mult_64x32_and_fold(type, desc_len + 13); acc = mult_64x32_and_fold(acc, 9207); piece = (unsigned long)index_key->domain_tag; acc = mult_64x32_and_fold(acc, piece); acc = mult_64x32_and_fold(acc, 9207); for (;;) { n = desc_len; if (n <= 0) break; if (n > 4) n = 4; piece = 0; memcpy(&piece, description, n); description += n; desc_len -= n; acc = mult_64x32_and_fold(acc, piece); acc = mult_64x32_and_fold(acc, 9207); } /* Fold the hash down to 32 bits if need be. */ hash = acc; if (ASSOC_ARRAY_KEY_CHUNK_SIZE == 32) hash ^= acc >> 32; /* Squidge all the keyrings into a separate part of the tree to * ordinary keys by making sure the lowest level segment in the hash is * zero for keyrings and non-zero otherwise. */ if (index_key->type != &key_type_keyring && (hash & fan_mask) == 0) hash |= (hash >> (ASSOC_ARRAY_KEY_CHUNK_SIZE - level_shift)) | 1; else if (index_key->type == &key_type_keyring && (hash & fan_mask) != 0) hash = (hash + (hash << level_shift)) & ~fan_mask; index_key->hash = hash; } /* * Finalise an index key to include a part of the description actually in the * index key, to set the domain tag and to calculate the hash. */ void key_set_index_key(struct keyring_index_key *index_key) { static struct key_tag default_domain_tag = { .usage = REFCOUNT_INIT(1), }; size_t n = min_t(size_t, index_key->desc_len, sizeof(index_key->desc)); memcpy(index_key->desc, index_key->description, n); if (!index_key->domain_tag) { if (index_key->type->flags & KEY_TYPE_NET_DOMAIN) index_key->domain_tag = current->nsproxy->net_ns->key_domain; else index_key->domain_tag = &default_domain_tag; } hash_key_type_and_desc(index_key); } /** * key_put_tag - Release a ref on a tag. * @tag: The tag to release. * * This releases a reference the given tag and returns true if that ref was the * last one. */ bool key_put_tag(struct key_tag *tag) { if (refcount_dec_and_test(&tag->usage)) { kfree_rcu(tag, rcu); return true; } return false; } /** * key_remove_domain - Kill off a key domain and gc its keys * @domain_tag: The domain tag to release. * * This marks a domain tag as being dead and releases a ref on it. If that * wasn't the last reference, the garbage collector is poked to try and delete * all keys that were in the domain. */ void key_remove_domain(struct key_tag *domain_tag) { domain_tag->removed = true; if (!key_put_tag(domain_tag)) key_schedule_gc_links(); } /* * Build the next index key chunk. * * We return it one word-sized chunk at a time. */ static unsigned long keyring_get_key_chunk(const void *data, int level) { const struct keyring_index_key *index_key = data; unsigned long chunk = 0; const u8 *d; int desc_len = index_key->desc_len, n = sizeof(chunk); level /= ASSOC_ARRAY_KEY_CHUNK_SIZE; switch (level) { case 0: return index_key->hash; case 1: return index_key->x; case 2: return (unsigned long)index_key->type; case 3: return (unsigned long)index_key->domain_tag; default: level -= 4; if (desc_len <= sizeof(index_key->desc)) return 0; d = index_key->description + sizeof(index_key->desc); d += level * sizeof(long); desc_len -= sizeof(index_key->desc); if (desc_len > n) desc_len = n; do { chunk <<= 8; chunk |= *d++; } while (--desc_len > 0); return chunk; } } static unsigned long keyring_get_object_key_chunk(const void *object, int level) { const struct key *key = keyring_ptr_to_key(object); return keyring_get_key_chunk(&key->index_key, level); } static bool keyring_compare_object(const void *object, const void *data) { const struct keyring_index_key *index_key = data; const struct key *key = keyring_ptr_to_key(object); return key->index_key.type == index_key->type && key->index_key.domain_tag == index_key->domain_tag && key->index_key.desc_len == index_key->desc_len && memcmp(key->index_key.description, index_key->description, index_key->desc_len) == 0; } /* * Compare the index keys of a pair of objects and determine the bit position * at which they differ - if they differ. */ static int keyring_diff_objects(const void *object, const void *data) { const struct key *key_a = keyring_ptr_to_key(object); const struct keyring_index_key *a = &key_a->index_key; const struct keyring_index_key *b = data; unsigned long seg_a, seg_b; int level, i; level = 0; seg_a = a->hash; seg_b = b->hash; if ((seg_a ^ seg_b) != 0) goto differ; level += ASSOC_ARRAY_KEY_CHUNK_SIZE / 8; /* The number of bits contributed by the hash is controlled by a * constant in the assoc_array headers. Everything else thereafter we * can deal with as being machine word-size dependent. */ seg_a = a->x; seg_b = b->x; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); /* The next bit may not work on big endian */ seg_a = (unsigned long)a->type; seg_b = (unsigned long)b->type; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); seg_a = (unsigned long)a->domain_tag; seg_b = (unsigned long)b->domain_tag; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); i = sizeof(a->desc); if (a->desc_len <= i) goto same; for (; i < a->desc_len; i++) { seg_a = *(unsigned char *)(a->description + i); seg_b = *(unsigned char *)(b->description + i); if ((seg_a ^ seg_b) != 0) goto differ_plus_i; } same: return -1; differ_plus_i: level += i; differ: i = level * 8 + __ffs(seg_a ^ seg_b); return i; } /* * Free an object after stripping the keyring flag off of the pointer. */ static void keyring_free_object(void *object) { key_put(keyring_ptr_to_key(object)); } /* * Operations for keyring management by the index-tree routines. */ static const struct assoc_array_ops keyring_assoc_array_ops = { .get_key_chunk = keyring_get_key_chunk, .get_object_key_chunk = keyring_get_object_key_chunk, .compare_object = keyring_compare_object, .diff_objects = keyring_diff_objects, .free_object = keyring_free_object, }; /* * Clean up a keyring when it is destroyed. Unpublish its name if it had one * and dispose of its data. * * The garbage collector detects the final key_put(), removes the keyring from * the serial number tree and then does RCU synchronisation before coming here, * so we shouldn't need to worry about code poking around here with the RCU * readlock held by this time. */ static void keyring_destroy(struct key *keyring) { if (keyring->description) { write_lock(&keyring_name_lock); if (keyring->name_link.next != NULL && !list_empty(&keyring->name_link)) list_del(&keyring->name_link); write_unlock(&keyring_name_lock); } if (keyring->restrict_link) { struct key_restriction *keyres = keyring->restrict_link; key_put(keyres->key); kfree(keyres); } assoc_array_destroy(&keyring->keys, &keyring_assoc_array_ops); } /* * Describe a keyring for /proc. */ static void keyring_describe(const struct key *keyring, struct seq_file *m) { if (keyring->description) seq_puts(m, keyring->description); else seq_puts(m, "[anon]"); if (key_is_positive(keyring)) { if (keyring->keys.nr_leaves_on_tree != 0) seq_printf(m, ": %lu", keyring->keys.nr_leaves_on_tree); else seq_puts(m, ": empty"); } } struct keyring_read_iterator_context { size_t buflen; size_t count; key_serial_t *buffer; }; static int keyring_read_iterator(const void *object, void *data) { struct keyring_read_iterator_context *ctx = data; const struct key *key = keyring_ptr_to_key(object); kenter("{%s,%d},,{%zu/%zu}", key->type->name, key->serial, ctx->count, ctx->buflen); if (ctx->count >= ctx->buflen) return 1; *ctx->buffer++ = key->serial; ctx->count += sizeof(key->serial); return 0; } /* * Read a list of key IDs from the keyring's contents in binary form * * The keyring's semaphore is read-locked by the caller. This prevents someone * from modifying it under us - which could cause us to read key IDs multiple * times. */ static long keyring_read(const struct key *keyring, char *buffer, size_t buflen) { struct keyring_read_iterator_context ctx; long ret; kenter("{%d},,%zu", key_serial(keyring), buflen); if (buflen & (sizeof(key_serial_t) - 1)) return -EINVAL; /* Copy as many key IDs as fit into the buffer */ if (buffer && buflen) { ctx.buffer = (key_serial_t *)buffer; ctx.buflen = buflen; ctx.count = 0; ret = assoc_array_iterate(&keyring->keys, keyring_read_iterator, &ctx); if (ret < 0) { kleave(" = %ld [iterate]", ret); return ret; } } /* Return the size of the buffer needed */ ret = keyring->keys.nr_leaves_on_tree * sizeof(key_serial_t); if (ret <= buflen) kleave("= %ld [ok]", ret); else kleave("= %ld [buffer too small]", ret); return ret; } /* * Allocate a keyring and link into the destination keyring. */ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link, struct key *dest) { struct key *keyring; int ret; keyring = key_alloc(&key_type_keyring, description, uid, gid, cred, perm, flags, restrict_link); if (!IS_ERR(keyring)) { ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL); if (ret < 0) { key_put(keyring); keyring = ERR_PTR(ret); } } return keyring; } EXPORT_SYMBOL(keyring_alloc); /** * restrict_link_reject - Give -EPERM to restrict link * @keyring: The keyring being added to. * @type: The type of key being added. * @payload: The payload of the key intended to be added. * @restriction_key: Keys providing additional data for evaluating restriction. * * Reject the addition of any links to a keyring. It can be overridden by * passing KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when * adding a key to a keyring. * * This is meant to be stored in a key_restriction structure which is passed * in the restrict_link parameter to keyring_alloc(). */ int restrict_link_reject(struct key *keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key) { return -EPERM; } /* * By default, we keys found by getting an exact match on their descriptions. */ bool key_default_cmp(const struct key *key, const struct key_match_data *match_data) { return strcmp(key->description, match_data->raw_data) == 0; } /* * Iteration function to consider each key found. */ static int keyring_search_iterator(const void *object, void *iterator_data) { struct keyring_search_context *ctx = iterator_data; const struct key *key = keyring_ptr_to_key(object); unsigned long kflags = READ_ONCE(key->flags); short state = READ_ONCE(key->state); kenter("{%d}", key->serial); /* ignore keys not of this type */ if (key->type != ctx->index_key.type) { kleave(" = 0 [!type]"); return 0; } /* skip invalidated, revoked and expired keys */ if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { time64_t expiry = READ_ONCE(key->expiry); if (kflags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) { ctx->result = ERR_PTR(-EKEYREVOKED); kleave(" = %d [invrev]", ctx->skipped_ret); goto skipped; } if (expiry && ctx->now >= expiry) { if (!(ctx->flags & KEYRING_SEARCH_SKIP_EXPIRED)) ctx->result = ERR_PTR(-EKEYEXPIRED); kleave(" = %d [expire]", ctx->skipped_ret); goto skipped; } } /* keys that don't match */ if (!ctx->match_data.cmp(key, &ctx->match_data)) { kleave(" = 0 [!match]"); return 0; } /* key must have search permissions */ if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && key_task_permission(make_key_ref(key, ctx->possessed), ctx->cred, KEY_NEED_SEARCH) < 0) { ctx->result = ERR_PTR(-EACCES); kleave(" = %d [!perm]", ctx->skipped_ret); goto skipped; } if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { /* we set a different error code if we pass a negative key */ if (state < 0) { ctx->result = ERR_PTR(state); kleave(" = %d [neg]", ctx->skipped_ret); goto skipped; } } /* Found */ ctx->result = make_key_ref(key, ctx->possessed); kleave(" = 1 [found]"); return 1; skipped: return ctx->skipped_ret; } /* * Search inside a keyring for a key. We can search by walking to it * directly based on its index-key or we can iterate over the entire * tree looking for it, based on the match function. */ static int search_keyring(struct key *keyring, struct keyring_search_context *ctx) { if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_DIRECT) { const void *object; object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops, &ctx->index_key); return object ? ctx->iterator(object, ctx) : 0; } return assoc_array_iterate(&keyring->keys, ctx->iterator, ctx); } /* * Search a tree of keyrings that point to other keyrings up to the maximum * depth. */ static bool search_nested_keyrings(struct key *keyring, struct keyring_search_context *ctx) { struct { struct key *keyring; struct assoc_array_node *node; int slot; } stack[KEYRING_SEARCH_MAX_DEPTH]; struct assoc_array_shortcut *shortcut; struct assoc_array_node *node; struct assoc_array_ptr *ptr; struct key *key; int sp = 0, slot; kenter("{%d},{%s,%s}", keyring->serial, ctx->index_key.type->name, ctx->index_key.description); #define STATE_CHECKS (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_DO_STATE_CHECK) BUG_ON((ctx->flags & STATE_CHECKS) == 0 || (ctx->flags & STATE_CHECKS) == STATE_CHECKS); if (ctx->index_key.description) key_set_index_key(&ctx->index_key); /* Check to see if this top-level keyring is what we are looking for * and whether it is valid or not. */ if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_ITERATE || keyring_compare_object(keyring, &ctx->index_key)) { ctx->skipped_ret = 2; switch (ctx->iterator(keyring_key_to_ptr(keyring), ctx)) { case 1: goto found; case 2: return false; default: break; } } ctx->skipped_ret = 0; /* Start processing a new keyring */ descend_to_keyring: kdebug("descend to %d", keyring->serial); if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) goto not_this_keyring; /* Search through the keys in this keyring before its searching its * subtrees. */ if (search_keyring(keyring, ctx)) goto found; /* Then manually iterate through the keyrings nested in this one. * * Start from the root node of the index tree. Because of the way the * hash function has been set up, keyrings cluster on the leftmost * branch of the root node (root slot 0) or in the root node itself. * Non-keyrings avoid the leftmost branch of the root entirely (root * slots 1-15). */ if (!(ctx->flags & KEYRING_SEARCH_RECURSE)) goto not_this_keyring; ptr = READ_ONCE(keyring->keys.root); if (!ptr) goto not_this_keyring; if (assoc_array_ptr_is_shortcut(ptr)) { /* If the root is a shortcut, either the keyring only contains * keyring pointers (everything clusters behind root slot 0) or * doesn't contain any keyring pointers. */ shortcut = assoc_array_ptr_to_shortcut(ptr); if ((shortcut->index_key[0] & ASSOC_ARRAY_FAN_MASK) != 0) goto not_this_keyring; ptr = READ_ONCE(shortcut->next_node); node = assoc_array_ptr_to_node(ptr); goto begin_node; } node = assoc_array_ptr_to_node(ptr); ptr = node->slots[0]; if (!assoc_array_ptr_is_meta(ptr)) goto begin_node; descend_to_node: /* Descend to a more distal node in this keyring's content tree and go * through that. */ kdebug("descend"); if (assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = READ_ONCE(shortcut->next_node); BUG_ON(!assoc_array_ptr_is_node(ptr)); } node = assoc_array_ptr_to_node(ptr); begin_node: kdebug("begin_node"); slot = 0; ascend_to_node: /* Go through the slots in a node */ for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = READ_ONCE(node->slots[slot]); if (assoc_array_ptr_is_meta(ptr)) { if (node->back_pointer || assoc_array_ptr_is_shortcut(ptr)) goto descend_to_node; } if (!keyring_ptr_is_keyring(ptr)) continue; key = keyring_ptr_to_key(ptr); if (sp >= KEYRING_SEARCH_MAX_DEPTH) { if (ctx->flags & KEYRING_SEARCH_DETECT_TOO_DEEP) { ctx->result = ERR_PTR(-ELOOP); return false; } goto not_this_keyring; } /* Search a nested keyring */ if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && key_task_permission(make_key_ref(key, ctx->possessed), ctx->cred, KEY_NEED_SEARCH) < 0) continue; /* stack the current position */ stack[sp].keyring = keyring; stack[sp].node = node; stack[sp].slot = slot; sp++; /* begin again with the new keyring */ keyring = key; goto descend_to_keyring; } /* We've dealt with all the slots in the current node, so now we need * to ascend to the parent and continue processing there. */ ptr = READ_ONCE(node->back_pointer); slot = node->parent_slot; if (ptr && assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = READ_ONCE(shortcut->back_pointer); slot = shortcut->parent_slot; } if (!ptr) goto not_this_keyring; node = assoc_array_ptr_to_node(ptr); slot++; /* If we've ascended to the root (zero backpointer), we must have just * finished processing the leftmost branch rather than the root slots - * so there can't be any more keyrings for us to find. */ if (node->back_pointer) { kdebug("ascend %d", slot); goto ascend_to_node; } /* The keyring we're looking at was disqualified or didn't contain a * matching key. */ not_this_keyring: kdebug("not_this_keyring %d", sp); if (sp <= 0) { kleave(" = false"); return false; } /* Resume the processing of a keyring higher up in the tree */ sp--; keyring = stack[sp].keyring; node = stack[sp].node; slot = stack[sp].slot + 1; kdebug("ascend to %d [%d]", keyring->serial, slot); goto ascend_to_node; /* We found a viable match */ found: key = key_ref_to_ptr(ctx->result); key_check(key); if (!(ctx->flags & KEYRING_SEARCH_NO_UPDATE_TIME)) { key->last_used_at = ctx->now; keyring->last_used_at = ctx->now; while (sp > 0) stack[--sp].keyring->last_used_at = ctx->now; } kleave(" = true"); return true; } /** * keyring_search_rcu - Search a keyring tree for a matching key under RCU * @keyring_ref: A pointer to the keyring with possession indicator. * @ctx: The keyring search context. * * Search the supplied keyring tree for a key that matches the criteria given. * The root keyring and any linked keyrings must grant Search permission to the * caller to be searchable and keys can only be found if they too grant Search * to the caller. The possession flag on the root keyring pointer controls use * of the possessor bits in permissions checking of the entire tree. In * addition, the LSM gets to forbid keyring searches and key matches. * * The search is performed as a breadth-then-depth search up to the prescribed * limit (KEYRING_SEARCH_MAX_DEPTH). The caller must hold the RCU read lock to * prevent keyrings from being destroyed or rearranged whilst they are being * searched. * * Keys are matched to the type provided and are then filtered by the match * function, which is given the description to use in any way it sees fit. The * match function may use any attributes of a key that it wishes to * determine the match. Normally the match function from the key type would be * used. * * RCU can be used to prevent the keyring key lists from disappearing without * the need to take lots of locks. * * Returns a pointer to the found key and increments the key usage count if * successful; -EAGAIN if no matching keys were found, or if expired or revoked * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the * specified keyring wasn't a keyring. * * In the case of a successful return, the possession attribute from * @keyring_ref is propagated to the returned key reference. */ key_ref_t keyring_search_rcu(key_ref_t keyring_ref, struct keyring_search_context *ctx) { struct key *keyring; long err; ctx->iterator = keyring_search_iterator; ctx->possessed = is_key_possessed(keyring_ref); ctx->result = ERR_PTR(-EAGAIN); keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (keyring->type != &key_type_keyring) return ERR_PTR(-ENOTDIR); if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM)) { err = key_task_permission(keyring_ref, ctx->cred, KEY_NEED_SEARCH); if (err < 0) return ERR_PTR(err); } ctx->now = ktime_get_real_seconds(); if (search_nested_keyrings(keyring, ctx)) __key_get(key_ref_to_ptr(ctx->result)); return ctx->result; } /** * keyring_search - Search the supplied keyring tree for a matching key * @keyring: The root of the keyring tree to be searched. * @type: The type of keyring we want to find. * @description: The name of the keyring we want to find. * @recurse: True to search the children of @keyring also * * As keyring_search_rcu() above, but using the current task's credentials and * type's default matching function and preferred search method. */ key_ref_t keyring_search(key_ref_t keyring, struct key_type *type, const char *description, bool recurse) { struct keyring_search_context ctx = { .index_key.type = type, .index_key.description = description, .index_key.desc_len = strlen(description), .cred = current_cred(), .match_data.cmp = key_default_cmp, .match_data.raw_data = description, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = KEYRING_SEARCH_DO_STATE_CHECK, }; key_ref_t key; int ret; if (recurse) ctx.flags |= KEYRING_SEARCH_RECURSE; if (type->match_preparse) { ret = type->match_preparse(&ctx.match_data); if (ret < 0) return ERR_PTR(ret); } rcu_read_lock(); key = keyring_search_rcu(keyring, &ctx); rcu_read_unlock(); if (type->match_free) type->match_free(&ctx.match_data); return key; } EXPORT_SYMBOL(keyring_search); static struct key_restriction *keyring_restriction_alloc( key_restrict_link_func_t check) { struct key_restriction *keyres = kzalloc(sizeof(struct key_restriction), GFP_KERNEL); if (!keyres) return ERR_PTR(-ENOMEM); keyres->check = check; return keyres; } /* * Semaphore to serialise restriction setup to prevent reference count * cycles through restriction key pointers. */ static DECLARE_RWSEM(keyring_serialise_restrict_sem); /* * Check for restriction cycles that would prevent keyring garbage collection. * keyring_serialise_restrict_sem must be held. */ static bool keyring_detect_restriction_cycle(const struct key *dest_keyring, struct key_restriction *keyres) { while (keyres && keyres->key && keyres->key->type == &key_type_keyring) { if (keyres->key == dest_keyring) return true; keyres = keyres->key->restrict_link; } return false; } /** * keyring_restrict - Look up and apply a restriction to a keyring * @keyring_ref: The keyring to be restricted * @type: The key type that will provide the restriction checker. * @restriction: The restriction options to apply to the keyring * * Look up a keyring and apply a restriction to it. The restriction is managed * by the specific key type, but can be configured by the options specified in * the restriction string. */ int keyring_restrict(key_ref_t keyring_ref, const char *type, const char *restriction) { struct key *keyring; struct key_type *restrict_type = NULL; struct key_restriction *restrict_link; int ret = 0; keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (keyring->type != &key_type_keyring) return -ENOTDIR; if (!type) { restrict_link = keyring_restriction_alloc(restrict_link_reject); } else { restrict_type = key_type_lookup(type); if (IS_ERR(restrict_type)) return PTR_ERR(restrict_type); if (!restrict_type->lookup_restriction) { ret = -ENOENT; goto error; } restrict_link = restrict_type->lookup_restriction(restriction); } if (IS_ERR(restrict_link)) { ret = PTR_ERR(restrict_link); goto error; } down_write(&keyring->sem); down_write(&keyring_serialise_restrict_sem); if (keyring->restrict_link) { ret = -EEXIST; } else if (keyring_detect_restriction_cycle(keyring, restrict_link)) { ret = -EDEADLK; } else { keyring->restrict_link = restrict_link; notify_key(keyring, NOTIFY_KEY_SETATTR, 0); } up_write(&keyring_serialise_restrict_sem); up_write(&keyring->sem); if (ret < 0) { key_put(restrict_link->key); kfree(restrict_link); } error: if (restrict_type) key_type_put(restrict_type); return ret; } EXPORT_SYMBOL(keyring_restrict); /* * Search the given keyring for a key that might be updated. * * The caller must guarantee that the keyring is a keyring and that the * permission is granted to modify the keyring as no check is made here. The * caller must also hold a lock on the keyring semaphore. * * Returns a pointer to the found key with usage count incremented if * successful and returns NULL if not found. Revoked and invalidated keys are * skipped over. * * If successful, the possession indicator is propagated from the keyring ref * to the returned key reference. */ key_ref_t find_key_to_update(key_ref_t keyring_ref, const struct keyring_index_key *index_key) { struct key *keyring, *key; const void *object; keyring = key_ref_to_ptr(keyring_ref); kenter("{%d},{%s,%s}", keyring->serial, index_key->type->name, index_key->description); object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops, index_key); if (object) goto found; kleave(" = NULL"); return NULL; found: key = keyring_ptr_to_key(object); if (key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) { kleave(" = NULL [x]"); return NULL; } __key_get(key); kleave(" = {%d}", key->serial); return make_key_ref(key, is_key_possessed(keyring_ref)); } /* * Find a keyring with the specified name. * * Only keyrings that have nonzero refcount, are not revoked, and are owned by a * user in the current user namespace are considered. If @uid_keyring is %true, * the keyring additionally must have been allocated as a user or user session * keyring; otherwise, it must grant Search permission directly to the caller. * * Returns a pointer to the keyring with the keyring's refcount having being * incremented on success. -ENOKEY is returned if a key could not be found. */ struct key *find_keyring_by_name(const char *name, bool uid_keyring) { struct user_namespace *ns = current_user_ns(); struct key *keyring; if (!name) return ERR_PTR(-EINVAL); read_lock(&keyring_name_lock); /* Search this hash bucket for a keyring with a matching name that * grants Search permission and that hasn't been revoked */ list_for_each_entry(keyring, &ns->keyring_name_list, name_link) { if (!kuid_has_mapping(ns, keyring->user->uid)) continue; if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) continue; if (strcmp(keyring->description, name) != 0) continue; if (uid_keyring) { if (!test_bit(KEY_FLAG_UID_KEYRING, &keyring->flags)) continue; } else { if (key_permission(make_key_ref(keyring, 0), KEY_NEED_SEARCH) < 0) continue; } /* we've got a match but we might end up racing with * key_cleanup() if the keyring is currently 'dead' * (ie. it has a zero usage count) */ if (!refcount_inc_not_zero(&keyring->usage)) continue; keyring->last_used_at = ktime_get_real_seconds(); goto out; } keyring = ERR_PTR(-ENOKEY); out: read_unlock(&keyring_name_lock); return keyring; } static int keyring_detect_cycle_iterator(const void *object, void *iterator_data) { struct keyring_search_context *ctx = iterator_data; const struct key *key = keyring_ptr_to_key(object); kenter("{%d}", key->serial); /* We might get a keyring with matching index-key that is nonetheless a * different keyring. */ if (key != ctx->match_data.raw_data) return 0; ctx->result = ERR_PTR(-EDEADLK); return 1; } /* * See if a cycle will be created by inserting acyclic tree B in acyclic * tree A at the topmost level (ie: as a direct child of A). * * Since we are adding B to A at the top level, checking for cycles should just * be a matter of seeing if node A is somewhere in tree B. */ static int keyring_detect_cycle(struct key *A, struct key *B) { struct keyring_search_context ctx = { .index_key = A->index_key, .match_data.raw_data = A, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .iterator = keyring_detect_cycle_iterator, .flags = (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_NO_UPDATE_TIME | KEYRING_SEARCH_NO_CHECK_PERM | KEYRING_SEARCH_DETECT_TOO_DEEP | KEYRING_SEARCH_RECURSE), }; rcu_read_lock(); search_nested_keyrings(B, &ctx); rcu_read_unlock(); return PTR_ERR(ctx.result) == -EAGAIN ? 0 : PTR_ERR(ctx.result); } /* * Lock keyring for link. */ int __key_link_lock(struct key *keyring, const struct keyring_index_key *index_key) __acquires(&keyring->sem) __acquires(&keyring_serialise_link_lock) { if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); /* Serialise link/link calls to prevent parallel calls causing a cycle * when linking two keyring in opposite orders. */ if (index_key->type == &key_type_keyring) mutex_lock(&keyring_serialise_link_lock); return 0; } /* * Lock keyrings for move (link/unlink combination). */ int __key_move_lock(struct key *l_keyring, struct key *u_keyring, const struct keyring_index_key *index_key) __acquires(&l_keyring->sem) __acquires(&u_keyring->sem) __acquires(&keyring_serialise_link_lock) { if (l_keyring->type != &key_type_keyring || u_keyring->type != &key_type_keyring) return -ENOTDIR; /* We have to be very careful here to take the keyring locks in the * right order, lest we open ourselves to deadlocking against another * move operation. */ if (l_keyring < u_keyring) { down_write(&l_keyring->sem); down_write_nested(&u_keyring->sem, 1); } else { down_write(&u_keyring->sem); down_write_nested(&l_keyring->sem, 1); } /* Serialise link/link calls to prevent parallel calls causing a cycle * when linking two keyring in opposite orders. */ if (index_key->type == &key_type_keyring) mutex_lock(&keyring_serialise_link_lock); return 0; } /* * Preallocate memory so that a key can be linked into to a keyring. */ int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit **_edit) { struct assoc_array_edit *edit; int ret; kenter("%d,%s,%s,", keyring->serial, index_key->type->name, index_key->description); BUG_ON(index_key->desc_len == 0); BUG_ON(*_edit != NULL); *_edit = NULL; ret = -EKEYREVOKED; if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) goto error; /* Create an edit script that will insert/replace the key in the * keyring tree. */ edit = assoc_array_insert(&keyring->keys, &keyring_assoc_array_ops, index_key, NULL); if (IS_ERR(edit)) { ret = PTR_ERR(edit); goto error; } /* If we're not replacing a link in-place then we're going to need some * extra quota. */ if (!edit->dead_leaf) { ret = key_payload_reserve(keyring, keyring->datalen + KEYQUOTA_LINK_BYTES); if (ret < 0) goto error_cancel; } *_edit = edit; kleave(" = 0"); return 0; error_cancel: assoc_array_cancel_edit(edit); error: kleave(" = %d", ret); return ret; } /* * Check already instantiated keys aren't going to be a problem. * * The caller must have called __key_link_begin(). Don't need to call this for * keys that were created since __key_link_begin() was called. */ int __key_link_check_live_key(struct key *keyring, struct key *key) { if (key->type == &key_type_keyring) /* check that we aren't going to create a cycle by linking one * keyring to another */ return keyring_detect_cycle(keyring, key); return 0; } /* * Link a key into to a keyring. * * Must be called with __key_link_begin() having being called. Discards any * already extant link to matching key if there is one, so that each keyring * holds at most one link to any given key of a particular type+description * combination. */ void __key_link(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { __key_get(key); assoc_array_insert_set_object(*_edit, keyring_key_to_ptr(key)); assoc_array_apply_edit(*_edit); *_edit = NULL; notify_key(keyring, NOTIFY_KEY_LINKED, key_serial(key)); } /* * Finish linking a key into to a keyring. * * Must be called with __key_link_begin() having being called. */ void __key_link_end(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit *edit) __releases(&keyring->sem) __releases(&keyring_serialise_link_lock) { BUG_ON(index_key->type == NULL); kenter("%d,%s,", keyring->serial, index_key->type->name); if (edit) { if (!edit->dead_leaf) { key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } assoc_array_cancel_edit(edit); } up_write(&keyring->sem); if (index_key->type == &key_type_keyring) mutex_unlock(&keyring_serialise_link_lock); } /* * Check addition of keys to restricted keyrings. */ static int __key_link_check_restriction(struct key *keyring, struct key *key) { if (!keyring->restrict_link || !keyring->restrict_link->check) return 0; return keyring->restrict_link->check(keyring, key->type, &key->payload, keyring->restrict_link->key); } /** * key_link - Link a key to a keyring * @keyring: The keyring to make the link in. * @key: The key to link to. * * Make a link in a keyring to a key, such that the keyring holds a reference * on that key and the key can potentially be found by searching that keyring. * * This function will write-lock the keyring's semaphore and will consume some * of the user's key data quota to hold the link. * * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, * -EKEYREVOKED if the keyring has been revoked, -ENFILE if the keyring is * full, -EDQUOT if there is insufficient key data quota remaining to add * another link or -ENOMEM if there's insufficient memory. * * It is assumed that the caller has checked that it is permitted for a link to * be made (the keyring should have Write permission and the key Link * permission). */ int key_link(struct key *keyring, struct key *key) { struct assoc_array_edit *edit = NULL; int ret; kenter("{%d,%d}", keyring->serial, refcount_read(&keyring->usage)); key_check(keyring); key_check(key); ret = __key_link_lock(keyring, &key->index_key); if (ret < 0) goto error; ret = __key_link_begin(keyring, &key->index_key, &edit); if (ret < 0) goto error_end; kdebug("begun {%d,%d}", keyring->serial, refcount_read(&keyring->usage)); ret = __key_link_check_restriction(keyring, key); if (ret == 0) ret = __key_link_check_live_key(keyring, key); if (ret == 0) __key_link(keyring, key, &edit); error_end: __key_link_end(keyring, &key->index_key, edit); error: kleave(" = %d {%d,%d}", ret, keyring->serial, refcount_read(&keyring->usage)); return ret; } EXPORT_SYMBOL(key_link); /* * Lock a keyring for unlink. */ static int __key_unlink_lock(struct key *keyring) __acquires(&keyring->sem) { if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); return 0; } /* * Begin the process of unlinking a key from a keyring. */ static int __key_unlink_begin(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { struct assoc_array_edit *edit; BUG_ON(*_edit != NULL); edit = assoc_array_delete(&keyring->keys, &keyring_assoc_array_ops, &key->index_key); if (IS_ERR(edit)) return PTR_ERR(edit); if (!edit) return -ENOENT; *_edit = edit; return 0; } /* * Apply an unlink change. */ static void __key_unlink(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { assoc_array_apply_edit(*_edit); notify_key(keyring, NOTIFY_KEY_UNLINKED, key_serial(key)); *_edit = NULL; key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } /* * Finish unlinking a key from to a keyring. */ static void __key_unlink_end(struct key *keyring, struct key *key, struct assoc_array_edit *edit) __releases(&keyring->sem) { if (edit) assoc_array_cancel_edit(edit); up_write(&keyring->sem); } /** * key_unlink - Unlink the first link to a key from a keyring. * @keyring: The keyring to remove the link from. * @key: The key the link is to. * * Remove a link from a keyring to a key. * * This function will write-lock the keyring's semaphore. * * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, -ENOENT if * the key isn't linked to by the keyring or -ENOMEM if there's insufficient * memory. * * It is assumed that the caller has checked that it is permitted for a link to * be removed (the keyring should have Write permission; no permissions are * required on the key). */ int key_unlink(struct key *keyring, struct key *key) { struct assoc_array_edit *edit = NULL; int ret; key_check(keyring); key_check(key); ret = __key_unlink_lock(keyring); if (ret < 0) return ret; ret = __key_unlink_begin(keyring, key, &edit); if (ret == 0) __key_unlink(keyring, key, &edit); __key_unlink_end(keyring, key, edit); return ret; } EXPORT_SYMBOL(key_unlink); /** * key_move - Move a key from one keyring to another * @key: The key to move * @from_keyring: The keyring to remove the link from. * @to_keyring: The keyring to make the link in. * @flags: Qualifying flags, such as KEYCTL_MOVE_EXCL. * * Make a link in @to_keyring to a key, such that the keyring holds a reference * on that key and the key can potentially be found by searching that keyring * whilst simultaneously removing a link to the key from @from_keyring. * * This function will write-lock both keyring's semaphores and will consume * some of the user's key data quota to hold the link on @to_keyring. * * Returns 0 if successful, -ENOTDIR if either keyring isn't a keyring, * -EKEYREVOKED if either keyring has been revoked, -ENFILE if the second * keyring is full, -EDQUOT if there is insufficient key data quota remaining * to add another link or -ENOMEM if there's insufficient memory. If * KEYCTL_MOVE_EXCL is set, then -EEXIST will be returned if there's already a * matching key in @to_keyring. * * It is assumed that the caller has checked that it is permitted for a link to * be made (the keyring should have Write permission and the key Link * permission). */ int key_move(struct key *key, struct key *from_keyring, struct key *to_keyring, unsigned int flags) { struct assoc_array_edit *from_edit = NULL, *to_edit = NULL; int ret; kenter("%d,%d,%d", key->serial, from_keyring->serial, to_keyring->serial); if (from_keyring == to_keyring) return 0; key_check(key); key_check(from_keyring); key_check(to_keyring); ret = __key_move_lock(from_keyring, to_keyring, &key->index_key); if (ret < 0) goto out; ret = __key_unlink_begin(from_keyring, key, &from_edit); if (ret < 0) goto error; ret = __key_link_begin(to_keyring, &key->index_key, &to_edit); if (ret < 0) goto error; ret = -EEXIST; if (to_edit->dead_leaf && (flags & KEYCTL_MOVE_EXCL)) goto error; ret = __key_link_check_restriction(to_keyring, key); if (ret < 0) goto error; ret = __key_link_check_live_key(to_keyring, key); if (ret < 0) goto error; __key_unlink(from_keyring, key, &from_edit); __key_link(to_keyring, key, &to_edit); error: __key_link_end(to_keyring, &key->index_key, to_edit); __key_unlink_end(from_keyring, key, from_edit); out: kleave(" = %d", ret); return ret; } EXPORT_SYMBOL(key_move); /** * keyring_clear - Clear a keyring * @keyring: The keyring to clear. * * Clear the contents of the specified keyring. * * Returns 0 if successful or -ENOTDIR if the keyri |