| 14 14 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 | // SPDX-License-Identifier: GPL-2.0-only /* * scsi.c Copyright (C) 1992 Drew Eckhardt * Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale * Copyright (C) 2002, 2003 Christoph Hellwig * * generic mid-level SCSI driver * Initial versions: Drew Eckhardt * Subsequent revisions: Eric Youngdale * * <drew@colorado.edu> * * Bug correction thanks go to : * Rik Faith <faith@cs.unc.edu> * Tommy Thorn <tthorn> * Thomas Wuensche <tw@fgb1.fgb.mw.tu-muenchen.de> * * Modified by Eric Youngdale eric@andante.org or ericy@gnu.ai.mit.edu to * add scatter-gather, multiple outstanding request, and other * enhancements. * * Native multichannel, wide scsi, /proc/scsi and hot plugging * support added by Michael Neuffer <mike@i-connect.net> * * Added request_module("scsi_hostadapter") for kerneld: * (Put an "alias scsi_hostadapter your_hostadapter" in /etc/modprobe.conf) * Bjorn Ekwall <bj0rn@blox.se> * (changed to kmod) * * Major improvements to the timeout, abort, and reset processing, * as well as performance modifications for large queue depths by * Leonard N. Zubkoff <lnz@dandelion.com> * * Converted cli() code to spinlocks, Ingo Molnar * * Jiffies wrap fixes (host->resetting), 3 Dec 1998 Andrea Arcangeli * * out_of_space hacks, D. Gilbert (dpg) 990608 */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/completion.h> #include <linux/unistd.h> #include <linux/spinlock.h> #include <linux/kmod.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/unaligned.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_tcq.h> #include "scsi_priv.h" #include "scsi_logging.h" #define CREATE_TRACE_POINTS #include <trace/events/scsi.h> /* * Definitions and constants. */ /* * Note - the initial logging level can be set here to log events at boot time. * After the system is up, you may enable logging via the /proc interface. */ unsigned int scsi_logging_level; #if defined(CONFIG_SCSI_LOGGING) EXPORT_SYMBOL(scsi_logging_level); #endif #ifdef CONFIG_SCSI_LOGGING void scsi_log_send(struct scsi_cmnd *cmd) { unsigned int level; /* * If ML QUEUE log level is greater than or equal to: * * 1: nothing (match completion) * * 2: log opcode + command of all commands + cmd address * * 3: same as 2 * * 4: same as 3 */ if (unlikely(scsi_logging_level)) { level = SCSI_LOG_LEVEL(SCSI_LOG_MLQUEUE_SHIFT, SCSI_LOG_MLQUEUE_BITS); if (level > 1) { scmd_printk(KERN_INFO, cmd, "Send: scmd 0x%p\n", cmd); scsi_print_command(cmd); } } } void scsi_log_completion(struct scsi_cmnd *cmd, int disposition) { unsigned int level; /* * If ML COMPLETE log level is greater than or equal to: * * 1: log disposition, result, opcode + command, and conditionally * sense data for failures or non SUCCESS dispositions. * * 2: same as 1 but for all command completions. * * 3: same as 2 * * 4: same as 3 plus dump extra junk */ if (unlikely(scsi_logging_level)) { level = SCSI_LOG_LEVEL(SCSI_LOG_MLCOMPLETE_SHIFT, SCSI_LOG_MLCOMPLETE_BITS); if (((level > 0) && (cmd->result || disposition != SUCCESS)) || (level > 1)) { scsi_print_result(cmd, "Done", disposition); scsi_print_command(cmd); if (scsi_status_is_check_condition(cmd->result)) scsi_print_sense(cmd); if (level > 3) scmd_printk(KERN_INFO, cmd, "scsi host busy %d failed %d\n", scsi_host_busy(cmd->device->host), cmd->device->host->host_failed); } } } #endif /** * scsi_finish_command - cleanup and pass command back to upper layer * @cmd: the command * * Description: Pass command off to upper layer for finishing of I/O * request, waking processes that are waiting on results, * etc. */ void scsi_finish_command(struct scsi_cmnd *cmd) { struct scsi_device *sdev = cmd->device; struct scsi_target *starget = scsi_target(sdev); struct Scsi_Host *shost = sdev->host; struct scsi_driver *drv; unsigned int good_bytes; scsi_device_unbusy(sdev, cmd); /* * Clear the flags that say that the device/target/host is no longer * capable of accepting new commands. */ if (atomic_read(&shost->host_blocked)) atomic_set(&shost->host_blocked, 0); if (atomic_read(&starget->target_blocked)) atomic_set(&starget->target_blocked, 0); if (atomic_read(&sdev->device_blocked)) atomic_set(&sdev->device_blocked, 0); SCSI_LOG_MLCOMPLETE(4, sdev_printk(KERN_INFO, sdev, "Notifying upper driver of completion " "(result %x)\n", cmd->result)); good_bytes = scsi_bufflen(cmd); if (!blk_rq_is_passthrough(scsi_cmd_to_rq(cmd))) { int old_good_bytes = good_bytes; drv = scsi_cmd_to_driver(cmd); if (drv->done) good_bytes = drv->done(cmd); /* * USB may not give sense identifying bad sector and * simply return a residue instead, so subtract off the * residue if drv->done() error processing indicates no * change to the completion length. */ if (good_bytes == old_good_bytes) good_bytes -= scsi_get_resid(cmd); } scsi_io_completion(cmd, good_bytes); } /* * 4096 is big enough for saturating fast SCSI LUNs. */ int scsi_device_max_queue_depth(struct scsi_device *sdev) { return min_t(int, sdev->host->can_queue, 4096); } /** * scsi_change_queue_depth - change a device's queue depth * @sdev: SCSI Device in question * @depth: number of commands allowed to be queued to the driver * * Sets the device queue depth and returns the new value. */ int scsi_change_queue_depth(struct scsi_device *sdev, int depth) { depth = min_t(int, depth, scsi_device_max_queue_depth(sdev)); if (depth > 0) { sdev->queue_depth = depth; wmb(); } if (sdev->request_queue) blk_set_queue_depth(sdev->request_queue, depth); sbitmap_resize(&sdev->budget_map, sdev->queue_depth); return sdev->queue_depth; } EXPORT_SYMBOL(scsi_change_queue_depth); /** * scsi_track_queue_full - track QUEUE_FULL events to adjust queue depth * @sdev: SCSI Device in question * @depth: Current number of outstanding SCSI commands on this device, * not counting the one returned as QUEUE_FULL. * * Description: This function will track successive QUEUE_FULL events on a * specific SCSI device to determine if and when there is a * need to adjust the queue depth on the device. * * Returns: 0 - No change needed, >0 - Adjust queue depth to this new depth, * -1 - Drop back to untagged operation using host->cmd_per_lun * as the untagged command depth * * Lock Status: None held on entry * * Notes: Low level drivers may call this at any time and we will do * "The Right Thing." We are interrupt context safe. */ int scsi_track_queue_full(struct scsi_device *sdev, int depth) { /* * Don't let QUEUE_FULLs on the same * jiffies count, they could all be from * same event. */ if ((jiffies >> 4) == (sdev->last_queue_full_time >> 4)) return 0; sdev->last_queue_full_time = jiffies; if (sdev->last_queue_full_depth != depth) { sdev->last_queue_full_count = 1; sdev->last_queue_full_depth = depth; } else { sdev->last_queue_full_count++; } if (sdev->last_queue_full_count <= 10) return 0; return scsi_change_queue_depth(sdev, depth); } EXPORT_SYMBOL(scsi_track_queue_full); /** * scsi_vpd_inquiry - Request a device provide us with a VPD page * @sdev: The device to ask * @buffer: Where to put the result * @page: Which Vital Product Data to return * @len: The length of the buffer * * This is an internal helper function. You probably want to use * scsi_get_vpd_page instead. * * Returns size of the vpd page on success or a negative error number. */ static int scsi_vpd_inquiry(struct scsi_device *sdev, unsigned char *buffer, u8 page, unsigned len) { int result; unsigned char cmd[16]; if (len < 4) return -EINVAL; cmd[0] = INQUIRY; cmd[1] = 1; /* EVPD */ cmd[2] = page; cmd[3] = len >> 8; cmd[4] = len & 0xff; cmd[5] = 0; /* Control byte */ /* * I'm not convinced we need to try quite this hard to get VPD, but * all the existing users tried this hard. */ result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, buffer, len, 30 * HZ, 3, NULL); if (result) return -EIO; /* * Sanity check that we got the page back that we asked for and that * the page size is not 0. */ if (buffer[1] != page) return -EIO; result = get_unaligned_be16(&buffer[2]); if (!result) return -EIO; return result + 4; } enum scsi_vpd_parameters { SCSI_VPD_HEADER_SIZE = 4, SCSI_VPD_LIST_SIZE = 36, }; static int scsi_get_vpd_size(struct scsi_device *sdev, u8 page) { unsigned char vpd[SCSI_VPD_LIST_SIZE] __aligned(4); int result; if (sdev->no_vpd_size) return SCSI_DEFAULT_VPD_LEN; /* * Fetch the supported pages VPD and validate that the requested page * number is present. */ if (page != 0) { result = scsi_vpd_inquiry(sdev, vpd, 0, sizeof(vpd)); if (result < SCSI_VPD_HEADER_SIZE) return 0; if (result > sizeof(vpd)) { dev_warn_once(&sdev->sdev_gendev, "%s: long VPD page 0 length: %d bytes\n", __func__, result); result = sizeof(vpd); } result -= SCSI_VPD_HEADER_SIZE; if (!memchr(&vpd[SCSI_VPD_HEADER_SIZE], page, result)) return 0; } /* * Fetch the VPD page header to find out how big the page * is. This is done to prevent problems on legacy devices * which can not handle allocation lengths as large as * potentially requested by the caller. */ result = scsi_vpd_inquiry(sdev, vpd, page, SCSI_VPD_HEADER_SIZE); if (result < 0) return 0; if (result < SCSI_VPD_HEADER_SIZE) { dev_warn_once(&sdev->sdev_gendev, "%s: short VPD page 0x%02x length: %d bytes\n", __func__, page, result); return 0; } return result; } /** * scsi_get_vpd_page - Get Vital Product Data from a SCSI device * @sdev: The device to ask * @page: Which Vital Product Data to return * @buf: where to store the VPD * @buf_len: number of bytes in the VPD buffer area * * SCSI devices may optionally supply Vital Product Data. Each 'page' * of VPD is defined in the appropriate SCSI document (eg SPC, SBC). * If the device supports this VPD page, this routine fills @buf * with the data from that page and return 0. If the VPD page is not * supported or its content cannot be retrieved, -EINVAL is returned. */ int scsi_get_vpd_page(struct scsi_device *sdev, u8 page, unsigned char *buf, int buf_len) { int result, vpd_len; if (!scsi_device_supports_vpd(sdev)) return -EINVAL; vpd_len = scsi_get_vpd_size(sdev, page); if (vpd_len <= 0) return -EINVAL; vpd_len = min(vpd_len, buf_len); /* * Fetch the actual page. Since the appropriate size was reported * by the device it is now safe to ask for something bigger. */ memset(buf, 0, buf_len); result = scsi_vpd_inquiry(sdev, buf, page, vpd_len); if (result < 0) return -EINVAL; else if (result > vpd_len) dev_warn_once(&sdev->sdev_gendev, "%s: VPD page 0x%02x result %d > %d bytes\n", __func__, page, result, vpd_len); return 0; } EXPORT_SYMBOL_GPL(scsi_get_vpd_page); /** * scsi_get_vpd_buf - Get Vital Product Data from a SCSI device * @sdev: The device to ask * @page: Which Vital Product Data to return * * Returns %NULL upon failure. */ static struct scsi_vpd *scsi_get_vpd_buf(struct scsi_device *sdev, u8 page) { struct scsi_vpd *vpd_buf; int vpd_len, result; vpd_len = scsi_get_vpd_size(sdev, page); if (vpd_len <= 0) return NULL; retry_pg: /* * Fetch the actual page. Since the appropriate size was reported * by the device it is now safe to ask for something bigger. */ vpd_buf = kmalloc(sizeof(*vpd_buf) + vpd_len, GFP_KERNEL); if (!vpd_buf) return NULL; result = scsi_vpd_inquiry(sdev, vpd_buf->data, page, vpd_len); if (result < 0) { kfree(vpd_buf); return NULL; } if (result > vpd_len) { dev_warn_once(&sdev->sdev_gendev, "%s: VPD page 0x%02x result %d > %d bytes\n", __func__, page, result, vpd_len); vpd_len = result; kfree(vpd_buf); goto retry_pg; } vpd_buf->len = result; return vpd_buf; } static void scsi_update_vpd_page(struct scsi_device *sdev, u8 page, struct scsi_vpd __rcu **sdev_vpd_buf) { struct scsi_vpd *vpd_buf; vpd_buf = scsi_get_vpd_buf(sdev, page); if (!vpd_buf) return; mutex_lock(&sdev->inquiry_mutex); vpd_buf = rcu_replace_pointer(*sdev_vpd_buf, vpd_buf, lockdep_is_held(&sdev->inquiry_mutex)); mutex_unlock(&sdev->inquiry_mutex); if (vpd_buf) kfree_rcu(vpd_buf, rcu); } /** * scsi_attach_vpd - Attach Vital Product Data to a SCSI device structure * @sdev: The device to ask * * Attach the 'Device Identification' VPD page (0x83) and the * 'Unit Serial Number' VPD page (0x80) to a SCSI device * structure. This information can be used to identify the device * uniquely. */ void scsi_attach_vpd(struct scsi_device *sdev) { int i; struct scsi_vpd *vpd_buf; if (!scsi_device_supports_vpd(sdev)) return; /* Ask for all the pages supported by this device */ vpd_buf = scsi_get_vpd_buf(sdev, 0); if (!vpd_buf) return; for (i = 4; i < vpd_buf->len; i++) { switch (vpd_buf->data[i]) { case 0x0: scsi_update_vpd_page(sdev, 0x0, &sdev->vpd_pg0); break; case 0x80: scsi_update_vpd_page(sdev, 0x80, &sdev->vpd_pg80); break; case 0x83: scsi_update_vpd_page(sdev, 0x83, &sdev->vpd_pg83); break; case 0x89: scsi_update_vpd_page(sdev, 0x89, &sdev->vpd_pg89); break; case 0xb0: scsi_update_vpd_page(sdev, 0xb0, &sdev->vpd_pgb0); break; case 0xb1: scsi_update_vpd_page(sdev, 0xb1, &sdev->vpd_pgb1); break; case 0xb2: scsi_update_vpd_page(sdev, 0xb2, &sdev->vpd_pgb2); break; case 0xb7: scsi_update_vpd_page(sdev, 0xb7, &sdev->vpd_pgb7); break; default: break; } } kfree(vpd_buf); } /** * scsi_report_opcode - Find out if a given command is supported * @sdev: scsi device to query * @buffer: scratch buffer (must be at least 20 bytes long) * @len: length of buffer * @opcode: opcode for the command to look up * @sa: service action for the command to look up * * Uses the REPORT SUPPORTED OPERATION CODES to check support for the * command identified with @opcode and @sa. If the command does not * have a service action, @sa must be 0. Returns -EINVAL if RSOC fails, * 0 if the command is not supported and 1 if the device claims to * support the command. */ int scsi_report_opcode(struct scsi_device *sdev, unsigned char *buffer, unsigned int len, unsigned char opcode, unsigned short sa) { unsigned char cmd[16]; struct scsi_sense_hdr sshdr; int result, request_len; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; if (sdev->no_report_opcodes || sdev->scsi_level < SCSI_SPC_3) return -EINVAL; /* RSOC header + size of command we are asking about */ request_len = 4 + COMMAND_SIZE(opcode); if (request_len > len) { dev_warn_once(&sdev->sdev_gendev, "%s: len %u bytes, opcode 0x%02x needs %u\n", __func__, len, opcode, request_len); return -EINVAL; } memset(cmd, 0, 16); cmd[0] = MAINTENANCE_IN; cmd[1] = MI_REPORT_SUPPORTED_OPERATION_CODES; if (!sa) { cmd[2] = 1; /* One command format */ cmd[3] = opcode; } else { cmd[2] = 3; /* One command format with service action */ cmd[3] = opcode; put_unaligned_be16(sa, &cmd[4]); } put_unaligned_be32(request_len, &cmd[6]); memset(buffer, 0, len); result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, buffer, request_len, 30 * HZ, 3, &exec_args); if (result < 0) return result; if (result && scsi_sense_valid(&sshdr) && sshdr.sense_key == ILLEGAL_REQUEST && (sshdr.asc == 0x20 || sshdr.asc == 0x24) && sshdr.ascq == 0x00) return -EINVAL; if ((buffer[1] & 3) == 3) /* Command supported */ return 1; return 0; } EXPORT_SYMBOL(scsi_report_opcode); #define SCSI_CDL_CHECK_BUF_LEN 64 static bool scsi_cdl_check_cmd(struct scsi_device *sdev, u8 opcode, u16 sa, unsigned char *buf) { int ret; u8 cdlp; /* Check operation code */ ret = scsi_report_opcode(sdev, buf, SCSI_CDL_CHECK_BUF_LEN, opcode, sa); if (ret <= 0) return false; if ((buf[1] & 0x03) != 0x03) return false; /* * See SPC-6, One_command parameter data format for * REPORT SUPPORTED OPERATION CODES. We have the following cases * depending on rwcdlp (buf[0] & 0x01) value: * - rwcdlp == 0: then cdlp indicates support for the A mode page when * it is equal to 1 and for the B mode page when it is * equal to 2. * - rwcdlp == 1: then cdlp indicates support for the T2A mode page * when it is equal to 1 and for the T2B mode page when * it is equal to 2. * Overall, to detect support for command duration limits, we only need * to check that cdlp is 1 or 2. */ cdlp = (buf[1] & 0x18) >> 3; return cdlp == 0x01 || cdlp == 0x02; } /** * scsi_cdl_check - Check if a SCSI device supports Command Duration Limits * @sdev: The device to check */ void scsi_cdl_check(struct scsi_device *sdev) { bool cdl_supported; unsigned char *buf; /* * Support for CDL was defined in SPC-5. Ignore devices reporting an * lower SPC version. This also avoids problems with old drives choking * on MAINTENANCE_IN / MI_REPORT_SUPPORTED_OPERATION_CODES with a * service action specified, as done in scsi_cdl_check_cmd(). */ if (sdev->scsi_level < SCSI_SPC_5) { sdev->cdl_supported = 0; return; } buf = kmalloc(SCSI_CDL_CHECK_BUF_LEN, GFP_KERNEL); if (!buf) { sdev->cdl_supported = 0; return; } /* Check support for READ_16, WRITE_16, READ_32 and WRITE_32 commands */ cdl_supported = scsi_cdl_check_cmd(sdev, READ_16, 0, buf) || scsi_cdl_check_cmd(sdev, WRITE_16, 0, buf) || scsi_cdl_check_cmd(sdev, VARIABLE_LENGTH_CMD, READ_32, buf) || scsi_cdl_check_cmd(sdev, VARIABLE_LENGTH_CMD, WRITE_32, buf); if (cdl_supported) { /* * We have CDL support: force the use of READ16/WRITE16. * READ32 and WRITE32 will be used for devices that support * the T10_PI_TYPE2_PROTECTION protection type. */ sdev->use_16_for_rw = 1; sdev->use_10_for_rw = 0; sdev->cdl_supported = 1; /* * If the device supports CDL, make sure that the current drive * feature status is consistent with the user controlled * cdl_enable state. */ scsi_cdl_enable(sdev, sdev->cdl_enable); } else { sdev->cdl_supported = 0; } kfree(buf); } /** * scsi_cdl_enable - Enable or disable a SCSI device supports for Command * Duration Limits * @sdev: The target device * @enable: the target state */ int scsi_cdl_enable(struct scsi_device *sdev, bool enable) { char buf[64]; bool is_ata; int ret; if (!sdev->cdl_supported) return -EOPNOTSUPP; rcu_read_lock(); is_ata = rcu_dereference(sdev->vpd_pg89); rcu_read_unlock(); /* * For ATA devices, CDL needs to be enabled with a SET FEATURES command. */ if (is_ata) { struct scsi_mode_data data; struct scsi_sense_hdr sshdr; char *buf_data; int len; ret = scsi_mode_sense(sdev, 0x08, 0x0a, 0xf2, buf, sizeof(buf), 5 * HZ, 3, &data, NULL); if (ret) return -EINVAL; /* Enable or disable CDL using the ATA feature page */ len = min_t(size_t, sizeof(buf), data.length - data.header_length - data.block_descriptor_length); buf_data = buf + data.header_length + data.block_descriptor_length; /* * If we want to enable CDL and CDL is already enabled on the * device, do nothing. This avoids needlessly resetting the CDL * statistics on the device as that is implied by the CDL enable * action. Similar to this, there is no need to do anything if * we want to disable CDL and CDL is already disabled. */ if (enable) { if ((buf_data[4] & 0x03) == 0x02) goto out; buf_data[4] &= ~0x03; buf_data[4] |= 0x02; } else { if ((buf_data[4] & 0x03) == 0x00) goto out; buf_data[4] &= ~0x03; } ret = scsi_mode_select(sdev, 1, 0, buf_data, len, 5 * HZ, 3, &data, &sshdr); if (ret) { if (ret > 0 && scsi_sense_valid(&sshdr)) scsi_print_sense_hdr(sdev, dev_name(&sdev->sdev_gendev), &sshdr); return ret; } } out: sdev->cdl_enable = enable; return 0; } /** * scsi_device_get - get an additional reference to a scsi_device * @sdev: device to get a reference to * * Description: Gets a reference to the scsi_device and increments the use count * of the underlying LLDD module. You must hold host_lock of the * parent Scsi_Host or already have a reference when calling this. * * This will fail if a device is deleted or cancelled, or when the LLD module * is in the process of being unloaded. */ int scsi_device_get(struct scsi_device *sdev) { if (sdev->sdev_state == SDEV_DEL || sdev->sdev_state == SDEV_CANCEL) goto fail; if (!try_module_get(sdev->host->hostt->module)) goto fail; if (!get_device(&sdev->sdev_gendev)) goto fail_put_module; return 0; fail_put_module: module_put(sdev->host->hostt->module); fail: return -ENXIO; } EXPORT_SYMBOL(scsi_device_get); /** * scsi_device_put - release a reference to a scsi_device * @sdev: device to release a reference on. * * Description: Release a reference to the scsi_device and decrements the use * count of the underlying LLDD module. The device is freed once the last * user vanishes. */ void scsi_device_put(struct scsi_device *sdev) { struct module *mod = sdev->host->hostt->module; put_device(&sdev->sdev_gendev); module_put(mod); } EXPORT_SYMBOL(scsi_device_put); /* helper for shost_for_each_device, see that for documentation */ struct scsi_device *__scsi_iterate_devices(struct Scsi_Host *shost, struct scsi_device *prev) { struct list_head *list = (prev ? &prev->siblings : &shost->__devices); struct scsi_device *next = NULL; unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); while (list->next != &shost->__devices) { next = list_entry(list->next, struct scsi_device, siblings); /* skip devices that we can't get a reference to */ if (!scsi_device_get(next)) break; next = NULL; list = list->next; } spin_unlock_irqrestore(shost->host_lock, flags); if (prev) scsi_device_put(prev); return next; } EXPORT_SYMBOL(__scsi_iterate_devices); /** * starget_for_each_device - helper to walk all devices of a target * @starget: target whose devices we want to iterate over. * @data: Opaque passed to each function call. * @fn: Function to call on each device * * This traverses over each device of @starget. The devices have * a reference that must be released by scsi_host_put when breaking * out of the loop. */ void starget_for_each_device(struct scsi_target *starget, void *data, void (*fn)(struct scsi_device *, void *)) { struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); struct scsi_device *sdev; shost_for_each_device(sdev, shost) { if ((sdev->channel == starget->channel) && (sdev->id == starget->id)) fn(sdev, data); } } EXPORT_SYMBOL(starget_for_each_device); /** * __starget_for_each_device - helper to walk all devices of a target (UNLOCKED) * @starget: target whose devices we want to iterate over. * @data: parameter for callback @fn() * @fn: callback function that is invoked for each device * * This traverses over each device of @starget. It does _not_ * take a reference on the scsi_device, so the whole loop must be * protected by shost->host_lock. * * Note: The only reason why drivers would want to use this is because * they need to access the device list in irq context. Otherwise you * really want to use starget_for_each_device instead. **/ void __starget_for_each_device(struct scsi_target *starget, void *data, void (*fn)(struct scsi_device *, void *)) { struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); struct scsi_device *sdev; __shost_for_each_device(sdev, shost) { if ((sdev->channel == starget->channel) && (sdev->id == starget->id)) fn(sdev, data); } } EXPORT_SYMBOL(__starget_for_each_device); /** * __scsi_device_lookup_by_target - find a device given the target (UNLOCKED) * @starget: SCSI target pointer * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @lun for a given * @starget. The returned scsi_device does not have an additional * reference. You must hold the host's host_lock over this call and * any access to the returned scsi_device. A scsi_device in state * SDEV_DEL is skipped. * * Note: The only reason why drivers should use this is because * they need to access the device list in irq context. Otherwise you * really want to use scsi_device_lookup_by_target instead. **/ struct scsi_device *__scsi_device_lookup_by_target(struct scsi_target *starget, u64 lun) { struct scsi_device *sdev; list_for_each_entry(sdev, &starget->devices, same_target_siblings) { if (sdev->sdev_state == SDEV_DEL) continue; if (sdev->lun ==lun) return sdev; } return NULL; } EXPORT_SYMBOL(__scsi_device_lookup_by_target); /** * scsi_device_lookup_by_target - find a device given the target * @starget: SCSI target pointer * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @lun for a given * @starget. The returned scsi_device has an additional reference that * needs to be released with scsi_device_put once you're done with it. **/ struct scsi_device *scsi_device_lookup_by_target(struct scsi_target *starget, u64 lun) { struct scsi_device *sdev; struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); sdev = __scsi_device_lookup_by_target(starget, lun); if (sdev && scsi_device_get(sdev)) sdev = NULL; spin_unlock_irqrestore(shost->host_lock, flags); return sdev; } EXPORT_SYMBOL(scsi_device_lookup_by_target); /** * __scsi_device_lookup - find a device given the host (UNLOCKED) * @shost: SCSI host pointer * @channel: SCSI channel (zero if only one channel) * @id: SCSI target number (physical unit number) * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @channel, @id, @lun * for a given host. The returned scsi_device does not have an additional * reference. You must hold the host's host_lock over this call and any access * to the returned scsi_device. * * Note: The only reason why drivers would want to use this is because * they need to access the device list in irq context. Otherwise you * really want to use scsi_device_lookup instead. **/ struct scsi_device *__scsi_device_lookup(struct Scsi_Host *shost, uint channel, uint id, u64 lun) { struct scsi_device *sdev; list_for_each_entry(sdev, &shost->__devices, siblings) { if (sdev->sdev_state == SDEV_DEL) continue; if (sdev->channel == channel && sdev->id == id && sdev->lun ==lun) return sdev; } return NULL; } EXPORT_SYMBOL(__scsi_device_lookup); /** * scsi_device_lookup - find a device given the host * @shost: SCSI host pointer * @channel: SCSI channel (zero if only one channel) * @id: SCSI target number (physical unit number) * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @channel, @id, @lun * for a given host. The returned scsi_device has an additional reference that * needs to be released with scsi_device_put once you're done with it. **/ struct scsi_device *scsi_device_lookup(struct Scsi_Host *shost, uint channel, uint id, u64 lun) { struct scsi_device *sdev; unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); sdev = __scsi_device_lookup(shost, channel, id, lun); if (sdev && scsi_device_get(sdev)) sdev = NULL; spin_unlock_irqrestore(shost->host_lock, flags); return sdev; } EXPORT_SYMBOL(scsi_device_lookup); MODULE_DESCRIPTION("SCSI core"); MODULE_LICENSE("GPL"); module_param(scsi_logging_level, int, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(scsi_logging_level, "a bit mask of logging levels"); static int __init init_scsi(void) { int error; error = scsi_init_procfs(); if (error) goto cleanup_queue; error = scsi_init_devinfo(); if (error) goto cleanup_procfs; error = scsi_init_hosts(); if (error) goto cleanup_devlist; error = scsi_init_sysctl(); if (error) goto cleanup_hosts; error = scsi_sysfs_register(); if (error) goto cleanup_sysctl; scsi_netlink_init(); printk(KERN_NOTICE "SCSI subsystem initialized\n"); return 0; cleanup_sysctl: scsi_exit_sysctl(); cleanup_hosts: scsi_exit_hosts(); cleanup_devlist: scsi_exit_devinfo(); cleanup_procfs: scsi_exit_procfs(); cleanup_queue: scsi_exit_queue(); printk(KERN_ERR "SCSI subsystem failed to initialize, error = %d\n", -error); return error; } static void __exit exit_scsi(void) { scsi_netlink_exit(); scsi_sysfs_unregister(); scsi_exit_sysctl(); scsi_exit_hosts(); scsi_exit_devinfo(); scsi_exit_procfs(); scsi_exit_queue(); } subsys_initcall(init_scsi); module_exit(exit_scsi); |
| 5294 5294 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_THROTTLE_H #define BLK_THROTTLE_H #include "blk-cgroup-rwstat.h" /* * To implement hierarchical throttling, throtl_grps form a tree and bios * are dispatched upwards level by level until they reach the top and get * issued. When dispatching bios from the children and local group at each * level, if the bios are dispatched into a single bio_list, there's a risk * of a local or child group which can queue many bios at once filling up * the list starving others. * * To avoid such starvation, dispatched bios are queued separately * according to where they came from. When they are again dispatched to * the parent, they're popped in round-robin order so that no single source * hogs the dispatch window. * * throtl_qnode is used to keep the queued bios separated by their sources. * Bios are queued to throtl_qnode which in turn is queued to * throtl_service_queue and then dispatched in round-robin order. * * It's also used to track the reference counts on blkg's. A qnode always * belongs to a throtl_grp and gets queued on itself or the parent, so * incrementing the reference of the associated throtl_grp when a qnode is * queued and decrementing when dequeued is enough to keep the whole blkg * tree pinned while bios are in flight. */ struct throtl_qnode { struct list_head node; /* service_queue->queued[] */ struct bio_list bios_bps; /* queued bios for bps limit */ struct bio_list bios_iops; /* queued bios for iops limit */ struct throtl_grp *tg; /* tg this qnode belongs to */ }; struct throtl_service_queue { struct throtl_service_queue *parent_sq; /* the parent service_queue */ /* * Bios queued directly to this service_queue or dispatched from * children throtl_grp's. */ struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ unsigned int nr_queued_bps[2]; /* number of queued bps bios */ unsigned int nr_queued_iops[2]; /* number of queued iops bios */ /* * RB tree of active children throtl_grp's, which are sorted by * their ->disptime. */ struct rb_root_cached pending_tree; /* RB tree of active tgs */ unsigned int nr_pending; /* # queued in the tree */ unsigned long first_pending_disptime; /* disptime of the first tg */ struct timer_list pending_timer; /* fires on first_pending_disptime */ }; enum tg_state_flags { THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ /* * The sq's iops queue is empty, and a bio is about to be enqueued * to the first qnode's bios_iops list. */ THROTL_TG_IOPS_WAS_EMPTY = 1 << 2, THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */ }; struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; /* active throtl group service_queue member */ struct rb_node rb_node; /* throtl_data this group belongs to */ struct throtl_data *td; /* this group's service queue */ struct throtl_service_queue service_queue; /* * qnode_on_self is used when bios are directly queued to this * throtl_grp so that local bios compete fairly with bios * dispatched from children. qnode_on_parent is used when bios are * dispatched from this throtl_grp into its parent and will compete * with the sibling qnode_on_parents and the parent's * qnode_on_self. */ struct throtl_qnode qnode_on_self[2]; struct throtl_qnode qnode_on_parent[2]; /* * Dispatch time in jiffies. This is the estimated time when group * will unthrottle and is ready to dispatch more bio. It is used as * key to sort active groups in service tree. */ unsigned long disptime; unsigned int flags; /* are there any throtl rules between this group and td? */ bool has_rules_bps[2]; bool has_rules_iops[2]; /* bytes per second rate limits */ uint64_t bps[2]; /* IOPS limits */ unsigned int iops[2]; /* * Number of bytes/bio's dispatched in current slice. * When new configuration is submitted while some bios are still throttled, * first calculate the carryover: the amount of bytes/IOs already waited * under the previous configuration. Then, [bytes/io]_disp are represented * as the negative of the carryover, and they will be used to calculate the * wait time under the new configuration. */ int64_t bytes_disp[2]; int io_disp[2]; unsigned long last_check_time; /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; struct blkg_rwstat stat_bytes; struct blkg_rwstat stat_ios; }; extern struct blkcg_policy blkcg_policy_throtl; static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct throtl_grp, pd) : NULL; } static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) { return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); } /* * Internal throttling interface */ #ifndef CONFIG_BLK_DEV_THROTTLING static inline void blk_throtl_exit(struct gendisk *disk) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } static inline void blk_throtl_cancel_bios(struct gendisk *disk) { } #else /* CONFIG_BLK_DEV_THROTTLING */ void blk_throtl_exit(struct gendisk *disk); bool __blk_throtl_bio(struct bio *bio); void blk_throtl_cancel_bios(struct gendisk *disk); static inline bool blk_throtl_activated(struct request_queue *q) { return q->td != NULL; } static inline bool blk_should_throtl(struct bio *bio) { struct throtl_grp *tg; int rw = bio_data_dir(bio); /* * This is called under bio_queue_enter(), and it's synchronized with * the activation of blk-throtl, which is protected by * blk_mq_freeze_queue(). */ if (!blk_throtl_activated(bio->bi_bdev->bd_queue)) return false; tg = blkg_to_tg(bio->bi_blkg); if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { bio_set_flag(bio, BIO_CGROUP_ACCT); blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, bio->bi_iter.bi_size); } blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); } /* iops limit is always counted */ if (tg->has_rules_iops[rw]) return true; if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED)) return true; return false; } static inline bool blk_throtl_bio(struct bio *bio) { if (!blk_should_throtl(bio)) return false; return __blk_throtl_bio(bio); } #endif /* CONFIG_BLK_DEV_THROTTLING */ #endif |
| 34 30 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_IO_READ_H #define _BCACHEFS_IO_READ_H #include "bkey_buf.h" #include "btree_iter.h" #include "extents_types.h" #include "reflink.h" struct bch_read_bio { struct bch_fs *c; u64 start_time; u64 submit_time; /* * Reads will often have to be split, and if the extent being read from * was checksummed or compressed we'll also have to allocate bounce * buffers and copy the data back into the original bio. * * If we didn't have to split, we have to save and restore the original * bi_end_io - @split below indicates which: */ union { struct bch_read_bio *parent; bio_end_io_t *end_io; }; /* * Saved copy of bio->bi_iter, from submission time - allows us to * resubmit on IO error, and also to copy data back to the original bio * when we're bouncing: */ struct bvec_iter bvec_iter; unsigned offset_into_extent; u16 flags; union { struct { u16 data_update:1, promote:1, bounce:1, split:1, have_ioref:1, narrow_crcs:1, saw_error:1, self_healing:1, context:2; }; u16 _state; }; s16 ret; #ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS unsigned list_idx; #endif struct extent_ptr_decoded pick; /* * pos we read from - different from data_pos for indirect extents: */ u32 subvol; struct bpos read_pos; /* * start pos of data we read (may not be pos of data we want) - for * promote, narrow extents paths: */ enum btree_id data_btree; struct bpos data_pos; struct bversion version; struct bch_io_opts opts; struct work_struct work; struct bio bio; }; #define to_rbio(_bio) container_of((_bio), struct bch_read_bio, bio) struct bch_devs_mask; struct cache_promote_op; struct extent_ptr_decoded; static inline int bch2_read_indirect_extent(struct btree_trans *trans, enum btree_id *data_btree, s64 *offset_into_extent, struct bkey_buf *extent) { if (extent->k->k.type != KEY_TYPE_reflink_p) return 0; *data_btree = BTREE_ID_reflink; struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, offset_into_extent, bkey_i_to_s_c_reflink_p(extent->k), true, 0); int ret = bkey_err(k); if (ret) return ret; if (bkey_deleted(k.k)) { bch2_trans_iter_exit(trans, &iter); return bch_err_throw(c, missing_indirect_extent); } bch2_bkey_buf_reassemble(extent, c, k); bch2_trans_iter_exit(trans, &iter); return 0; } #define BCH_READ_FLAGS() \ x(retry_if_stale) \ x(may_promote) \ x(user_mapped) \ x(last_fragment) \ x(must_bounce) \ x(must_clone) \ x(in_retry) enum __bch_read_flags { #define x(n) __BCH_READ_##n, BCH_READ_FLAGS() #undef x }; enum bch_read_flags { #define x(n) BCH_READ_##n = BIT(__BCH_READ_##n), BCH_READ_FLAGS() #undef x }; int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, struct bpos, enum btree_id, struct bkey_s_c, unsigned, struct bch_io_failures *, unsigned, int); static inline void bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, struct bpos read_pos, enum btree_id data_btree, struct bkey_s_c k, unsigned offset_into_extent, unsigned flags) { int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, data_btree, k, offset_into_extent, NULL, flags, -1); /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */ WARN(ret, "unhandled error from __bch2_read_extent()"); } int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, subvol_inum, struct bch_io_failures *, struct bkey_buf *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) { BUG_ON(rbio->_state); rbio->subvol = inum.subvol; bch2_trans_run(c, __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL, BCH_READ_retry_if_stale| BCH_READ_may_promote| BCH_READ_user_mapped)); } static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, struct bch_read_bio *orig) { struct bch_read_bio *rbio = to_rbio(bio); rbio->c = orig->c; rbio->_state = 0; rbio->flags = 0; rbio->ret = 0; rbio->split = true; rbio->parent = orig; rbio->opts = orig->opts; #ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS rbio->list_idx = 0; #endif return rbio; } static inline struct bch_read_bio *rbio_init(struct bio *bio, struct bch_fs *c, struct bch_io_opts opts, bio_end_io_t end_io) { struct bch_read_bio *rbio = to_rbio(bio); rbio->start_time = local_clock(); rbio->c = c; rbio->_state = 0; rbio->flags = 0; rbio->ret = 0; rbio->opts = opts; rbio->bio.bi_end_io = end_io; #ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS rbio->list_idx = 0; #endif return rbio; } struct promote_op; void bch2_promote_op_to_text(struct printbuf *, struct promote_op *); void bch2_read_bio_to_text(struct printbuf *, struct bch_read_bio *); void bch2_fs_io_read_exit(struct bch_fs *); int bch2_fs_io_read_init(struct bch_fs *); #endif /* _BCACHEFS_IO_READ_H */ |
| 76 75 26 8 18 18 61 62 1 2 68 68 1 68 68 58 6 4 54 3 64 1 47 2 4 7 1 11 3 2 70 1 1 68 14 6 66 7 1 18 2 24 1 52 75 1 75 75 5 14 56 75 2 3 55 17 20 52 76 4 1 1 52 2 68 4 21 73 5 61 1 4 60 60 60 60 63 60 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 | // SPDX-License-Identifier: GPL-2.0+ /* * the_nilfs shared structure. * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * * Written by Ryusuke Konishi. * */ #include <linux/buffer_head.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/log2.h> #include <linux/crc32.h> #include "nilfs.h" #include "segment.h" #include "alloc.h" #include "cpfile.h" #include "sufile.h" #include "dat.h" #include "segbuf.h" static int nilfs_valid_sb(struct nilfs_super_block *sbp); void nilfs_set_last_segment(struct the_nilfs *nilfs, sector_t start_blocknr, u64 seq, __u64 cno) { spin_lock(&nilfs->ns_last_segment_lock); nilfs->ns_last_pseg = start_blocknr; nilfs->ns_last_seq = seq; nilfs->ns_last_cno = cno; if (!nilfs_sb_dirty(nilfs)) { if (nilfs->ns_prev_seq == nilfs->ns_last_seq) goto stay_cursor; set_nilfs_sb_dirty(nilfs); } nilfs->ns_prev_seq = nilfs->ns_last_seq; stay_cursor: spin_unlock(&nilfs->ns_last_segment_lock); } /** * alloc_nilfs - allocate a nilfs object * @sb: super block instance * * Return: a pointer to the allocated nilfs object on success, or NULL on * failure. */ struct the_nilfs *alloc_nilfs(struct super_block *sb) { struct the_nilfs *nilfs; nilfs = kzalloc(sizeof(*nilfs), GFP_KERNEL); if (!nilfs) return NULL; nilfs->ns_sb = sb; nilfs->ns_bdev = sb->s_bdev; atomic_set(&nilfs->ns_ndirtyblks, 0); init_rwsem(&nilfs->ns_sem); mutex_init(&nilfs->ns_snapshot_mount_mutex); INIT_LIST_HEAD(&nilfs->ns_dirty_files); INIT_LIST_HEAD(&nilfs->ns_gc_inodes); spin_lock_init(&nilfs->ns_inode_lock); spin_lock_init(&nilfs->ns_last_segment_lock); nilfs->ns_cptree = RB_ROOT; spin_lock_init(&nilfs->ns_cptree_lock); init_rwsem(&nilfs->ns_segctor_sem); nilfs->ns_sb_update_freq = NILFS_SB_FREQ; return nilfs; } /** * destroy_nilfs - destroy nilfs object * @nilfs: nilfs object to be released */ void destroy_nilfs(struct the_nilfs *nilfs) { might_sleep(); if (nilfs_init(nilfs)) { brelse(nilfs->ns_sbh[0]); brelse(nilfs->ns_sbh[1]); } kfree(nilfs); } static int nilfs_load_super_root(struct the_nilfs *nilfs, struct super_block *sb, sector_t sr_block) { struct buffer_head *bh_sr; struct nilfs_super_root *raw_sr; struct nilfs_super_block **sbp = nilfs->ns_sbp; struct nilfs_inode *rawi; unsigned int dat_entry_size, segment_usage_size, checkpoint_size; unsigned int inode_size; int err; err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1); if (unlikely(err)) return err; down_read(&nilfs->ns_sem); dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size); checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size); segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size); up_read(&nilfs->ns_sem); inode_size = nilfs->ns_inode_size; rawi = (void *)bh_sr->b_data + NILFS_SR_DAT_OFFSET(inode_size); err = nilfs_dat_read(sb, dat_entry_size, rawi, &nilfs->ns_dat); if (err) goto failed; rawi = (void *)bh_sr->b_data + NILFS_SR_CPFILE_OFFSET(inode_size); err = nilfs_cpfile_read(sb, checkpoint_size, rawi, &nilfs->ns_cpfile); if (err) goto failed_dat; rawi = (void *)bh_sr->b_data + NILFS_SR_SUFILE_OFFSET(inode_size); err = nilfs_sufile_read(sb, segment_usage_size, rawi, &nilfs->ns_sufile); if (err) goto failed_cpfile; raw_sr = (struct nilfs_super_root *)bh_sr->b_data; nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime); failed: brelse(bh_sr); return err; failed_cpfile: iput(nilfs->ns_cpfile); failed_dat: iput(nilfs->ns_dat); goto failed; } static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri) { memset(ri, 0, sizeof(*ri)); INIT_LIST_HEAD(&ri->ri_used_segments); } static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri) { nilfs_dispose_segment_list(&ri->ri_used_segments); } /** * nilfs_store_log_cursor - load log cursor from a super block * @nilfs: nilfs object * @sbp: buffer storing super block to be read * * nilfs_store_log_cursor() reads the last position of the log * containing a super root from a given super block, and initializes * relevant information on the nilfs object preparatory for log * scanning and recovery. * * Return: 0 on success, or %-EINVAL if current segment number is out * of range. */ static int nilfs_store_log_cursor(struct the_nilfs *nilfs, struct nilfs_super_block *sbp) { int ret = 0; nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg); nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno); nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq); nilfs->ns_prev_seq = nilfs->ns_last_seq; nilfs->ns_seg_seq = nilfs->ns_last_seq; nilfs->ns_segnum = nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg); nilfs->ns_cno = nilfs->ns_last_cno + 1; if (nilfs->ns_segnum >= nilfs->ns_nsegments) { nilfs_err(nilfs->ns_sb, "pointed segment number is out of range: segnum=%llu, nsegments=%lu", (unsigned long long)nilfs->ns_segnum, nilfs->ns_nsegments); ret = -EINVAL; } return ret; } /** * nilfs_get_blocksize - get block size from raw superblock data * @sb: super block instance * @sbp: superblock raw data buffer * @blocksize: place to store block size * * nilfs_get_blocksize() calculates the block size from the block size * exponent information written in @sbp and stores it in @blocksize, * or aborts with an error message if it's too large. * * Return: 0 on success, or %-EINVAL if the block size is too large. */ static int nilfs_get_blocksize(struct super_block *sb, struct nilfs_super_block *sbp, int *blocksize) { unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size); if (unlikely(shift_bits > ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS)) { nilfs_err(sb, "too large filesystem blocksize: 2 ^ %u KiB", shift_bits); return -EINVAL; } *blocksize = BLOCK_SIZE << shift_bits; return 0; } /** * load_nilfs - load and recover the nilfs * @nilfs: the_nilfs structure to be released * @sb: super block instance used to recover past segment * * load_nilfs() searches and load the latest super root, * attaches the last segment, and does recovery if needed. * The caller must call this exclusively for simultaneous mounts. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - No valid segment found. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient memory available. * * %-EROFS - Read only device or RO compat mode (if recovery is required) */ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) { struct nilfs_recovery_info ri; unsigned int s_flags = sb->s_flags; int really_read_only = bdev_read_only(nilfs->ns_bdev); int valid_fs = nilfs_valid_fs(nilfs); int err; if (!valid_fs) { nilfs_warn(sb, "mounting unchecked fs"); if (s_flags & SB_RDONLY) { nilfs_info(sb, "recovery required for readonly filesystem"); nilfs_info(sb, "write access will be enabled during recovery"); } } nilfs_init_recovery_info(&ri); err = nilfs_search_super_root(nilfs, &ri); if (unlikely(err)) { struct nilfs_super_block **sbp = nilfs->ns_sbp; int blocksize; if (err != -EINVAL) goto scan_error; if (!nilfs_valid_sb(sbp[1])) { nilfs_warn(sb, "unable to fall back to spare super block"); goto scan_error; } nilfs_info(sb, "trying rollback from an earlier position"); /* * restore super block with its spare and reconfigure * relevant states of the nilfs object. */ memcpy(sbp[0], sbp[1], nilfs->ns_sbsize); nilfs->ns_crc_seed = le32_to_cpu(sbp[0]->s_crc_seed); nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); /* verify consistency between two super blocks */ err = nilfs_get_blocksize(sb, sbp[0], &blocksize); if (err) goto scan_error; if (blocksize != nilfs->ns_blocksize) { nilfs_warn(sb, "blocksize differs between two super blocks (%d != %d)", blocksize, nilfs->ns_blocksize); err = -EINVAL; goto scan_error; } err = nilfs_store_log_cursor(nilfs, sbp[0]); if (err) goto scan_error; /* drop clean flag to allow roll-forward and recovery */ nilfs->ns_mount_state &= ~NILFS_VALID_FS; valid_fs = 0; err = nilfs_search_super_root(nilfs, &ri); if (err) goto scan_error; } err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root); if (unlikely(err)) { nilfs_err(sb, "error %d while loading super root", err); goto failed; } err = nilfs_sysfs_create_device_group(sb); if (unlikely(err)) goto sysfs_error; if (valid_fs) goto skip_recovery; if (s_flags & SB_RDONLY) { __u64 features; if (nilfs_test_opt(nilfs, NORECOVERY)) { nilfs_info(sb, "norecovery option specified, skipping roll-forward recovery"); goto skip_recovery; } features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) & ~NILFS_FEATURE_COMPAT_RO_SUPP; if (features) { nilfs_err(sb, "couldn't proceed with recovery because of unsupported optional features (%llx)", (unsigned long long)features); err = -EROFS; goto failed_unload; } if (really_read_only) { nilfs_err(sb, "write access unavailable, cannot proceed"); err = -EROFS; goto failed_unload; } sb->s_flags &= ~SB_RDONLY; } else if (nilfs_test_opt(nilfs, NORECOVERY)) { nilfs_err(sb, "recovery cancelled because norecovery option was specified for a read/write mount"); err = -EINVAL; goto failed_unload; } err = nilfs_salvage_orphan_logs(nilfs, sb, &ri); if (err) goto failed_unload; down_write(&nilfs->ns_sem); nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */ err = nilfs_cleanup_super(sb); up_write(&nilfs->ns_sem); if (err) { nilfs_err(sb, "error %d updating super block. recovery unfinished.", err); goto failed_unload; } nilfs_info(sb, "recovery complete"); skip_recovery: nilfs_clear_recovery_info(&ri); sb->s_flags = s_flags; return 0; scan_error: nilfs_err(sb, "error %d while searching super root", err); goto failed; failed_unload: nilfs_sysfs_delete_device_group(nilfs); sysfs_error: iput(nilfs->ns_cpfile); iput(nilfs->ns_sufile); iput(nilfs->ns_dat); failed: nilfs_clear_recovery_info(&ri); sb->s_flags = s_flags; return err; } static unsigned long long nilfs_max_size(unsigned int blkbits) { unsigned int max_bits; unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */ max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */ if (max_bits < 64) res = min_t(unsigned long long, res, (1ULL << max_bits) - 1); return res; } /** * nilfs_nrsvsegs - calculate the number of reserved segments * @nilfs: nilfs object * @nsegs: total number of segments * * Return: Number of reserved segments. */ unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs) { return max_t(unsigned long, NILFS_MIN_NRSVSEGS, DIV_ROUND_UP(nsegs * nilfs->ns_r_segments_percentage, 100)); } /** * nilfs_max_segment_count - calculate the maximum number of segments * @nilfs: nilfs object * * Return: Maximum number of segments */ static u64 nilfs_max_segment_count(struct the_nilfs *nilfs) { u64 max_count = U64_MAX; max_count = div64_ul(max_count, nilfs->ns_blocks_per_segment); return min_t(u64, max_count, ULONG_MAX); } void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs) { nilfs->ns_nsegments = nsegs; nilfs->ns_nrsvsegs = nilfs_nrsvsegs(nilfs, nsegs); } static int nilfs_store_disk_layout(struct the_nilfs *nilfs, struct nilfs_super_block *sbp) { u64 nsegments, nblocks; if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) { nilfs_err(nilfs->ns_sb, "unsupported revision (superblock rev.=%d.%d, current rev.=%d.%d). Please check the version of mkfs.nilfs(2).", le32_to_cpu(sbp->s_rev_level), le16_to_cpu(sbp->s_minor_rev_level), NILFS_CURRENT_REV, NILFS_MINOR_REV); return -EINVAL; } nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes); if (nilfs->ns_sbsize > BLOCK_SIZE) return -EINVAL; nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size); if (nilfs->ns_inode_size > nilfs->ns_blocksize) { nilfs_err(nilfs->ns_sb, "too large inode size: %d bytes", nilfs->ns_inode_size); return -EINVAL; } else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) { nilfs_err(nilfs->ns_sb, "too small inode size: %d bytes", nilfs->ns_inode_size); return -EINVAL; } nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino); if (nilfs->ns_first_ino < NILFS_USER_INO) { nilfs_err(nilfs->ns_sb, "too small lower limit for non-reserved inode numbers: %u", nilfs->ns_first_ino); return -EINVAL; } nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { nilfs_err(nilfs->ns_sb, "too short segment: %lu blocks", nilfs->ns_blocks_per_segment); return -EINVAL; } nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block); nilfs->ns_r_segments_percentage = le32_to_cpu(sbp->s_r_segments_percentage); if (nilfs->ns_r_segments_percentage < 1 || nilfs->ns_r_segments_percentage > 99) { nilfs_err(nilfs->ns_sb, "invalid reserved segments percentage: %lu", nilfs->ns_r_segments_percentage); return -EINVAL; } nsegments = le64_to_cpu(sbp->s_nsegments); if (nsegments > nilfs_max_segment_count(nilfs)) { nilfs_err(nilfs->ns_sb, "segment count %llu exceeds upper limit (%llu segments)", (unsigned long long)nsegments, (unsigned long long)nilfs_max_segment_count(nilfs)); return -EINVAL; } nblocks = sb_bdev_nr_blocks(nilfs->ns_sb); if (nblocks) { u64 min_block_count = nsegments * nilfs->ns_blocks_per_segment; /* * To avoid failing to mount early device images without a * second superblock, exclude that block count from the * "min_block_count" calculation. */ if (nblocks < min_block_count) { nilfs_err(nilfs->ns_sb, "total number of segment blocks %llu exceeds device size (%llu blocks)", (unsigned long long)min_block_count, (unsigned long long)nblocks); return -EINVAL; } } nilfs_set_nsegments(nilfs, nsegments); nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed); return 0; } static int nilfs_valid_sb(struct nilfs_super_block *sbp) { static unsigned char sum[4]; const int sumoff = offsetof(struct nilfs_super_block, s_sum); size_t bytes; u32 crc; if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC) return 0; bytes = le16_to_cpu(sbp->s_bytes); if (bytes < sumoff + 4 || bytes > BLOCK_SIZE) return 0; crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp, sumoff); crc = crc32_le(crc, sum, 4); crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4, bytes - sumoff - 4); return crc == le32_to_cpu(sbp->s_sum); } /** * nilfs_sb2_bad_offset - check the location of the second superblock * @sbp: superblock raw data buffer * @offset: byte offset of second superblock calculated from device size * * nilfs_sb2_bad_offset() checks if the position on the second * superblock is valid or not based on the filesystem parameters * stored in @sbp. If @offset points to a location within the segment * area, or if the parameters themselves are not normal, it is * determined to be invalid. * * Return: true if invalid, false if valid. */ static bool nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset) { unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size); u32 blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); u64 nsegments = le64_to_cpu(sbp->s_nsegments); u64 index; if (blocks_per_segment < NILFS_SEG_MIN_BLOCKS || shift_bits > ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS) return true; index = offset >> (shift_bits + BLOCK_SIZE_BITS); do_div(index, blocks_per_segment); return index < nsegments; } static void nilfs_release_super_block(struct the_nilfs *nilfs) { int i; for (i = 0; i < 2; i++) { if (nilfs->ns_sbp[i]) { brelse(nilfs->ns_sbh[i]); nilfs->ns_sbh[i] = NULL; nilfs->ns_sbp[i] = NULL; } } } void nilfs_fall_back_super_block(struct the_nilfs *nilfs) { brelse(nilfs->ns_sbh[0]); nilfs->ns_sbh[0] = nilfs->ns_sbh[1]; nilfs->ns_sbp[0] = nilfs->ns_sbp[1]; nilfs->ns_sbh[1] = NULL; nilfs->ns_sbp[1] = NULL; } void nilfs_swap_super_block(struct the_nilfs *nilfs) { struct buffer_head *tsbh = nilfs->ns_sbh[0]; struct nilfs_super_block *tsbp = nilfs->ns_sbp[0]; nilfs->ns_sbh[0] = nilfs->ns_sbh[1]; nilfs->ns_sbp[0] = nilfs->ns_sbp[1]; nilfs->ns_sbh[1] = tsbh; nilfs->ns_sbp[1] = tsbp; } static int nilfs_load_super_block(struct the_nilfs *nilfs, struct super_block *sb, int blocksize, struct nilfs_super_block **sbpp) { struct nilfs_super_block **sbp = nilfs->ns_sbp; struct buffer_head **sbh = nilfs->ns_sbh; u64 sb2off, devsize = bdev_nr_bytes(nilfs->ns_bdev); int valid[2], swp = 0, older; if (devsize < NILFS_SEG_MIN_BLOCKS * NILFS_MIN_BLOCK_SIZE + 4096) { nilfs_err(sb, "device size too small"); return -EINVAL; } sb2off = NILFS_SB2_OFFSET_BYTES(devsize); sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize, &sbh[0]); sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]); if (!sbp[0]) { if (!sbp[1]) { nilfs_err(sb, "unable to read superblock"); return -EIO; } nilfs_warn(sb, "unable to read primary superblock (blocksize = %d)", blocksize); } else if (!sbp[1]) { nilfs_warn(sb, "unable to read secondary superblock (blocksize = %d)", blocksize); } /* * Compare two super blocks and set 1 in swp if the secondary * super block is valid and newer. Otherwise, set 0 in swp. */ valid[0] = nilfs_valid_sb(sbp[0]); valid[1] = nilfs_valid_sb(sbp[1]); swp = valid[1] && (!valid[0] || le64_to_cpu(sbp[1]->s_last_cno) > le64_to_cpu(sbp[0]->s_last_cno)); if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) { brelse(sbh[1]); sbh[1] = NULL; sbp[1] = NULL; valid[1] = 0; swp = 0; } if (!valid[swp]) { nilfs_release_super_block(nilfs); nilfs_err(sb, "couldn't find nilfs on the device"); return -EINVAL; } if (!valid[!swp]) nilfs_warn(sb, "broken superblock, retrying with spare superblock (blocksize = %d)", blocksize); if (swp) nilfs_swap_super_block(nilfs); /* * Calculate the array index of the older superblock data. * If one has been dropped, set index 0 pointing to the remaining one, * otherwise set index 1 pointing to the old one (including if both * are the same). * * Divided case valid[0] valid[1] swp -> older * ------------------------------------------------------------- * Both SBs are invalid 0 0 N/A (Error) * SB1 is invalid 0 1 1 0 * SB2 is invalid 1 0 0 0 * SB2 is newer 1 1 1 0 * SB2 is older or the same 1 1 0 1 */ older = valid[1] ^ swp; nilfs->ns_sbwcount = 0; nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); nilfs->ns_prot_seq = le64_to_cpu(sbp[older]->s_last_seq); *sbpp = sbp[0]; return 0; } /** * init_nilfs - initialize a NILFS instance. * @nilfs: the_nilfs structure * @sb: super block * * init_nilfs() performs common initialization per block device (e.g. * reading the super block, getting disk layout information, initializing * shared fields in the_nilfs). * * Return: 0 on success, or a negative error code on failure. */ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb) { struct nilfs_super_block *sbp; int blocksize; int err; blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE); if (!blocksize) { nilfs_err(sb, "unable to set blocksize"); err = -EINVAL; goto out; } err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); if (err) goto out; err = nilfs_store_magic(sb, sbp); if (err) goto failed_sbh; err = nilfs_check_feature_compatibility(sb, sbp); if (err) goto failed_sbh; err = nilfs_get_blocksize(sb, sbp, &blocksize); if (err) goto failed_sbh; if (blocksize < NILFS_MIN_BLOCK_SIZE) { nilfs_err(sb, "couldn't mount because of unsupported filesystem blocksize %d", blocksize); err = -EINVAL; goto failed_sbh; } if (sb->s_blocksize != blocksize) { int hw_blocksize = bdev_logical_block_size(sb->s_bdev); if (blocksize < hw_blocksize) { nilfs_err(sb, "blocksize %d too small for device (sector-size = %d)", blocksize, hw_blocksize); err = -EINVAL; goto failed_sbh; } nilfs_release_super_block(nilfs); if (!sb_set_blocksize(sb, blocksize)) { nilfs_err(sb, "bad blocksize %d", blocksize); err = -EINVAL; goto out; } err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); if (err) goto out; /* * Not to failed_sbh; sbh is released automatically * when reloading fails. */ } nilfs->ns_blocksize_bits = sb->s_blocksize_bits; nilfs->ns_blocksize = blocksize; err = nilfs_store_disk_layout(nilfs, sbp); if (err) goto failed_sbh; sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits); nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); err = nilfs_store_log_cursor(nilfs, sbp); if (err) goto failed_sbh; set_nilfs_init(nilfs); err = 0; out: return err; failed_sbh: nilfs_release_super_block(nilfs); goto out; } int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, size_t nsegs) { sector_t seg_start, seg_end; sector_t start = 0, nblocks = 0; unsigned int sects_per_block; __u64 *sn; int ret = 0; sects_per_block = (1 << nilfs->ns_blocksize_bits) / bdev_logical_block_size(nilfs->ns_bdev); for (sn = segnump; sn < segnump + nsegs; sn++) { nilfs_get_segment_range(nilfs, *sn, &seg_start, &seg_end); if (!nblocks) { start = seg_start; nblocks = seg_end - seg_start + 1; } else if (start + nblocks == seg_start) { nblocks += seg_end - seg_start + 1; } else { ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, GFP_NOFS); if (ret < 0) return ret; nblocks = 0; } } if (nblocks) ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, GFP_NOFS); return ret; } int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) { unsigned long ncleansegs; ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; return 0; } int nilfs_near_disk_full(struct the_nilfs *nilfs) { unsigned long ncleansegs, nincsegs; ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); nincsegs = atomic_read(&nilfs->ns_ndirtyblks) / nilfs->ns_blocks_per_segment + 1; return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs; } struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno) { struct rb_node *n; struct nilfs_root *root; spin_lock(&nilfs->ns_cptree_lock); n = nilfs->ns_cptree.rb_node; while (n) { root = rb_entry(n, struct nilfs_root, rb_node); if (cno < root->cno) { n = n->rb_left; } else if (cno > root->cno) { n = n->rb_right; } else { refcount_inc(&root->count); spin_unlock(&nilfs->ns_cptree_lock); return root; } } spin_unlock(&nilfs->ns_cptree_lock); return NULL; } struct nilfs_root * nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) { struct rb_node **p, *parent; struct nilfs_root *root, *new; int err; root = nilfs_lookup_root(nilfs, cno); if (root) return root; new = kzalloc(sizeof(*root), GFP_KERNEL); if (!new) return NULL; spin_lock(&nilfs->ns_cptree_lock); p = &nilfs->ns_cptree.rb_node; parent = NULL; while (*p) { parent = *p; root = rb_entry(parent, struct nilfs_root, rb_node); if (cno < root->cno) { p = &(*p)->rb_left; } else if (cno > root->cno) { p = &(*p)->rb_right; } else { refcount_inc(&root->count); spin_unlock(&nilfs->ns_cptree_lock); kfree(new); return root; } } new->cno = cno; new->ifile = NULL; new->nilfs = nilfs; refcount_set(&new->count, 1); atomic64_set(&new->inodes_count, 0); atomic64_set(&new->blocks_count, 0); rb_link_node(&new->rb_node, parent, p); rb_insert_color(&new->rb_node, &nilfs->ns_cptree); spin_unlock(&nilfs->ns_cptree_lock); err = nilfs_sysfs_create_snapshot_group(new); if (err) { kfree(new); new = NULL; } return new; } void nilfs_put_root(struct nilfs_root *root) { struct the_nilfs *nilfs = root->nilfs; if (refcount_dec_and_lock(&root->count, &nilfs->ns_cptree_lock)) { rb_erase(&root->rb_node, &nilfs->ns_cptree); spin_unlock(&nilfs->ns_cptree_lock); nilfs_sysfs_delete_snapshot_group(root); iput(root->ifile); kfree(root); } } |
| 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2002, 2004 * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * Copyright (c) 2002-2003 Intel Corp. * * This file is part of the SCTP kernel implementation * * SCTP over IPv6. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Le Yanqun <yanqun.le@nokia.com> * Hui Huang <hui.huang@nokia.com> * La Monte H.P. Yarroll <piggy@acm.org> * Sridhar Samudrala <sri@us.ibm.com> * Jon Grimm <jgrimm@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * * Based on: * linux/net/ipv6/tcp_ipv6.c */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/init.h> #include <linux/ipsec.h> #include <linux/slab.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> #include <linux/seq_file.h> #include <net/protocol.h> #include <net/ndisc.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/transp_v6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/inet_common.h> #include <net/inet_ecn.h> #include <net/sctp/sctp.h> #include <net/udp_tunnel.h> #include <linux/uaccess.h> static inline int sctp_v6_addr_match_len(union sctp_addr *s1, union sctp_addr *s2); static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr, __be16 port); static int sctp_v6_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2); /* Event handler for inet6 address addition/deletion events. * The sctp_local_addr_list needs to be protocted by a spin lock since * multiple notifiers (say IPv4 and IPv6) may be running at the same * time and thus corrupt the list. * The reader side is protected with RCU. */ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev, void *ptr) { struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct sctp_sockaddr_entry *addr = NULL; struct sctp_sockaddr_entry *temp; struct net *net = dev_net(ifa->idev->dev); int found = 0; switch (ev) { case NETDEV_UP: addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v6.sin6_family = AF_INET6; addr->a.v6.sin6_addr = ifa->addr; addr->a.v6.sin6_scope_id = ifa->idev->dev->ifindex; addr->valid = 1; spin_lock_bh(&net->sctp.local_addr_lock); list_add_tail_rcu(&addr->list, &net->sctp.local_addr_list); sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_NEW); spin_unlock_bh(&net->sctp.local_addr_lock); } break; case NETDEV_DOWN: spin_lock_bh(&net->sctp.local_addr_lock); list_for_each_entry_safe(addr, temp, &net->sctp.local_addr_list, list) { if (addr->a.sa.sa_family == AF_INET6 && ipv6_addr_equal(&addr->a.v6.sin6_addr, &ifa->addr) && addr->a.v6.sin6_scope_id == ifa->idev->dev->ifindex) { found = 1; addr->valid = 0; list_del_rcu(&addr->list); sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL); break; } } spin_unlock_bh(&net->sctp.local_addr_lock); if (found) kfree_rcu(addr, rcu); break; } return NOTIFY_DONE; } static struct notifier_block sctp_inet6addr_notifier = { .notifier_call = sctp_inet6addr_event, }; static void sctp_v6_err_handle(struct sctp_transport *t, struct sk_buff *skb, __u8 type, __u8 code, __u32 info) { struct sctp_association *asoc = t->asoc; struct sock *sk = asoc->base.sk; int err = 0; switch (type) { case ICMPV6_PKT_TOOBIG: if (ip6_sk_accept_pmtu(sk)) sctp_icmp_frag_needed(sk, asoc, t, info); return; case ICMPV6_PARAMPROB: if (ICMPV6_UNK_NEXTHDR == code) { sctp_icmp_proto_unreachable(sk, asoc, t); return; } break; case NDISC_REDIRECT: sctp_icmp_redirect(sk, t, skb); return; default: break; } icmpv6_err_convert(type, code, &err); if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { sk->sk_err = err; sk_error_report(sk); } else { WRITE_ONCE(sk->sk_err_soft, err); } } /* ICMP error handler. */ static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); struct sctp_transport *transport; struct sctp_association *asoc; __u16 saveip, savesctp; struct sock *sk; /* Fix up skb to look at the embedded net header. */ saveip = skb->network_header; savesctp = skb->transport_header; skb_reset_network_header(skb); skb_set_transport_header(skb, offset); sk = sctp_err_lookup(net, AF_INET6, skb, sctp_hdr(skb), &asoc, &transport); /* Put back, the original pointers. */ skb->network_header = saveip; skb->transport_header = savesctp; if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return -ENOENT; } sctp_v6_err_handle(transport, skb, type, code, ntohl(info)); sctp_err_finish(sk, transport); return 0; } int sctp_udp_v6_err(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct sctp_association *asoc; struct sctp_transport *t; struct icmp6hdr *hdr; __u32 info = 0; skb->transport_header += sizeof(struct udphdr); sk = sctp_err_lookup(net, AF_INET6, skb, sctp_hdr(skb), &asoc, &t); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return -ENOENT; } skb->transport_header -= sizeof(struct udphdr); hdr = (struct icmp6hdr *)(skb_network_header(skb) - sizeof(struct icmp6hdr)); if (hdr->icmp6_type == NDISC_REDIRECT) { /* can't be handled without outer ip6hdr known, leave it to udpv6_err */ sctp_err_finish(sk, t); return 0; } if (hdr->icmp6_type == ICMPV6_PKT_TOOBIG) info = ntohl(hdr->icmp6_mtu); sctp_v6_err_handle(t, skb, hdr->icmp6_type, hdr->icmp6_code, info); sctp_err_finish(sk, t); return 1; } static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) { struct dst_entry *dst = dst_clone(t->dst); struct flowi6 *fl6 = &t->fl.u.ip6; struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); __u8 tclass = np->tclass; __be32 label; pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb, skb->len, &fl6->saddr, &fl6->daddr); if (t->dscp & SCTP_DSCP_SET_MASK) tclass = t->dscp & SCTP_DSCP_VAL_MASK; if (INET_ECN_is_capable(tclass)) IP6_ECN_flow_xmit(sk, fl6->flowlabel); if (!(t->param_flags & SPP_PMTUD_ENABLE)) skb->ignore_df = 1; SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); if (!t->encap_port || !sctp_sk(sk)->udp_port) { int res; skb_dst_set(skb, dst); rcu_read_lock(); res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), tclass, READ_ONCE(sk->sk_priority)); rcu_read_unlock(); return res; } if (skb_is_gso(skb)) skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; skb->encapsulation = 1; skb_reset_inner_mac_header(skb); skb_reset_inner_transport_header(skb); skb_set_inner_ipproto(skb, IPPROTO_SCTP); label = ip6_make_flowlabel(sock_net(sk), skb, fl6->flowlabel, true, fl6); return udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, &fl6->daddr, tclass, ip6_dst_hoplimit(dst), label, sctp_sk(sk)->udp_port, t->encap_port, false); } /* Returns the dst cache entry for the given source and destination ip * addresses. */ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct flowi *fl, struct sock *sk) { struct sctp_association *asoc = t->asoc; struct dst_entry *dst = NULL; struct flowi _fl; struct flowi6 *fl6 = &_fl.u.ip6; struct sctp_bind_addr *bp; struct ipv6_pinfo *np = inet6_sk(sk); struct sctp_sockaddr_entry *laddr; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; struct in6_addr *final_p, final; enum sctp_scope scope; __u8 matchlen = 0; memset(&_fl, 0, sizeof(_fl)); fl6->daddr = daddr->v6.sin6_addr; fl6->fl6_dport = daddr->v6.sin6_port; fl6->flowi6_proto = IPPROTO_SCTP; if (ipv6_addr_type(&daddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) fl6->flowi6_oif = daddr->v6.sin6_scope_id; else if (asoc) fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if; if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK) fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK); if (inet6_test_bit(SNDFLOW, sk) && (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6->flowlabel); if (IS_ERR(flowlabel)) goto out; fl6_sock_release(flowlabel); } pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr); if (asoc) fl6->fl6_sport = htons(asoc->base.bind_addr.port); if (saddr) { fl6->saddr = saddr->v6.sin6_addr; if (!fl6->fl6_sport) fl6->fl6_sport = saddr->v6.sin6_port; pr_debug("src=%pI6 - ", &fl6->saddr); } rcu_read_lock(); final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!asoc || saddr) { t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); goto out; } bp = &asoc->base.bind_addr; scope = sctp_scope(daddr); /* ip6_dst_lookup has filled in the fl6->saddr for us. Check * to see if we can use it. */ if (!IS_ERR(dst)) { /* Walk through the bind address list and look for a bind * address that matches the source address of the returned dst. */ sctp_v6_to_addr(&dst_saddr, &fl6->saddr, htons(bp->port)); rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid || laddr->state == SCTP_ADDR_DEL || (laddr->state != SCTP_ADDR_SRC && !asoc->src_out_of_asoc_ok)) continue; /* Do not compare against v4 addrs */ if ((laddr->a.sa.sa_family == AF_INET6) && (sctp_v6_cmp_addr(&dst_saddr, &laddr->a))) { rcu_read_unlock(); t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); goto out; } } rcu_read_unlock(); /* None of the bound addresses match the source address of the * dst. So release it. */ dst_release(dst); dst = NULL; } /* Walk through the bind address list and try to get the * best source address for a given destination. */ rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { struct dst_entry *bdst; __u8 bmatchlen; if (!laddr->valid || laddr->state != SCTP_ADDR_SRC || laddr->a.sa.sa_family != AF_INET6 || scope > sctp_scope(&laddr->a)) continue; fl6->saddr = laddr->a.v6.sin6_addr; fl6->fl6_sport = laddr->a.v6.sin6_port; final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); bdst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(bdst)) continue; if (ipv6_chk_addr(dev_net(bdst->dev), &laddr->a.v6.sin6_addr, bdst->dev, 1)) { if (!IS_ERR_OR_NULL(dst)) dst_release(dst); dst = bdst; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); break; } bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a); if (matchlen > bmatchlen) { dst_release(bdst); continue; } if (!IS_ERR_OR_NULL(dst)) dst_release(dst); dst = bdst; matchlen = bmatchlen; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); } rcu_read_unlock(); out: if (!IS_ERR_OR_NULL(dst)) { struct rt6_info *rt; rt = dst_rt6_info(dst); t->dst_cookie = rt6_get_cookie(rt); pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n", &rt->rt6i_dst.addr, rt->rt6i_dst.plen, &fl->u.ip6.saddr); } else { t->dst = NULL; pr_debug("no route\n"); } } /* Returns the number of consecutive initial bits that match in the 2 ipv6 * addresses. */ static inline int sctp_v6_addr_match_len(union sctp_addr *s1, union sctp_addr *s2) { return ipv6_addr_diff(&s1->v6.sin6_addr, &s2->v6.sin6_addr); } /* Fills in the source address(saddr) based on the destination address(daddr) * and asoc's bind address list. */ static void sctp_v6_get_saddr(struct sctp_sock *sk, struct sctp_transport *t, struct flowi *fl) { struct flowi6 *fl6 = &fl->u.ip6; union sctp_addr *saddr = &t->saddr; pr_debug("%s: asoc:%p dst:%p\n", __func__, t->asoc, t->dst); if (t->dst) { saddr->v6.sin6_family = AF_INET6; saddr->v6.sin6_addr = fl6->saddr; } } /* Make a copy of all potential local addresses. */ static void sctp_v6_copy_addrlist(struct list_head *addrlist, struct net_device *dev) { struct inet6_dev *in6_dev; struct inet6_ifaddr *ifp; struct sctp_sockaddr_entry *addr; rcu_read_lock(); if ((in6_dev = __in6_dev_get(dev)) == NULL) { rcu_read_unlock(); return; } read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { /* Add the address to the local list. */ addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v6.sin6_family = AF_INET6; addr->a.v6.sin6_addr = ifp->addr; addr->a.v6.sin6_scope_id = dev->ifindex; addr->valid = 1; INIT_LIST_HEAD(&addr->list); list_add_tail(&addr->list, addrlist); } } read_unlock_bh(&in6_dev->lock); rcu_read_unlock(); } /* Copy over any ip options */ static void sctp_v6_copy_ip_options(struct sock *sk, struct sock *newsk) { struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct ipv6_txoptions *opt; newnp = inet6_sk(newsk); rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); if (!opt) pr_err("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newnp->opt, opt); rcu_read_unlock(); } /* Account for the IP options */ static int sctp_v6_ip_options_len(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_txoptions *opt; int len = 0; rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) len = opt->opt_flen + opt->opt_nflen; rcu_read_unlock(); return len; } /* Initialize a sockaddr_storage from in incoming skb. */ static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb, int is_saddr) { /* Always called on head skb, so this is safe */ struct sctphdr *sh = sctp_hdr(skb); struct sockaddr_in6 *sa = &addr->v6; addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; /* FIXME */ addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif; if (is_saddr) { sa->sin6_port = sh->source; sa->sin6_addr = ipv6_hdr(skb)->saddr; } else { sa->sin6_port = sh->dest; sa->sin6_addr = ipv6_hdr(skb)->daddr; } } /* Initialize an sctp_addr from a socket. */ static void sctp_v6_from_sk(union sctp_addr *addr, struct sock *sk) { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = 0; addr->v6.sin6_addr = sk->sk_v6_rcv_saddr; } /* Initialize sk->sk_rcv_saddr from sctp_addr. */ static void sctp_v6_to_sk_saddr(union sctp_addr *addr, struct sock *sk) { if (addr->sa.sa_family == AF_INET) { sk->sk_v6_rcv_saddr.s6_addr32[0] = 0; sk->sk_v6_rcv_saddr.s6_addr32[1] = 0; sk->sk_v6_rcv_saddr.s6_addr32[2] = htonl(0x0000ffff); sk->sk_v6_rcv_saddr.s6_addr32[3] = addr->v4.sin_addr.s_addr; } else { sk->sk_v6_rcv_saddr = addr->v6.sin6_addr; } } /* Initialize sk->sk_daddr from sctp_addr. */ static void sctp_v6_to_sk_daddr(union sctp_addr *addr, struct sock *sk) { if (addr->sa.sa_family == AF_INET) { sk->sk_v6_daddr.s6_addr32[0] = 0; sk->sk_v6_daddr.s6_addr32[1] = 0; sk->sk_v6_daddr.s6_addr32[2] = htonl(0x0000ffff); sk->sk_v6_daddr.s6_addr32[3] = addr->v4.sin_addr.s_addr; } else { sk->sk_v6_daddr = addr->v6.sin6_addr; } } /* Initialize a sctp_addr from an address parameter. */ static bool sctp_v6_from_addr_param(union sctp_addr *addr, union sctp_addr_param *param, __be16 port, int iif) { if (ntohs(param->v6.param_hdr.length) < sizeof(struct sctp_ipv6addr_param)) return false; addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = port; addr->v6.sin6_flowinfo = 0; /* BUG */ addr->v6.sin6_addr = param->v6.addr; addr->v6.sin6_scope_id = iif; return true; } /* Initialize an address parameter from a sctp_addr and return the length * of the address parameter. */ static int sctp_v6_to_addr_param(const union sctp_addr *addr, union sctp_addr_param *param) { int length = sizeof(struct sctp_ipv6addr_param); param->v6.param_hdr.type = SCTP_PARAM_IPV6_ADDRESS; param->v6.param_hdr.length = htons(length); param->v6.addr = addr->v6.sin6_addr; return length; } /* Initialize a sctp_addr from struct in6_addr. */ static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr, __be16 port) { addr->sa.sa_family = AF_INET6; addr->v6.sin6_port = port; addr->v6.sin6_flowinfo = 0; addr->v6.sin6_addr = *saddr; addr->v6.sin6_scope_id = 0; } static int __sctp_v6_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2) { if (addr1->sa.sa_family != addr2->sa.sa_family) { if (addr1->sa.sa_family == AF_INET && addr2->sa.sa_family == AF_INET6 && ipv6_addr_v4mapped(&addr2->v6.sin6_addr) && addr2->v6.sin6_addr.s6_addr32[3] == addr1->v4.sin_addr.s_addr) return 1; if (addr2->sa.sa_family == AF_INET && addr1->sa.sa_family == AF_INET6 && ipv6_addr_v4mapped(&addr1->v6.sin6_addr) && addr1->v6.sin6_addr.s6_addr32[3] == addr2->v4.sin_addr.s_addr) return 1; return 0; } if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr)) return 0; /* If this is a linklocal address, compare the scope_id. */ if ((ipv6_addr_type(&addr1->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) && addr1->v6.sin6_scope_id && addr2->v6.sin6_scope_id && addr1->v6.sin6_scope_id != addr2->v6.sin6_scope_id) return 0; return 1; } /* Compare addresses exactly. * v4-mapped-v6 is also in consideration. */ static int sctp_v6_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2) { return __sctp_v6_cmp_addr(addr1, addr2) && addr1->v6.sin6_port == addr2->v6.sin6_port; } /* Initialize addr struct to INADDR_ANY. */ static void sctp_v6_inaddr_any(union sctp_addr *addr, __be16 port) { memset(addr, 0x00, sizeof(union sctp_addr)); addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = port; } /* Is this a wildcard address? */ static int sctp_v6_is_any(const union sctp_addr *addr) { return ipv6_addr_any(&addr->v6.sin6_addr); } /* Should this be available for binding? */ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) { const struct in6_addr *in6 = (const struct in6_addr *)&addr->v6.sin6_addr; struct sock *sk = &sp->inet.sk; struct net *net = sock_net(sk); struct net_device *dev = NULL; int type, res, bound_dev_if; type = ipv6_addr_type(in6); if (IPV6_ADDR_ANY == type) return 1; if (type == IPV6_ADDR_MAPPED) { if (sp && ipv6_only_sock(sctp_opt2sk(sp))) return 0; sctp_v6_map_v4(addr); return sctp_get_af_specific(AF_INET)->available(addr, sp); } if (!(type & IPV6_ADDR_UNICAST)) return 0; rcu_read_lock(); bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if) { res = 0; dev = dev_get_by_index_rcu(net, bound_dev_if); if (!dev) goto out; } res = ipv6_can_nonlocal_bind(net, &sp->inet) || ipv6_chk_addr(net, in6, dev, 0); out: rcu_read_unlock(); return res; } /* This function checks if the address is a valid address to be used for * SCTP. * * Output: * Return 0 - If the address is a non-unicast or an illegal address. * Return 1 - If the address is a unicast. */ static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp, const struct sk_buff *skb) { int ret = ipv6_addr_type(&addr->v6.sin6_addr); /* Support v4-mapped-v6 address. */ if (ret == IPV6_ADDR_MAPPED) { /* Note: This routine is used in input, so v4-mapped-v6 * are disallowed here when there is no sctp_sock. */ if (sp && ipv6_only_sock(sctp_opt2sk(sp))) return 0; sctp_v6_map_v4(addr); return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp, skb); } /* Is this a non-unicast address */ if (!(ret & IPV6_ADDR_UNICAST)) return 0; return 1; } /* What is the scope of 'addr'? */ static enum sctp_scope sctp_v6_scope(union sctp_addr *addr) { enum sctp_scope retval; int v6scope; /* The IPv6 scope is really a set of bit fields. * See IFA_* in <net/if_inet6.h>. Map to a generic SCTP scope. */ v6scope = ipv6_addr_scope(&addr->v6.sin6_addr); switch (v6scope) { case IFA_HOST: retval = SCTP_SCOPE_LOOPBACK; break; case IFA_LINK: retval = SCTP_SCOPE_LINK; break; case IFA_SITE: retval = SCTP_SCOPE_PRIVATE; break; default: retval = SCTP_SCOPE_GLOBAL; break; } return retval; } /* Create and initialize a new sk for the socket to be returned by accept(). */ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, struct sctp_association *asoc, bool kern) { struct sock *newsk; struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct sctp6_sock *newsctp6sk; newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, kern); if (!newsk) goto out; sock_init_data(NULL, newsk); sctp_copy_sock(newsk, sk, asoc); sock_reset_flag(sk, SOCK_ZAPPED); newsctp6sk = (struct sctp6_sock *)newsk; inet_sk(newsk)->pinet6 = &newsctp6sk->inet6; sctp_sk(newsk)->v4mapped = sctp_sk(sk)->v4mapped; newnp = inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; sctp_v6_copy_ip_options(sk, newsk); /* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname() * and getpeername(). */ sctp_v6_to_sk_daddr(&asoc->peer.primary_addr, newsk); newsk->sk_v6_rcv_saddr = sk->sk_v6_rcv_saddr; if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); newsk = NULL; } out: return newsk; } /* Format a sockaddr for return to user space. This makes sure the return is * AF_INET or AF_INET6 depending on the SCTP_I_WANT_MAPPED_V4_ADDR option. */ static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) { if (sp->v4mapped) { if (addr->sa.sa_family == AF_INET) sctp_v4_map_v6(addr); } else { if (addr->sa.sa_family == AF_INET6 && ipv6_addr_v4mapped(&addr->v6.sin6_addr)) sctp_v6_map_v4(addr); } if (addr->sa.sa_family == AF_INET) { memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); return sizeof(struct sockaddr_in); } return sizeof(struct sockaddr_in6); } /* Where did this skb come from? */ static int sctp_v6_skb_iif(const struct sk_buff *skb) { return inet6_iif(skb); } static int sctp_v6_skb_sdif(const struct sk_buff *skb) { return inet6_sdif(skb); } /* Was this packet marked by Explicit Congestion Notification? */ static int sctp_v6_is_ce(const struct sk_buff *skb) { return *((__u32 *)(ipv6_hdr(skb))) & (__force __u32)htonl(1 << 20); } /* Dump the v6 addr to the seq file. */ static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr) { seq_printf(seq, "%pI6 ", &addr->v6.sin6_addr); } static void sctp_v6_ecn_capable(struct sock *sk) { inet6_sk(sk)->tclass |= INET_ECN_ECT_0; } /* Initialize a PF_INET msgname from a ulpevent. */ static void sctp_inet6_event_msgname(struct sctp_ulpevent *event, char *msgname, int *addrlen) { union sctp_addr *addr; struct sctp_association *asoc; union sctp_addr *paddr; if (!msgname) return; addr = (union sctp_addr *)msgname; asoc = event->asoc; paddr = &asoc->peer.primary_addr; if (paddr->sa.sa_family == AF_INET) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = htons(asoc->peer.port); addr->v4.sin_addr = paddr->v4.sin_addr; } else { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; if (ipv6_addr_type(&paddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) addr->v6.sin6_scope_id = paddr->v6.sin6_scope_id; else addr->v6.sin6_scope_id = 0; addr->v6.sin6_port = htons(asoc->peer.port); addr->v6.sin6_addr = paddr->v6.sin6_addr; } *addrlen = sctp_v6_addr_to_user(sctp_sk(asoc->base.sk), addr); } /* Initialize a msg_name from an inbound skb. */ static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname, int *addr_len) { union sctp_addr *addr; struct sctphdr *sh; if (!msgname) return; addr = (union sctp_addr *)msgname; sh = sctp_hdr(skb); if (ip_hdr(skb)->version == 4) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = sh->source; addr->v4.sin_addr.s_addr = ip_hdr(skb)->saddr; } else { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; addr->v6.sin6_port = sh->source; addr->v6.sin6_addr = ipv6_hdr(skb)->saddr; if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) addr->v6.sin6_scope_id = sctp_v6_skb_iif(skb); else addr->v6.sin6_scope_id = 0; } *addr_len = sctp_v6_addr_to_user(sctp_sk(skb->sk), addr); } /* Do we support this AF? */ static int sctp_inet6_af_supported(sa_family_t family, struct sctp_sock *sp) { switch (family) { case AF_INET6: return 1; /* v4-mapped-v6 addresses */ case AF_INET: if (!ipv6_only_sock(sctp_opt2sk(sp))) return 1; fallthrough; default: return 0; } } /* Address matching with wildcards allowed. This extra level * of indirection lets us choose whether a PF_INET6 should * disallow any v4 addresses if we so choose. */ static int sctp_inet6_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2, struct sctp_sock *opt) { struct sock *sk = sctp_opt2sk(opt); struct sctp_af *af1, *af2; af1 = sctp_get_af_specific(addr1->sa.sa_family); af2 = sctp_get_af_specific(addr2->sa.sa_family); if (!af1 || !af2) return 0; /* If the socket is IPv6 only, v4 addrs will not match */ if (ipv6_only_sock(sk) && af1 != af2) return 0; /* Today, wildcard AF_INET/AF_INET6. */ if (sctp_is_any(sk, addr1) || sctp_is_any(sk, addr2)) return 1; if (addr1->sa.sa_family == AF_INET && addr2->sa.sa_family == AF_INET) return addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr; return __sctp_v6_cmp_addr(addr1, addr2); } /* Verify that the provided sockaddr looks bindable. Common verification, * has already been taken care of. */ static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr) { struct sctp_af *af; /* ASSERT: address family has already been verified. */ if (addr->sa.sa_family != AF_INET6) af = sctp_get_af_specific(addr->sa.sa_family); else { int type = ipv6_addr_type(&addr->v6.sin6_addr); struct net_device *dev; if (type & IPV6_ADDR_LINKLOCAL) { struct net *net; if (!addr->v6.sin6_scope_id) return 0; net = sock_net(&opt->inet.sk); rcu_read_lock(); dev = dev_get_by_index_rcu(net, addr->v6.sin6_scope_id); if (!dev || !(ipv6_can_nonlocal_bind(net, &opt->inet) || ipv6_chk_addr(net, &addr->v6.sin6_addr, dev, 0))) { rcu_read_unlock(); return 0; } rcu_read_unlock(); } af = opt->pf->af; } return af->available(addr, opt); } /* Verify that the provided sockaddr looks sendable. Common verification, * has already been taken care of. */ static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr) { struct sctp_af *af = NULL; /* ASSERT: address family has already been verified. */ if (addr->sa.sa_family != AF_INET6) af = sctp_get_af_specific(addr->sa.sa_family); else { int type = ipv6_addr_type(&addr->v6.sin6_addr); struct net_device *dev; if (type & IPV6_ADDR_LINKLOCAL) { if (!addr->v6.sin6_scope_id) return 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(&opt->inet.sk), addr->v6.sin6_scope_id); rcu_read_unlock(); if (!dev) return 0; } af = opt->pf->af; } return af != NULL; } /* Fill in Supported Address Type information for INIT and INIT-ACK * chunks. Note: In the future, we may want to look at sock options * to determine whether a PF_INET6 socket really wants to have IPV4 * addresses. * Returns number of addresses supported. */ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt, __be16 *types) { types[0] = SCTP_PARAM_IPV6_ADDRESS; if (!opt || !ipv6_only_sock(sctp_opt2sk(opt))) { types[1] = SCTP_PARAM_IPV4_ADDRESS; return 2; } return 1; } /* Handle SCTP_I_WANT_MAPPED_V4_ADDR for getpeername() and getsockname() */ static int sctp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { int rc; rc = inet6_getname(sock, uaddr, peer); if (rc < 0) return rc; rc = sctp_v6_addr_to_user(sctp_sk(sock->sk), (union sctp_addr *)uaddr); return rc; } static const struct proto_ops inet6_seqpacket_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = sctp_inet_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = sctp_getname, .poll = sctp_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, .listen = sctp_inet_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif }; static struct inet_protosw sctpv6_seqpacket_protosw = { .type = SOCK_SEQPACKET, .protocol = IPPROTO_SCTP, .prot = &sctpv6_prot, .ops = &inet6_seqpacket_ops, .flags = SCTP_PROTOSW_FLAG }; static struct inet_protosw sctpv6_stream_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_SCTP, .prot = &sctpv6_prot, .ops = &inet6_seqpacket_ops, .flags = SCTP_PROTOSW_FLAG, }; static int sctp6_rcv(struct sk_buff *skb) { SCTP_INPUT_CB(skb)->encap_port = 0; return sctp_rcv(skb) ? -1 : 0; } static const struct inet6_protocol sctpv6_protocol = { .handler = sctp6_rcv, .err_handler = sctp_v6_err, .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, }; static struct sctp_af sctp_af_inet6 = { .sa_family = AF_INET6, .sctp_xmit = sctp_v6_xmit, .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .get_dst = sctp_v6_get_dst, .get_saddr = sctp_v6_get_saddr, .copy_addrlist = sctp_v6_copy_addrlist, .from_skb = sctp_v6_from_skb, .from_sk = sctp_v6_from_sk, .from_addr_param = sctp_v6_from_addr_param, .to_addr_param = sctp_v6_to_addr_param, .cmp_addr = sctp_v6_cmp_addr, .scope = sctp_v6_scope, .addr_valid = sctp_v6_addr_valid, .inaddr_any = sctp_v6_inaddr_any, .is_any = sctp_v6_is_any, .available = sctp_v6_available, .skb_iif = sctp_v6_skb_iif, .skb_sdif = sctp_v6_skb_sdif, .is_ce = sctp_v6_is_ce, .seq_dump_addr = sctp_v6_seq_dump_addr, .ecn_capable = sctp_v6_ecn_capable, .net_header_len = sizeof(struct ipv6hdr), .sockaddr_len = sizeof(struct sockaddr_in6), .ip_options_len = sctp_v6_ip_options_len, }; static struct sctp_pf sctp_pf_inet6 = { .event_msgname = sctp_inet6_event_msgname, .skb_msgname = sctp_inet6_skb_msgname, .af_supported = sctp_inet6_af_supported, .cmp_addr = sctp_inet6_cmp_addr, .bind_verify = sctp_inet6_bind_verify, .send_verify = sctp_inet6_send_verify, .supported_addrs = sctp_inet6_supported_addrs, .create_accept_sk = sctp_v6_create_accept_sk, .addr_to_user = sctp_v6_addr_to_user, .to_sk_saddr = sctp_v6_to_sk_saddr, .to_sk_daddr = sctp_v6_to_sk_daddr, .copy_ip_options = sctp_v6_copy_ip_options, .af = &sctp_af_inet6, }; /* Initialize IPv6 support and register with socket layer. */ void sctp_v6_pf_init(void) { /* Register the SCTP specific PF_INET6 functions. */ sctp_register_pf(&sctp_pf_inet6, PF_INET6); /* Register the SCTP specific AF_INET6 functions. */ sctp_register_af(&sctp_af_inet6); } void sctp_v6_pf_exit(void) { list_del(&sctp_af_inet6.list); } /* Initialize IPv6 support and register with socket layer. */ int sctp_v6_protosw_init(void) { int rc; rc = proto_register(&sctpv6_prot, 1); if (rc) return rc; /* Add SCTPv6(UDP and TCP style) to inetsw6 linked list. */ inet6_register_protosw(&sctpv6_seqpacket_protosw); inet6_register_protosw(&sctpv6_stream_protosw); return 0; } void sctp_v6_protosw_exit(void) { inet6_unregister_protosw(&sctpv6_seqpacket_protosw); inet6_unregister_protosw(&sctpv6_stream_protosw); proto_unregister(&sctpv6_prot); } /* Register with inet6 layer. */ int sctp_v6_add_protocol(void) { /* Register notifier for inet6 address additions/deletions. */ register_inet6addr_notifier(&sctp_inet6addr_notifier); if (inet6_add_protocol(&sctpv6_protocol, IPPROTO_SCTP) < 0) return -EAGAIN; return 0; } /* Unregister with inet6 layer. */ void sctp_v6_del_protocol(void) { inet6_del_protocol(&sctpv6_protocol, IPPROTO_SCTP); unregister_inet6addr_notifier(&sctp_inet6addr_notifier); } |
| 1 3 3 3 1 3 3 3 3 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Digital Audio (PCM) abstract layer * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/io.h> #include <linux/time.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/moduleparam.h> #include <linux/export.h> #include <sound/core.h> #include <sound/pcm.h> #include <sound/info.h> #include <sound/initval.h> #include "pcm_local.h" static int preallocate_dma = 1; module_param(preallocate_dma, int, 0444); MODULE_PARM_DESC(preallocate_dma, "Preallocate DMA memory when the PCM devices are initialized."); static int maximum_substreams = 4; module_param(maximum_substreams, int, 0444); MODULE_PARM_DESC(maximum_substreams, "Maximum substreams with preallocated DMA memory."); static const size_t snd_minimum_buffer = 16384; static unsigned long max_alloc_per_card = 32UL * 1024UL * 1024UL; module_param(max_alloc_per_card, ulong, 0644); MODULE_PARM_DESC(max_alloc_per_card, "Max total allocation bytes per card."); static void __update_allocated_size(struct snd_card *card, ssize_t bytes) { card->total_pcm_alloc_bytes += bytes; } static void update_allocated_size(struct snd_card *card, ssize_t bytes) { guard(mutex)(&card->memory_mutex); __update_allocated_size(card, bytes); } static void decrease_allocated_size(struct snd_card *card, size_t bytes) { guard(mutex)(&card->memory_mutex); WARN_ON(card->total_pcm_alloc_bytes < bytes); __update_allocated_size(card, -(ssize_t)bytes); } static int do_alloc_pages(struct snd_card *card, int type, struct device *dev, int str, size_t size, struct snd_dma_buffer *dmab) { enum dma_data_direction dir; int err; /* check and reserve the requested size */ scoped_guard(mutex, &card->memory_mutex) { if (max_alloc_per_card && card->total_pcm_alloc_bytes + size > max_alloc_per_card) return -ENOMEM; __update_allocated_size(card, size); } if (str == SNDRV_PCM_STREAM_PLAYBACK) dir = DMA_TO_DEVICE; else dir = DMA_FROM_DEVICE; err = snd_dma_alloc_dir_pages(type, dev, dir, size, dmab); if (!err) { /* the actual allocation size might be bigger than requested, * and we need to correct the account */ if (dmab->bytes != size) update_allocated_size(card, dmab->bytes - size); } else { /* take back on allocation failure */ decrease_allocated_size(card, size); } return err; } static void do_free_pages(struct snd_card *card, struct snd_dma_buffer *dmab) { if (!dmab->area) return; decrease_allocated_size(card, dmab->bytes); snd_dma_free_pages(dmab); dmab->area = NULL; } /* * try to allocate as the large pages as possible. * stores the resultant memory size in *res_size. * * the minimum size is snd_minimum_buffer. it should be power of 2. */ static int preallocate_pcm_pages(struct snd_pcm_substream *substream, size_t size, bool no_fallback) { struct snd_dma_buffer *dmab = &substream->dma_buffer; struct snd_card *card = substream->pcm->card; size_t orig_size = size; int err; do { err = do_alloc_pages(card, dmab->dev.type, dmab->dev.dev, substream->stream, size, dmab); if (err != -ENOMEM) return err; if (no_fallback) break; size >>= 1; } while (size >= snd_minimum_buffer); dmab->bytes = 0; /* tell error */ pr_warn("ALSA pcmC%dD%d%c,%d:%s: cannot preallocate for size %zu\n", substream->pcm->card->number, substream->pcm->device, substream->stream ? 'c' : 'p', substream->number, substream->pcm->name, orig_size); return -ENOMEM; } /** * snd_pcm_lib_preallocate_free - release the preallocated buffer of the specified substream. * @substream: the pcm substream instance * * Releases the pre-allocated buffer of the given substream. */ void snd_pcm_lib_preallocate_free(struct snd_pcm_substream *substream) { do_free_pages(substream->pcm->card, &substream->dma_buffer); } /** * snd_pcm_lib_preallocate_free_for_all - release all pre-allocated buffers on the pcm * @pcm: the pcm instance * * Releases all the pre-allocated buffers on the given pcm. */ void snd_pcm_lib_preallocate_free_for_all(struct snd_pcm *pcm) { struct snd_pcm_substream *substream; int stream; for_each_pcm_substream(pcm, stream, substream) snd_pcm_lib_preallocate_free(substream); } EXPORT_SYMBOL(snd_pcm_lib_preallocate_free_for_all); #ifdef CONFIG_SND_VERBOSE_PROCFS /* * read callback for prealloc proc file * * prints the current allocated size in kB. */ static void snd_pcm_lib_preallocate_proc_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_pcm_substream *substream = entry->private_data; snd_iprintf(buffer, "%lu\n", (unsigned long) substream->dma_buffer.bytes / 1024); } /* * read callback for prealloc_max proc file * * prints the maximum allowed size in kB. */ static void snd_pcm_lib_preallocate_max_proc_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_pcm_substream *substream = entry->private_data; snd_iprintf(buffer, "%lu\n", (unsigned long) substream->dma_max / 1024); } /* * write callback for prealloc proc file * * accepts the preallocation size in kB. */ static void snd_pcm_lib_preallocate_proc_write(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_pcm_substream *substream = entry->private_data; struct snd_card *card = substream->pcm->card; char line[64], str[64]; unsigned long size; struct snd_dma_buffer new_dmab; guard(mutex)(&substream->pcm->open_mutex); if (substream->runtime) { buffer->error = -EBUSY; return; } if (!snd_info_get_line(buffer, line, sizeof(line))) { snd_info_get_str(str, line, sizeof(str)); buffer->error = kstrtoul(str, 10, &size); if (buffer->error != 0) return; size *= 1024; if ((size != 0 && size < 8192) || size > substream->dma_max) { buffer->error = -EINVAL; return; } if (substream->dma_buffer.bytes == size) return; memset(&new_dmab, 0, sizeof(new_dmab)); new_dmab.dev = substream->dma_buffer.dev; if (size > 0) { if (do_alloc_pages(card, substream->dma_buffer.dev.type, substream->dma_buffer.dev.dev, substream->stream, size, &new_dmab) < 0) { buffer->error = -ENOMEM; pr_debug("ALSA pcmC%dD%d%c,%d:%s: cannot preallocate for size %lu\n", substream->pcm->card->number, substream->pcm->device, substream->stream ? 'c' : 'p', substream->number, substream->pcm->name, size); return; } substream->buffer_bytes_max = size; } else { substream->buffer_bytes_max = UINT_MAX; } if (substream->dma_buffer.area) do_free_pages(card, &substream->dma_buffer); substream->dma_buffer = new_dmab; } else { buffer->error = -EINVAL; } } static inline void preallocate_info_init(struct snd_pcm_substream *substream) { struct snd_info_entry *entry; entry = snd_info_create_card_entry(substream->pcm->card, "prealloc", substream->proc_root); if (entry) { snd_info_set_text_ops(entry, substream, snd_pcm_lib_preallocate_proc_read); entry->c.text.write = snd_pcm_lib_preallocate_proc_write; entry->mode |= 0200; } entry = snd_info_create_card_entry(substream->pcm->card, "prealloc_max", substream->proc_root); if (entry) snd_info_set_text_ops(entry, substream, snd_pcm_lib_preallocate_max_proc_read); } #else /* !CONFIG_SND_VERBOSE_PROCFS */ static inline void preallocate_info_init(struct snd_pcm_substream *substream) { } #endif /* CONFIG_SND_VERBOSE_PROCFS */ /* * pre-allocate the buffer and create a proc file for the substream */ static int preallocate_pages(struct snd_pcm_substream *substream, int type, struct device *data, size_t size, size_t max, bool managed) { int err; if (snd_BUG_ON(substream->dma_buffer.dev.type)) return -EINVAL; substream->dma_buffer.dev.type = type; substream->dma_buffer.dev.dev = data; if (size > 0) { if (!max) { /* no fallback, only also inform -ENOMEM */ err = preallocate_pcm_pages(substream, size, true); if (err < 0) return err; } else if (preallocate_dma && substream->number < maximum_substreams) { err = preallocate_pcm_pages(substream, size, false); if (err < 0 && err != -ENOMEM) return err; } } if (substream->dma_buffer.bytes > 0) substream->buffer_bytes_max = substream->dma_buffer.bytes; substream->dma_max = max; if (max > 0) preallocate_info_init(substream); if (managed) substream->managed_buffer_alloc = 1; return 0; } static int preallocate_pages_for_all(struct snd_pcm *pcm, int type, void *data, size_t size, size_t max, bool managed) { struct snd_pcm_substream *substream; int stream, err; for_each_pcm_substream(pcm, stream, substream) { err = preallocate_pages(substream, type, data, size, max, managed); if (err < 0) return err; } return 0; } /** * snd_pcm_lib_preallocate_pages - pre-allocation for the given DMA type * @substream: the pcm substream instance * @type: DMA type (SNDRV_DMA_TYPE_*) * @data: DMA type dependent data * @size: the requested pre-allocation size in bytes * @max: the max. allowed pre-allocation size * * Do pre-allocation for the given DMA buffer type. */ void snd_pcm_lib_preallocate_pages(struct snd_pcm_substream *substream, int type, struct device *data, size_t size, size_t max) { preallocate_pages(substream, type, data, size, max, false); } EXPORT_SYMBOL(snd_pcm_lib_preallocate_pages); /** * snd_pcm_lib_preallocate_pages_for_all - pre-allocation for continuous memory type (all substreams) * @pcm: the pcm instance * @type: DMA type (SNDRV_DMA_TYPE_*) * @data: DMA type dependent data * @size: the requested pre-allocation size in bytes * @max: the max. allowed pre-allocation size * * Do pre-allocation to all substreams of the given pcm for the * specified DMA type. */ void snd_pcm_lib_preallocate_pages_for_all(struct snd_pcm *pcm, int type, void *data, size_t size, size_t max) { preallocate_pages_for_all(pcm, type, data, size, max, false); } EXPORT_SYMBOL(snd_pcm_lib_preallocate_pages_for_all); /** * snd_pcm_set_managed_buffer - set up buffer management for a substream * @substream: the pcm substream instance * @type: DMA type (SNDRV_DMA_TYPE_*) * @data: DMA type dependent data * @size: the requested pre-allocation size in bytes * @max: the max. allowed pre-allocation size * * Do pre-allocation for the given DMA buffer type, and set the managed * buffer allocation mode to the given substream. * In this mode, PCM core will allocate a buffer automatically before PCM * hw_params ops call, and release the buffer after PCM hw_free ops call * as well, so that the driver doesn't need to invoke the allocation and * the release explicitly in its callback. * When a buffer is actually allocated before the PCM hw_params call, it * turns on the runtime buffer_changed flag for drivers changing their h/w * parameters accordingly. * * When @size is non-zero and @max is zero, this tries to allocate for only * the exact buffer size without fallback, and may return -ENOMEM. * Otherwise, the function tries to allocate smaller chunks if the allocation * fails. This is the behavior of snd_pcm_set_fixed_buffer(). * * When both @size and @max are zero, the function only sets up the buffer * for later dynamic allocations. It's used typically for buffers with * SNDRV_DMA_TYPE_VMALLOC type. * * Upon successful buffer allocation and setup, the function returns 0. * * Return: zero if successful, or a negative error code */ int snd_pcm_set_managed_buffer(struct snd_pcm_substream *substream, int type, struct device *data, size_t size, size_t max) { return preallocate_pages(substream, type, data, size, max, true); } EXPORT_SYMBOL(snd_pcm_set_managed_buffer); /** * snd_pcm_set_managed_buffer_all - set up buffer management for all substreams * for all substreams * @pcm: the pcm instance * @type: DMA type (SNDRV_DMA_TYPE_*) * @data: DMA type dependent data * @size: the requested pre-allocation size in bytes * @max: the max. allowed pre-allocation size * * Do pre-allocation to all substreams of the given pcm for the specified DMA * type and size, and set the managed_buffer_alloc flag to each substream. * * Return: zero if successful, or a negative error code */ int snd_pcm_set_managed_buffer_all(struct snd_pcm *pcm, int type, struct device *data, size_t size, size_t max) { return preallocate_pages_for_all(pcm, type, data, size, max, true); } EXPORT_SYMBOL(snd_pcm_set_managed_buffer_all); /** * snd_pcm_lib_malloc_pages - allocate the DMA buffer * @substream: the substream to allocate the DMA buffer to * @size: the requested buffer size in bytes * * Allocates the DMA buffer on the BUS type given earlier to * snd_pcm_lib_preallocate_xxx_pages(). * * Return: 1 if the buffer is changed, 0 if not changed, or a negative * code on failure. */ int snd_pcm_lib_malloc_pages(struct snd_pcm_substream *substream, size_t size) { struct snd_card *card; struct snd_pcm_runtime *runtime; struct snd_dma_buffer *dmab = NULL; if (PCM_RUNTIME_CHECK(substream)) return -EINVAL; if (snd_BUG_ON(substream->dma_buffer.dev.type == SNDRV_DMA_TYPE_UNKNOWN)) return -EINVAL; runtime = substream->runtime; card = substream->pcm->card; if (runtime->dma_buffer_p) { /* perphaps, we might free the large DMA memory region to save some space here, but the actual solution costs us less time */ if (runtime->dma_buffer_p->bytes >= size) { runtime->dma_bytes = size; return 0; /* ok, do not change */ } snd_pcm_lib_free_pages(substream); } if (substream->dma_buffer.area != NULL && substream->dma_buffer.bytes >= size) { dmab = &substream->dma_buffer; /* use the pre-allocated buffer */ } else { /* dma_max=0 means the fixed size preallocation */ if (substream->dma_buffer.area && !substream->dma_max) return -ENOMEM; dmab = kzalloc(sizeof(*dmab), GFP_KERNEL); if (! dmab) return -ENOMEM; dmab->dev = substream->dma_buffer.dev; if (do_alloc_pages(card, substream->dma_buffer.dev.type, substream->dma_buffer.dev.dev, substream->stream, size, dmab) < 0) { kfree(dmab); pr_debug("ALSA pcmC%dD%d%c,%d:%s: cannot allocate for size %zu\n", substream->pcm->card->number, substream->pcm->device, substream->stream ? 'c' : 'p', substream->number, substream->pcm->name, size); return -ENOMEM; } } snd_pcm_set_runtime_buffer(substream, dmab); runtime->dma_bytes = size; return 1; /* area was changed */ } EXPORT_SYMBOL(snd_pcm_lib_malloc_pages); /** * snd_pcm_lib_free_pages - release the allocated DMA buffer. * @substream: the substream to release the DMA buffer * * Releases the DMA buffer allocated via snd_pcm_lib_malloc_pages(). * * Return: Zero if successful, or a negative error code on failure. */ int snd_pcm_lib_free_pages(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime; if (PCM_RUNTIME_CHECK(substream)) return -EINVAL; runtime = substream->runtime; if (runtime->dma_area == NULL) return 0; if (runtime->dma_buffer_p != &substream->dma_buffer) { struct snd_card *card = substream->pcm->card; /* it's a newly allocated buffer. release it now. */ do_free_pages(card, runtime->dma_buffer_p); kfree(runtime->dma_buffer_p); } snd_pcm_set_runtime_buffer(substream, NULL); return 0; } EXPORT_SYMBOL(snd_pcm_lib_free_pages); |
| 29 22 7 26 360 353 219 360 102 91 11 2 2 4 2 3 3 5 5 4 6 4 6 4 6 4 6 10 71 67 4 71 71 67 162 215 21 202 138 58 31 89 89 214 191 10 71 215 67 12 49 49 49 49 49 49 48 49 49 49 12 12 12 12 12 50 50 49 27 36 43 43 43 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. */ #include <linux/iversion.h> #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_inode_util.h" #include "xfs_trans.h" #include "xfs_ialloc.h" #include "xfs_health.h" #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_ag.h" #include "xfs_iunlink_item.h" #include "xfs_inode_item.h" uint16_t xfs_flags2diflags( struct xfs_inode *ip, unsigned int xflags) { /* can't set PREALLOC this way, just preserve it */ uint16_t di_flags = (ip->i_diflags & XFS_DIFLAG_PREALLOC); if (xflags & FS_XFLAG_IMMUTABLE) di_flags |= XFS_DIFLAG_IMMUTABLE; if (xflags & FS_XFLAG_APPEND) di_flags |= XFS_DIFLAG_APPEND; if (xflags & FS_XFLAG_SYNC) di_flags |= XFS_DIFLAG_SYNC; if (xflags & FS_XFLAG_NOATIME) di_flags |= XFS_DIFLAG_NOATIME; if (xflags & FS_XFLAG_NODUMP) di_flags |= XFS_DIFLAG_NODUMP; if (xflags & FS_XFLAG_NODEFRAG) di_flags |= XFS_DIFLAG_NODEFRAG; if (xflags & FS_XFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; if (S_ISDIR(VFS_I(ip)->i_mode)) { if (xflags & FS_XFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; if (xflags & FS_XFLAG_NOSYMLINKS) di_flags |= XFS_DIFLAG_NOSYMLINKS; if (xflags & FS_XFLAG_EXTSZINHERIT) di_flags |= XFS_DIFLAG_EXTSZINHERIT; if (xflags & FS_XFLAG_PROJINHERIT) di_flags |= XFS_DIFLAG_PROJINHERIT; } else if (S_ISREG(VFS_I(ip)->i_mode)) { if (xflags & FS_XFLAG_REALTIME) di_flags |= XFS_DIFLAG_REALTIME; if (xflags & FS_XFLAG_EXTSIZE) di_flags |= XFS_DIFLAG_EXTSIZE; } return di_flags; } uint64_t xfs_flags2diflags2( struct xfs_inode *ip, unsigned int xflags) { uint64_t di_flags2 = (ip->i_diflags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; if (xflags & FS_XFLAG_COWEXTSIZE) di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; return di_flags2; } uint32_t xfs_ip2xflags( struct xfs_inode *ip) { uint32_t flags = 0; if (ip->i_diflags & XFS_DIFLAG_ANY) { if (ip->i_diflags & XFS_DIFLAG_REALTIME) flags |= FS_XFLAG_REALTIME; if (ip->i_diflags & XFS_DIFLAG_PREALLOC) flags |= FS_XFLAG_PREALLOC; if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE) flags |= FS_XFLAG_IMMUTABLE; if (ip->i_diflags & XFS_DIFLAG_APPEND) flags |= FS_XFLAG_APPEND; if (ip->i_diflags & XFS_DIFLAG_SYNC) flags |= FS_XFLAG_SYNC; if (ip->i_diflags & XFS_DIFLAG_NOATIME) flags |= FS_XFLAG_NOATIME; if (ip->i_diflags & XFS_DIFLAG_NODUMP) flags |= FS_XFLAG_NODUMP; if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) flags |= FS_XFLAG_RTINHERIT; if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT) flags |= FS_XFLAG_PROJINHERIT; if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS) flags |= FS_XFLAG_NOSYMLINKS; if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) flags |= FS_XFLAG_EXTSIZE; if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) flags |= FS_XFLAG_EXTSZINHERIT; if (ip->i_diflags & XFS_DIFLAG_NODEFRAG) flags |= FS_XFLAG_NODEFRAG; if (ip->i_diflags & XFS_DIFLAG_FILESTREAM) flags |= FS_XFLAG_FILESTREAM; } if (ip->i_diflags2 & XFS_DIFLAG2_ANY) { if (ip->i_diflags2 & XFS_DIFLAG2_DAX) flags |= FS_XFLAG_DAX; if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) flags |= FS_XFLAG_COWEXTSIZE; } if (xfs_inode_has_attr_fork(ip)) flags |= FS_XFLAG_HASATTR; return flags; } prid_t xfs_get_initial_prid(struct xfs_inode *dp) { if (dp->i_diflags & XFS_DIFLAG_PROJINHERIT) return dp->i_projid; /* Assign to the root project by default. */ return 0; } /* Propagate di_flags from a parent inode to a child inode. */ static inline void xfs_inode_inherit_flags( struct xfs_inode *ip, const struct xfs_inode *pip) { unsigned int di_flags = 0; xfs_failaddr_t failaddr; umode_t mode = VFS_I(ip)->i_mode; if (S_ISDIR(mode)) { if (pip->i_diflags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { di_flags |= XFS_DIFLAG_EXTSZINHERIT; ip->i_extsize = pip->i_extsize; } if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT) di_flags |= XFS_DIFLAG_PROJINHERIT; } else if (S_ISREG(mode)) { if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) && xfs_has_realtime(ip->i_mount)) di_flags |= XFS_DIFLAG_REALTIME; if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { di_flags |= XFS_DIFLAG_EXTSIZE; ip->i_extsize = pip->i_extsize; } } if ((pip->i_diflags & XFS_DIFLAG_NOATIME) && xfs_inherit_noatime) di_flags |= XFS_DIFLAG_NOATIME; if ((pip->i_diflags & XFS_DIFLAG_NODUMP) && xfs_inherit_nodump) di_flags |= XFS_DIFLAG_NODUMP; if ((pip->i_diflags & XFS_DIFLAG_SYNC) && xfs_inherit_sync) di_flags |= XFS_DIFLAG_SYNC; if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) && xfs_inherit_nosymlinks) di_flags |= XFS_DIFLAG_NOSYMLINKS; if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) && xfs_inherit_nodefrag) di_flags |= XFS_DIFLAG_NODEFRAG; if (pip->i_diflags & XFS_DIFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; ip->i_diflags |= di_flags; /* * Inode verifiers on older kernels only check that the extent size * hint is an integer multiple of the rt extent size on realtime files. * They did not check the hint alignment on a directory with both * rtinherit and extszinherit flags set. If the misaligned hint is * propagated from a directory into a new realtime file, new file * allocations will fail due to math errors in the rt allocator and/or * trip the verifiers. Validate the hint settings in the new file so * that we don't let broken hints propagate. */ failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize, VFS_I(ip)->i_mode, ip->i_diflags); if (failaddr) { ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT); ip->i_extsize = 0; } } /* Propagate di_flags2 from a parent inode to a child inode. */ static inline void xfs_inode_inherit_flags2( struct xfs_inode *ip, const struct xfs_inode *pip) { xfs_failaddr_t failaddr; if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_cowextsize = pip->i_cowextsize; } if (pip->i_diflags2 & XFS_DIFLAG2_DAX) ip->i_diflags2 |= XFS_DIFLAG2_DAX; if (xfs_is_metadir_inode(pip)) ip->i_diflags2 |= XFS_DIFLAG2_METADATA; /* Don't let invalid cowextsize hints propagate. */ failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2); if (failaddr) { ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; ip->i_cowextsize = 0; } } /* * If we need to create attributes immediately after allocating the inode, * initialise an empty attribute fork right now. We use the default fork offset * for attributes here as we don't know exactly what size or how many * attributes we might be adding. We can do this safely here because we know * the data fork is completely empty and this saves us from needing to run a * separate transaction to set the fork offset in the immediate future. * * If we have parent pointers and the caller hasn't told us that the file will * never be linked into a directory tree, we /must/ create the attr fork. */ static inline bool xfs_icreate_want_attrfork( struct xfs_mount *mp, const struct xfs_icreate_args *args) { if (args->flags & XFS_ICREATE_INIT_XATTRS) return true; if (!(args->flags & XFS_ICREATE_UNLINKABLE) && xfs_has_parent(mp)) return true; return false; } /* Initialise an inode's attributes. */ void xfs_inode_init( struct xfs_trans *tp, const struct xfs_icreate_args *args, struct xfs_inode *ip) { struct xfs_inode *pip = args->pip; struct inode *dir = pip ? VFS_I(pip) : NULL; struct xfs_mount *mp = tp->t_mountp; struct inode *inode = VFS_I(ip); unsigned int flags; int times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG | XFS_ICHGTIME_ACCESS; if (args->flags & XFS_ICREATE_TMPFILE) set_nlink(inode, 0); else if (S_ISDIR(args->mode)) set_nlink(inode, 2); else set_nlink(inode, 1); inode->i_rdev = args->rdev; if (!args->idmap || pip == NULL) { /* creating a tree root, sb rooted, or detached file */ inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; ip->i_projid = 0; inode->i_mode = args->mode; } else { /* creating a child in the directory tree */ if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { inode_fsuid_set(inode, args->idmap); inode->i_gid = dir->i_gid; inode->i_mode = args->mode; } else { inode_init_owner(args->idmap, inode, dir, args->mode); } /* * If the group ID of the new file does not match the effective * group ID or one of the supplementary group IDs, the S_ISGID * bit is cleared (and only if the irix_sgid_inherit * compatibility variable is set). */ if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode))) inode->i_mode &= ~S_ISGID; ip->i_projid = xfs_get_initial_prid(pip); } ip->i_disk_size = 0; ip->i_df.if_nextents = 0; ASSERT(ip->i_nblocks == 0); ip->i_extsize = 0; ip->i_diflags = 0; if (xfs_has_v3inodes(mp)) { inode_set_iversion(inode, 1); /* also covers the di_used_blocks union arm: */ ip->i_cowextsize = 0; times |= XFS_ICHGTIME_CREATE; } xfs_trans_ichgtime(tp, ip, times); flags = XFS_ILOG_CORE; switch (args->mode & S_IFMT) { case S_IFIFO: case S_IFCHR: case S_IFBLK: case S_IFSOCK: ip->i_df.if_format = XFS_DINODE_FMT_DEV; flags |= XFS_ILOG_DEV; break; case S_IFREG: case S_IFDIR: if (pip && (pip->i_diflags & XFS_DIFLAG_ANY)) xfs_inode_inherit_flags(ip, pip); if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY)) xfs_inode_inherit_flags2(ip, pip); fallthrough; case S_IFLNK: ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_bytes = 0; ip->i_df.if_data = NULL; break; default: ASSERT(0); } if (xfs_icreate_want_attrfork(mp, args)) { ip->i_forkoff = xfs_default_attroffset(ip) >> 3; xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); if (!xfs_has_attr(mp)) { spin_lock(&mp->m_sb_lock); xfs_add_attr(mp); spin_unlock(&mp->m_sb_lock); xfs_log_sb(tp); } } xfs_trans_log_inode(tp, ip, flags); } /* * In-Core Unlinked List Lookups * ============================= * * Every inode is supposed to be reachable from some other piece of metadata * with the exception of the root directory. Inodes with a connection to a * file descriptor but not linked from anywhere in the on-disk directory tree * are collectively known as unlinked inodes, though the filesystem itself * maintains links to these inodes so that on-disk metadata are consistent. * * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI * header contains a number of buckets that point to an inode, and each inode * record has a pointer to the next inode in the hash chain. This * singly-linked list causes scaling problems in the iunlink remove function * because we must walk that list to find the inode that points to the inode * being removed from the unlinked hash bucket list. * * Hence we keep an in-memory double linked list to link each inode on an * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer * based lists would require having 64 list heads in the perag, one for each * list. This is expensive in terms of memory (think millions of AGs) and cache * misses on lookups. Instead, use the fact that inodes on the unlinked list * must be referenced at the VFS level to keep them on the list and hence we * have an existence guarantee for inodes on the unlinked list. * * Given we have an existence guarantee, we can use lockless inode cache lookups * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode * for the double linked unlinked list, and we don't need any extra locking to * keep the list safe as all manipulations are done under the AGI buffer lock. * Keeping the list up to date does not require memory allocation, just finding * the XFS inode and updating the next/prev unlinked list aginos. */ /* * Update the prev pointer of the next agino. Returns -ENOLINK if the inode * is not in cache. */ static int xfs_iunlink_update_backref( struct xfs_perag *pag, xfs_agino_t prev_agino, xfs_agino_t next_agino) { struct xfs_inode *ip; /* No update necessary if we are at the end of the list. */ if (next_agino == NULLAGINO) return 0; ip = xfs_iunlink_lookup(pag, next_agino); if (!ip) return -ENOLINK; ip->i_prev_unlinked = prev_agino; return 0; } /* * Point the AGI unlinked bucket at an inode and log the results. The caller * is responsible for validating the old value. */ STATIC int xfs_iunlink_update_bucket( struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_buf *agibp, unsigned int bucket_index, xfs_agino_t new_agino) { struct xfs_agi *agi = agibp->b_addr; xfs_agino_t old_value; int offset; ASSERT(xfs_verify_agino_or_null(pag, new_agino)); old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); trace_xfs_iunlink_update_bucket(pag, bucket_index, old_value, new_agino); /* * We should never find the head of the list already set to the value * passed in because either we're adding or removing ourselves from the * head of the list. */ if (old_value == new_agino) { xfs_buf_mark_corrupt(agibp); xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); return -EFSCORRUPTED; } agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); offset = offsetof(struct xfs_agi, agi_unlinked) + (sizeof(xfs_agino_t) * bucket_index); xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); return 0; } static int xfs_iunlink_insert_inode( struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_buf *agibp, struct xfs_inode *ip) { struct xfs_mount *mp = tp->t_mountp; struct xfs_agi *agi = agibp->b_addr; xfs_agino_t next_agino; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; int error; /* * Get the index into the agi hash table for the list this inode will * go on. Make sure the pointer isn't garbage and that this inode * isn't already on the list. */ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); if (next_agino == agino || !xfs_verify_agino_or_null(pag, next_agino)) { xfs_buf_mark_corrupt(agibp); xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); return -EFSCORRUPTED; } /* * Update the prev pointer in the next inode to point back to this * inode. */ error = xfs_iunlink_update_backref(pag, agino, next_agino); if (error == -ENOLINK) error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino); if (error) return error; if (next_agino != NULLAGINO) { /* * There is already another inode in the bucket, so point this * inode to the current head of the list. */ error = xfs_iunlink_log_inode(tp, ip, pag, next_agino); if (error) return error; ip->i_next_unlinked = next_agino; } /* Point the head of the list to point to this inode. */ ip->i_prev_unlinked = NULLAGINO; return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); } /* * This is called when the inode's link count has gone to 0 or we are creating * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. * * We place the on-disk inode on a list in the AGI. It will be pulled from this * list when the inode is freed. */ int xfs_iunlink( struct xfs_trans *tp, struct xfs_inode *ip) { struct xfs_mount *mp = tp->t_mountp; struct xfs_perag *pag; struct xfs_buf *agibp; int error; ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); trace_xfs_iunlink(ip); pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); /* Get the agi buffer first. It ensures lock ordering on the list. */ error = xfs_read_agi(pag, tp, 0, &agibp); if (error) goto out; error = xfs_iunlink_insert_inode(tp, pag, agibp, ip); out: xfs_perag_put(pag); return error; } static int xfs_iunlink_remove_inode( struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_buf *agibp, struct xfs_inode *ip) { struct xfs_mount *mp = tp->t_mountp; struct xfs_agi *agi = agibp->b_addr; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); xfs_agino_t head_agino; short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; int error; trace_xfs_iunlink_remove(ip); /* * Get the index into the agi hash table for the list this inode will * go on. Make sure the head pointer isn't garbage. */ head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); if (!xfs_verify_agino(pag, head_agino)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); return -EFSCORRUPTED; } /* * Set our inode's next_unlinked pointer to NULL and then return * the old pointer value so that we can update whatever was previous * to us in the list to point to whatever was next in the list. */ error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO); if (error) return error; /* * Update the prev pointer in the next inode to point back to previous * inode in the chain. */ error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, ip->i_next_unlinked); if (error == -ENOLINK) error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked, ip->i_next_unlinked); if (error) return error; if (head_agino != agino) { struct xfs_inode *prev_ip; prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); if (!prev_ip) { xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } error = xfs_iunlink_log_inode(tp, prev_ip, pag, ip->i_next_unlinked); prev_ip->i_next_unlinked = ip->i_next_unlinked; } else { /* Point the head of the list to the next unlinked inode. */ error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, ip->i_next_unlinked); } ip->i_next_unlinked = NULLAGINO; ip->i_prev_unlinked = 0; return error; } /* * Pull the on-disk inode from the AGI unlinked list. */ int xfs_iunlink_remove( struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_inode *ip) { struct xfs_buf *agibp; int error; trace_xfs_iunlink_remove(ip); /* Get the agi buffer first. It ensures lock ordering on the list. */ error = xfs_read_agi(pag, tp, 0, &agibp); if (error) return error; return xfs_iunlink_remove_inode(tp, pag, agibp, ip); } /* * Decrement the link count on an inode & log the change. If this causes the * link count to go to zero, move the inode to AGI unlinked list so that it can * be freed when the last active reference goes away via xfs_inactive(). */ int xfs_droplink( struct xfs_trans *tp, struct xfs_inode *ip) { struct inode *inode = VFS_I(ip); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); if (inode->i_nlink == 0) { xfs_info_ratelimited(tp->t_mountp, "Inode 0x%llx link count dropped below zero. Pinning link count.", ip->i_ino); set_nlink(inode, XFS_NLINK_PINNED); } if (inode->i_nlink != XFS_NLINK_PINNED) drop_nlink(inode); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (inode->i_nlink) return 0; return xfs_iunlink(tp, ip); } /* * Increment the link count on an inode & log the change. */ void xfs_bumplink( struct xfs_trans *tp, struct xfs_inode *ip) { struct inode *inode = VFS_I(ip); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); if (inode->i_nlink == XFS_NLINK_PINNED - 1) xfs_info_ratelimited(tp->t_mountp, "Inode 0x%llx link count exceeded maximum. Pinning link count.", ip->i_ino); if (inode->i_nlink != XFS_NLINK_PINNED) inc_nlink(inode); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } /* Free an inode in the ondisk index and zero it out. */ int xfs_inode_uninit( struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_inode *ip, struct xfs_icluster *xic) { struct xfs_mount *mp = ip->i_mount; int error; /* * Free the inode first so that we guarantee that the AGI lock is going * to be taken before we remove the inode from the unlinked list. This * makes the AGI lock -> unlinked list modification order the same as * used in O_TMPFILE creation. */ error = xfs_difree(tp, pag, ip->i_ino, xic); if (error) return error; error = xfs_iunlink_remove(tp, pag, ip); if (error) return error; /* * Free any local-format data sitting around before we reset the * data fork to extents format. Note that the attr fork data has * already been freed by xfs_attr_inactive. */ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { kfree(ip->i_df.if_data); ip->i_df.if_data = NULL; ip->i_df.if_bytes = 0; } VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_diflags = 0; ip->i_diflags2 = mp->m_ino_geo.new_diflags2; ip->i_forkoff = 0; /* mark the attr fork not in use */ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; /* * Bump the generation count so no one will be confused * by reincarnations of this inode. */ VFS_I(ip)->i_generation++; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); return 0; } |
| 78 72 79 226 2 89 152 31 31 95 29 72 37 37 31 31 4 4 32 3 106 94 21 21 2 168 168 35 35 34 67 33 32 162 225 158 158 229 159 86 84 167 63 7 307 2 10 7 95 19 101 45 2 5 13 77 70 12 86 17 6 18 6 4 12 6 34 33 97 97 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM f2fs #if !defined(_TRACE_F2FS_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_F2FS_H #include <linux/tracepoint.h> #include <uapi/linux/f2fs.h> #define show_dev(dev) MAJOR(dev), MINOR(dev) #define show_dev_ino(entry) show_dev(entry->dev), (unsigned long)entry->ino TRACE_DEFINE_ENUM(NODE); TRACE_DEFINE_ENUM(DATA); TRACE_DEFINE_ENUM(META); TRACE_DEFINE_ENUM(META_FLUSH); TRACE_DEFINE_ENUM(IPU); TRACE_DEFINE_ENUM(OPU); TRACE_DEFINE_ENUM(HOT); TRACE_DEFINE_ENUM(WARM); TRACE_DEFINE_ENUM(COLD); TRACE_DEFINE_ENUM(CURSEG_HOT_DATA); TRACE_DEFINE_ENUM(CURSEG_WARM_DATA); TRACE_DEFINE_ENUM(CURSEG_COLD_DATA); TRACE_DEFINE_ENUM(CURSEG_HOT_NODE); TRACE_DEFINE_ENUM(CURSEG_WARM_NODE); TRACE_DEFINE_ENUM(CURSEG_COLD_NODE); TRACE_DEFINE_ENUM(NO_CHECK_TYPE); TRACE_DEFINE_ENUM(GC_GREEDY); TRACE_DEFINE_ENUM(GC_CB); TRACE_DEFINE_ENUM(FG_GC); TRACE_DEFINE_ENUM(BG_GC); TRACE_DEFINE_ENUM(LFS); TRACE_DEFINE_ENUM(SSR); TRACE_DEFINE_ENUM(__REQ_RAHEAD); TRACE_DEFINE_ENUM(__REQ_SYNC); TRACE_DEFINE_ENUM(__REQ_IDLE); TRACE_DEFINE_ENUM(__REQ_PREFLUSH); TRACE_DEFINE_ENUM(__REQ_FUA); TRACE_DEFINE_ENUM(__REQ_PRIO); TRACE_DEFINE_ENUM(__REQ_META); TRACE_DEFINE_ENUM(CP_UMOUNT); TRACE_DEFINE_ENUM(CP_FASTBOOT); TRACE_DEFINE_ENUM(CP_SYNC); TRACE_DEFINE_ENUM(CP_RECOVERY); TRACE_DEFINE_ENUM(CP_DISCARD); TRACE_DEFINE_ENUM(CP_TRIMMED); TRACE_DEFINE_ENUM(CP_PAUSE); TRACE_DEFINE_ENUM(CP_RESIZE); TRACE_DEFINE_ENUM(EX_READ); TRACE_DEFINE_ENUM(EX_BLOCK_AGE); #define show_block_type(type) \ __print_symbolic(type, \ { NODE, "NODE" }, \ { DATA, "DATA" }, \ { META, "META" }, \ { META_FLUSH, "META_FLUSH" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) #define show_block_temp(temp) \ __print_symbolic(temp, \ { HOT, "HOT" }, \ { WARM, "WARM" }, \ { COLD, "COLD" }) #define F2FS_OP_FLAGS (REQ_RAHEAD | REQ_SYNC | REQ_META | REQ_PRIO | \ REQ_PREFLUSH | REQ_FUA) #define F2FS_BIO_FLAG_MASK(t) (__force u32)((t) & F2FS_OP_FLAGS) #define show_bio_type(op,op_flags) show_bio_op(op), \ show_bio_op_flags(op_flags) #define show_bio_op(op) blk_op_str(op) #define show_bio_op_flags(flags) \ __print_flags(F2FS_BIO_FLAG_MASK(flags), "|", \ { (__force u32)REQ_RAHEAD, "R" }, \ { (__force u32)REQ_SYNC, "S" }, \ { (__force u32)REQ_META, "M" }, \ { (__force u32)REQ_PRIO, "P" }, \ { (__force u32)REQ_PREFLUSH, "PF" }, \ { (__force u32)REQ_FUA, "FUA" }) #define show_data_type(type) \ __print_symbolic(type, \ { CURSEG_HOT_DATA, "Hot DATA" }, \ { CURSEG_WARM_DATA, "Warm DATA" }, \ { CURSEG_COLD_DATA, "Cold DATA" }, \ { CURSEG_HOT_NODE, "Hot NODE" }, \ { CURSEG_WARM_NODE, "Warm NODE" }, \ { CURSEG_COLD_NODE, "Cold NODE" }, \ { NO_CHECK_TYPE, "No TYPE" }) #define show_file_type(type) \ __print_symbolic(type, \ { 0, "FILE" }, \ { 1, "DIR" }) #define show_gc_type(type) \ __print_symbolic(type, \ { FG_GC, "Foreground GC" }, \ { BG_GC, "Background GC" }) #define show_alloc_mode(type) \ __print_symbolic(type, \ { LFS, "LFS-mode" }, \ { SSR, "SSR-mode" }, \ { AT_SSR, "AT_SSR-mode" }) #define show_victim_policy(type) \ __print_symbolic(type, \ { GC_GREEDY, "Greedy" }, \ { GC_CB, "Cost-Benefit" }, \ { GC_AT, "Age-threshold" }) #define show_cpreason(type) \ __print_flags(type, "|", \ { CP_UMOUNT, "Umount" }, \ { CP_FASTBOOT, "Fastboot" }, \ { CP_SYNC, "Sync" }, \ { CP_RECOVERY, "Recovery" }, \ { CP_DISCARD, "Discard" }, \ { CP_PAUSE, "Pause" }, \ { CP_TRIMMED, "Trimmed" }, \ { CP_RESIZE, "Resize" }) #define show_fsync_cpreason(type) \ __print_symbolic(type, \ { CP_NO_NEEDED, "no needed" }, \ { CP_NON_REGULAR, "non regular" }, \ { CP_COMPRESSED, "compressed" }, \ { CP_HARDLINK, "hardlink" }, \ { CP_SB_NEED_CP, "sb needs cp" }, \ { CP_WRONG_PINO, "wrong pino" }, \ { CP_NO_SPC_ROLL, "no space roll forward" }, \ { CP_NODE_NEED_CP, "node needs cp" }, \ { CP_FASTBOOT_MODE, "fastboot mode" }, \ { CP_SPEC_LOG_NUM, "log type is 2" }, \ { CP_RECOVER_DIR, "dir needs recovery" }, \ { CP_XATTR_DIR, "dir's xattr updated" }) #define show_shutdown_mode(type) \ __print_symbolic(type, \ { F2FS_GOING_DOWN_FULLSYNC, "full sync" }, \ { F2FS_GOING_DOWN_METASYNC, "meta sync" }, \ { F2FS_GOING_DOWN_NOSYNC, "no sync" }, \ { F2FS_GOING_DOWN_METAFLUSH, "meta flush" }, \ { F2FS_GOING_DOWN_NEED_FSCK, "need fsck" }) #define show_compress_algorithm(type) \ __print_symbolic(type, \ { COMPRESS_LZO, "LZO" }, \ { COMPRESS_LZ4, "LZ4" }, \ { COMPRESS_ZSTD, "ZSTD" }, \ { COMPRESS_LZORLE, "LZO-RLE" }) #define show_extent_type(type) \ __print_symbolic(type, \ { EX_READ, "Read" }, \ { EX_BLOCK_AGE, "Block Age" }) #define show_inode_type(x) \ __print_symbolic(x, \ { S_IFLNK, "symbolic" }, \ { S_IFREG, "regular" }, \ { S_IFDIR, "directory" }, \ { S_IFCHR, "character" }, \ { S_IFBLK, "block" }, \ { S_IFIFO, "fifo" }, \ { S_IFSOCK, "sock" }) #define S_ALL_PERM (S_ISUID | S_ISGID | S_ISVTX | \ S_IRWXU | S_IRWXG | S_IRWXO) struct f2fs_sb_info; struct f2fs_io_info; struct extent_info; struct victim_sel_policy; struct f2fs_map_blocks; DECLARE_EVENT_CLASS(f2fs__inode, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(ino_t, pino) __field(umode_t, mode) __field(loff_t, size) __field(unsigned int, nlink) __field(blkcnt_t, blocks) __field(__u8, advise) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pino = F2FS_I(inode)->i_pino; __entry->mode = inode->i_mode; __entry->nlink = inode->i_nlink; __entry->size = inode->i_size; __entry->blocks = inode->i_blocks; __entry->advise = F2FS_I(inode)->i_advise; ), TP_printk("dev = (%d,%d), ino = %lu, pino = %lu, i_mode = 0x%hx, " "i_size = %lld, i_nlink = %u, i_blocks = %llu, i_advise = 0x%x", show_dev_ino(__entry), (unsigned long)__entry->pino, __entry->mode, __entry->size, (unsigned int)__entry->nlink, (unsigned long long)__entry->blocks, (unsigned char)__entry->advise) ); DECLARE_EVENT_CLASS(f2fs__inode_exit, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(umode_t, mode) __field(int, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, type: %s, mode = 0%o, ret = %d", show_dev_ino(__entry), show_inode_type(__entry->mode & S_IFMT), __entry->mode & S_ALL_PERM, __entry->ret) ); DEFINE_EVENT(f2fs__inode, f2fs_sync_file_enter, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); TRACE_EVENT(f2fs_sync_file_exit, TP_PROTO(struct inode *inode, int cp_reason, int datasync, int ret), TP_ARGS(inode, cp_reason, datasync, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(int, cp_reason) __field(int, datasync) __field(int, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->cp_reason = cp_reason; __entry->datasync = datasync; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, cp_reason: %s, " "datasync = %d, ret = %d", show_dev_ino(__entry), show_fsync_cpreason(__entry->cp_reason), __entry->datasync, __entry->ret) ); TRACE_EVENT(f2fs_sync_fs, TP_PROTO(struct super_block *sb, int wait), TP_ARGS(sb, wait), TP_STRUCT__entry( __field(dev_t, dev) __field(int, dirty) __field(int, wait) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->dirty = is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY); __entry->wait = wait; ), TP_printk("dev = (%d,%d), superblock is %s, wait = %d", show_dev(__entry->dev), __entry->dirty ? "dirty" : "not dirty", __entry->wait) ); DEFINE_EVENT(f2fs__inode, f2fs_iget, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(f2fs__inode_exit, f2fs_iget_exit, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret) ); DEFINE_EVENT(f2fs__inode, f2fs_evict_inode, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(f2fs__inode_exit, f2fs_new_inode, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret) ); TRACE_EVENT(f2fs_unlink_enter, TP_PROTO(struct inode *dir, struct dentry *dentry), TP_ARGS(dir, dentry), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, size) __field(blkcnt_t, blocks) __string(name, dentry->d_name.name) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; __entry->size = dir->i_size; __entry->blocks = dir->i_blocks; __assign_str(name); ), TP_printk("dev = (%d,%d), dir ino = %lu, i_size = %lld, " "i_blocks = %llu, name = %s", show_dev_ino(__entry), __entry->size, (unsigned long long)__entry->blocks, __get_str(name)) ); DEFINE_EVENT(f2fs__inode_exit, f2fs_unlink_exit, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret) ); DEFINE_EVENT(f2fs__inode_exit, f2fs_drop_inode, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret) ); DEFINE_EVENT(f2fs__inode, f2fs_truncate, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); TRACE_EVENT(f2fs_truncate_data_blocks_range, TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs, int free), TP_ARGS(inode, nid, ofs, free), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(nid_t, nid) __field(unsigned int, ofs) __field(int, free) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nid = nid; __entry->ofs = ofs; __entry->free = free; ), TP_printk("dev = (%d,%d), ino = %lu, nid = %u, offset = %u, freed = %d", show_dev_ino(__entry), (unsigned int)__entry->nid, __entry->ofs, __entry->free) ); DECLARE_EVENT_CLASS(f2fs__truncate_op, TP_PROTO(struct inode *inode, u64 from), TP_ARGS(inode, from), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, size) __field(blkcnt_t, blocks) __field(u64, from) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->size = inode->i_size; __entry->blocks = inode->i_blocks; __entry->from = from; ), TP_printk("dev = (%d,%d), ino = %lu, i_size = %lld, i_blocks = %llu, " "start file offset = %llu", show_dev_ino(__entry), __entry->size, (unsigned long long)__entry->blocks, (unsigned long long)__entry->from) ); DEFINE_EVENT(f2fs__truncate_op, f2fs_truncate_blocks_enter, TP_PROTO(struct inode *inode, u64 from), TP_ARGS(inode, from) ); DEFINE_EVENT(f2fs__inode_exit, f2fs_truncate_blocks_exit, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret) ); DEFINE_EVENT(f2fs__truncate_op, f2fs_truncate_inode_blocks_enter, TP_PROTO(struct inode *inode, u64 from), TP_ARGS(inode, from) ); DEFINE_EVENT(f2fs__inode_exit, f2fs_truncate_inode_blocks_exit, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret) ); DECLARE_EVENT_CLASS(f2fs__truncate_node, TP_PROTO(struct inode *inode, nid_t nid, block_t blk_addr), TP_ARGS(inode, nid, blk_addr), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(nid_t, nid) __field(block_t, blk_addr) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nid = nid; __entry->blk_addr = blk_addr; ), TP_printk("dev = (%d,%d), ino = %lu, nid = %u, block_address = 0x%llx", show_dev_ino(__entry), (unsigned int)__entry->nid, (unsigned long long)__entry->blk_addr) ); DEFINE_EVENT(f2fs__truncate_node, f2fs_truncate_nodes_enter, TP_PROTO(struct inode *inode, nid_t nid, block_t blk_addr), TP_ARGS(inode, nid, blk_addr) ); DEFINE_EVENT(f2fs__inode_exit, f2fs_truncate_nodes_exit, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret) ); DEFINE_EVENT(f2fs__truncate_node, f2fs_truncate_node, TP_PROTO(struct inode *inode, nid_t nid, block_t blk_addr), TP_ARGS(inode, nid, blk_addr) ); TRACE_EVENT(f2fs_truncate_partial_nodes, TP_PROTO(struct inode *inode, nid_t *nid, int depth, int err), TP_ARGS(inode, nid, depth, err), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __array(nid_t, nid, 3) __field(int, depth) __field(int, err) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nid[0] = nid[0]; __entry->nid[1] = nid[1]; __entry->nid[2] = nid[2]; __entry->depth = depth; __entry->err = err; ), TP_printk("dev = (%d,%d), ino = %lu, " "nid[0] = %u, nid[1] = %u, nid[2] = %u, depth = %d, err = %d", show_dev_ino(__entry), (unsigned int)__entry->nid[0], (unsigned int)__entry->nid[1], (unsigned int)__entry->nid[2], __entry->depth, __entry->err) ); TRACE_EVENT(f2fs_file_write_iter, TP_PROTO(struct inode *inode, loff_t offset, size_t length, ssize_t ret), TP_ARGS(inode, offset, length, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, offset) __field(size_t, length) __field(ssize_t, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->offset = offset; __entry->length = length; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, " "offset = %lld, length = %zu, written(err) = %zd", show_dev_ino(__entry), __entry->offset, __entry->length, __entry->ret) ); TRACE_EVENT(f2fs_map_blocks, TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, int flag, int ret), TP_ARGS(inode, map, flag, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(block_t, m_lblk) __field(block_t, m_pblk) __field(unsigned int, m_len) __field(unsigned int, m_flags) __field(int, m_seg_type) __field(bool, m_may_create) __field(bool, m_multidev_dio) __field(int, flag) __field(int, ret) ), TP_fast_assign( __entry->dev = map->m_bdev->bd_dev; __entry->ino = inode->i_ino; __entry->m_lblk = map->m_lblk; __entry->m_pblk = map->m_pblk; __entry->m_len = map->m_len; __entry->m_flags = map->m_flags; __entry->m_seg_type = map->m_seg_type; __entry->m_may_create = map->m_may_create; __entry->m_multidev_dio = map->m_multidev_dio; __entry->flag = flag; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, " "start blkaddr = 0x%llx, len = 0x%llx, flags = %u, " "seg_type = %d, may_create = %d, multidevice = %d, " "flag = %d, err = %d", show_dev_ino(__entry), (unsigned long long)__entry->m_lblk, (unsigned long long)__entry->m_pblk, (unsigned long long)__entry->m_len, __entry->m_flags, __entry->m_seg_type, __entry->m_may_create, __entry->m_multidev_dio, __entry->flag, __entry->ret) ); TRACE_EVENT(f2fs_background_gc, TP_PROTO(struct super_block *sb, unsigned int wait_ms, unsigned int prefree, unsigned int free), TP_ARGS(sb, wait_ms, prefree, free), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, wait_ms) __field(unsigned int, prefree) __field(unsigned int, free) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->wait_ms = wait_ms; __entry->prefree = prefree; __entry->free = free; ), TP_printk("dev = (%d,%d), wait_ms = %u, prefree = %u, free = %u", show_dev(__entry->dev), __entry->wait_ms, __entry->prefree, __entry->free) ); TRACE_EVENT(f2fs_gc_begin, TP_PROTO(struct super_block *sb, int gc_type, bool no_bg_gc, unsigned int nr_free_secs, long long dirty_nodes, long long dirty_dents, long long dirty_imeta, unsigned int free_sec, unsigned int free_seg, int reserved_seg, unsigned int prefree_seg), TP_ARGS(sb, gc_type, no_bg_gc, nr_free_secs, dirty_nodes, dirty_dents, dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), TP_STRUCT__entry( __field(dev_t, dev) __field(int, gc_type) __field(bool, no_bg_gc) __field(unsigned int, nr_free_secs) __field(long long, dirty_nodes) __field(long long, dirty_dents) __field(long long, dirty_imeta) __field(unsigned int, free_sec) __field(unsigned int, free_seg) __field(int, reserved_seg) __field(unsigned int, prefree_seg) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->gc_type = gc_type; __entry->no_bg_gc = no_bg_gc; __entry->nr_free_secs = nr_free_secs; __entry->dirty_nodes = dirty_nodes; __entry->dirty_dents = dirty_dents; __entry->dirty_imeta = dirty_imeta; __entry->free_sec = free_sec; __entry->free_seg = free_seg; __entry->reserved_seg = reserved_seg; __entry->prefree_seg = prefree_seg; ), TP_printk("dev = (%d,%d), gc_type = %s, no_background_GC = %d, nr_free_secs = %u, " "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " "rsv_seg:%d, prefree_seg:%u", show_dev(__entry->dev), show_gc_type(__entry->gc_type), (__entry->gc_type == BG_GC) ? __entry->no_bg_gc : -1, __entry->nr_free_secs, __entry->dirty_nodes, __entry->dirty_dents, __entry->dirty_imeta, __entry->free_sec, __entry->free_seg, __entry->reserved_seg, __entry->prefree_seg) ); TRACE_EVENT(f2fs_gc_end, TP_PROTO(struct super_block *sb, int ret, int seg_freed, int sec_freed, long long dirty_nodes, long long dirty_dents, long long dirty_imeta, unsigned int free_sec, unsigned int free_seg, int reserved_seg, unsigned int prefree_seg), TP_ARGS(sb, ret, seg_freed, sec_freed, dirty_nodes, dirty_dents, dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), TP_STRUCT__entry( __field(dev_t, dev) __field(int, ret) __field(int, seg_freed) __field(int, sec_freed) __field(long long, dirty_nodes) __field(long long, dirty_dents) __field(long long, dirty_imeta) __field(unsigned int, free_sec) __field(unsigned int, free_seg) __field(int, reserved_seg) __field(unsigned int, prefree_seg) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ret = ret; __entry->seg_freed = seg_freed; __entry->sec_freed = sec_freed; __entry->dirty_nodes = dirty_nodes; __entry->dirty_dents = dirty_dents; __entry->dirty_imeta = dirty_imeta; __entry->free_sec = free_sec; __entry->free_seg = free_seg; __entry->reserved_seg = reserved_seg; __entry->prefree_seg = prefree_seg; ), TP_printk("dev = (%d,%d), ret = %d, seg_freed = %d, sec_freed = %d, " "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, " "free_seg:%u, rsv_seg:%d, prefree_seg:%u", show_dev(__entry->dev), __entry->ret, __entry->seg_freed, __entry->sec_freed, __entry->dirty_nodes, __entry->dirty_dents, __entry->dirty_imeta, __entry->free_sec, __entry->free_seg, __entry->reserved_seg, __entry->prefree_seg) ); TRACE_EVENT(f2fs_get_victim, TP_PROTO(struct super_block *sb, int type, int gc_type, struct victim_sel_policy *p, unsigned int pre_victim, unsigned int prefree, unsigned int free), TP_ARGS(sb, type, gc_type, p, pre_victim, prefree, free), TP_STRUCT__entry( __field(dev_t, dev) __field(int, type) __field(int, gc_type) __field(int, alloc_mode) __field(int, gc_mode) __field(unsigned int, victim) __field(unsigned int, cost) __field(unsigned int, ofs_unit) __field(unsigned int, pre_victim) __field(unsigned int, prefree) __field(unsigned int, free) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->type = type; __entry->gc_type = gc_type; __entry->alloc_mode = p->alloc_mode; __entry->gc_mode = p->gc_mode; __entry->victim = p->min_segno; __entry->cost = p->min_cost; __entry->ofs_unit = p->ofs_unit; __entry->pre_victim = pre_victim; __entry->prefree = prefree; __entry->free = free; ), TP_printk("dev = (%d,%d), type = %s, policy = (%s, %s, %s), " "victim = %u, cost = %u, ofs_unit = %u, " "pre_victim_secno = %d, prefree = %u, free = %u", show_dev(__entry->dev), show_data_type(__entry->type), show_gc_type(__entry->gc_type), show_alloc_mode(__entry->alloc_mode), show_victim_policy(__entry->gc_mode), __entry->victim, __entry->cost, __entry->ofs_unit, (int)__entry->pre_victim, __entry->prefree, __entry->free) ); TRACE_EVENT(f2fs_lookup_start, TP_PROTO(struct inode *dir, struct dentry *dentry, unsigned int flags), TP_ARGS(dir, dentry, flags), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __string(name, dentry->d_name.name) __field(unsigned int, flags) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; __assign_str(name); __entry->flags = flags; ), TP_printk("dev = (%d,%d), pino = %lu, name:%s, flags:%u", show_dev_ino(__entry), __get_str(name), __entry->flags) ); TRACE_EVENT(f2fs_lookup_end, TP_PROTO(struct inode *dir, struct dentry *dentry, nid_t ino, int err), TP_ARGS(dir, dentry, ino, err), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __string(name, dentry->d_name.name) __field(nid_t, cino) __field(int, err) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; __assign_str(name); __entry->cino = ino; __entry->err = err; ), TP_printk("dev = (%d,%d), pino = %lu, name:%s, ino:%u, err:%d", show_dev_ino(__entry), __get_str(name), __entry->cino, __entry->err) ); TRACE_EVENT(f2fs_rename_start, TP_PROTO(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags), TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, flags), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __string(old_name, old_dentry->d_name.name) __field(ino_t, new_pino) __string(new_name, new_dentry->d_name.name) __field(unsigned int, flags) ), TP_fast_assign( __entry->dev = old_dir->i_sb->s_dev; __entry->ino = old_dir->i_ino; __assign_str(old_name); __entry->new_pino = new_dir->i_ino; __assign_str(new_name); __entry->flags = flags; ), TP_printk("dev = (%d,%d), old_dir = %lu, old_name: %s, " "new_dir = %lu, new_name: %s, flags = %u", show_dev_ino(__entry), __get_str(old_name), __entry->new_pino, __get_str(new_name), __entry->flags) ); TRACE_EVENT(f2fs_rename_end, TP_PROTO(struct dentry *old_dentry, struct dentry *new_dentry, unsigned int flags, int ret), TP_ARGS(old_dentry, new_dentry, flags, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __string(old_name, old_dentry->d_name.name) __string(new_name, new_dentry->d_name.name) __field(unsigned int, flags) __field(int, ret) ), TP_fast_assign( __entry->dev = old_dentry->d_sb->s_dev; __entry->ino = old_dentry->d_inode->i_ino; __assign_str(old_name); __assign_str(new_name); __entry->flags = flags; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, old_name: %s, " "new_name: %s, flags = %u, ret = %d", show_dev_ino(__entry), __get_str(old_name), __get_str(new_name), __entry->flags, __entry->ret) ); TRACE_EVENT(f2fs_readdir, TP_PROTO(struct inode *dir, loff_t start_pos, loff_t end_pos, int err), TP_ARGS(dir, start_pos, end_pos, err), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, start) __field(loff_t, end) __field(int, err) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; __entry->start = start_pos; __entry->end = end_pos; __entry->err = err; ), TP_printk("dev = (%d,%d), ino = %lu, start_pos:%llu, end_pos:%llu, err:%d", show_dev_ino(__entry), __entry->start, __entry->end, __entry->err) ); TRACE_EVENT(f2fs_fallocate, TP_PROTO(struct inode *inode, int mode, loff_t offset, loff_t len, int ret), TP_ARGS(inode, mode, offset, len, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(int, mode) __field(loff_t, offset) __field(loff_t, len) __field(loff_t, size) __field(blkcnt_t, blocks) __field(int, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->mode = mode; __entry->offset = offset; __entry->len = len; __entry->size = inode->i_size; __entry->blocks = inode->i_blocks; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, mode = %x, offset = %lld, " "len = %lld, i_size = %lld, i_blocks = %llu, ret = %d", show_dev_ino(__entry), __entry->mode, (unsigned long long)__entry->offset, (unsigned long long)__entry->len, (unsigned long long)__entry->size, (unsigned long long)__entry->blocks, __entry->ret) ); TRACE_EVENT(f2fs_direct_IO_enter, TP_PROTO(struct inode *inode, struct kiocb *iocb, long len, int rw), TP_ARGS(inode, iocb, len, rw), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, ki_pos) __field(int, ki_flags) __field(u16, ki_ioprio) __field(unsigned long, len) __field(int, rw) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ki_pos = iocb->ki_pos; __entry->ki_flags = iocb->ki_flags; __entry->ki_ioprio = iocb->ki_ioprio; __entry->len = len; __entry->rw = rw; ), TP_printk("dev = (%d,%d), ino = %lu pos = %lld len = %lu ki_flags = %x ki_ioprio = %x rw = %d", show_dev_ino(__entry), __entry->ki_pos, __entry->len, __entry->ki_flags, __entry->ki_ioprio, __entry->rw) ); TRACE_EVENT(f2fs_direct_IO_exit, TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw, int ret), TP_ARGS(inode, offset, len, rw, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, pos) __field(unsigned long, len) __field(int, rw) __field(int, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = offset; __entry->len = len; __entry->rw = rw; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu pos = %lld len = %lu " "rw = %d ret = %d", show_dev_ino(__entry), __entry->pos, __entry->len, __entry->rw, __entry->ret) ); TRACE_EVENT(f2fs_reserve_new_blocks, TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node, blkcnt_t count), TP_ARGS(inode, nid, ofs_in_node, count), TP_STRUCT__entry( __field(dev_t, dev) __field(nid_t, nid) __field(unsigned int, ofs_in_node) __field(blkcnt_t, count) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = nid; __entry->ofs_in_node = ofs_in_node; __entry->count = count; ), TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u, count = %llu", show_dev(__entry->dev), (unsigned int)__entry->nid, __entry->ofs_in_node, (unsigned long long)__entry->count) ); DECLARE_EVENT_CLASS(f2fs__submit_folio_bio, TP_PROTO(struct folio *folio, struct f2fs_io_info *fio), TP_ARGS(folio, fio), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(pgoff_t, index) __field(block_t, old_blkaddr) __field(block_t, new_blkaddr) __field(enum req_op, op) __field(blk_opf_t, op_flags) __field(int, temp) __field(int, type) ), TP_fast_assign( __entry->dev = folio->mapping->host->i_sb->s_dev; __entry->ino = folio->mapping->host->i_ino; __entry->index = folio->index; __entry->old_blkaddr = fio->old_blkaddr; __entry->new_blkaddr = fio->new_blkaddr; __entry->op = fio->op; __entry->op_flags = fio->op_flags; __entry->temp = fio->temp; __entry->type = fio->type; ), TP_printk("dev = (%d,%d), ino = %lu, folio_index = 0x%lx, " "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s(%s), type = %s_%s", show_dev_ino(__entry), (unsigned long)__entry->index, (unsigned long long)__entry->old_blkaddr, (unsigned long long)__entry->new_blkaddr, show_bio_type(__entry->op, __entry->op_flags), show_block_temp(__entry->temp), show_block_type(__entry->type)) ); DEFINE_EVENT_CONDITION(f2fs__submit_folio_bio, f2fs_submit_folio_bio, TP_PROTO(struct folio *folio, struct f2fs_io_info *fio), TP_ARGS(folio, fio), TP_CONDITION(folio->mapping) ); DEFINE_EVENT_CONDITION(f2fs__submit_folio_bio, f2fs_submit_folio_write, TP_PROTO(struct folio *folio, struct f2fs_io_info *fio), TP_ARGS(folio, fio), TP_CONDITION(folio->mapping) ); DECLARE_EVENT_CLASS(f2fs__bio, TP_PROTO(struct super_block *sb, int type, struct bio *bio), TP_ARGS(sb, type, bio), TP_STRUCT__entry( __field(dev_t, dev) __field(dev_t, target) __field(enum req_op, op) __field(blk_opf_t, op_flags) __field(int, type) __field(sector_t, sector) __field(unsigned int, size) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->target = bio_dev(bio); __entry->op = bio_op(bio); __entry->op_flags = bio->bi_opf; __entry->type = type; __entry->sector = bio->bi_iter.bi_sector; __entry->size = bio->bi_iter.bi_size; ), TP_printk("dev = (%d,%d)/(%d,%d), rw = %s(%s), %s, sector = %lld, size = %u", show_dev(__entry->target), show_dev(__entry->dev), show_bio_type(__entry->op, __entry->op_flags), show_block_type(__entry->type), (unsigned long long)__entry->sector, __entry->size) ); DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_write_bio, TP_PROTO(struct super_block *sb, int type, struct bio *bio), TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_read_bio, TP_PROTO(struct super_block *sb, int type, struct bio *bio), TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_read_bio, TP_PROTO(struct super_block *sb, int type, struct bio *bio), TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_write_bio, TP_PROTO(struct super_block *sb, int type, struct bio *bio), TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); TRACE_EVENT(f2fs_write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), TP_ARGS(inode, pos, len), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, pos) __field(unsigned int, len) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; ), TP_printk("dev = (%d,%d), ino = %lu, pos = %llu, len = %u", show_dev_ino(__entry), (unsigned long long)__entry->pos, __entry->len) ); TRACE_EVENT(f2fs_write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, pos) __field(unsigned int, len) __field(unsigned int, copied) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; __entry->copied = copied; ), TP_printk("dev = (%d,%d), ino = %lu, pos = %llu, len = %u, copied = %u", show_dev_ino(__entry), (unsigned long long)__entry->pos, __entry->len, __entry->copied) ); DECLARE_EVENT_CLASS(f2fs__folio, TP_PROTO(struct folio *folio, int type), TP_ARGS(folio, type), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(int, type) __field(int, dir) __field(pgoff_t, index) __field(int, dirty) __field(int, uptodate) ), TP_fast_assign( __entry->dev = folio->mapping->host->i_sb->s_dev; __entry->ino = folio->mapping->host->i_ino; __entry->type = type; __entry->dir = S_ISDIR(folio->mapping->host->i_mode); __entry->index = folio->index; __entry->dirty = folio_test_dirty(folio); __entry->uptodate = folio_test_uptodate(folio); ), TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, " "dirty = %d, uptodate = %d", show_dev_ino(__entry), show_block_type(__entry->type), show_file_type(__entry->dir), (unsigned long)__entry->index, __entry->dirty, __entry->uptodate) ); DEFINE_EVENT(f2fs__folio, f2fs_writepage, TP_PROTO(struct folio *folio, int type), TP_ARGS(folio, type) ); DEFINE_EVENT(f2fs__folio, f2fs_do_write_data_page, TP_PROTO(struct folio *folio, int type), TP_ARGS(folio, type) ); DEFINE_EVENT(f2fs__folio, f2fs_readpage, TP_PROTO(struct folio *folio, int type), TP_ARGS(folio, type) ); DEFINE_EVENT(f2fs__folio, f2fs_set_page_dirty, TP_PROTO(struct folio *folio, int type), TP_ARGS(folio, type) ); TRACE_EVENT(f2fs_replace_atomic_write_block, TP_PROTO(struct inode *inode, struct inode *cow_inode, pgoff_t index, block_t old_addr, block_t new_addr, bool recovery), TP_ARGS(inode, cow_inode, index, old_addr, new_addr, recovery), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(ino_t, cow_ino) __field(pgoff_t, index) __field(block_t, old_addr) __field(block_t, new_addr) __field(bool, recovery) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->cow_ino = cow_inode->i_ino; __entry->index = index; __entry->old_addr = old_addr; __entry->new_addr = new_addr; __entry->recovery = recovery; ), TP_printk("dev = (%d,%d), ino = %lu, cow_ino = %lu, index = %lu, " "old_addr = 0x%llx, new_addr = 0x%llx, recovery = %d", show_dev_ino(__entry), __entry->cow_ino, (unsigned long)__entry->index, (unsigned long long)__entry->old_addr, (unsigned long long)__entry->new_addr, __entry->recovery) ); DECLARE_EVENT_CLASS(f2fs_mmap, TP_PROTO(struct inode *inode, pgoff_t index, vm_flags_t flags, vm_fault_t ret), TP_ARGS(inode, index, flags, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(pgoff_t, index) __field(vm_flags_t, flags) __field(vm_fault_t, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->index = index; __entry->flags = flags; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, index = %lu, flags: %s, ret: %s", show_dev_ino(__entry), (unsigned long)__entry->index, __print_flags(__entry->flags, "|", FAULT_FLAG_TRACE), __print_flags(__entry->ret, "|", VM_FAULT_RESULT_TRACE)) ); DEFINE_EVENT(f2fs_mmap, f2fs_filemap_fault, TP_PROTO(struct inode *inode, pgoff_t index, vm_flags_t flags, vm_fault_t ret), TP_ARGS(inode, index, flags, ret) ); DEFINE_EVENT(f2fs_mmap, f2fs_vm_page_mkwrite, TP_PROTO(struct inode *inode, pgoff_t index, vm_flags_t flags, vm_fault_t ret), TP_ARGS(inode, index, flags, ret) ); TRACE_EVENT(f2fs_writepages, TP_PROTO(struct inode *inode, struct writeback_control *wbc, int type), TP_ARGS(inode, wbc, type), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(int, type) __field(int, dir) __field(long, nr_to_write) __field(long, pages_skipped) __field(loff_t, range_start) __field(loff_t, range_end) __field(pgoff_t, writeback_index) __field(int, sync_mode) __field(char, for_kupdate) __field(char, for_background) __field(char, tagged_writepages) __field(char, range_cyclic) __field(char, for_sync) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->type = type; __entry->dir = S_ISDIR(inode->i_mode); __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; __entry->range_end = wbc->range_end; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; __entry->for_background = wbc->for_background; __entry->tagged_writepages = wbc->tagged_writepages; __entry->range_cyclic = wbc->range_cyclic; __entry->for_sync = wbc->for_sync; ), TP_printk("dev = (%d,%d), ino = %lu, %s, %s, nr_to_write %ld, " "skipped %ld, start %lld, end %lld, wb_idx %lu, sync_mode %d, " "kupdate %u background %u tagged %u cyclic %u sync %u", show_dev_ino(__entry), show_block_type(__entry->type), show_file_type(__entry->dir), __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, (unsigned long)__entry->writeback_index, __entry->sync_mode, __entry->for_kupdate, __entry->for_background, __entry->tagged_writepages, __entry->range_cyclic, __entry->for_sync) ); TRACE_EVENT(f2fs_readpages, TP_PROTO(struct inode *inode, pgoff_t start, unsigned int nrpage), TP_ARGS(inode, start, nrpage), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(pgoff_t, start) __field(unsigned int, nrpage) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->start = start; __entry->nrpage = nrpage; ), TP_printk("dev = (%d,%d), ino = %lu, start = %lu nrpage = %u", show_dev_ino(__entry), (unsigned long)__entry->start, __entry->nrpage) ); TRACE_EVENT(f2fs_write_checkpoint, TP_PROTO(struct super_block *sb, int reason, const char *msg), TP_ARGS(sb, reason, msg), TP_STRUCT__entry( __field(dev_t, dev) __field(int, reason) __string(dest_msg, msg) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->reason = reason; __assign_str(dest_msg); ), TP_printk("dev = (%d,%d), checkpoint for %s, state = %s", show_dev(__entry->dev), show_cpreason(__entry->reason), __get_str(dest_msg)) ); DECLARE_EVENT_CLASS(f2fs_discard, TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), TP_ARGS(dev, blkstart, blklen), TP_STRUCT__entry( __field(dev_t, dev) __field(block_t, blkstart) __field(block_t, blklen) ), TP_fast_assign( __entry->dev = dev->bd_dev; __entry->blkstart = blkstart; __entry->blklen = blklen; ), TP_printk("dev = (%d,%d), blkstart = 0x%llx, blklen = 0x%llx", show_dev(__entry->dev), (unsigned long long)__entry->blkstart, (unsigned long long)__entry->blklen) ); DEFINE_EVENT(f2fs_discard, f2fs_queue_discard, TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), TP_ARGS(dev, blkstart, blklen) ); DEFINE_EVENT(f2fs_discard, f2fs_issue_discard, TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), TP_ARGS(dev, blkstart, blklen) ); DEFINE_EVENT(f2fs_discard, f2fs_remove_discard, TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), TP_ARGS(dev, blkstart, blklen) ); DECLARE_EVENT_CLASS(f2fs_reset_zone, TP_PROTO(struct block_device *dev, block_t blkstart), TP_ARGS(dev, blkstart), TP_STRUCT__entry( __field(dev_t, dev) __field(block_t, blkstart) ), TP_fast_assign( __entry->dev = dev->bd_dev; __entry->blkstart = blkstart; ), TP_printk("dev = (%d,%d), zone at block = 0x%llx", show_dev(__entry->dev), (unsigned long long)__entry->blkstart) ); DEFINE_EVENT(f2fs_reset_zone, f2fs_queue_reset_zone, TP_PROTO(struct block_device *dev, block_t blkstart), TP_ARGS(dev, blkstart) ); DEFINE_EVENT(f2fs_reset_zone, f2fs_issue_reset_zone, TP_PROTO(struct block_device *dev, block_t blkstart), TP_ARGS(dev, blkstart) ); TRACE_EVENT(f2fs_issue_flush, TP_PROTO(struct block_device *dev, unsigned int nobarrier, unsigned int flush_merge, int ret), TP_ARGS(dev, nobarrier, flush_merge, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, nobarrier) __field(unsigned int, flush_merge) __field(int, ret) ), TP_fast_assign( __entry->dev = dev->bd_dev; __entry->nobarrier = nobarrier; __entry->flush_merge = flush_merge; __entry->ret = ret; ), TP_printk("dev = (%d,%d), %s %s, ret = %d", show_dev(__entry->dev), __entry->nobarrier ? "skip (nobarrier)" : "issue", __entry->flush_merge ? " with flush_merge" : "", __entry->ret) ); TRACE_EVENT(f2fs_lookup_extent_tree_start, TP_PROTO(struct inode *inode, unsigned int pgofs, enum extent_type type), TP_ARGS(inode, pgofs, type), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, pgofs) __field(enum extent_type, type) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; __entry->type = type; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, type = %s", show_dev_ino(__entry), __entry->pgofs, show_extent_type(__entry->type)) ); TRACE_EVENT_CONDITION(f2fs_lookup_read_extent_tree_end, TP_PROTO(struct inode *inode, unsigned int pgofs, struct extent_info *ei), TP_ARGS(inode, pgofs, ei), TP_CONDITION(ei), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, pgofs) __field(unsigned int, fofs) __field(unsigned int, len) __field(u32, blk) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; __entry->fofs = ei->fofs; __entry->len = ei->len; __entry->blk = ei->blk; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " "read_ext_info(fofs: %u, len: %u, blk: %u)", show_dev_ino(__entry), __entry->pgofs, __entry->fofs, __entry->len, __entry->blk) ); TRACE_EVENT_CONDITION(f2fs_lookup_age_extent_tree_end, TP_PROTO(struct inode *inode, unsigned int pgofs, struct extent_info *ei), TP_ARGS(inode, pgofs, ei), TP_CONDITION(ei), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, pgofs) __field(unsigned int, fofs) __field(unsigned int, len) __field(unsigned long long, age) __field(unsigned long long, blocks) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; __entry->fofs = ei->fofs; __entry->len = ei->len; __entry->age = ei->age; __entry->blocks = ei->last_blocks; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " "age_ext_info(fofs: %u, len: %u, age: %llu, blocks: %llu)", show_dev_ino(__entry), __entry->pgofs, __entry->fofs, __entry->len, __entry->age, __entry->blocks) ); TRACE_EVENT(f2fs_update_read_extent_tree_range, TP_PROTO(struct inode *inode, unsigned int pgofs, unsigned int len, block_t blkaddr, unsigned int c_len), TP_ARGS(inode, pgofs, len, blkaddr, c_len), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, pgofs) __field(u32, blk) __field(unsigned int, len) __field(unsigned int, c_len) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; __entry->len = len; __entry->blk = blkaddr; __entry->c_len = c_len; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " "len = %u, blkaddr = %u, c_len = %u", show_dev_ino(__entry), __entry->pgofs, __entry->len, __entry->blk, __entry->c_len) ); TRACE_EVENT(f2fs_update_age_extent_tree_range, TP_PROTO(struct inode *inode, unsigned int pgofs, unsigned int len, unsigned long long age, unsigned long long last_blks), TP_ARGS(inode, pgofs, len, age, last_blks), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, pgofs) __field(unsigned int, len) __field(unsigned long long, age) __field(unsigned long long, blocks) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; __entry->len = len; __entry->age = age; __entry->blocks = last_blks; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " "len = %u, age = %llu, blocks = %llu", show_dev_ino(__entry), __entry->pgofs, __entry->len, __entry->age, __entry->blocks) ); TRACE_EVENT(f2fs_shrink_extent_tree, TP_PROTO(struct f2fs_sb_info *sbi, unsigned int node_cnt, unsigned int tree_cnt, enum extent_type type), TP_ARGS(sbi, node_cnt, tree_cnt, type), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, node_cnt) __field(unsigned int, tree_cnt) __field(enum extent_type, type) ), TP_fast_assign( __entry->dev = sbi->sb->s_dev; __entry->node_cnt = node_cnt; __entry->tree_cnt = tree_cnt; __entry->type = type; ), TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u, type = %s", show_dev(__entry->dev), __entry->node_cnt, __entry->tree_cnt, show_extent_type(__entry->type)) ); TRACE_EVENT(f2fs_destroy_extent_tree, TP_PROTO(struct inode *inode, unsigned int node_cnt, enum extent_type type), TP_ARGS(inode, node_cnt, type), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, node_cnt) __field(enum extent_type, type) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->node_cnt = node_cnt; __entry->type = type; ), TP_printk("dev = (%d,%d), ino = %lu, destroyed: node_cnt = %u, type = %s", show_dev_ino(__entry), __entry->node_cnt, show_extent_type(__entry->type)) ); DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, TP_PROTO(struct super_block *sb, int type, s64 count), TP_ARGS(sb, type, count), TP_STRUCT__entry( __field(dev_t, dev) __field(int, type) __field(s64, count) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->type = type; __entry->count = count; ), TP_printk("dev = (%d,%d), %s, dirty count = %lld", show_dev(__entry->dev), show_file_type(__entry->type), __entry->count) ); DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_enter, TP_PROTO(struct super_block *sb, int type, s64 count), TP_ARGS(sb, type, count) ); DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_exit, TP_PROTO(struct super_block *sb, int type, s64 count), TP_ARGS(sb, type, count) ); TRACE_EVENT(f2fs_shutdown, TP_PROTO(struct f2fs_sb_info *sbi, unsigned int mode, int ret), TP_ARGS(sbi, mode, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, mode) __field(int, ret) ), TP_fast_assign( __entry->dev = sbi->sb->s_dev; __entry->mode = mode; __entry->ret = ret; ), TP_printk("dev = (%d,%d), mode: %s, ret:%d", show_dev(__entry->dev), show_shutdown_mode(__entry->mode), __entry->ret) ); DECLARE_EVENT_CLASS(f2fs_zip_start, TP_PROTO(struct inode *inode, pgoff_t cluster_idx, unsigned int cluster_size, unsigned char algtype), TP_ARGS(inode, cluster_idx, cluster_size, algtype), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(pgoff_t, idx) __field(unsigned int, size) __field(unsigned int, algtype) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->idx = cluster_idx; __entry->size = cluster_size; __entry->algtype = algtype; ), TP_printk("dev = (%d,%d), ino = %lu, cluster_idx:%lu, " "cluster_size = %u, algorithm = %s", show_dev_ino(__entry), __entry->idx, __entry->size, show_compress_algorithm(__entry->algtype)) ); DECLARE_EVENT_CLASS(f2fs_zip_end, TP_PROTO(struct inode *inode, pgoff_t cluster_idx, unsigned int compressed_size, int ret), TP_ARGS(inode, cluster_idx, compressed_size, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(pgoff_t, idx) __field(unsigned int, size) __field(unsigned int, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->idx = cluster_idx; __entry->size = compressed_size; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, cluster_idx:%lu, " "compressed_size = %u, ret = %d", show_dev_ino(__entry), __entry->idx, __entry->size, __entry->ret) ); DEFINE_EVENT(f2fs_zip_start, f2fs_compress_pages_start, TP_PROTO(struct inode *inode, pgoff_t cluster_idx, unsigned int cluster_size, unsigned char algtype), TP_ARGS(inode, cluster_idx, cluster_size, algtype) ); DEFINE_EVENT(f2fs_zip_start, f2fs_decompress_pages_start, TP_PROTO(struct inode *inode, pgoff_t cluster_idx, unsigned int cluster_size, unsigned char algtype), TP_ARGS(inode, cluster_idx, cluster_size, algtype) ); DEFINE_EVENT(f2fs_zip_end, f2fs_compress_pages_end, TP_PROTO(struct inode *inode, pgoff_t cluster_idx, unsigned int compressed_size, int ret), TP_ARGS(inode, cluster_idx, compressed_size, ret) ); DEFINE_EVENT(f2fs_zip_end, f2fs_decompress_pages_end, TP_PROTO(struct inode *inode, pgoff_t cluster_idx, unsigned int compressed_size, int ret), TP_ARGS(inode, cluster_idx, compressed_size, ret) ); #ifdef CONFIG_F2FS_IOSTAT TRACE_EVENT(f2fs_iostat, TP_PROTO(struct f2fs_sb_info *sbi, unsigned long long *iostat), TP_ARGS(sbi, iostat), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned long long, app_dio) __field(unsigned long long, app_bio) __field(unsigned long long, app_wio) __field(unsigned long long, app_mio) __field(unsigned long long, app_bcdio) __field(unsigned long long, app_mcdio) __field(unsigned long long, fs_dio) __field(unsigned long long, fs_cdio) __field(unsigned long long, fs_nio) __field(unsigned long long, fs_mio) __field(unsigned long long, fs_gc_dio) __field(unsigned long long, fs_gc_nio) __field(unsigned long long, fs_cp_dio) __field(unsigned long long, fs_cp_nio) __field(unsigned long long, fs_cp_mio) __field(unsigned long long, app_drio) __field(unsigned long long, app_brio) __field(unsigned long long, app_rio) __field(unsigned long long, app_mrio) __field(unsigned long long, app_bcrio) __field(unsigned long long, app_mcrio) __field(unsigned long long, fs_drio) __field(unsigned long long, fs_gdrio) __field(unsigned long long, fs_cdrio) __field(unsigned long long, fs_nrio) __field(unsigned long long, fs_mrio) __field(unsigned long long, fs_discard) __field(unsigned long long, fs_reset_zone) ), TP_fast_assign( __entry->dev = sbi->sb->s_dev; __entry->app_dio = iostat[APP_DIRECT_IO]; __entry->app_bio = iostat[APP_BUFFERED_IO]; __entry->app_wio = iostat[APP_WRITE_IO]; __entry->app_mio = iostat[APP_MAPPED_IO]; __entry->app_bcdio = iostat[APP_BUFFERED_CDATA_IO]; __entry->app_mcdio = iostat[APP_MAPPED_CDATA_IO]; __entry->fs_dio = iostat[FS_DATA_IO]; __entry->fs_cdio = iostat[FS_CDATA_IO]; __entry->fs_nio = iostat[FS_NODE_IO]; __entry->fs_mio = iostat[FS_META_IO]; __entry->fs_gc_dio = iostat[FS_GC_DATA_IO]; __entry->fs_gc_nio = iostat[FS_GC_NODE_IO]; __entry->fs_cp_dio = iostat[FS_CP_DATA_IO]; __entry->fs_cp_nio = iostat[FS_CP_NODE_IO]; __entry->fs_cp_mio = iostat[FS_CP_META_IO]; __entry->app_drio = iostat[APP_DIRECT_READ_IO]; __entry->app_brio = iostat[APP_BUFFERED_READ_IO]; __entry->app_rio = iostat[APP_READ_IO]; __entry->app_mrio = iostat[APP_MAPPED_READ_IO]; __entry->app_bcrio = iostat[APP_BUFFERED_CDATA_READ_IO]; __entry->app_mcrio = iostat[APP_MAPPED_CDATA_READ_IO]; __entry->fs_drio = iostat[FS_DATA_READ_IO]; __entry->fs_gdrio = iostat[FS_GDATA_READ_IO]; __entry->fs_cdrio = iostat[FS_CDATA_READ_IO]; __entry->fs_nrio = iostat[FS_NODE_READ_IO]; __entry->fs_mrio = iostat[FS_META_READ_IO]; __entry->fs_discard = iostat[FS_DISCARD_IO]; __entry->fs_reset_zone = iostat[FS_ZONE_RESET_IO]; ), TP_printk("dev = (%d,%d), " "app [write=%llu (direct=%llu, buffered=%llu), mapped=%llu, " "compr(buffered=%llu, mapped=%llu)], " "fs [data=%llu, cdata=%llu, node=%llu, meta=%llu, discard=%llu, " "reset_zone=%llu], " "gc [data=%llu, node=%llu], " "cp [data=%llu, node=%llu, meta=%llu], " "app [read=%llu (direct=%llu, buffered=%llu), mapped=%llu], " "compr(buffered=%llu, mapped=%llu)], " "fs [data=%llu, (gc_data=%llu, cdata=%llu), " "node=%llu, meta=%llu]", show_dev(__entry->dev), __entry->app_wio, __entry->app_dio, __entry->app_bio, __entry->app_mio, __entry->app_bcdio, __entry->app_mcdio, __entry->fs_dio, __entry->fs_cdio, __entry->fs_nio, __entry->fs_mio, __entry->fs_discard, __entry->fs_reset_zone, __entry->fs_gc_dio, __entry->fs_gc_nio, __entry->fs_cp_dio, __entry->fs_cp_nio, __entry->fs_cp_mio, __entry->app_rio, __entry->app_drio, __entry->app_brio, __entry->app_mrio, __entry->app_bcrio, __entry->app_mcrio, __entry->fs_drio, __entry->fs_gdrio, __entry->fs_cdrio, __entry->fs_nrio, __entry->fs_mrio) ); #ifndef __F2FS_IOSTAT_LATENCY_TYPE #define __F2FS_IOSTAT_LATENCY_TYPE struct f2fs_iostat_latency { unsigned int peak_lat; unsigned int avg_lat; unsigned int cnt; }; #endif /* __F2FS_IOSTAT_LATENCY_TYPE */ TRACE_EVENT(f2fs_iostat_latency, TP_PROTO(struct f2fs_sb_info *sbi, struct f2fs_iostat_latency (*iostat_lat)[NR_PAGE_TYPE]), TP_ARGS(sbi, iostat_lat), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, d_rd_peak) __field(unsigned int, d_rd_avg) __field(unsigned int, d_rd_cnt) __field(unsigned int, n_rd_peak) __field(unsigned int, n_rd_avg) __field(unsigned int, n_rd_cnt) __field(unsigned int, m_rd_peak) __field(unsigned int, m_rd_avg) __field(unsigned int, m_rd_cnt) __field(unsigned int, d_wr_s_peak) __field(unsigned int, d_wr_s_avg) __field(unsigned int, d_wr_s_cnt) __field(unsigned int, n_wr_s_peak) __field(unsigned int, n_wr_s_avg) __field(unsigned int, n_wr_s_cnt) __field(unsigned int, m_wr_s_peak) __field(unsigned int, m_wr_s_avg) __field(unsigned int, m_wr_s_cnt) __field(unsigned int, d_wr_as_peak) __field(unsigned int, d_wr_as_avg) __field(unsigned int, d_wr_as_cnt) __field(unsigned int, n_wr_as_peak) __field(unsigned int, n_wr_as_avg) __field(unsigned int, n_wr_as_cnt) __field(unsigned int, m_wr_as_peak) __field(unsigned int, m_wr_as_avg) __field(unsigned int, m_wr_as_cnt) ), TP_fast_assign( __entry->dev = sbi->sb->s_dev; __entry->d_rd_peak = iostat_lat[READ_IO][DATA].peak_lat; __entry->d_rd_avg = iostat_lat[READ_IO][DATA].avg_lat; __entry->d_rd_cnt = iostat_lat[READ_IO][DATA].cnt; __entry->n_rd_peak = iostat_lat[READ_IO][NODE].peak_lat; __entry->n_rd_avg = iostat_lat[READ_IO][NODE].avg_lat; __entry->n_rd_cnt = iostat_lat[READ_IO][NODE].cnt; __entry->m_rd_peak = iostat_lat[READ_IO][META].peak_lat; __entry->m_rd_avg = iostat_lat[READ_IO][META].avg_lat; __entry->m_rd_cnt = iostat_lat[READ_IO][META].cnt; __entry->d_wr_s_peak = iostat_lat[WRITE_SYNC_IO][DATA].peak_lat; __entry->d_wr_s_avg = iostat_lat[WRITE_SYNC_IO][DATA].avg_lat; __entry->d_wr_s_cnt = iostat_lat[WRITE_SYNC_IO][DATA].cnt; __entry->n_wr_s_peak = iostat_lat[WRITE_SYNC_IO][NODE].peak_lat; __entry->n_wr_s_avg = iostat_lat[WRITE_SYNC_IO][NODE].avg_lat; __entry->n_wr_s_cnt = iostat_lat[WRITE_SYNC_IO][NODE].cnt; __entry->m_wr_s_peak = iostat_lat[WRITE_SYNC_IO][META].peak_lat; __entry->m_wr_s_avg = iostat_lat[WRITE_SYNC_IO][META].avg_lat; __entry->m_wr_s_cnt = iostat_lat[WRITE_SYNC_IO][META].cnt; __entry->d_wr_as_peak = iostat_lat[WRITE_ASYNC_IO][DATA].peak_lat; __entry->d_wr_as_avg = iostat_lat[WRITE_ASYNC_IO][DATA].avg_lat; __entry->d_wr_as_cnt = iostat_lat[WRITE_ASYNC_IO][DATA].cnt; __entry->n_wr_as_peak = iostat_lat[WRITE_ASYNC_IO][NODE].peak_lat; __entry->n_wr_as_avg = iostat_lat[WRITE_ASYNC_IO][NODE].avg_lat; __entry->n_wr_as_cnt = iostat_lat[WRITE_ASYNC_IO][NODE].cnt; __entry->m_wr_as_peak = iostat_lat[WRITE_ASYNC_IO][META].peak_lat; __entry->m_wr_as_avg = iostat_lat[WRITE_ASYNC_IO][META].avg_lat; __entry->m_wr_as_cnt = iostat_lat[WRITE_ASYNC_IO][META].cnt; ), TP_printk("dev = (%d,%d), " "iotype [peak lat.(ms)/avg lat.(ms)/count], " "rd_data [%u/%u/%u], rd_node [%u/%u/%u], rd_meta [%u/%u/%u], " "wr_sync_data [%u/%u/%u], wr_sync_node [%u/%u/%u], " "wr_sync_meta [%u/%u/%u], wr_async_data [%u/%u/%u], " "wr_async_node [%u/%u/%u], wr_async_meta [%u/%u/%u]", show_dev(__entry->dev), __entry->d_rd_peak, __entry->d_rd_avg, __entry->d_rd_cnt, __entry->n_rd_peak, __entry->n_rd_avg, __entry->n_rd_cnt, __entry->m_rd_peak, __entry->m_rd_avg, __entry->m_rd_cnt, __entry->d_wr_s_peak, __entry->d_wr_s_avg, __entry->d_wr_s_cnt, __entry->n_wr_s_peak, __entry->n_wr_s_avg, __entry->n_wr_s_cnt, __entry->m_wr_s_peak, __entry->m_wr_s_avg, __entry->m_wr_s_cnt, __entry->d_wr_as_peak, __entry->d_wr_as_avg, __entry->d_wr_as_cnt, __entry->n_wr_as_peak, __entry->n_wr_as_avg, __entry->n_wr_as_cnt, __entry->m_wr_as_peak, __entry->m_wr_as_avg, __entry->m_wr_as_cnt) ); #endif TRACE_EVENT(f2fs_bmap, TP_PROTO(struct inode *inode, sector_t lblock, sector_t pblock), TP_ARGS(inode, lblock, pblock), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(sector_t, lblock) __field(sector_t, pblock) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblock = lblock; __entry->pblock = pblock; ), TP_printk("dev = (%d,%d), ino = %lu, lblock:%lld, pblock:%lld", show_dev_ino(__entry), (unsigned long long)__entry->lblock, (unsigned long long)__entry->pblock) ); TRACE_EVENT(f2fs_fiemap, TP_PROTO(struct inode *inode, sector_t lblock, sector_t pblock, unsigned long long len, unsigned int flags, int ret), TP_ARGS(inode, lblock, pblock, len, flags, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(sector_t, lblock) __field(sector_t, pblock) __field(unsigned long long, len) __field(unsigned int, flags) __field(int, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblock = lblock; __entry->pblock = pblock; __entry->len = len; __entry->flags = flags; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, lblock:%lld, pblock:%lld, " "len:%llu, flags:%u, ret:%d", show_dev_ino(__entry), (unsigned long long)__entry->lblock, (unsigned long long)__entry->pblock, __entry->len, __entry->flags, __entry->ret) ); DECLARE_EVENT_CLASS(f2fs__rw_start, TP_PROTO(struct inode *inode, loff_t offset, int bytes, pid_t pid, char *pathname, char *command), TP_ARGS(inode, offset, bytes, pid, pathname, command), TP_STRUCT__entry( __string(pathbuf, pathname) __field(loff_t, offset) __field(int, bytes) __field(loff_t, i_size) __string(cmdline, command) __field(pid_t, pid) __field(ino_t, ino) ), TP_fast_assign( /* * Replace the spaces in filenames and cmdlines * because this screws up the tooling that parses * the traces. */ __assign_str(pathbuf); (void)strreplace(__get_str(pathbuf), ' ', '_'); __entry->offset = offset; __entry->bytes = bytes; __entry->i_size = i_size_read(inode); __assign_str(cmdline); (void)strreplace(__get_str(cmdline), ' ', '_'); __entry->pid = pid; __entry->ino = inode->i_ino; ), TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s," " pid %d, i_size %llu, ino %lu", __get_str(pathbuf), __entry->offset, __entry->bytes, __get_str(cmdline), __entry->pid, __entry->i_size, (unsigned long) __entry->ino) ); DECLARE_EVENT_CLASS(f2fs__rw_end, TP_PROTO(struct inode *inode, loff_t offset, int bytes), TP_ARGS(inode, offset, bytes), TP_STRUCT__entry( __field(ino_t, ino) __field(loff_t, offset) __field(int, bytes) ), TP_fast_assign( __entry->ino = inode->i_ino; __entry->offset = offset; __entry->bytes = bytes; ), TP_printk("ino %lu, offset %llu, bytes %d", (unsigned long) __entry->ino, __entry->offset, __entry->bytes) ); DEFINE_EVENT(f2fs__rw_start, f2fs_dataread_start, TP_PROTO(struct inode *inode, loff_t offset, int bytes, pid_t pid, char *pathname, char *command), TP_ARGS(inode, offset, bytes, pid, pathname, command) ); DEFINE_EVENT(f2fs__rw_end, f2fs_dataread_end, TP_PROTO(struct inode *inode, loff_t offset, int bytes), TP_ARGS(inode, offset, bytes) ); DEFINE_EVENT(f2fs__rw_start, f2fs_datawrite_start, TP_PROTO(struct inode *inode, loff_t offset, int bytes, pid_t pid, char *pathname, char *command), TP_ARGS(inode, offset, bytes, pid, pathname, command) ); DEFINE_EVENT(f2fs__rw_end, f2fs_datawrite_end, TP_PROTO(struct inode *inode, loff_t offset, int bytes), TP_ARGS(inode, offset, bytes) ); #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 1 1 1 1 3 3 3 1 1 3 3 3 3 3 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 | // SPDX-License-Identifier: GPL-2.0-only /* * The NFC Controller Interface is the communication protocol between an * NFC Controller (NFCC) and a Device Host (DH). * * Copyright (C) 2011 Texas Instruments, Inc. * Copyright (C) 2014 Marvell International Ltd. * * Written by Ilan Elias <ilane@ti.com> * * Acknowledgements: * This file is based on hci_core.c, which was written * by Maxim Krasnyansky. */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include <linux/module.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/completion.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/bitops.h> #include <linux/skbuff.h> #include <linux/kcov.h> #include "../nfc.h" #include <net/nfc/nci.h> #include <net/nfc/nci_core.h> #include <linux/nfc.h> struct core_conn_create_data { int length; struct nci_core_conn_create_cmd *cmd; }; static void nci_cmd_work(struct work_struct *work); static void nci_rx_work(struct work_struct *work); static void nci_tx_work(struct work_struct *work); struct nci_conn_info *nci_get_conn_info_by_conn_id(struct nci_dev *ndev, int conn_id) { struct nci_conn_info *conn_info; list_for_each_entry(conn_info, &ndev->conn_info_list, list) { if (conn_info->conn_id == conn_id) return conn_info; } return NULL; } int nci_get_conn_info_by_dest_type_params(struct nci_dev *ndev, u8 dest_type, const struct dest_spec_params *params) { const struct nci_conn_info *conn_info; list_for_each_entry(conn_info, &ndev->conn_info_list, list) { if (conn_info->dest_type == dest_type) { if (!params) return conn_info->conn_id; if (params->id == conn_info->dest_params->id && params->protocol == conn_info->dest_params->protocol) return conn_info->conn_id; } } return -EINVAL; } EXPORT_SYMBOL(nci_get_conn_info_by_dest_type_params); /* ---- NCI requests ---- */ void nci_req_complete(struct nci_dev *ndev, int result) { if (ndev->req_status == NCI_REQ_PEND) { ndev->req_result = result; ndev->req_status = NCI_REQ_DONE; complete(&ndev->req_completion); } } EXPORT_SYMBOL(nci_req_complete); static void nci_req_cancel(struct nci_dev *ndev, int err) { if (ndev->req_status == NCI_REQ_PEND) { ndev->req_result = err; ndev->req_status = NCI_REQ_CANCELED; complete(&ndev->req_completion); } } /* Execute request and wait for completion. */ static int __nci_request(struct nci_dev *ndev, void (*req)(struct nci_dev *ndev, const void *opt), const void *opt, __u32 timeout) { int rc = 0; long completion_rc; ndev->req_status = NCI_REQ_PEND; reinit_completion(&ndev->req_completion); req(ndev, opt); completion_rc = wait_for_completion_interruptible_timeout(&ndev->req_completion, timeout); pr_debug("wait_for_completion return %ld\n", completion_rc); if (completion_rc > 0) { switch (ndev->req_status) { case NCI_REQ_DONE: rc = nci_to_errno(ndev->req_result); break; case NCI_REQ_CANCELED: rc = -ndev->req_result; break; default: rc = -ETIMEDOUT; break; } } else { pr_err("wait_for_completion_interruptible_timeout failed %ld\n", completion_rc); rc = ((completion_rc == 0) ? (-ETIMEDOUT) : (completion_rc)); } ndev->req_status = ndev->req_result = 0; return rc; } inline int nci_request(struct nci_dev *ndev, void (*req)(struct nci_dev *ndev, const void *opt), const void *opt, __u32 timeout) { int rc; /* Serialize all requests */ mutex_lock(&ndev->req_lock); /* check the state after obtaing the lock against any races * from nci_close_device when the device gets removed. */ if (test_bit(NCI_UP, &ndev->flags)) rc = __nci_request(ndev, req, opt, timeout); else rc = -ENETDOWN; mutex_unlock(&ndev->req_lock); return rc; } static void nci_reset_req(struct nci_dev *ndev, const void *opt) { struct nci_core_reset_cmd cmd; cmd.reset_type = NCI_RESET_TYPE_RESET_CONFIG; nci_send_cmd(ndev, NCI_OP_CORE_RESET_CMD, 1, &cmd); } static void nci_init_req(struct nci_dev *ndev, const void *opt) { u8 plen = 0; if (opt) plen = sizeof(struct nci_core_init_v2_cmd); nci_send_cmd(ndev, NCI_OP_CORE_INIT_CMD, plen, opt); } static void nci_init_complete_req(struct nci_dev *ndev, const void *opt) { struct nci_rf_disc_map_cmd cmd; struct disc_map_config *cfg = cmd.mapping_configs; __u8 *num = &cmd.num_mapping_configs; int i; /* set rf mapping configurations */ *num = 0; /* by default mapping is set to NCI_RF_INTERFACE_FRAME */ for (i = 0; i < ndev->num_supported_rf_interfaces; i++) { if (ndev->supported_rf_interfaces[i] == NCI_RF_INTERFACE_ISO_DEP) { cfg[*num].rf_protocol = NCI_RF_PROTOCOL_ISO_DEP; cfg[*num].mode = NCI_DISC_MAP_MODE_POLL | NCI_DISC_MAP_MODE_LISTEN; cfg[*num].rf_interface = NCI_RF_INTERFACE_ISO_DEP; (*num)++; } else if (ndev->supported_rf_interfaces[i] == NCI_RF_INTERFACE_NFC_DEP) { cfg[*num].rf_protocol = NCI_RF_PROTOCOL_NFC_DEP; cfg[*num].mode = NCI_DISC_MAP_MODE_POLL | NCI_DISC_MAP_MODE_LISTEN; cfg[*num].rf_interface = NCI_RF_INTERFACE_NFC_DEP; (*num)++; } if (*num == NCI_MAX_NUM_MAPPING_CONFIGS) break; } nci_send_cmd(ndev, NCI_OP_RF_DISCOVER_MAP_CMD, (1 + ((*num) * sizeof(struct disc_map_config))), &cmd); } struct nci_set_config_param { __u8 id; size_t len; const __u8 *val; }; static void nci_set_config_req(struct nci_dev *ndev, const void *opt) { const struct nci_set_config_param *param = opt; struct nci_core_set_config_cmd cmd; BUG_ON(param->len > NCI_MAX_PARAM_LEN); cmd.num_params = 1; cmd.param.id = param->id; cmd.param.len = param->len; memcpy(cmd.param.val, param->val, param->len); nci_send_cmd(ndev, NCI_OP_CORE_SET_CONFIG_CMD, (3 + param->len), &cmd); } struct nci_rf_discover_param { __u32 im_protocols; __u32 tm_protocols; }; static void nci_rf_discover_req(struct nci_dev *ndev, const void *opt) { const struct nci_rf_discover_param *param = opt; struct nci_rf_disc_cmd cmd; cmd.num_disc_configs = 0; if ((cmd.num_disc_configs < NCI_MAX_NUM_RF_CONFIGS) && (param->im_protocols & NFC_PROTO_JEWEL_MASK || param->im_protocols & NFC_PROTO_MIFARE_MASK || param->im_protocols & NFC_PROTO_ISO14443_MASK || param->im_protocols & NFC_PROTO_NFC_DEP_MASK)) { cmd.disc_configs[cmd.num_disc_configs].rf_tech_and_mode = NCI_NFC_A_PASSIVE_POLL_MODE; cmd.disc_configs[cmd.num_disc_configs].frequency = 1; cmd.num_disc_configs++; } if ((cmd.num_disc_configs < NCI_MAX_NUM_RF_CONFIGS) && (param->im_protocols & NFC_PROTO_ISO14443_B_MASK)) { cmd.disc_configs[cmd.num_disc_configs].rf_tech_and_mode = NCI_NFC_B_PASSIVE_POLL_MODE; cmd.disc_configs[cmd.num_disc_configs].frequency = 1; cmd.num_disc_configs++; } if ((cmd.num_disc_configs < NCI_MAX_NUM_RF_CONFIGS) && (param->im_protocols & NFC_PROTO_FELICA_MASK || param->im_protocols & NFC_PROTO_NFC_DEP_MASK)) { cmd.disc_configs[cmd.num_disc_configs].rf_tech_and_mode = NCI_NFC_F_PASSIVE_POLL_MODE; cmd.disc_configs[cmd.num_disc_configs].frequency = 1; cmd.num_disc_configs++; } if ((cmd.num_disc_configs < NCI_MAX_NUM_RF_CONFIGS) && (param->im_protocols & NFC_PROTO_ISO15693_MASK)) { cmd.disc_configs[cmd.num_disc_configs].rf_tech_and_mode = NCI_NFC_V_PASSIVE_POLL_MODE; cmd.disc_configs[cmd.num_disc_configs].frequency = 1; cmd.num_disc_configs++; } if ((cmd.num_disc_configs < NCI_MAX_NUM_RF_CONFIGS - 1) && (param->tm_protocols & NFC_PROTO_NFC_DEP_MASK)) { cmd.disc_configs[cmd.num_disc_configs].rf_tech_and_mode = NCI_NFC_A_PASSIVE_LISTEN_MODE; cmd.disc_configs[cmd.num_disc_configs].frequency = 1; cmd.num_disc_configs++; cmd.disc_configs[cmd.num_disc_configs].rf_tech_and_mode = NCI_NFC_F_PASSIVE_LISTEN_MODE; cmd.disc_configs[cmd.num_disc_configs].frequency = 1; cmd.num_disc_configs++; } nci_send_cmd(ndev, NCI_OP_RF_DISCOVER_CMD, (1 + (cmd.num_disc_configs * sizeof(struct disc_config))), &cmd); } struct nci_rf_discover_select_param { __u8 rf_discovery_id; __u8 rf_protocol; }; static void nci_rf_discover_select_req(struct nci_dev *ndev, const void *opt) { const struct nci_rf_discover_select_param *param = opt; struct nci_rf_discover_select_cmd cmd; cmd.rf_discovery_id = param->rf_discovery_id; cmd.rf_protocol = param->rf_protocol; switch (cmd.rf_protocol) { case NCI_RF_PROTOCOL_ISO_DEP: cmd.rf_interface = NCI_RF_INTERFACE_ISO_DEP; break; case NCI_RF_PROTOCOL_NFC_DEP: cmd.rf_interface = NCI_RF_INTERFACE_NFC_DEP; break; default: cmd.rf_interface = NCI_RF_INTERFACE_FRAME; break; } nci_send_cmd(ndev, NCI_OP_RF_DISCOVER_SELECT_CMD, sizeof(struct nci_rf_discover_select_cmd), &cmd); } static void nci_rf_deactivate_req(struct nci_dev *ndev, const void *opt) { struct nci_rf_deactivate_cmd cmd; cmd.type = (unsigned long)opt; nci_send_cmd(ndev, NCI_OP_RF_DEACTIVATE_CMD, sizeof(struct nci_rf_deactivate_cmd), &cmd); } struct nci_cmd_param { __u16 opcode; size_t len; const __u8 *payload; }; static void nci_generic_req(struct nci_dev *ndev, const void *opt) { const struct nci_cmd_param *param = opt; nci_send_cmd(ndev, param->opcode, param->len, param->payload); } int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, const __u8 *payload) { struct nci_cmd_param param; param.opcode = nci_opcode_pack(NCI_GID_PROPRIETARY, oid); param.len = len; param.payload = payload; return __nci_request(ndev, nci_generic_req, ¶m, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_prop_cmd); int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len, const __u8 *payload) { struct nci_cmd_param param; param.opcode = opcode; param.len = len; param.payload = payload; return __nci_request(ndev, nci_generic_req, ¶m, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_core_cmd); int nci_core_reset(struct nci_dev *ndev) { return __nci_request(ndev, nci_reset_req, (void *)0, msecs_to_jiffies(NCI_RESET_TIMEOUT)); } EXPORT_SYMBOL(nci_core_reset); int nci_core_init(struct nci_dev *ndev) { return __nci_request(ndev, nci_init_req, (void *)0, msecs_to_jiffies(NCI_INIT_TIMEOUT)); } EXPORT_SYMBOL(nci_core_init); struct nci_loopback_data { u8 conn_id; struct sk_buff *data; }; static void nci_send_data_req(struct nci_dev *ndev, const void *opt) { const struct nci_loopback_data *data = opt; nci_send_data(ndev, data->conn_id, data->data); } static void nci_nfcc_loopback_cb(void *context, struct sk_buff *skb, int err) { struct nci_dev *ndev = (struct nci_dev *)context; struct nci_conn_info *conn_info; conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_conn_id); if (!conn_info) { nci_req_complete(ndev, NCI_STATUS_REJECTED); return; } conn_info->rx_skb = skb; nci_req_complete(ndev, NCI_STATUS_OK); } int nci_nfcc_loopback(struct nci_dev *ndev, const void *data, size_t data_len, struct sk_buff **resp) { int r; struct nci_loopback_data loopback_data; struct nci_conn_info *conn_info; struct sk_buff *skb; int conn_id = nci_get_conn_info_by_dest_type_params(ndev, NCI_DESTINATION_NFCC_LOOPBACK, NULL); if (conn_id < 0) { r = nci_core_conn_create(ndev, NCI_DESTINATION_NFCC_LOOPBACK, 0, 0, NULL); if (r != NCI_STATUS_OK) return r; conn_id = nci_get_conn_info_by_dest_type_params(ndev, NCI_DESTINATION_NFCC_LOOPBACK, NULL); } conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id); if (!conn_info) return -EPROTO; /* store cb and context to be used on receiving data */ conn_info->data_exchange_cb = nci_nfcc_loopback_cb; conn_info->data_exchange_cb_context = ndev; skb = nci_skb_alloc(ndev, NCI_DATA_HDR_SIZE + data_len, GFP_KERNEL); if (!skb) return -ENOMEM; skb_reserve(skb, NCI_DATA_HDR_SIZE); skb_put_data(skb, data, data_len); loopback_data.conn_id = conn_id; loopback_data.data = skb; ndev->cur_conn_id = conn_id; r = nci_request(ndev, nci_send_data_req, &loopback_data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK && resp) *resp = conn_info->rx_skb; return r; } EXPORT_SYMBOL(nci_nfcc_loopback); static int nci_open_device(struct nci_dev *ndev) { int rc = 0; mutex_lock(&ndev->req_lock); if (test_bit(NCI_UNREG, &ndev->flags)) { rc = -ENODEV; goto done; } if (test_bit(NCI_UP, &ndev->flags)) { rc = -EALREADY; goto done; } if (ndev->ops->open(ndev)) { rc = -EIO; goto done; } atomic_set(&ndev->cmd_cnt, 1); set_bit(NCI_INIT, &ndev->flags); if (ndev->ops->init) rc = ndev->ops->init(ndev); if (!rc) { rc = __nci_request(ndev, nci_reset_req, (void *)0, msecs_to_jiffies(NCI_RESET_TIMEOUT)); } if (!rc && ndev->ops->setup) { rc = ndev->ops->setup(ndev); } if (!rc) { struct nci_core_init_v2_cmd nci_init_v2_cmd = { .feature1 = NCI_FEATURE_DISABLE, .feature2 = NCI_FEATURE_DISABLE }; const void *opt = NULL; if (ndev->nci_ver & NCI_VER_2_MASK) opt = &nci_init_v2_cmd; rc = __nci_request(ndev, nci_init_req, opt, msecs_to_jiffies(NCI_INIT_TIMEOUT)); } if (!rc && ndev->ops->post_setup) rc = ndev->ops->post_setup(ndev); if (!rc) { rc = __nci_request(ndev, nci_init_complete_req, (void *)0, msecs_to_jiffies(NCI_INIT_TIMEOUT)); } clear_bit(NCI_INIT, &ndev->flags); if (!rc) { set_bit(NCI_UP, &ndev->flags); nci_clear_target_list(ndev); atomic_set(&ndev->state, NCI_IDLE); } else { /* Init failed, cleanup */ skb_queue_purge(&ndev->cmd_q); skb_queue_purge(&ndev->rx_q); skb_queue_purge(&ndev->tx_q); ndev->ops->close(ndev); ndev->flags &= BIT(NCI_UNREG); } done: mutex_unlock(&ndev->req_lock); return rc; } static int nci_close_device(struct nci_dev *ndev) { nci_req_cancel(ndev, ENODEV); /* This mutex needs to be held as a barrier for * caller nci_unregister_device */ mutex_lock(&ndev->req_lock); if (!test_and_clear_bit(NCI_UP, &ndev->flags)) { /* Need to flush the cmd wq in case * there is a queued/running cmd_work */ flush_workqueue(ndev->cmd_wq); timer_delete_sync(&ndev->cmd_timer); timer_delete_sync(&ndev->data_timer); mutex_unlock(&ndev->req_lock); return 0; } /* Drop RX and TX queues */ skb_queue_purge(&ndev->rx_q); skb_queue_purge(&ndev->tx_q); /* Flush RX and TX wq */ flush_workqueue(ndev->rx_wq); flush_workqueue(ndev->tx_wq); /* Reset device */ skb_queue_purge(&ndev->cmd_q); atomic_set(&ndev->cmd_cnt, 1); set_bit(NCI_INIT, &ndev->flags); __nci_request(ndev, nci_reset_req, (void *)0, msecs_to_jiffies(NCI_RESET_TIMEOUT)); /* After this point our queues are empty * and no works are scheduled. */ ndev->ops->close(ndev); clear_bit(NCI_INIT, &ndev->flags); /* Flush cmd wq */ flush_workqueue(ndev->cmd_wq); timer_delete_sync(&ndev->cmd_timer); /* Clear flags except NCI_UNREG */ ndev->flags &= BIT(NCI_UNREG); mutex_unlock(&ndev->req_lock); return 0; } /* NCI command timer function */ static void nci_cmd_timer(struct timer_list *t) { struct nci_dev *ndev = timer_container_of(ndev, t, cmd_timer); atomic_set(&ndev->cmd_cnt, 1); queue_work(ndev->cmd_wq, &ndev->cmd_work); } /* NCI data exchange timer function */ static void nci_data_timer(struct timer_list *t) { struct nci_dev *ndev = timer_container_of(ndev, t, data_timer); set_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags); queue_work(ndev->rx_wq, &ndev->rx_work); } static int nci_dev_up(struct nfc_dev *nfc_dev) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); return nci_open_device(ndev); } static int nci_dev_down(struct nfc_dev *nfc_dev) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); return nci_close_device(ndev); } int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, const __u8 *val) { struct nci_set_config_param param; if (!val || !len) return 0; param.id = id; param.len = len; param.val = val; return __nci_request(ndev, nci_set_config_req, ¶m, msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT)); } EXPORT_SYMBOL(nci_set_config); static void nci_nfcee_discover_req(struct nci_dev *ndev, const void *opt) { struct nci_nfcee_discover_cmd cmd; __u8 action = (unsigned long)opt; cmd.discovery_action = action; nci_send_cmd(ndev, NCI_OP_NFCEE_DISCOVER_CMD, 1, &cmd); } int nci_nfcee_discover(struct nci_dev *ndev, u8 action) { unsigned long opt = action; return __nci_request(ndev, nci_nfcee_discover_req, (void *)opt, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_nfcee_discover); static void nci_nfcee_mode_set_req(struct nci_dev *ndev, const void *opt) { const struct nci_nfcee_mode_set_cmd *cmd = opt; nci_send_cmd(ndev, NCI_OP_NFCEE_MODE_SET_CMD, sizeof(struct nci_nfcee_mode_set_cmd), cmd); } int nci_nfcee_mode_set(struct nci_dev *ndev, u8 nfcee_id, u8 nfcee_mode) { struct nci_nfcee_mode_set_cmd cmd; cmd.nfcee_id = nfcee_id; cmd.nfcee_mode = nfcee_mode; return __nci_request(ndev, nci_nfcee_mode_set_req, &cmd, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_nfcee_mode_set); static void nci_core_conn_create_req(struct nci_dev *ndev, const void *opt) { const struct core_conn_create_data *data = opt; nci_send_cmd(ndev, NCI_OP_CORE_CONN_CREATE_CMD, data->length, data->cmd); } int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type, u8 number_destination_params, size_t params_len, const struct core_conn_create_dest_spec_params *params) { int r; struct nci_core_conn_create_cmd *cmd; struct core_conn_create_data data; data.length = params_len + sizeof(struct nci_core_conn_create_cmd); cmd = kzalloc(data.length, GFP_KERNEL); if (!cmd) return -ENOMEM; cmd->destination_type = destination_type; cmd->number_destination_params = number_destination_params; data.cmd = cmd; if (params) { memcpy(cmd->params, params, params_len); if (params->length > 0) memcpy(&ndev->cur_params, ¶ms->value[DEST_SPEC_PARAMS_ID_INDEX], sizeof(struct dest_spec_params)); else ndev->cur_params.id = 0; } else { ndev->cur_params.id = 0; } ndev->cur_dest_type = destination_type; r = __nci_request(ndev, nci_core_conn_create_req, &data, msecs_to_jiffies(NCI_CMD_TIMEOUT)); kfree(cmd); return r; } EXPORT_SYMBOL(nci_core_conn_create); static void nci_core_conn_close_req(struct nci_dev *ndev, const void *opt) { __u8 conn_id = (unsigned long)opt; nci_send_cmd(ndev, NCI_OP_CORE_CONN_CLOSE_CMD, 1, &conn_id); } int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id) { unsigned long opt = conn_id; ndev->cur_conn_id = conn_id; return __nci_request(ndev, nci_core_conn_close_req, (void *)opt, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_core_conn_close); static void nci_set_target_ats(struct nfc_target *target, struct nci_dev *ndev) { if (ndev->target_ats_len > 0) { target->ats_len = ndev->target_ats_len; memcpy(target->ats, ndev->target_ats, target->ats_len); } } static int nci_set_local_general_bytes(struct nfc_dev *nfc_dev) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); struct nci_set_config_param param; int rc; param.val = nfc_get_local_general_bytes(nfc_dev, ¶m.len); if ((param.val == NULL) || (param.len == 0)) return 0; if (param.len > NFC_MAX_GT_LEN) return -EINVAL; param.id = NCI_PN_ATR_REQ_GEN_BYTES; rc = nci_request(ndev, nci_set_config_req, ¶m, msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT)); if (rc) return rc; param.id = NCI_LN_ATR_RES_GEN_BYTES; return nci_request(ndev, nci_set_config_req, ¶m, msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT)); } static int nci_set_listen_parameters(struct nfc_dev *nfc_dev) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); int rc; __u8 val; val = NCI_LA_SEL_INFO_NFC_DEP_MASK; rc = nci_set_config(ndev, NCI_LA_SEL_INFO, 1, &val); if (rc) return rc; val = NCI_LF_PROTOCOL_TYPE_NFC_DEP_MASK; rc = nci_set_config(ndev, NCI_LF_PROTOCOL_TYPE, 1, &val); if (rc) return rc; val = NCI_LF_CON_BITR_F_212 | NCI_LF_CON_BITR_F_424; return nci_set_config(ndev, NCI_LF_CON_BITR_F, 1, &val); } static int nci_start_poll(struct nfc_dev *nfc_dev, __u32 im_protocols, __u32 tm_protocols) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); struct nci_rf_discover_param param; int rc; if ((atomic_read(&ndev->state) == NCI_DISCOVERY) || (atomic_read(&ndev->state) == NCI_W4_ALL_DISCOVERIES)) { pr_err("unable to start poll, since poll is already active\n"); return -EBUSY; } if (ndev->target_active_prot) { pr_err("there is an active target\n"); return -EBUSY; } if ((atomic_read(&ndev->state) == NCI_W4_HOST_SELECT) || (atomic_read(&ndev->state) == NCI_POLL_ACTIVE)) { pr_debug("target active or w4 select, implicitly deactivate\n"); rc = nci_request(ndev, nci_rf_deactivate_req, (void *)NCI_DEACTIVATE_TYPE_IDLE_MODE, msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); if (rc) return -EBUSY; } if ((im_protocols | tm_protocols) & NFC_PROTO_NFC_DEP_MASK) { rc = nci_set_local_general_bytes(nfc_dev); if (rc) { pr_err("failed to set local general bytes\n"); return rc; } } if (tm_protocols & NFC_PROTO_NFC_DEP_MASK) { rc = nci_set_listen_parameters(nfc_dev); if (rc) pr_err("failed to set listen parameters\n"); } param.im_protocols = im_protocols; param.tm_protocols = tm_protocols; rc = nci_request(ndev, nci_rf_discover_req, ¶m, msecs_to_jiffies(NCI_RF_DISC_TIMEOUT)); if (!rc) ndev->poll_prots = im_protocols; return rc; } static void nci_stop_poll(struct nfc_dev *nfc_dev) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); if ((atomic_read(&ndev->state) != NCI_DISCOVERY) && (atomic_read(&ndev->state) != NCI_W4_ALL_DISCOVERIES)) { pr_err("unable to stop poll, since poll is not active\n"); return; } nci_request(ndev, nci_rf_deactivate_req, (void *)NCI_DEACTIVATE_TYPE_IDLE_MODE, msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); } static int nci_activate_target(struct nfc_dev *nfc_dev, struct nfc_target *target, __u32 protocol) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); struct nci_rf_discover_select_param param; const struct nfc_target *nci_target = NULL; int i; int rc = 0; pr_debug("target_idx %d, protocol 0x%x\n", target->idx, protocol); if ((atomic_read(&ndev->state) != NCI_W4_HOST_SELECT) && (atomic_read(&ndev->state) != NCI_POLL_ACTIVE)) { pr_err("there is no available target to activate\n"); return -EINVAL; } if (ndev->target_active_prot) { pr_err("there is already an active target\n"); return -EBUSY; } for (i = 0; i < ndev->n_targets; i++) { if (ndev->targets[i].idx == target->idx) { nci_target = &ndev->targets[i]; break; } } if (!nci_target) { pr_err("unable to find the selected target\n"); return -EINVAL; } if (protocol >= NFC_PROTO_MAX) { pr_err("the requested nfc protocol is invalid\n"); return -EINVAL; } if (!(nci_target->supported_protocols & (1 << protocol))) { pr_err("target does not support the requested protocol 0x%x\n", protocol); return -EINVAL; } if (atomic_read(&ndev->state) == NCI_W4_HOST_SELECT) { param.rf_discovery_id = nci_target->logical_idx; if (protocol == NFC_PROTO_JEWEL) param.rf_protocol = NCI_RF_PROTOCOL_T1T; else if (protocol == NFC_PROTO_MIFARE) param.rf_protocol = NCI_RF_PROTOCOL_T2T; else if (protocol == NFC_PROTO_FELICA) param.rf_protocol = NCI_RF_PROTOCOL_T3T; else if (protocol == NFC_PROTO_ISO14443 || protocol == NFC_PROTO_ISO14443_B) param.rf_protocol = NCI_RF_PROTOCOL_ISO_DEP; else param.rf_protocol = NCI_RF_PROTOCOL_NFC_DEP; rc = nci_request(ndev, nci_rf_discover_select_req, ¶m, msecs_to_jiffies(NCI_RF_DISC_SELECT_TIMEOUT)); } if (!rc) { ndev->target_active_prot = protocol; if (protocol == NFC_PROTO_ISO14443) nci_set_target_ats(target, ndev); } return rc; } static void nci_deactivate_target(struct nfc_dev *nfc_dev, struct nfc_target *target, __u8 mode) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); unsigned long nci_mode = NCI_DEACTIVATE_TYPE_IDLE_MODE; if (!ndev->target_active_prot) { pr_err("unable to deactivate target, no active target\n"); return; } ndev->target_active_prot = 0; switch (mode) { case NFC_TARGET_MODE_SLEEP: nci_mode = NCI_DEACTIVATE_TYPE_SLEEP_MODE; break; } if (atomic_read(&ndev->state) == NCI_POLL_ACTIVE) { nci_request(ndev, nci_rf_deactivate_req, (void *)nci_mode, msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); } } static int nci_dep_link_up(struct nfc_dev *nfc_dev, struct nfc_target *target, __u8 comm_mode, __u8 *gb, size_t gb_len) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); int rc; pr_debug("target_idx %d, comm_mode %d\n", target->idx, comm_mode); rc = nci_activate_target(nfc_dev, target, NFC_PROTO_NFC_DEP); if (rc) return rc; rc = nfc_set_remote_general_bytes(nfc_dev, ndev->remote_gb, ndev->remote_gb_len); if (!rc) rc = nfc_dep_link_is_up(nfc_dev, target->idx, NFC_COMM_PASSIVE, NFC_RF_INITIATOR); return rc; } static int nci_dep_link_down(struct nfc_dev *nfc_dev) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); int rc; if (nfc_dev->rf_mode == NFC_RF_INITIATOR) { nci_deactivate_target(nfc_dev, NULL, NCI_DEACTIVATE_TYPE_IDLE_MODE); } else { if (atomic_read(&ndev->state) == NCI_LISTEN_ACTIVE || atomic_read(&ndev->state) == NCI_DISCOVERY) { nci_request(ndev, nci_rf_deactivate_req, (void *)0, msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); } rc = nfc_tm_deactivated(nfc_dev); if (rc) pr_err("error when signaling tm deactivation\n"); } return 0; } static int nci_transceive(struct nfc_dev *nfc_dev, struct nfc_target *target, struct sk_buff *skb, data_exchange_cb_t cb, void *cb_context) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); int rc; struct nci_conn_info *conn_info; conn_info = ndev->rf_conn_info; if (!conn_info) return -EPROTO; pr_debug("target_idx %d, len %d\n", target->idx, skb->len); if (!ndev->target_active_prot) { pr_err("unable to exchange data, no active target\n"); return -EINVAL; } if (test_and_set_bit(NCI_DATA_EXCHANGE, &ndev->flags)) return -EBUSY; /* store cb and context to be used on receiving data */ conn_info->data_exchange_cb = cb; conn_info->data_exchange_cb_context = cb_context; rc = nci_send_data(ndev, NCI_STATIC_RF_CONN_ID, skb); if (rc) clear_bit(NCI_DATA_EXCHANGE, &ndev->flags); return rc; } static int nci_tm_send(struct nfc_dev *nfc_dev, struct sk_buff *skb) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); int rc; rc = nci_send_data(ndev, NCI_STATIC_RF_CONN_ID, skb); if (rc) pr_err("unable to send data\n"); return rc; } static int nci_enable_se(struct nfc_dev *nfc_dev, u32 se_idx) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); if (ndev->ops->enable_se) return ndev->ops->enable_se(ndev, se_idx); return 0; } static int nci_disable_se(struct nfc_dev *nfc_dev, u32 se_idx) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); if (ndev->ops->disable_se) return ndev->ops->disable_se(ndev, se_idx); return 0; } static int nci_discover_se(struct nfc_dev *nfc_dev) { int r; struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); if (ndev->ops->discover_se) { r = nci_nfcee_discover(ndev, NCI_NFCEE_DISCOVERY_ACTION_ENABLE); if (r != NCI_STATUS_OK) return -EPROTO; return ndev->ops->discover_se(ndev); } return 0; } static int nci_se_io(struct nfc_dev *nfc_dev, u32 se_idx, u8 *apdu, size_t apdu_length, se_io_cb_t cb, void *cb_context) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); if (ndev->ops->se_io) return ndev->ops->se_io(ndev, se_idx, apdu, apdu_length, cb, cb_context); return 0; } static int nci_fw_download(struct nfc_dev *nfc_dev, const char *firmware_name) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); if (!ndev->ops->fw_download) return -ENOTSUPP; return ndev->ops->fw_download(ndev, firmware_name); } static const struct nfc_ops nci_nfc_ops = { .dev_up = nci_dev_up, .dev_down = nci_dev_down, .start_poll = nci_start_poll, .stop_poll = nci_stop_poll, .dep_link_up = nci_dep_link_up, .dep_link_down = nci_dep_link_down, .activate_target = nci_activate_target, .deactivate_target = nci_deactivate_target, .im_transceive = nci_transceive, .tm_send = nci_tm_send, .enable_se = nci_enable_se, .disable_se = nci_disable_se, .discover_se = nci_discover_se, .se_io = nci_se_io, .fw_download = nci_fw_download, }; /* ---- Interface to NCI drivers ---- */ /** * nci_allocate_device - allocate a new nci device * * @ops: device operations * @supported_protocols: NFC protocols supported by the device * @tx_headroom: Reserved space at beginning of skb * @tx_tailroom: Reserved space at end of skb */ struct nci_dev *nci_allocate_device(const struct nci_ops *ops, __u32 supported_protocols, int tx_headroom, int tx_tailroom) { struct nci_dev *ndev; pr_debug("supported_protocols 0x%x\n", supported_protocols); if (!ops->open || !ops->close || !ops->send) return NULL; if (!supported_protocols) return NULL; ndev = kzalloc(sizeof(struct nci_dev), GFP_KERNEL); if (!ndev) return NULL; ndev->ops = ops; if (ops->n_prop_ops > NCI_MAX_PROPRIETARY_CMD) { pr_err("Too many proprietary commands: %zd\n", ops->n_prop_ops); goto free_nci; } ndev->tx_headroom = tx_headroom; ndev->tx_tailroom = tx_tailroom; init_completion(&ndev->req_completion); ndev->nfc_dev = nfc_allocate_device(&nci_nfc_ops, supported_protocols, tx_headroom + NCI_DATA_HDR_SIZE, tx_tailroom); if (!ndev->nfc_dev) goto free_nci; ndev->hci_dev = nci_hci_allocate(ndev); if (!ndev->hci_dev) goto free_nfc; nfc_set_drvdata(ndev->nfc_dev, ndev); return ndev; free_nfc: nfc_free_device(ndev->nfc_dev); free_nci: kfree(ndev); return NULL; } EXPORT_SYMBOL(nci_allocate_device); /** * nci_free_device - deallocate nci device * * @ndev: The nci device to deallocate */ void nci_free_device(struct nci_dev *ndev) { nfc_free_device(ndev->nfc_dev); nci_hci_deallocate(ndev); /* drop partial rx data packet if present */ if (ndev->rx_data_reassembly) kfree_skb(ndev->rx_data_reassembly); kfree(ndev); } EXPORT_SYMBOL(nci_free_device); /** * nci_register_device - register a nci device in the nfc subsystem * * @ndev: The nci device to register */ int nci_register_device(struct nci_dev *ndev) { int rc; struct device *dev = &ndev->nfc_dev->dev; char name[32]; ndev->flags = 0; INIT_WORK(&ndev->cmd_work, nci_cmd_work); snprintf(name, sizeof(name), "%s_nci_cmd_wq", dev_name(dev)); ndev->cmd_wq = create_singlethread_workqueue(name); if (!ndev->cmd_wq) { rc = -ENOMEM; goto exit; } INIT_WORK(&ndev->rx_work, nci_rx_work); snprintf(name, sizeof(name), "%s_nci_rx_wq", dev_name(dev)); ndev->rx_wq = create_singlethread_workqueue(name); if (!ndev->rx_wq) { rc = -ENOMEM; goto destroy_cmd_wq_exit; } INIT_WORK(&ndev->tx_work, nci_tx_work); snprintf(name, sizeof(name), "%s_nci_tx_wq", dev_name(dev)); ndev->tx_wq = create_singlethread_workqueue(name); if (!ndev->tx_wq) { rc = -ENOMEM; goto destroy_rx_wq_exit; } skb_queue_head_init(&ndev->cmd_q); skb_queue_head_init(&ndev->rx_q); skb_queue_head_init(&ndev->tx_q); timer_setup(&ndev->cmd_timer, nci_cmd_timer, 0); timer_setup(&ndev->data_timer, nci_data_timer, 0); mutex_init(&ndev->req_lock); INIT_LIST_HEAD(&ndev->conn_info_list); rc = nfc_register_device(ndev->nfc_dev); if (rc) goto destroy_tx_wq_exit; goto exit; destroy_tx_wq_exit: destroy_workqueue(ndev->tx_wq); destroy_rx_wq_exit: destroy_workqueue(ndev->rx_wq); destroy_cmd_wq_exit: destroy_workqueue(ndev->cmd_wq); exit: return rc; } EXPORT_SYMBOL(nci_register_device); /** * nci_unregister_device - unregister a nci device in the nfc subsystem * * @ndev: The nci device to unregister */ void nci_unregister_device(struct nci_dev *ndev) { struct nci_conn_info *conn_info, *n; /* This set_bit is not protected with specialized barrier, * However, it is fine because the mutex_lock(&ndev->req_lock); * in nci_close_device() will help to emit one. */ set_bit(NCI_UNREG, &ndev->flags); nci_close_device(ndev); destroy_workqueue(ndev->cmd_wq); destroy_workqueue(ndev->rx_wq); destroy_workqueue(ndev->tx_wq); list_for_each_entry_safe(conn_info, n, &ndev->conn_info_list, list) { list_del(&conn_info->list); /* conn_info is allocated with devm_kzalloc */ } nfc_unregister_device(ndev->nfc_dev); } EXPORT_SYMBOL(nci_unregister_device); /** * nci_recv_frame - receive frame from NCI drivers * * @ndev: The nci device * @skb: The sk_buff to receive */ int nci_recv_frame(struct nci_dev *ndev, struct sk_buff *skb) { pr_debug("len %d\n", skb->len); if (!ndev || (!test_bit(NCI_UP, &ndev->flags) && !test_bit(NCI_INIT, &ndev->flags))) { kfree_skb(skb); return -ENXIO; } /* Queue frame for rx worker thread */ skb_queue_tail(&ndev->rx_q, skb); queue_work(ndev->rx_wq, &ndev->rx_work); return 0; } EXPORT_SYMBOL(nci_recv_frame); int nci_send_frame(struct nci_dev *ndev, struct sk_buff *skb) { pr_debug("len %d\n", skb->len); if (!ndev) { kfree_skb(skb); return -ENODEV; } /* Get rid of skb owner, prior to sending to the driver. */ skb_orphan(skb); /* Send copy to sniffer */ nfc_send_to_raw_sock(ndev->nfc_dev, skb, RAW_PAYLOAD_NCI, NFC_DIRECTION_TX); return ndev->ops->send(ndev, skb); } EXPORT_SYMBOL(nci_send_frame); /* Send NCI command */ int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, const void *payload) { struct nci_ctrl_hdr *hdr; struct sk_buff *skb; pr_debug("opcode 0x%x, plen %d\n", opcode, plen); skb = nci_skb_alloc(ndev, (NCI_CTRL_HDR_SIZE + plen), GFP_KERNEL); if (!skb) { pr_err("no memory for command\n"); return -ENOMEM; } hdr = skb_put(skb, NCI_CTRL_HDR_SIZE); hdr->gid = nci_opcode_gid(opcode); hdr->oid = nci_opcode_oid(opcode); hdr->plen = plen; nci_mt_set((__u8 *)hdr, NCI_MT_CMD_PKT); nci_pbf_set((__u8 *)hdr, NCI_PBF_LAST); if (plen) skb_put_data(skb, payload, plen); skb_queue_tail(&ndev->cmd_q, skb); queue_work(ndev->cmd_wq, &ndev->cmd_work); return 0; } EXPORT_SYMBOL(nci_send_cmd); /* Proprietary commands API */ static const struct nci_driver_ops *ops_cmd_lookup(const struct nci_driver_ops *ops, size_t n_ops, __u16 opcode) { size_t i; const struct nci_driver_ops *op; if (!ops || !n_ops) return NULL; for (i = 0; i < n_ops; i++) { op = &ops[i]; if (op->opcode == opcode) return op; } return NULL; } static int nci_op_rsp_packet(struct nci_dev *ndev, __u16 rsp_opcode, struct sk_buff *skb, const struct nci_driver_ops *ops, size_t n_ops) { const struct nci_driver_ops *op; op = ops_cmd_lookup(ops, n_ops, rsp_opcode); if (!op || !op->rsp) return -ENOTSUPP; return op->rsp(ndev, skb); } static int nci_op_ntf_packet(struct nci_dev *ndev, __u16 ntf_opcode, struct sk_buff *skb, const struct nci_driver_ops *ops, size_t n_ops) { const struct nci_driver_ops *op; op = ops_cmd_lookup(ops, n_ops, ntf_opcode); if (!op || !op->ntf) return -ENOTSUPP; return op->ntf(ndev, skb); } int nci_prop_rsp_packet(struct nci_dev *ndev, __u16 opcode, struct sk_buff *skb) { return nci_op_rsp_packet(ndev, opcode, skb, ndev->ops->prop_ops, ndev->ops->n_prop_ops); } int nci_prop_ntf_packet(struct nci_dev *ndev, __u16 opcode, struct sk_buff *skb) { return nci_op_ntf_packet(ndev, opcode, skb, ndev->ops->prop_ops, ndev->ops->n_prop_ops); } int nci_core_rsp_packet(struct nci_dev *ndev, __u16 opcode, struct sk_buff *skb) { return nci_op_rsp_packet(ndev, opcode, skb, ndev->ops->core_ops, ndev->ops->n_core_ops); } int nci_core_ntf_packet(struct nci_dev *ndev, __u16 opcode, struct sk_buff *skb) { return nci_op_ntf_packet(ndev, opcode, skb, ndev->ops->core_ops, ndev->ops->n_core_ops); } static bool nci_valid_size(struct sk_buff *skb) { BUILD_BUG_ON(NCI_CTRL_HDR_SIZE != NCI_DATA_HDR_SIZE); unsigned int hdr_size = NCI_CTRL_HDR_SIZE; if (skb->len < hdr_size || !nci_plen(skb->data) || skb->len < hdr_size + nci_plen(skb->data)) { return false; } return true; } /* ---- NCI TX Data worker thread ---- */ static void nci_tx_work(struct work_struct *work) { struct nci_dev *ndev = container_of(work, struct nci_dev, tx_work); struct nci_conn_info *conn_info; struct sk_buff *skb; conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_conn_id); if (!conn_info) return; pr_debug("credits_cnt %d\n", atomic_read(&conn_info->credits_cnt)); /* Send queued tx data */ while (atomic_read(&conn_info->credits_cnt)) { skb = skb_dequeue(&ndev->tx_q); if (!skb) return; kcov_remote_start_common(skb_get_kcov_handle(skb)); /* Check if data flow control is used */ if (atomic_read(&conn_info->credits_cnt) != NCI_DATA_FLOW_CONTROL_NOT_USED) atomic_dec(&conn_info->credits_cnt); pr_debug("NCI TX: MT=data, PBF=%d, conn_id=%d, plen=%d\n", nci_pbf(skb->data), nci_conn_id(skb->data), nci_plen(skb->data)); nci_send_frame(ndev, skb); mod_timer(&ndev->data_timer, jiffies + msecs_to_jiffies(NCI_DATA_TIMEOUT)); kcov_remote_stop(); } } /* ----- NCI RX worker thread (data & control) ----- */ static void nci_rx_work(struct work_struct *work) { struct nci_dev *ndev = container_of(work, struct nci_dev, rx_work); struct sk_buff *skb; for (; (skb = skb_dequeue(&ndev->rx_q)); kcov_remote_stop()) { kcov_remote_start_common(skb_get_kcov_handle(skb)); /* Send copy to sniffer */ nfc_send_to_raw_sock(ndev->nfc_dev, skb, RAW_PAYLOAD_NCI, NFC_DIRECTION_RX); if (!nci_valid_size(skb)) { kfree_skb(skb); continue; } /* Process frame */ switch (nci_mt(skb->data)) { case NCI_MT_RSP_PKT: nci_rsp_packet(ndev, skb); break; case NCI_MT_NTF_PKT: nci_ntf_packet(ndev, skb); break; case NCI_MT_DATA_PKT: nci_rx_data_packet(ndev, skb); break; default: pr_err("unknown MT 0x%x\n", nci_mt(skb->data)); kfree_skb(skb); break; } } /* check if a data exchange timeout has occurred */ if (test_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags)) { /* complete the data exchange transaction, if exists */ if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags)) nci_data_exchange_complete(ndev, NULL, ndev->cur_conn_id, -ETIMEDOUT); clear_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags); } } /* ----- NCI TX CMD worker thread ----- */ static void nci_cmd_work(struct work_struct *work) { struct nci_dev *ndev = container_of(work, struct nci_dev, cmd_work); struct sk_buff *skb; pr_debug("cmd_cnt %d\n", atomic_read(&ndev->cmd_cnt)); /* Send queued command */ if (atomic_read(&ndev->cmd_cnt)) { skb = skb_dequeue(&ndev->cmd_q); if (!skb) return; kcov_remote_start_common(skb_get_kcov_handle(skb)); atomic_dec(&ndev->cmd_cnt); pr_debug("NCI TX: MT=cmd, PBF=%d, GID=0x%x, OID=0x%x, plen=%d\n", nci_pbf(skb->data), nci_opcode_gid(nci_opcode(skb->data)), nci_opcode_oid(nci_opcode(skb->data)), nci_plen(skb->data)); nci_send_frame(ndev, skb); mod_timer(&ndev->cmd_timer, jiffies + msecs_to_jiffies(NCI_CMD_TIMEOUT)); kcov_remote_stop(); } } MODULE_DESCRIPTION("NFC Controller Interface"); MODULE_LICENSE("GPL"); |
| 9 9 9 1 1 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 | /* * videobuf2-vmalloc.c - vmalloc memory allocator for videobuf2 * * Copyright (C) 2010 Samsung Electronics * * Author: Pawel Osciak <pawel@osciak.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation. */ #include <linux/io.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/refcount.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <media/videobuf2-v4l2.h> #include <media/videobuf2-vmalloc.h> #include <media/videobuf2-memops.h> struct vb2_vmalloc_buf { void *vaddr; struct frame_vector *vec; enum dma_data_direction dma_dir; unsigned long size; refcount_t refcount; struct vb2_vmarea_handler handler; struct dma_buf *dbuf; }; static void vb2_vmalloc_put(void *buf_priv); static void *vb2_vmalloc_alloc(struct vb2_buffer *vb, struct device *dev, unsigned long size) { struct vb2_vmalloc_buf *buf; buf = kzalloc(sizeof(*buf), GFP_KERNEL | vb->vb2_queue->gfp_flags); if (!buf) return ERR_PTR(-ENOMEM); buf->size = size; buf->vaddr = vmalloc_user(buf->size); if (!buf->vaddr) { pr_debug("vmalloc of size %ld failed\n", buf->size); kfree(buf); return ERR_PTR(-ENOMEM); } buf->dma_dir = vb->vb2_queue->dma_dir; buf->handler.refcount = &buf->refcount; buf->handler.put = vb2_vmalloc_put; buf->handler.arg = buf; refcount_set(&buf->refcount, 1); return buf; } static void vb2_vmalloc_put(void *buf_priv) { struct vb2_vmalloc_buf *buf = buf_priv; if (refcount_dec_and_test(&buf->refcount)) { vfree(buf->vaddr); kfree(buf); } } static void *vb2_vmalloc_get_userptr(struct vb2_buffer *vb, struct device *dev, unsigned long vaddr, unsigned long size) { struct vb2_vmalloc_buf *buf; struct frame_vector *vec; int n_pages, offset, i; int ret = -ENOMEM; buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) return ERR_PTR(-ENOMEM); buf->dma_dir = vb->vb2_queue->dma_dir; offset = vaddr & ~PAGE_MASK; buf->size = size; vec = vb2_create_framevec(vaddr, size, buf->dma_dir == DMA_FROM_DEVICE || buf->dma_dir == DMA_BIDIRECTIONAL); if (IS_ERR(vec)) { ret = PTR_ERR(vec); goto fail_pfnvec_create; } buf->vec = vec; n_pages = frame_vector_count(vec); if (frame_vector_to_pages(vec) < 0) { unsigned long *nums = frame_vector_pfns(vec); /* * We cannot get page pointers for these pfns. Check memory is * physically contiguous and use direct mapping. */ for (i = 1; i < n_pages; i++) if (nums[i-1] + 1 != nums[i]) goto fail_map; buf->vaddr = (__force void *) ioremap(__pfn_to_phys(nums[0]), size + offset); } else { buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1); } if (!buf->vaddr) goto fail_map; buf->vaddr += offset; return buf; fail_map: vb2_destroy_framevec(vec); fail_pfnvec_create: kfree(buf); return ERR_PTR(ret); } static void vb2_vmalloc_put_userptr(void *buf_priv) { struct vb2_vmalloc_buf *buf = buf_priv; unsigned long vaddr = (unsigned long)buf->vaddr & PAGE_MASK; unsigned int i; struct page **pages; unsigned int n_pages; if (!buf->vec->is_pfns) { n_pages = frame_vector_count(buf->vec); if (vaddr) vm_unmap_ram((void *)vaddr, n_pages); if (buf->dma_dir == DMA_FROM_DEVICE || buf->dma_dir == DMA_BIDIRECTIONAL) { pages = frame_vector_pages(buf->vec); if (!WARN_ON_ONCE(IS_ERR(pages))) for (i = 0; i < n_pages; i++) set_page_dirty_lock(pages[i]); } } else { iounmap((__force void __iomem *)buf->vaddr); } vb2_destroy_framevec(buf->vec); kfree(buf); } static void *vb2_vmalloc_vaddr(struct vb2_buffer *vb, void *buf_priv) { struct vb2_vmalloc_buf *buf = buf_priv; if (!buf->vaddr) { pr_err("Address of an unallocated plane requested or cannot map user pointer\n"); return NULL; } return buf->vaddr; } static unsigned int vb2_vmalloc_num_users(void *buf_priv) { struct vb2_vmalloc_buf *buf = buf_priv; return refcount_read(&buf->refcount); } static int vb2_vmalloc_mmap(void *buf_priv, struct vm_area_struct *vma) { struct vb2_vmalloc_buf *buf = buf_priv; int ret; if (!buf) { pr_err("No memory to map\n"); return -EINVAL; } ret = remap_vmalloc_range(vma, buf->vaddr, 0); if (ret) { pr_err("Remapping vmalloc memory, error: %d\n", ret); return ret; } /* * Make sure that vm_areas for 2 buffers won't be merged together */ vm_flags_set(vma, VM_DONTEXPAND); /* * Use common vm_area operations to track buffer refcount. */ vma->vm_private_data = &buf->handler; vma->vm_ops = &vb2_common_vm_ops; vma->vm_ops->open(vma); return 0; } #ifdef CONFIG_HAS_DMA /*********************************************/ /* DMABUF ops for exporters */ /*********************************************/ struct vb2_vmalloc_attachment { struct sg_table sgt; enum dma_data_direction dma_dir; }; static int vb2_vmalloc_dmabuf_ops_attach(struct dma_buf *dbuf, struct dma_buf_attachment *dbuf_attach) { struct vb2_vmalloc_attachment *attach; struct vb2_vmalloc_buf *buf = dbuf->priv; int num_pages = PAGE_ALIGN(buf->size) / PAGE_SIZE; struct sg_table *sgt; struct scatterlist *sg; void *vaddr = buf->vaddr; int ret; int i; attach = kzalloc(sizeof(*attach), GFP_KERNEL); if (!attach) return -ENOMEM; sgt = &attach->sgt; ret = sg_alloc_table(sgt, num_pages, GFP_KERNEL); if (ret) { kfree(attach); return ret; } for_each_sgtable_sg(sgt, sg, i) { struct page *page = vmalloc_to_page(vaddr); if (!page) { sg_free_table(sgt); kfree(attach); return -ENOMEM; } sg_set_page(sg, page, PAGE_SIZE, 0); vaddr += PAGE_SIZE; } attach->dma_dir = DMA_NONE; dbuf_attach->priv = attach; return 0; } static void vb2_vmalloc_dmabuf_ops_detach(struct dma_buf *dbuf, struct dma_buf_attachment *db_attach) { struct vb2_vmalloc_attachment *attach = db_attach->priv; struct sg_table *sgt; if (!attach) return; sgt = &attach->sgt; /* release the scatterlist cache */ if (attach->dma_dir != DMA_NONE) dma_unmap_sgtable(db_attach->dev, sgt, attach->dma_dir, 0); sg_free_table(sgt); kfree(attach); db_attach->priv = NULL; } static struct sg_table *vb2_vmalloc_dmabuf_ops_map( struct dma_buf_attachment *db_attach, enum dma_data_direction dma_dir) { struct vb2_vmalloc_attachment *attach = db_attach->priv; struct sg_table *sgt; sgt = &attach->sgt; /* return previously mapped sg table */ if (attach->dma_dir == dma_dir) return sgt; /* release any previous cache */ if (attach->dma_dir != DMA_NONE) { dma_unmap_sgtable(db_attach->dev, sgt, attach->dma_dir, 0); attach->dma_dir = DMA_NONE; } /* mapping to the client with new direction */ if (dma_map_sgtable(db_attach->dev, sgt, dma_dir, 0)) { pr_err("failed to map scatterlist\n"); return ERR_PTR(-EIO); } attach->dma_dir = dma_dir; return sgt; } static void vb2_vmalloc_dmabuf_ops_unmap(struct dma_buf_attachment *db_attach, struct sg_table *sgt, enum dma_data_direction dma_dir) { /* nothing to be done here */ } static void vb2_vmalloc_dmabuf_ops_release(struct dma_buf *dbuf) { /* drop reference obtained in vb2_vmalloc_get_dmabuf */ vb2_vmalloc_put(dbuf->priv); } static int vb2_vmalloc_dmabuf_ops_vmap(struct dma_buf *dbuf, struct iosys_map *map) { struct vb2_vmalloc_buf *buf = dbuf->priv; iosys_map_set_vaddr(map, buf->vaddr); return 0; } static int vb2_vmalloc_dmabuf_ops_mmap(struct dma_buf *dbuf, struct vm_area_struct *vma) { return vb2_vmalloc_mmap(dbuf->priv, vma); } static const struct dma_buf_ops vb2_vmalloc_dmabuf_ops = { .attach = vb2_vmalloc_dmabuf_ops_attach, .detach = vb2_vmalloc_dmabuf_ops_detach, .map_dma_buf = vb2_vmalloc_dmabuf_ops_map, .unmap_dma_buf = vb2_vmalloc_dmabuf_ops_unmap, .vmap = vb2_vmalloc_dmabuf_ops_vmap, .mmap = vb2_vmalloc_dmabuf_ops_mmap, .release = vb2_vmalloc_dmabuf_ops_release, }; static struct dma_buf *vb2_vmalloc_get_dmabuf(struct vb2_buffer *vb, void *buf_priv, unsigned long flags) { struct vb2_vmalloc_buf *buf = buf_priv; struct dma_buf *dbuf; DEFINE_DMA_BUF_EXPORT_INFO(exp_info); exp_info.ops = &vb2_vmalloc_dmabuf_ops; exp_info.size = buf->size; exp_info.flags = flags; exp_info.priv = buf; if (WARN_ON(!buf->vaddr)) return NULL; dbuf = dma_buf_export(&exp_info); if (IS_ERR(dbuf)) return NULL; /* dmabuf keeps reference to vb2 buffer */ refcount_inc(&buf->refcount); return dbuf; } #endif /* CONFIG_HAS_DMA */ /*********************************************/ /* callbacks for DMABUF buffers */ /*********************************************/ static int vb2_vmalloc_map_dmabuf(void *mem_priv) { struct vb2_vmalloc_buf *buf = mem_priv; struct iosys_map map; int ret; ret = dma_buf_vmap_unlocked(buf->dbuf, &map); if (ret) return -EFAULT; buf->vaddr = map.vaddr; return 0; } static void vb2_vmalloc_unmap_dmabuf(void *mem_priv) { struct vb2_vmalloc_buf *buf = mem_priv; struct iosys_map map = IOSYS_MAP_INIT_VADDR(buf->vaddr); dma_buf_vunmap_unlocked(buf->dbuf, &map); buf->vaddr = NULL; } static void vb2_vmalloc_detach_dmabuf(void *mem_priv) { struct vb2_vmalloc_buf *buf = mem_priv; struct iosys_map map = IOSYS_MAP_INIT_VADDR(buf->vaddr); if (buf->vaddr) dma_buf_vunmap_unlocked(buf->dbuf, &map); kfree(buf); } static void *vb2_vmalloc_attach_dmabuf(struct vb2_buffer *vb, struct device *dev, struct dma_buf *dbuf, unsigned long size) { struct vb2_vmalloc_buf *buf; if (dbuf->size < size) return ERR_PTR(-EFAULT); buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) return ERR_PTR(-ENOMEM); buf->dbuf = dbuf; buf->dma_dir = vb->vb2_queue->dma_dir; buf->size = size; return buf; } const struct vb2_mem_ops vb2_vmalloc_memops = { .alloc = vb2_vmalloc_alloc, .put = vb2_vmalloc_put, .get_userptr = vb2_vmalloc_get_userptr, .put_userptr = vb2_vmalloc_put_userptr, #ifdef CONFIG_HAS_DMA .get_dmabuf = vb2_vmalloc_get_dmabuf, #endif .map_dmabuf = vb2_vmalloc_map_dmabuf, .unmap_dmabuf = vb2_vmalloc_unmap_dmabuf, .attach_dmabuf = vb2_vmalloc_attach_dmabuf, .detach_dmabuf = vb2_vmalloc_detach_dmabuf, .vaddr = vb2_vmalloc_vaddr, .mmap = vb2_vmalloc_mmap, .num_users = vb2_vmalloc_num_users, }; EXPORT_SYMBOL_GPL(vb2_vmalloc_memops); MODULE_DESCRIPTION("vmalloc memory handling routines for videobuf2"); MODULE_AUTHOR("Pawel Osciak <pawel@osciak.com>"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS("DMA_BUF"); |
| 220 194 120 119 103 68 191 155 99 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_QM_H__ #define __XFS_QM_H__ #include "xfs_dquot_item.h" #include "xfs_dquot.h" struct xfs_inode; extern struct kmem_cache *xfs_dqtrx_cache; /* * Number of bmaps that we ask from bmapi when doing a quotacheck. * We make this restriction to keep the memory usage to a minimum. */ #define XFS_DQITER_MAP_SIZE 10 #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ !dqp->q_blk.hardlimit && \ !dqp->q_blk.softlimit && \ !dqp->q_rtb.hardlimit && \ !dqp->q_rtb.softlimit && \ !dqp->q_ino.hardlimit && \ !dqp->q_ino.softlimit && \ !dqp->q_blk.count && \ !dqp->q_rtb.count && \ !dqp->q_ino.count) struct xfs_quota_limits { xfs_qcnt_t hard; /* default hard limit */ xfs_qcnt_t soft; /* default soft limit */ time64_t time; /* limit for timers */ }; /* Defaults for each quota type: time limits, warn limits, usage limits */ struct xfs_def_quota { struct xfs_quota_limits blk; struct xfs_quota_limits ino; struct xfs_quota_limits rtb; }; /* * Various quota information for individual filesystems. * The mount structure keeps a pointer to this. */ struct xfs_quotainfo { struct radix_tree_root qi_uquota_tree; struct radix_tree_root qi_gquota_tree; struct radix_tree_root qi_pquota_tree; struct mutex qi_tree_lock; struct xfs_inode *qi_uquotaip; /* user quota inode */ struct xfs_inode *qi_gquotaip; /* group quota inode */ struct xfs_inode *qi_pquotaip; /* project quota inode */ struct xfs_inode *qi_dirip; /* quota metadir */ struct list_lru qi_lru; int qi_dquots; struct mutex qi_quotaofflock;/* to serialize quotaoff */ xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ uint qi_dqperchunk; /* # ondisk dq in above chunk */ struct xfs_def_quota qi_usr_default; struct xfs_def_quota qi_grp_default; struct xfs_def_quota qi_prj_default; struct shrinker *qi_shrinker; /* Minimum and maximum quota expiration timestamp values. */ time64_t qi_expiry_min; time64_t qi_expiry_max; /* Hook to feed quota counter updates to an active online repair. */ struct xfs_hooks qi_mod_ino_dqtrx_hooks; struct xfs_hooks qi_apply_dqtrx_hooks; }; static inline struct radix_tree_root * xfs_dquot_tree( struct xfs_quotainfo *qi, xfs_dqtype_t type) { switch (type) { case XFS_DQTYPE_USER: return &qi->qi_uquota_tree; case XFS_DQTYPE_GROUP: return &qi->qi_gquota_tree; case XFS_DQTYPE_PROJ: return &qi->qi_pquota_tree; default: ASSERT(0); } return NULL; } static inline struct xfs_inode * xfs_quota_inode(struct xfs_mount *mp, xfs_dqtype_t type) { switch (type) { case XFS_DQTYPE_USER: return mp->m_quotainfo->qi_uquotaip; case XFS_DQTYPE_GROUP: return mp->m_quotainfo->qi_gquotaip; case XFS_DQTYPE_PROJ: return mp->m_quotainfo->qi_pquotaip; default: ASSERT(0); } return NULL; } /* * Parameters for tracking dqtrx changes on behalf of an inode. The hook * function arg parameter is the field being updated. */ struct xfs_mod_ino_dqtrx_params { uintptr_t tx_id; xfs_ino_t ino; xfs_dqtype_t q_type; xfs_dqid_t q_id; int64_t delta; }; extern void xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp, uint field, int64_t delta); extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *); extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *); /* * We keep the usr, grp, and prj dquots separately so that locking will be * easier to do at commit time. All transactions that we know of at this point * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value. */ enum { XFS_QM_TRANS_USR = 0, XFS_QM_TRANS_GRP, XFS_QM_TRANS_PRJ, XFS_QM_TRANS_DQTYPES }; #define XFS_QM_TRANS_MAXDQS 5 struct xfs_dquot_acct { struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS]; }; /* * Users are allowed to have a usage exceeding their softlimit for * a period this long. */ #define XFS_QM_BTIMELIMIT (7 * 24*60*60) /* 1 week */ #define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */ #define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */ extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); /* quota ops */ extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); extern int xfs_qm_scall_getquota(struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, struct qc_dqblk *dst); extern int xfs_qm_scall_getquota_next(struct xfs_mount *mp, xfs_dqid_t *id, xfs_dqtype_t type, struct qc_dqblk *dst); extern int xfs_qm_scall_setqlim(struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, struct qc_dqblk *newlim); extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); static inline struct xfs_def_quota * xfs_get_defquota(struct xfs_quotainfo *qi, xfs_dqtype_t type) { switch (type) { case XFS_DQTYPE_USER: return &qi->qi_usr_default; case XFS_DQTYPE_GROUP: return &qi->qi_grp_default; case XFS_DQTYPE_PROJ: return &qi->qi_prj_default; default: ASSERT(0); return NULL; } } int xfs_qm_qino_load(struct xfs_mount *mp, xfs_dqtype_t type, struct xfs_inode **ipp); #endif /* __XFS_QM_H__ */ |
| 115 115 111 115 92 106 2 2 42 43 42 43 42 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 | /* CPU control. * (C) 2001, 2002, 2003, 2004 Rusty Russell * * This code is licenced under the GPL. */ #include <linux/sched/mm.h> #include <linux/proc_fs.h> #include <linux/smp.h> #include <linux/init.h> #include <linux/notifier.h> #include <linux/sched/signal.h> #include <linux/sched/hotplug.h> #include <linux/sched/isolation.h> #include <linux/sched/task.h> #include <linux/sched/smt.h> #include <linux/unistd.h> #include <linux/cpu.h> #include <linux/oom.h> #include <linux/rcupdate.h> #include <linux/delay.h> #include <linux/export.h> #include <linux/bug.h> #include <linux/kthread.h> #include <linux/stop_machine.h> #include <linux/mutex.h> #include <linux/gfp.h> #include <linux/suspend.h> #include <linux/lockdep.h> #include <linux/tick.h> #include <linux/irq.h> #include <linux/nmi.h> #include <linux/smpboot.h> #include <linux/relay.h> #include <linux/slab.h> #include <linux/scs.h> #include <linux/percpu-rwsem.h> #include <linux/cpuset.h> #include <linux/random.h> #include <linux/cc_platform.h> #include <trace/events/power.h> #define CREATE_TRACE_POINTS #include <trace/events/cpuhp.h> #include "smpboot.h" /** * struct cpuhp_cpu_state - Per cpu hotplug state storage * @state: The current cpu state * @target: The target state * @fail: Current CPU hotplug callback state * @thread: Pointer to the hotplug thread * @should_run: Thread should execute * @rollback: Perform a rollback * @single: Single callback invocation * @bringup: Single callback bringup or teardown selector * @node: Remote CPU node; for multi-instance, do a * single entry callback for install/remove * @last: For multi-instance rollback, remember how far we got * @cb_state: The state for a single callback (install/uninstall) * @result: Result of the operation * @ap_sync_state: State for AP synchronization * @done_up: Signal completion to the issuer of the task for cpu-up * @done_down: Signal completion to the issuer of the task for cpu-down */ struct cpuhp_cpu_state { enum cpuhp_state state; enum cpuhp_state target; enum cpuhp_state fail; #ifdef CONFIG_SMP struct task_struct *thread; bool should_run; bool rollback; bool single; bool bringup; struct hlist_node *node; struct hlist_node *last; enum cpuhp_state cb_state; int result; atomic_t ap_sync_state; struct completion done_up; struct completion done_down; #endif }; static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { .fail = CPUHP_INVALID, }; #ifdef CONFIG_SMP cpumask_t cpus_booted_once_mask; #endif #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) static struct lockdep_map cpuhp_state_up_map = STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); static struct lockdep_map cpuhp_state_down_map = STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); static inline void cpuhp_lock_acquire(bool bringup) { lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); } static inline void cpuhp_lock_release(bool bringup) { lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); } #else static inline void cpuhp_lock_acquire(bool bringup) { } static inline void cpuhp_lock_release(bool bringup) { } #endif /** * struct cpuhp_step - Hotplug state machine step * @name: Name of the step * @startup: Startup function of the step * @teardown: Teardown function of the step * @cant_stop: Bringup/teardown can't be stopped at this step * @multi_instance: State has multiple instances which get added afterwards */ struct cpuhp_step { const char *name; union { int (*single)(unsigned int cpu); int (*multi)(unsigned int cpu, struct hlist_node *node); } startup; union { int (*single)(unsigned int cpu); int (*multi)(unsigned int cpu, struct hlist_node *node); } teardown; /* private: */ struct hlist_head list; /* public: */ bool cant_stop; bool multi_instance; }; static DEFINE_MUTEX(cpuhp_state_mutex); static struct cpuhp_step cpuhp_hp_states[]; static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) { return cpuhp_hp_states + state; } static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step) { return bringup ? !step->startup.single : !step->teardown.single; } /** * cpuhp_invoke_callback - Invoke the callbacks for a given state * @cpu: The cpu for which the callback should be invoked * @state: The state to do callbacks for * @bringup: True if the bringup callback should be invoked * @node: For multi-instance, do a single entry callback for install/remove * @lastp: For multi-instance rollback, remember how far we got * * Called from cpu hotplug and from the state register machinery. * * Return: %0 on success or a negative errno code */ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, bool bringup, struct hlist_node *node, struct hlist_node **lastp) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct cpuhp_step *step = cpuhp_get_step(state); int (*cbm)(unsigned int cpu, struct hlist_node *node); int (*cb)(unsigned int cpu); int ret, cnt; if (st->fail == state) { st->fail = CPUHP_INVALID; return -EAGAIN; } if (cpuhp_step_empty(bringup, step)) { WARN_ON_ONCE(1); return 0; } if (!step->multi_instance) { WARN_ON_ONCE(lastp && *lastp); cb = bringup ? step->startup.single : step->teardown.single; trace_cpuhp_enter(cpu, st->target, state, cb); ret = cb(cpu); trace_cpuhp_exit(cpu, st->state, state, ret); return ret; } cbm = bringup ? step->startup.multi : step->teardown.multi; /* Single invocation for instance add/remove */ if (node) { WARN_ON_ONCE(lastp && *lastp); trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); return ret; } /* State transition. Invoke on all instances */ cnt = 0; hlist_for_each(node, &step->list) { if (lastp && node == *lastp) break; trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); if (ret) { if (!lastp) goto err; *lastp = node; return ret; } cnt++; } if (lastp) *lastp = NULL; return 0; err: /* Rollback the instances if one failed */ cbm = !bringup ? step->startup.multi : step->teardown.multi; if (!cbm) return ret; hlist_for_each(node, &step->list) { if (!cnt--) break; trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); /* * Rollback must not fail, */ WARN_ON_ONCE(ret); } return ret; } #ifdef CONFIG_SMP static bool cpuhp_is_ap_state(enum cpuhp_state state) { /* * The extra check for CPUHP_TEARDOWN_CPU is only for documentation * purposes as that state is handled explicitly in cpu_down. */ return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; } static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) { struct completion *done = bringup ? &st->done_up : &st->done_down; wait_for_completion(done); } static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup) { struct completion *done = bringup ? &st->done_up : &st->done_down; complete(done); } /* * The former STARTING/DYING states, ran with IRQs disabled and must not fail. */ static bool cpuhp_is_atomic_state(enum cpuhp_state state) { return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; } /* Synchronization state management */ enum cpuhp_sync_state { SYNC_STATE_DEAD, SYNC_STATE_KICKED, SYNC_STATE_SHOULD_DIE, SYNC_STATE_ALIVE, SYNC_STATE_SHOULD_ONLINE, SYNC_STATE_ONLINE, }; #ifdef CONFIG_HOTPLUG_CORE_SYNC /** * cpuhp_ap_update_sync_state - Update synchronization state during bringup/teardown * @state: The synchronization state to set * * No synchronization point. Just update of the synchronization state, but implies * a full barrier so that the AP changes are visible before the control CPU proceeds. */ static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) { atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state); (void)atomic_xchg(st, state); } void __weak arch_cpuhp_sync_state_poll(void) { cpu_relax(); } static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state state, enum cpuhp_sync_state next_state) { atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu); ktime_t now, end, start = ktime_get(); int sync; end = start + 10ULL * NSEC_PER_SEC; sync = atomic_read(st); while (1) { if (sync == state) { if (!atomic_try_cmpxchg(st, &sync, next_state)) continue; return true; } now = ktime_get(); if (now > end) { /* Timeout. Leave the state unchanged */ return false; } else if (now - start < NSEC_PER_MSEC) { /* Poll for one millisecond */ arch_cpuhp_sync_state_poll(); } else { usleep_range(USEC_PER_MSEC, 2 * USEC_PER_MSEC); } sync = atomic_read(st); } return true; } #else /* CONFIG_HOTPLUG_CORE_SYNC */ static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) { } #endif /* !CONFIG_HOTPLUG_CORE_SYNC */ #ifdef CONFIG_HOTPLUG_CORE_SYNC_DEAD /** * cpuhp_ap_report_dead - Update synchronization state to DEAD * * No synchronization point. Just update of the synchronization state. */ void cpuhp_ap_report_dead(void) { cpuhp_ap_update_sync_state(SYNC_STATE_DEAD); } void __weak arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { } /* * Late CPU shutdown synchronization point. Cannot use cpuhp_state::done_down * because the AP cannot issue complete() at this stage. */ static void cpuhp_bp_sync_dead(unsigned int cpu) { atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu); int sync = atomic_read(st); do { /* CPU can have reported dead already. Don't overwrite that! */ if (sync == SYNC_STATE_DEAD) break; } while (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_SHOULD_DIE)); if (cpuhp_wait_for_sync_state(cpu, SYNC_STATE_DEAD, SYNC_STATE_DEAD)) { /* CPU reached dead state. Invoke the cleanup function */ arch_cpuhp_cleanup_dead_cpu(cpu); return; } /* No further action possible. Emit message and give up. */ pr_err("CPU%u failed to report dead state\n", cpu); } #else /* CONFIG_HOTPLUG_CORE_SYNC_DEAD */ static inline void cpuhp_bp_sync_dead(unsigned int cpu) { } #endif /* !CONFIG_HOTPLUG_CORE_SYNC_DEAD */ #ifdef CONFIG_HOTPLUG_CORE_SYNC_FULL /** * cpuhp_ap_sync_alive - Synchronize AP with the control CPU once it is alive * * Updates the AP synchronization state to SYNC_STATE_ALIVE and waits * for the BP to release it. */ void cpuhp_ap_sync_alive(void) { atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state); cpuhp_ap_update_sync_state(SYNC_STATE_ALIVE); /* Wait for the control CPU to release it. */ while (atomic_read(st) != SYNC_STATE_SHOULD_ONLINE) cpu_relax(); } static bool cpuhp_can_boot_ap(unsigned int cpu) { atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu); int sync = atomic_read(st); again: switch (sync) { case SYNC_STATE_DEAD: /* CPU is properly dead */ break; case SYNC_STATE_KICKED: /* CPU did not come up in previous attempt */ break; case SYNC_STATE_ALIVE: /* CPU is stuck cpuhp_ap_sync_alive(). */ break; default: /* CPU failed to report online or dead and is in limbo state. */ return false; } /* Prepare for booting */ if (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_KICKED)) goto again; return true; } void __weak arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) { } /* * Early CPU bringup synchronization point. Cannot use cpuhp_state::done_up * because the AP cannot issue complete() so early in the bringup. */ static int cpuhp_bp_sync_alive(unsigned int cpu) { int ret = 0; if (!IS_ENABLED(CONFIG_HOTPLUG_CORE_SYNC_FULL)) return 0; if (!cpuhp_wait_for_sync_state(cpu, SYNC_STATE_ALIVE, SYNC_STATE_SHOULD_ONLINE)) { pr_err("CPU%u failed to report alive state\n", cpu); ret = -EIO; } /* Let the architecture cleanup the kick alive mechanics. */ arch_cpuhp_cleanup_kick_cpu(cpu); return ret; } #else /* CONFIG_HOTPLUG_CORE_SYNC_FULL */ static inline int cpuhp_bp_sync_alive(unsigned int cpu) { return 0; } static inline bool cpuhp_can_boot_ap(unsigned int cpu) { return true; } #endif /* !CONFIG_HOTPLUG_CORE_SYNC_FULL */ /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); bool cpuhp_tasks_frozen; EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen); /* * The following two APIs (cpu_maps_update_begin/done) must be used when * attempting to serialize the updates to cpu_online_mask & cpu_present_mask. */ void cpu_maps_update_begin(void) { mutex_lock(&cpu_add_remove_lock); } void cpu_maps_update_done(void) { mutex_unlock(&cpu_add_remove_lock); } /* * If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock */ static int cpu_hotplug_disabled; #ifdef CONFIG_HOTPLUG_CPU DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock); static bool cpu_hotplug_offline_disabled __ro_after_init; void cpus_read_lock(void) { percpu_down_read(&cpu_hotplug_lock); } EXPORT_SYMBOL_GPL(cpus_read_lock); int cpus_read_trylock(void) { return percpu_down_read_trylock(&cpu_hotplug_lock); } EXPORT_SYMBOL_GPL(cpus_read_trylock); void cpus_read_unlock(void) { percpu_up_read(&cpu_hotplug_lock); } EXPORT_SYMBOL_GPL(cpus_read_unlock); void cpus_write_lock(void) { percpu_down_write(&cpu_hotplug_lock); } void cpus_write_unlock(void) { percpu_up_write(&cpu_hotplug_lock); } void lockdep_assert_cpus_held(void) { /* * We can't have hotplug operations before userspace starts running, * and some init codepaths will knowingly not take the hotplug lock. * This is all valid, so mute lockdep until it makes sense to report * unheld locks. */ if (system_state < SYSTEM_RUNNING) return; percpu_rwsem_assert_held(&cpu_hotplug_lock); } EXPORT_SYMBOL_GPL(lockdep_assert_cpus_held); #ifdef CONFIG_LOCKDEP int lockdep_is_cpus_held(void) { return percpu_rwsem_is_held(&cpu_hotplug_lock); } #endif static void lockdep_acquire_cpus_lock(void) { rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_); } static void lockdep_release_cpus_lock(void) { rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_); } /* Declare CPU offlining not supported */ void cpu_hotplug_disable_offlining(void) { cpu_maps_update_begin(); cpu_hotplug_offline_disabled = true; cpu_maps_update_done(); } /* * Wait for currently running CPU hotplug operations to complete (if any) and * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the * hotplug path before performing hotplug operations. So acquiring that lock * guarantees mutual exclusion from any currently running hotplug operations. */ void cpu_hotplug_disable(void) { cpu_maps_update_begin(); cpu_hotplug_disabled++; cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_disable); static void __cpu_hotplug_enable(void) { if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n")) return; cpu_hotplug_disabled--; } void cpu_hotplug_enable(void) { cpu_maps_update_begin(); __cpu_hotplug_enable(); cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #else static void lockdep_acquire_cpus_lock(void) { } static void lockdep_release_cpus_lock(void) { } #endif /* CONFIG_HOTPLUG_CPU */ /* * Architectures that need SMT-specific errata handling during SMT hotplug * should override this. */ void __weak arch_smt_update(void) { } #ifdef CONFIG_HOTPLUG_SMT enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; static unsigned int cpu_smt_max_threads __ro_after_init; unsigned int cpu_smt_num_threads __read_mostly = UINT_MAX; void __init cpu_smt_disable(bool force) { if (!cpu_smt_possible()) return; if (force) { pr_info("SMT: Force disabled\n"); cpu_smt_control = CPU_SMT_FORCE_DISABLED; } else { pr_info("SMT: disabled\n"); cpu_smt_control = CPU_SMT_DISABLED; } cpu_smt_num_threads = 1; } /* * The decision whether SMT is supported can only be done after the full * CPU identification. Called from architecture code. */ void __init cpu_smt_set_num_threads(unsigned int num_threads, unsigned int max_threads) { WARN_ON(!num_threads || (num_threads > max_threads)); if (max_threads == 1) cpu_smt_control = CPU_SMT_NOT_SUPPORTED; cpu_smt_max_threads = max_threads; /* * If SMT has been disabled via the kernel command line or SMT is * not supported, set cpu_smt_num_threads to 1 for consistency. * If enabled, take the architecture requested number of threads * to bring up into account. */ if (cpu_smt_control != CPU_SMT_ENABLED) cpu_smt_num_threads = 1; else if (num_threads < cpu_smt_num_threads) cpu_smt_num_threads = num_threads; } static int __init smt_cmdline_disable(char *str) { cpu_smt_disable(str && !strcmp(str, "force")); return 0; } early_param("nosmt", smt_cmdline_disable); /* * For Archicture supporting partial SMT states check if the thread is allowed. * Otherwise this has already been checked through cpu_smt_max_threads when * setting the SMT level. */ static inline bool cpu_smt_thread_allowed(unsigned int cpu) { #ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC return topology_smt_thread_allowed(cpu); #else return true; #endif } static inline bool cpu_bootable(unsigned int cpu) { if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu)) return true; /* All CPUs are bootable if controls are not configured */ if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED) return true; /* All CPUs are bootable if CPU is not SMT capable */ if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED) return true; if (topology_is_primary_thread(cpu)) return true; /* * On x86 it's required to boot all logical CPUs at least once so * that the init code can get a chance to set CR4.MCE on each * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any * core will shutdown the machine. */ return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); } /* Returns true if SMT is supported and not forcefully (irreversibly) disabled */ bool cpu_smt_possible(void) { return cpu_smt_control != CPU_SMT_FORCE_DISABLED && cpu_smt_control != CPU_SMT_NOT_SUPPORTED; } EXPORT_SYMBOL_GPL(cpu_smt_possible); #else static inline bool cpu_bootable(unsigned int cpu) { return true; } #endif static inline enum cpuhp_state cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; bool bringup = st->state < target; st->rollback = false; st->last = NULL; st->target = target; st->single = false; st->bringup = bringup; if (cpu_dying(cpu) != !bringup) set_cpu_dying(cpu, !bringup); return prev_state; } static inline void cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) { bool bringup = !st->bringup; st->target = prev_state; /* * Already rolling back. No need invert the bringup value or to change * the current state. */ if (st->rollback) return; st->rollback = true; /* * If we have st->last we need to undo partial multi_instance of this * state first. Otherwise start undo at the previous state. */ if (!st->last) { if (st->bringup) st->state--; else st->state++; } st->bringup = bringup; if (cpu_dying(cpu) != !bringup) set_cpu_dying(cpu, !bringup); } /* Regular hotplug invocation of the AP hotplug thread */ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) { if (!st->single && st->state == st->target) return; st->result = 0; /* * Make sure the above stores are visible before should_run becomes * true. Paired with the mb() above in cpuhp_thread_fun() */ smp_mb(); st->should_run = true; wake_up_process(st->thread); wait_for_ap_thread(st, st->bringup); } static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { enum cpuhp_state prev_state; int ret; prev_state = cpuhp_set_state(cpu, st, target); __cpuhp_kick_ap(st); if ((ret = st->result)) { cpuhp_reset_state(cpu, st, prev_state); __cpuhp_kick_ap(st); } return ret; } static int bringup_wait_for_ap_online(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ wait_for_ap_thread(st, true); if (WARN_ON_ONCE((!cpu_online(cpu)))) return -ECANCELED; /* Unpark the hotplug thread of the target cpu */ kthread_unpark(st->thread); /* * SMT soft disabling on X86 requires to bring the CPU out of the * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The * CPU marked itself as booted_once in notify_cpu_starting() so the * cpu_bootable() check will now return false if this is not the * primary sibling. */ if (!cpu_bootable(cpu)) return -ECANCELED; return 0; } #ifdef CONFIG_HOTPLUG_SPLIT_STARTUP static int cpuhp_kick_ap_alive(unsigned int cpu) { if (!cpuhp_can_boot_ap(cpu)) return -EAGAIN; return arch_cpuhp_kick_ap_alive(cpu, idle_thread_get(cpu)); } static int cpuhp_bringup_ap(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int ret; /* * Some architectures have to walk the irq descriptors to * setup the vector space for the cpu which comes online. * Prevent irq alloc/free across the bringup. */ irq_lock_sparse(); ret = cpuhp_bp_sync_alive(cpu); if (ret) goto out_unlock; ret = bringup_wait_for_ap_online(cpu); if (ret) goto out_unlock; irq_unlock_sparse(); if (st->target <= CPUHP_AP_ONLINE_IDLE) return 0; return cpuhp_kick_ap(cpu, st, st->target); out_unlock: irq_unlock_sparse(); return ret; } #else static int bringup_cpu(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct task_struct *idle = idle_thread_get(cpu); int ret; if (!cpuhp_can_boot_ap(cpu)) return -EAGAIN; /* * Some architectures have to walk the irq descriptors to * setup the vector space for the cpu which comes online. * * Prevent irq alloc/free across the bringup by acquiring the * sparse irq lock. Hold it until the upcoming CPU completes the * startup in cpuhp_online_idle() which allows to avoid * intermediate synchronization points in the architecture code. */ irq_lock_sparse(); ret = __cpu_up(cpu, idle); if (ret) goto out_unlock; ret = cpuhp_bp_sync_alive(cpu); if (ret) goto out_unlock; ret = bringup_wait_for_ap_online(cpu); if (ret) goto out_unlock; irq_unlock_sparse(); if (st->target <= CPUHP_AP_ONLINE_IDLE) return 0; return cpuhp_kick_ap(cpu, st, st->target); out_unlock: irq_unlock_sparse(); return ret; } #endif static int finish_cpu(unsigned int cpu) { struct task_struct *idle = idle_thread_get(cpu); struct mm_struct *mm = idle->active_mm; /* * sched_force_init_mm() ensured the use of &init_mm, * drop that refcount now that the CPU has stopped. */ WARN_ON(mm != &init_mm); idle->active_mm = NULL; mmdrop_lazy_tlb(mm); return 0; } /* * Hotplug state machine related functions */ /* * Get the next state to run. Empty ones will be skipped. Returns true if a * state must be run. * * st->state will be modified ahead of time, to match state_to_run, as if it * has already ran. */ static bool cpuhp_next_state(bool bringup, enum cpuhp_state *state_to_run, struct cpuhp_cpu_state *st, enum cpuhp_state target) { do { if (bringup) { if (st->state >= target) return false; *state_to_run = ++st->state; } else { if (st->state <= target) return false; *state_to_run = st->state--; } if (!cpuhp_step_empty(bringup, cpuhp_get_step(*state_to_run))) break; } while (true); return true; } static int __cpuhp_invoke_callback_range(bool bringup, unsigned int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target, bool nofail) { enum cpuhp_state state; int ret = 0; while (cpuhp_next_state(bringup, &state, st, target)) { int err; err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL); if (!err) continue; if (nofail) { pr_warn("CPU %u %s state %s (%d) failed (%d)\n", cpu, bringup ? "UP" : "DOWN", cpuhp_get_step(st->state)->name, st->state, err); ret = -1; } else { ret = err; break; } } return ret; } static inline int cpuhp_invoke_callback_range(bool bringup, unsigned int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { return __cpuhp_invoke_callback_range(bringup, cpu, st, target, false); } static inline void cpuhp_invoke_callback_range_nofail(bool bringup, unsigned int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { __cpuhp_invoke_callback_range(bringup, cpu, st, target, true); } static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st) { if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) return true; /* * When CPU hotplug is disabled, then taking the CPU down is not * possible because takedown_cpu() and the architecture and * subsystem specific mechanisms are not available. So the CPU * which would be completely unplugged again needs to stay around * in the current state. */ return st->state <= CPUHP_BRINGUP_CPU; } static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; int ret = 0; ret = cpuhp_invoke_callback_range(true, cpu, st, target); if (ret) { pr_debug("CPU UP failed (%d) CPU %u state %s (%d)\n", ret, cpu, cpuhp_get_step(st->state)->name, st->state); cpuhp_reset_state(cpu, st, prev_state); if (can_rollback_cpu(st)) WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, prev_state)); } return ret; } /* * The cpu hotplug threads manage the bringup and teardown of the cpus */ static int cpuhp_should_run(unsigned int cpu) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); return st->should_run; } /* * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke * callbacks when a state gets [un]installed at runtime. * * Each invocation of this function by the smpboot thread does a single AP * state callback. * * It has 3 modes of operation: * - single: runs st->cb_state * - up: runs ++st->state, while st->state < st->target * - down: runs st->state--, while st->state > st->target * * When complete or on error, should_run is cleared and the completion is fired. */ static void cpuhp_thread_fun(unsigned int cpu) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); bool bringup = st->bringup; enum cpuhp_state state; if (WARN_ON_ONCE(!st->should_run)) return; /* * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures * that if we see ->should_run we also see the rest of the state. */ smp_mb(); /* * The BP holds the hotplug lock, but we're now running on the AP, * ensure that anybody asserting the lock is held, will actually find * it so. */ lockdep_acquire_cpus_lock(); cpuhp_lock_acquire(bringup); if (st->single) { state = st->cb_state; st->should_run = false; } else { st->should_run = cpuhp_next_state(bringup, &state, st, st->target); if (!st->should_run) goto end; } WARN_ON_ONCE(!cpuhp_is_ap_state(state)); if (cpuhp_is_atomic_state(state)) { local_irq_disable(); st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); local_irq_enable(); /* * STARTING/DYING must not fail! */ WARN_ON_ONCE(st->result); } else { st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); } if (st->result) { /* * If we fail on a rollback, we're up a creek without no * paddle, no way forward, no way back. We loose, thanks for * playing. */ WARN_ON_ONCE(st->rollback); st->should_run = false; } end: cpuhp_lock_release(bringup); lockdep_release_cpus_lock(); if (!st->should_run) complete_ap_thread(st, bringup); } /* Invoke a single callback on a remote cpu */ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, struct hlist_node *node) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int ret; if (!cpu_online(cpu)) return 0; cpuhp_lock_acquire(false); cpuhp_lock_release(false); cpuhp_lock_acquire(true); cpuhp_lock_release(true); /* * If we are up and running, use the hotplug thread. For early calls * we invoke the thread function directly. */ if (!st->thread) return cpuhp_invoke_callback(cpu, state, bringup, node, NULL); st->rollback = false; st->last = NULL; st->node = node; st->bringup = bringup; st->cb_state = state; st->single = true; __cpuhp_kick_ap(st); /* * If we failed and did a partial, do a rollback. */ if ((ret = st->result) && st->last) { st->rollback = true; st->bringup = !bringup; __cpuhp_kick_ap(st); } /* * Clean up the leftovers so the next hotplug operation wont use stale * data. */ st->node = st->last = NULL; return ret; } static int cpuhp_kick_ap_work(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state prev_state = st->state; int ret; cpuhp_lock_acquire(false); cpuhp_lock_release(false); cpuhp_lock_acquire(true); cpuhp_lock_release(true); trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); ret = cpuhp_kick_ap(cpu, st, st->target); trace_cpuhp_exit(cpu, st->state, prev_state, ret); return ret; } static struct smp_hotplug_thread cpuhp_threads = { .store = &cpuhp_state.thread, .thread_should_run = cpuhp_should_run, .thread_fn = cpuhp_thread_fun, .thread_comm = "cpuhp/%u", .selfparking = true, }; static __init void cpuhp_init_state(void) { struct cpuhp_cpu_state *st; int cpu; for_each_possible_cpu(cpu) { st = per_cpu_ptr(&cpuhp_state, cpu); init_completion(&st->done_up); init_completion(&st->done_down); } } void __init cpuhp_threads_init(void) { cpuhp_init_state(); BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads)); kthread_unpark(this_cpu_read(cpuhp_state.thread)); } #ifdef CONFIG_HOTPLUG_CPU #ifndef arch_clear_mm_cpumask_cpu #define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm)) #endif /** * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU * @cpu: a CPU id * * This function walks all processes, finds a valid mm struct for each one and * then clears a corresponding bit in mm's cpumask. While this all sounds * trivial, there are various non-obvious corner cases, which this function * tries to solve in a safe manner. * * Also note that the function uses a somewhat relaxed locking scheme, so it may * be called only for an already offlined CPU. */ void clear_tasks_mm_cpumask(int cpu) { struct task_struct *p; /* * This function is called after the cpu is taken down and marked * offline, so its not like new tasks will ever get this cpu set in * their mm mask. -- Peter Zijlstra * Thus, we may use rcu_read_lock() here, instead of grabbing * full-fledged tasklist_lock. */ WARN_ON(cpu_online(cpu)); rcu_read_lock(); for_each_process(p) { struct task_struct *t; /* * Main thread might exit, but other threads may still have * a valid mm. Find one. */ t = find_lock_task_mm(p); if (!t) continue; arch_clear_mm_cpumask_cpu(cpu, t->mm); task_unlock(t); } rcu_read_unlock(); } /* Take this CPU down. */ static int take_cpu_down(void *_param) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); int err, cpu = smp_processor_id(); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; /* * Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going * down, that the current state is CPUHP_TEARDOWN_CPU - 1. */ WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1)); /* * Invoke the former CPU_DYING callbacks. DYING must not fail! */ cpuhp_invoke_callback_range_nofail(false, cpu, st, target); /* Park the stopper thread */ stop_machine_park(cpu); return 0; } static int takedown_cpu(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int err; /* Park the smpboot threads */ kthread_park(st->thread); /* * Prevent irq alloc/free while the dying cpu reorganizes the * interrupt affinities. */ irq_lock_sparse(); /* * So now all preempt/rcu users must observe !cpu_active(). */ err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { /* CPU refused to die */ irq_unlock_sparse(); /* Unpark the hotplug thread so we can rollback there */ kthread_unpark(st->thread); return err; } BUG_ON(cpu_online(cpu)); /* * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed * all runnable tasks from the CPU, there's only the idle task left now * that the migration thread is done doing the stop_machine thing. * * Wait for the stop thread to go away. */ wait_for_ap_thread(st, false); BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); /* Interrupts are moved away from the dying cpu, reenable alloc/free */ irq_unlock_sparse(); hotplug_cpu__broadcast_tick_pull(cpu); /* This actually kills the CPU. */ __cpu_die(cpu); cpuhp_bp_sync_dead(cpu); lockdep_cleanup_dead_cpu(cpu, idle_thread_get(cpu)); /* * Callbacks must be re-integrated right away to the RCU state machine. * Otherwise an RCU callback could block a further teardown function * waiting for its completion. */ rcutree_migrate_callbacks(cpu); return 0; } static void cpuhp_complete_idle_dead(void *arg) { struct cpuhp_cpu_state *st = arg; complete_ap_thread(st, false); } void cpuhp_report_idle_dead(void) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); BUG_ON(st->state != CPUHP_AP_OFFLINE); tick_assert_timekeeping_handover(); rcutree_report_cpu_dead(); st->state = CPUHP_AP_IDLE_DEAD; /* * We cannot call complete after rcutree_report_cpu_dead() so we delegate it * to an online cpu. */ smp_call_function_single(cpumask_first(cpu_online_mask), cpuhp_complete_idle_dead, st, 0); } static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; int ret = 0; ret = cpuhp_invoke_callback_range(false, cpu, st, target); if (ret) { pr_debug("CPU DOWN failed (%d) CPU %u state %s (%d)\n", ret, cpu, cpuhp_get_step(st->state)->name, st->state); cpuhp_reset_state(cpu, st, prev_state); if (st->state < prev_state) WARN_ON(cpuhp_invoke_callback_range(true, cpu, st, prev_state)); } return ret; } /* Requires cpu_add_remove_lock to be held */ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int prev_state, ret = 0; if (num_online_cpus() == 1) return -EBUSY; if (!cpu_present(cpu)) return -EINVAL; cpus_write_lock(); cpuhp_tasks_frozen = tasks_frozen; prev_state = cpuhp_set_state(cpu, st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread. */ if (st->state > CPUHP_TEARDOWN_CPU) { st->target = max((int)target, CPUHP_TEARDOWN_CPU); ret = cpuhp_kick_ap_work(cpu); /* * The AP side has done the error rollback already. Just * return the error code.. */ if (ret) goto out; /* * We might have stopped still in the range of the AP hotplug * thread. Nothing to do anymore. */ if (st->state > CPUHP_TEARDOWN_CPU) goto out; st->target = target; } /* * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need * to do the further cleanups. */ ret = cpuhp_down_callbacks(cpu, st, target); if (ret && st->state < prev_state) { if (st->state == CPUHP_TEARDOWN_CPU) { cpuhp_reset_state(cpu, st, prev_state); __cpuhp_kick_ap(st); } else { WARN(1, "DEAD callback error for CPU%d", cpu); } } out: cpus_write_unlock(); arch_smt_update(); return ret; } struct cpu_down_work { unsigned int cpu; enum cpuhp_state target; }; static long __cpu_down_maps_locked(void *arg) { struct cpu_down_work *work = arg; return _cpu_down(work->cpu, 0, work->target); } static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) { struct cpu_down_work work = { .cpu = cpu, .target = target, }; /* * If the platform does not support hotplug, report it explicitly to * differentiate it from a transient offlining failure. */ if (cpu_hotplug_offline_disabled) return -EOPNOTSUPP; if (cpu_hotplug_disabled) return -EBUSY; /* * Ensure that the control task does not run on the to be offlined * CPU to prevent a deadlock against cfs_b->period_timer. * Also keep at least one housekeeping cpu onlined to avoid generating * an empty sched_domain span. */ for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) { if (cpu != work.cpu) return work_on_cpu(cpu, __cpu_down_maps_locked, &work); } return -EBUSY; } static int cpu_down(unsigned int cpu, enum cpuhp_state target) { int err; cpu_maps_update_begin(); err = cpu_down_maps_locked(cpu, target); cpu_maps_update_done(); return err; } /** * cpu_device_down - Bring down a cpu device * @dev: Pointer to the cpu device to offline * * This function is meant to be used by device core cpu subsystem only. * * Other subsystems should use remove_cpu() instead. * * Return: %0 on success or a negative errno code */ int cpu_device_down(struct device *dev) { return cpu_down(dev->id, CPUHP_OFFLINE); } int remove_cpu(unsigned int cpu) { int ret; lock_device_hotplug(); ret = device_offline(get_cpu_device(cpu)); unlock_device_hotplug(); return ret; } EXPORT_SYMBOL_GPL(remove_cpu); void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { unsigned int cpu; int error; cpu_maps_update_begin(); /* * Make certain the cpu I'm about to reboot on is online. * * This is inline to what migrate_to_reboot_cpu() already do. */ if (!cpu_online(primary_cpu)) primary_cpu = cpumask_first(cpu_online_mask); for_each_online_cpu(cpu) { if (cpu == primary_cpu) continue; error = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); if (error) { pr_err("Failed to offline CPU%d - error=%d", cpu, error); break; } } /* * Ensure all but the reboot CPU are offline. */ BUG_ON(num_online_cpus() > 1); /* * Make sure the CPUs won't be enabled by someone else after this * point. Kexec will reboot to a new kernel shortly resetting * everything along the way. */ cpu_hotplug_disabled++; cpu_maps_update_done(); } #else #define takedown_cpu NULL #endif /*CONFIG_HOTPLUG_CPU*/ /** * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU * @cpu: cpu that just started * * It must be called by the arch code on the new cpu, before the new cpu * enables interrupts and before the "boot" cpu returns from __cpu_up(). */ void notify_cpu_starting(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); rcutree_report_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ cpumask_set_cpu(cpu, &cpus_booted_once_mask); /* * STARTING must not fail! */ cpuhp_invoke_callback_range_nofail(true, cpu, st, target); } /* * Called from the idle task. Wake up the controlling task which brings the * hotplug thread of the upcoming CPU up and then delegates the rest of the * online bringup to the hotplug thread. */ void cpuhp_online_idle(enum cpuhp_state state) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); /* Happens for the boot cpu */ if (state != CPUHP_AP_ONLINE_IDLE) return; cpuhp_ap_update_sync_state(SYNC_STATE_ONLINE); /* * Unpark the stopper thread before we start the idle loop (and start * scheduling); this ensures the stopper task is always available. */ stop_machine_unpark(smp_processor_id()); st->state = CPUHP_AP_ONLINE_IDLE; complete_ap_thread(st, true); } /* Requires cpu_add_remove_lock to be held */ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct task_struct *idle; int ret = 0; cpus_write_lock(); if (!cpu_present(cpu)) { ret = -EINVAL; goto out; } /* * The caller of cpu_up() might have raced with another * caller. Nothing to do. */ if (st->state >= target) goto out; if (st->state == CPUHP_OFFLINE) { /* Let it fail before we try to bring the cpu up */ idle = idle_thread_get(cpu); if (IS_ERR(idle)) { ret = PTR_ERR(idle); goto out; } /* * Reset stale stack state from the last time this CPU was online. */ scs_task_reset(idle); kasan_unpoison_task_stack(idle); } cpuhp_tasks_frozen = tasks_frozen; cpuhp_set_state(cpu, st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread once more. */ if (st->state > CPUHP_BRINGUP_CPU) { ret = cpuhp_kick_ap_work(cpu); /* * The AP side has done the error rollback already. Just * return the error code.. */ if (ret) goto out; } /* * Try to reach the target state. We max out on the BP at * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is * responsible for bringing it up to the target state. */ target = min((int)target, CPUHP_BRINGUP_CPU); ret = cpuhp_up_callbacks(cpu, st, target); out: cpus_write_unlock(); arch_smt_update(); return ret; } static int cpu_up(unsigned int cpu, enum cpuhp_state target) { int err = 0; if (!cpu_possible(cpu)) { pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n", cpu); return -EINVAL; } err = try_online_node(cpu_to_node(cpu)); if (err) return err; cpu_maps_update_begin(); if (cpu_hotplug_disabled) { err = -EBUSY; goto out; } if (!cpu_bootable(cpu)) { err = -EPERM; goto out; } err = _cpu_up(cpu, 0, target); out: cpu_maps_update_done(); return err; } /** * cpu_device_up - Bring up a cpu device * @dev: Pointer to the cpu device to online * * This function is meant to be used by device core cpu subsystem only. * * Other subsystems should use add_cpu() instead. * * Return: %0 on success or a negative errno code */ int cpu_device_up(struct device *dev) { return cpu_up(dev->id, CPUHP_ONLINE); } int add_cpu(unsigned int cpu) { int ret; lock_device_hotplug(); ret = device_online(get_cpu_device(cpu)); unlock_device_hotplug(); return ret; } EXPORT_SYMBOL_GPL(add_cpu); /** * bringup_hibernate_cpu - Bring up the CPU that we hibernated on * @sleep_cpu: The cpu we hibernated on and should be brought up. * * On some architectures like arm64, we can hibernate on any CPU, but on * wake up the CPU we hibernated on might be offline as a side effect of * using maxcpus= for example. * * Return: %0 on success or a negative errno code */ int bringup_hibernate_cpu(unsigned int sleep_cpu) { int ret; if (!cpu_online(sleep_cpu)) { pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n"); ret = cpu_up(sleep_cpu, CPUHP_ONLINE); if (ret) { pr_err("Failed to bring hibernate-CPU up!\n"); return ret; } } return 0; } static void __init cpuhp_bringup_mask(const struct cpumask *mask, unsigned int ncpus, enum cpuhp_state target) { unsigned int cpu; for_each_cpu(cpu, mask) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); if (cpu_up(cpu, target) && can_rollback_cpu(st)) { /* * If this failed then cpu_up() might have only * rolled back to CPUHP_BP_KICK_AP for the final * online. Clean it up. NOOP if already rolled back. */ WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, CPUHP_OFFLINE)); } if (!--ncpus) break; } } #ifdef CONFIG_HOTPLUG_PARALLEL static bool __cpuhp_parallel_bringup __ro_after_init = true; static int __init parallel_bringup_parse_param(char *arg) { return kstrtobool(arg, &__cpuhp_parallel_bringup); } early_param("cpuhp.parallel", parallel_bringup_parse_param); #ifdef CONFIG_HOTPLUG_SMT static inline bool cpuhp_smt_aware(void) { return cpu_smt_max_threads > 1; } static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) { return cpu_primary_thread_mask; } #else static inline bool cpuhp_smt_aware(void) { return false; } static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) { return cpu_none_mask; } #endif bool __weak arch_cpuhp_init_parallel_bringup(void) { return true; } /* * On architectures which have enabled parallel bringup this invokes all BP * prepare states for each of the to be onlined APs first. The last state * sends the startup IPI to the APs. The APs proceed through the low level * bringup code in parallel and then wait for the control CPU to release * them one by one for the final onlining procedure. * * This avoids waiting for each AP to respond to the startup IPI in * CPUHP_BRINGUP_CPU. */ static bool __init cpuhp_bringup_cpus_parallel(unsigned int ncpus) { const struct cpumask *mask = cpu_present_mask; if (__cpuhp_parallel_bringup) __cpuhp_parallel_bringup = arch_cpuhp_init_parallel_bringup(); if (!__cpuhp_parallel_bringup) return false; if (cpuhp_smt_aware()) { const struct cpumask *pmask = cpuhp_get_primary_thread_mask(); static struct cpumask tmp_mask __initdata; /* * X86 requires to prevent that SMT siblings stopped while * the primary thread does a microcode update for various * reasons. Bring the primary threads up first. */ cpumask_and(&tmp_mask, mask, pmask); cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_BP_KICK_AP); cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_ONLINE); /* Account for the online CPUs */ ncpus -= num_online_cpus(); if (!ncpus) return true; /* Create the mask for secondary CPUs */ cpumask_andnot(&tmp_mask, mask, pmask); mask = &tmp_mask; } /* Bring the not-yet started CPUs up */ cpuhp_bringup_mask(mask, ncpus, CPUHP_BP_KICK_AP); cpuhp_bringup_mask(mask, ncpus, CPUHP_ONLINE); return true; } #else static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return false; } #endif /* CONFIG_HOTPLUG_PARALLEL */ void __init bringup_nonboot_cpus(unsigned int max_cpus) { if (!max_cpus) return; /* Try parallel bringup optimization if enabled */ if (cpuhp_bringup_cpus_parallel(max_cpus)) return; /* Full per CPU serialized bringup */ cpuhp_bringup_mask(cpu_present_mask, max_cpus, CPUHP_ONLINE); } #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; int freeze_secondary_cpus(int primary) { int cpu, error = 0; cpu_maps_update_begin(); if (primary == -1) { primary = cpumask_first(cpu_online_mask); if (!housekeeping_cpu(primary, HK_TYPE_TIMER)) primary = housekeeping_any_cpu(HK_TYPE_TIMER); } else { if (!cpu_online(primary)) primary = cpumask_first(cpu_online_mask); } /* * We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); pr_info("Disabling non-boot CPUs ...\n"); for (cpu = nr_cpu_ids - 1; cpu >= 0; cpu--) { if (!cpu_online(cpu) || cpu == primary) continue; if (pm_wakeup_pending()) { pr_info("Wakeup pending. Abort CPU freeze\n"); error = -EBUSY; break; } trace_suspend_resume(TPS("CPU_OFF"), cpu, true); error = _cpu_down(cpu, 1, CPUHP_OFFLINE); trace_suspend_resume(TPS("CPU_OFF"), cpu, false); if (!error) cpumask_set_cpu(cpu, frozen_cpus); else { pr_err("Error taking CPU%d down: %d\n", cpu, error); break; } } if (!error) BUG_ON(num_online_cpus() > 1); else pr_err("Non-boot CPUs are not disabled\n"); /* * Make sure the CPUs won't be enabled by someone else. We need to do * this even in case of failure as all freeze_secondary_cpus() users are * supposed to do thaw_secondary_cpus() on the failure path. */ cpu_hotplug_disabled++; cpu_maps_update_done(); return error; } void __weak arch_thaw_secondary_cpus_begin(void) { } void __weak arch_thaw_secondary_cpus_end(void) { } void thaw_secondary_cpus(void) { int cpu, error; /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); __cpu_hotplug_enable(); if (cpumask_empty(frozen_cpus)) goto out; pr_info("Enabling non-boot CPUs ...\n"); arch_thaw_secondary_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { trace_suspend_resume(TPS("CPU_ON"), cpu, true); error = _cpu_up(cpu, 1, CPUHP_ONLINE); trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { pr_info("CPU%d is up\n", cpu); continue; } pr_warn("Error taking CPU%d up: %d\n", cpu, error); } arch_thaw_secondary_cpus_end(); cpumask_clear(frozen_cpus); out: cpu_maps_update_done(); } static int __init alloc_frozen_cpus(void) { if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) return -ENOMEM; return 0; } core_initcall(alloc_frozen_cpus); /* * When callbacks for CPU hotplug notifications are being executed, we must * ensure that the state of the system with respect to the tasks being frozen * or not, as reported by the notification, remains unchanged *throughout the * duration* of the execution of the callbacks. * Hence we need to prevent the freezer from racing with regular CPU hotplug. * * This synchronization is implemented by mutually excluding regular CPU * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/ * Hibernate notifications. */ static int cpu_hotplug_pm_callback(struct notifier_block *nb, unsigned long action, void *ptr) { switch (action) { case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: cpu_hotplug_disable(); break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: cpu_hotplug_enable(); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static int __init cpu_hotplug_pm_sync_init(void) { /* * cpu_hotplug_pm_callback has higher priority than x86 * bsp_pm_callback which depends on cpu_hotplug_pm_callback * to disable cpu hotplug to avoid cpu hotplug race. */ pm_notifier(cpu_hotplug_pm_callback, 0); return 0; } core_initcall(cpu_hotplug_pm_sync_init); #endif /* CONFIG_PM_SLEEP_SMP */ int __boot_cpu_id; #endif /* CONFIG_SMP */ /* Boot processor state steps */ static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_OFFLINE] = { .name = "offline", .startup.single = NULL, .teardown.single = NULL, }, #ifdef CONFIG_SMP [CPUHP_CREATE_THREADS]= { .name = "threads:prepare", .startup.single = smpboot_create_threads, .teardown.single = NULL, .cant_stop = true, }, [CPUHP_RANDOM_PREPARE] = { .name = "random:prepare", .startup.single = random_prepare_cpu, .teardown.single = NULL, }, [CPUHP_WORKQUEUE_PREP] = { .name = "workqueue:prepare", .startup.single = workqueue_prepare_cpu, .teardown.single = NULL, }, [CPUHP_HRTIMERS_PREPARE] = { .name = "hrtimers:prepare", .startup.single = hrtimers_prepare_cpu, .teardown.single = NULL, }, [CPUHP_SMPCFD_PREPARE] = { .name = "smpcfd:prepare", .startup.single = smpcfd_prepare_cpu, .teardown.single = smpcfd_dead_cpu, }, [CPUHP_RELAY_PREPARE] = { .name = "relay:prepare", .startup.single = relay_prepare_cpu, .teardown.single = NULL, }, [CPUHP_RCUTREE_PREP] = { .name = "RCU/tree:prepare", .startup.single = rcutree_prepare_cpu, .teardown.single = rcutree_dead_cpu, }, /* * On the tear-down path, timers_dead_cpu() must be invoked * before blk_mq_queue_reinit_notify() from notify_dead(), * otherwise a RCU stall occurs. */ [CPUHP_TIMERS_PREPARE] = { .name = "timers:prepare", .startup.single = timers_prepare_cpu, .teardown.single = timers_dead_cpu, }, #ifdef CONFIG_HOTPLUG_SPLIT_STARTUP /* * Kicks the AP alive. AP will wait in cpuhp_ap_sync_alive() until * the next step will release it. */ [CPUHP_BP_KICK_AP] = { .name = "cpu:kick_ap", .startup.single = cpuhp_kick_ap_alive, }, /* * Waits for the AP to reach cpuhp_ap_sync_alive() and then * releases it for the complete bringup. */ [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup.single = cpuhp_bringup_ap, .teardown.single = finish_cpu, .cant_stop = true, }, #else /* * All-in-one CPU bringup state which includes the kick alive. */ [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup.single = bringup_cpu, .teardown.single = finish_cpu, .cant_stop = true, }, #endif /* Final state before CPU kills itself */ [CPUHP_AP_IDLE_DEAD] = { .name = "idle:dead", }, /* * Last state before CPU enters the idle loop to die. Transient state * for synchronization. */ [CPUHP_AP_OFFLINE] = { .name = "ap:offline", .cant_stop = true, }, /* First state is scheduler control. Interrupts are disabled */ [CPUHP_AP_SCHED_STARTING] = { .name = "sched:starting", .startup.single = sched_cpu_starting, .teardown.single = sched_cpu_dying, }, [CPUHP_AP_RCUTREE_DYING] = { .name = "RCU/tree:dying", .startup.single = NULL, .teardown.single = rcutree_dying_cpu, }, [CPUHP_AP_SMPCFD_DYING] = { .name = "smpcfd:dying", .startup.single = NULL, .teardown.single = smpcfd_dying_cpu, }, [CPUHP_AP_HRTIMERS_DYING] = { .name = "hrtimers:dying", .startup.single = hrtimers_cpu_starting, .teardown.single = hrtimers_cpu_dying, }, [CPUHP_AP_TICK_DYING] = { .name = "tick:dying", .startup.single = NULL, .teardown.single = tick_cpu_dying, }, /* Entry state on starting. Interrupts enabled from here on. Transient * state for synchronsization */ [CPUHP_AP_ONLINE] = { .name = "ap:online", }, /* * Handled on control processor until the plugged processor manages * this itself. */ [CPUHP_TEARDOWN_CPU] = { .name = "cpu:teardown", .startup.single = NULL, .teardown.single = takedown_cpu, .cant_stop = true, }, [CPUHP_AP_SCHED_WAIT_EMPTY] = { .name = "sched:waitempty", .startup.single = NULL, .teardown.single = sched_cpu_wait_empty, }, /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot/threads:online", .startup.single = smpboot_unpark_threads, .teardown.single = smpboot_park_threads, }, [CPUHP_AP_IRQ_AFFINITY_ONLINE] = { .name = "irq/affinity:online", .startup.single = irq_affinity_online_cpu, .teardown.single = NULL, }, [CPUHP_AP_PERF_ONLINE] = { .name = "perf:online", .startup.single = perf_event_init_cpu, .teardown.single = perf_event_exit_cpu, }, [CPUHP_AP_WATCHDOG_ONLINE] = { .name = "lockup_detector:online", .startup.single = lockup_detector_online_cpu, .teardown.single = lockup_detector_offline_cpu, }, [CPUHP_AP_WORKQUEUE_ONLINE] = { .name = "workqueue:online", .startup.single = workqueue_online_cpu, .teardown.single = workqueue_offline_cpu, }, [CPUHP_AP_RANDOM_ONLINE] = { .name = "random:online", .startup.single = random_online_cpu, .teardown.single = NULL, }, [CPUHP_AP_RCUTREE_ONLINE] = { .name = "RCU/tree:online", .startup.single = rcutree_online_cpu, .teardown.single = rcutree_offline_cpu, }, #endif /* * The dynamically registered state space is here */ #ifdef CONFIG_SMP /* Last state is scheduler control setting the cpu active */ [CPUHP_AP_ACTIVE] = { .name = "sched:active", .startup.single = sched_cpu_activate, .teardown.single = sched_cpu_deactivate, }, #endif /* CPU is fully up and running. */ [CPUHP_ONLINE] = { .name = "online", .startup.single = NULL, .teardown.single = NULL, }, }; /* Sanity check for callbacks */ static int cpuhp_cb_check(enum cpuhp_state state) { if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE) return -EINVAL; return 0; } /* * Returns a free for dynamic slot assignment of the Online state. The states * are protected by the cpuhp_slot_states mutex and an empty slot is identified * by having no name assigned. */ static int cpuhp_reserve_state(enum cpuhp_state state) { enum cpuhp_state i, end; struct cpuhp_step *step; switch (state) { case CPUHP_AP_ONLINE_DYN: step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN; end = CPUHP_AP_ONLINE_DYN_END; break; case CPUHP_BP_PREPARE_DYN: step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN; end = CPUHP_BP_PREPARE_DYN_END; break; default: return -EINVAL; } for (i = state; i <= end; i++, step++) { if (!step->name) return i; } WARN(1, "No more dynamic states available for CPU hotplug\n"); return -ENOSPC; } static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu), bool multi_instance) { /* (Un)Install the callbacks for further cpu hotplug operations */ struct cpuhp_step *sp; int ret = 0; /* * If name is NULL, then the state gets removed. * * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on * the first allocation from these dynamic ranges, so the removal * would trigger a new allocation and clear the wrong (already * empty) state, leaving the callbacks of the to be cleared state * dangling, which causes wreckage on the next hotplug operation. */ if (name && (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN)) { ret = cpuhp_reserve_state(state); if (ret < 0) return ret; state = ret; } sp = cpuhp_get_step(state); if (name && sp->name) return -EBUSY; sp->startup.single = startup; sp->teardown.single = teardown; sp->name = name; sp->multi_instance = multi_instance; INIT_HLIST_HEAD(&sp->list); return ret; } static void *cpuhp_get_teardown_cb(enum cpuhp_state state) { return cpuhp_get_step(state)->teardown.single; } /* * Call the startup/teardown function for a step either on the AP or * on the current CPU. */ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, struct hlist_node *node) { struct cpuhp_step *sp = cpuhp_get_step(state); int ret; /* * If there's nothing to do, we done. * Relies on the union for multi_instance. */ if (cpuhp_step_empty(bringup, sp)) return 0; /* * The non AP bound callbacks can fail on bringup. On teardown * e.g. module removal we crash for now. */ #ifdef CONFIG_SMP if (cpuhp_is_ap_state(state)) ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); else ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #else ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #endif BUG_ON(ret && !bringup); return ret; } /* * Called from __cpuhp_setup_state on a recoverable failure. * * Note: The teardown callbacks for rollback are not allowed to fail! */ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, struct hlist_node *node) { int cpu; /* Roll back the already executed steps on the other cpus */ for_each_present_cpu(cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int cpustate = st->state; if (cpu >= failedcpu) break; /* Did we invoke the startup call on that cpu ? */ if (cpustate >= state) cpuhp_issue_call(cpu, state, false, node); } } int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state, struct hlist_node *node, bool invoke) { struct cpuhp_step *sp; int cpu; int ret; lockdep_assert_cpus_held(); sp = cpuhp_get_step(state); if (sp->multi_instance == false) return -EINVAL; mutex_lock(&cpuhp_state_mutex); if (!invoke || !sp->startup.multi) goto add_node; /* * Try to call the startup callback for each present cpu * depending on the hotplug state of the cpu. */ for_each_present_cpu(cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int cpustate = st->state; if (cpustate < state) continue; ret = cpuhp_issue_call(cpu, state, true, node); if (ret) { if (sp->teardown.multi) cpuhp_rollback_install(cpu, state, node); goto unlock; } } add_node: ret = 0; hlist_add_head(node, &sp->list); unlock: mutex_unlock(&cpuhp_state_mutex); return ret; } int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, bool invoke) { int ret; cpus_read_lock(); ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke); cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); /** * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state * @state: The state to setup * @name: Name of the step * @invoke: If true, the startup function is invoked for cpus where * cpu state >= @state * @startup: startup callback function * @teardown: teardown callback function * @multi_instance: State is set up for multiple instances which get * added afterwards. * * The caller needs to hold cpus read locked while calling this function. * Return: * On success: * Positive state number if @state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN; * 0 for all other states * On failure: proper (negative) error code */ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu), bool multi_instance) { int cpu, ret = 0; bool dynstate; lockdep_assert_cpus_held(); if (cpuhp_cb_check(state) || !name) return -EINVAL; mutex_lock(&cpuhp_state_mutex); ret = cpuhp_store_callbacks(state, name, startup, teardown, multi_instance); dynstate = state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN; if (ret > 0 && dynstate) { state = ret; ret = 0; } if (ret || !invoke || !startup) goto out; /* * Try to call the startup callback for each present cpu * depending on the hotplug state of the cpu. */ for_each_present_cpu(cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int cpustate = st->state; if (cpustate < state) continue; ret = cpuhp_issue_call(cpu, state, true, NULL); if (ret) { if (teardown) cpuhp_rollback_install(cpu, state, NULL); cpuhp_store_callbacks(state, NULL, NULL, NULL, false); goto out; } } out: mutex_unlock(&cpuhp_state_mutex); /* * If the requested state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN, * return the dynamically allocated state in case of success. */ if (!ret && dynstate) return state; return ret; } EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked); int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu), bool multi_instance) { int ret; cpus_read_lock(); ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup, teardown, multi_instance); cpus_read_unlock(); return ret; } EXPORT_SYMBOL(__cpuhp_setup_state); int __cpuhp_state_remove_instance(enum cpuhp_state state, struct hlist_node *node, bool invoke) { struct cpuhp_step *sp = cpuhp_get_step(state); int cpu; BUG_ON(cpuhp_cb_check(state)); if (!sp->multi_instance) return -EINVAL; cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); if (!invoke || !cpuhp_get_teardown_cb(state)) goto remove; /* * Call the teardown callback for each present cpu depending * on the hotplug state of the cpu. This function is not * allowed to fail currently! */ for_each_present_cpu(cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int cpustate = st->state; if (cpustate >= state) cpuhp_issue_call(cpu, state, false, node); } remove: hlist_del(node); mutex_unlock(&cpuhp_state_mutex); cpus_read_unlock(); return 0; } EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); /** * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state * @state: The state to remove * @invoke: If true, the teardown function is invoked for cpus where * cpu state >= @state * * The caller needs to hold cpus read locked while calling this function. * The teardown callback is currently not allowed to fail. Think * about module removal! */ void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke) { struct cpuhp_step *sp = cpuhp_get_step(state); int cpu; BUG_ON(cpuhp_cb_check(state)); lockdep_assert_cpus_held(); mutex_lock(&cpuhp_state_mutex); if (sp->multi_instance) { WARN(!hlist_empty(&sp->list), "Error: Removing state %d which has instances left.\n", state); goto remove; } if (!invoke || !cpuhp_get_teardown_cb(state)) goto remove; /* * Call the teardown callback for each present cpu depending * on the hotplug state of the cpu. This function is not * allowed to fail currently! */ for_each_present_cpu(cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int cpustate = st->state; if (cpustate >= state) cpuhp_issue_call(cpu, state, false, NULL); } remove: cpuhp_store_callbacks(state, NULL, NULL, NULL, false); mutex_unlock(&cpuhp_state_mutex); } EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked); void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) { cpus_read_lock(); __cpuhp_remove_state_cpuslocked(state, invoke); cpus_read_unlock(); } EXPORT_SYMBOL(__cpuhp_remove_state); #ifdef CONFIG_HOTPLUG_SMT static void cpuhp_offline_cpu_device(unsigned int cpu) { struct device *dev = get_cpu_device(cpu); dev->offline = true; /* Tell user space about the state change */ kobject_uevent(&dev->kobj, KOBJ_OFFLINE); } static void cpuhp_online_cpu_device(unsigned int cpu) { struct device *dev = get_cpu_device(cpu); dev->offline = false; /* Tell user space about the state change */ kobject_uevent(&dev->kobj, KOBJ_ONLINE); } int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { int cpu, ret = 0; cpu_maps_update_begin(); for_each_online_cpu(cpu) { if (topology_is_primary_thread(cpu)) continue; /* * Disable can be called with CPU_SMT_ENABLED when changing * from a higher to lower number of SMT threads per core. */ if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu)) continue; ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); if (ret) break; /* * As this needs to hold the cpu maps lock it's impossible * to call device_offline() because that ends up calling * cpu_down() which takes cpu maps lock. cpu maps lock * needs to be held as this might race against in kernel * abusers of the hotplug machinery (thermal management). * * So nothing would update device:offline state. That would * leave the sysfs entry stale and prevent onlining after * smt control has been changed to 'off' again. This is * called under the sysfs hotplug lock, so it is properly * serialized against the regular offline usage. */ cpuhp_offline_cpu_device(cpu); } if (!ret) cpu_smt_control = ctrlval; cpu_maps_update_done(); return ret; } /* Check if the core a CPU belongs to is online */ #if !defined(topology_is_core_online) static inline bool topology_is_core_online(unsigned int cpu) { return true; } #endif int cpuhp_smt_enable(void) { int cpu, ret = 0; cpu_maps_update_begin(); cpu_smt_control = CPU_SMT_ENABLED; for_each_present_cpu(cpu) { /* Skip online CPUs and CPUs on offline nodes */ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) continue; if (!cpu_smt_thread_allowed(cpu) || !topology_is_core_online(cpu)) continue; ret = _cpu_up(cpu, 0, CPUHP_ONLINE); if (ret) break; /* See comment in cpuhp_smt_disable() */ cpuhp_online_cpu_device(cpu); } cpu_maps_update_done(); return ret; } #endif #if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) static ssize_t state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); return sprintf(buf, "%d\n", st->state); } static DEVICE_ATTR_RO(state); static ssize_t target_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); struct cpuhp_step *sp; int target, ret; ret = kstrtoint(buf, 10, &target); if (ret) return ret; #ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE) return -EINVAL; #else if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE) return -EINVAL; #endif ret = lock_device_hotplug_sysfs(); if (ret) return ret; mutex_lock(&cpuhp_state_mutex); sp = cpuhp_get_step(target); ret = !sp->name || sp->cant_stop ? -EINVAL : 0; mutex_unlock(&cpuhp_state_mutex); if (ret) goto out; if (st->state < target) ret = cpu_up(dev->id, target); else if (st->state > target) ret = cpu_down(dev->id, target); else if (WARN_ON(st->target != target)) st->target = target; out: unlock_device_hotplug(); return ret ? ret : count; } static ssize_t target_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); return sprintf(buf, "%d\n", st->target); } static DEVICE_ATTR_RW(target); static ssize_t fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); struct cpuhp_step *sp; int fail, ret; ret = kstrtoint(buf, 10, &fail); if (ret) return ret; if (fail == CPUHP_INVALID) { st->fail = fail; return count; } if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE) return -EINVAL; /* * Cannot fail STARTING/DYING callbacks. */ if (cpuhp_is_atomic_state(fail)) return -EINVAL; /* * DEAD callbacks cannot fail... * ... neither can CPUHP_BRINGUP_CPU during hotunplug. The latter * triggering STARTING callbacks, a failure in this state would * hinder rollback. */ if (fail <= CPUHP_BRINGUP_CPU && st->state > CPUHP_BRINGUP_CPU) return -EINVAL; /* * Cannot fail anything that doesn't have callbacks. */ mutex_lock(&cpuhp_state_mutex); sp = cpuhp_get_step(fail); if (!sp->startup.single && !sp->teardown.single) ret = -EINVAL; mutex_unlock(&cpuhp_state_mutex); if (ret) return ret; st->fail = fail; return count; } static ssize_t fail_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); return sprintf(buf, "%d\n", st->fail); } static DEVICE_ATTR_RW(fail); static struct attribute *cpuhp_cpu_attrs[] = { &dev_attr_state.attr, &dev_attr_target.attr, &dev_attr_fail.attr, NULL }; static const struct attribute_group cpuhp_cpu_attr_group = { .attrs = cpuhp_cpu_attrs, .name = "hotplug", }; static ssize_t states_show(struct device *dev, struct device_attribute *attr, char *buf) { ssize_t cur, res = 0; int i; mutex_lock(&cpuhp_state_mutex); for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) { struct cpuhp_step *sp = cpuhp_get_step(i); if (sp->name) { cur = sprintf(buf, "%3d: %s\n", i, sp->name); buf += cur; res += cur; } } mutex_unlock(&cpuhp_state_mutex); return res; } static DEVICE_ATTR_RO(states); static struct attribute *cpuhp_cpu_root_attrs[] = { &dev_attr_states.attr, NULL }; static const struct attribute_group cpuhp_cpu_root_attr_group = { .attrs = cpuhp_cpu_root_attrs, .name = "hotplug", }; #ifdef CONFIG_HOTPLUG_SMT static bool cpu_smt_num_threads_valid(unsigned int threads) { if (IS_ENABLED(CONFIG_SMT_NUM_THREADS_DYNAMIC)) return threads >= 1 && threads <= cpu_smt_max_threads; return threads == 1 || threads == cpu_smt_max_threads; } static ssize_t __store_smt_control(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ctrlval, ret, num_threads, orig_threads; bool force_off; if (cpu_smt_control == CPU_SMT_FORCE_DISABLED) return -EPERM; if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED) return -ENODEV; if (sysfs_streq(buf, "on")) { ctrlval = CPU_SMT_ENABLED; num_threads = cpu_smt_max_threads; } else if (sysfs_streq(buf, "off")) { ctrlval = CPU_SMT_DISABLED; num_threads = 1; } else if (sysfs_streq(buf, "forceoff")) { ctrlval = CPU_SMT_FORCE_DISABLED; num_threads = 1; } else if (kstrtoint(buf, 10, &num_threads) == 0) { if (num_threads == 1) ctrlval = CPU_SMT_DISABLED; else if (cpu_smt_num_threads_valid(num_threads)) ctrlval = CPU_SMT_ENABLED; else return -EINVAL; } else { return -EINVAL; } ret = lock_device_hotplug_sysfs(); if (ret) return ret; orig_threads = cpu_smt_num_threads; cpu_smt_num_threads = num_threads; force_off = ctrlval != cpu_smt_control && ctrlval == CPU_SMT_FORCE_DISABLED; if (num_threads > orig_threads) ret = cpuhp_smt_enable(); else if (num_threads < orig_threads || force_off) ret = cpuhp_smt_disable(ctrlval); unlock_device_hotplug(); return ret ? ret : count; } #else /* !CONFIG_HOTPLUG_SMT */ static ssize_t __store_smt_control(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { return -ENODEV; } #endif /* CONFIG_HOTPLUG_SMT */ static const char *smt_states[] = { [CPU_SMT_ENABLED] = "on", [CPU_SMT_DISABLED] = "off", [CPU_SMT_FORCE_DISABLED] = "forceoff", [CPU_SMT_NOT_SUPPORTED] = "notsupported", [CPU_SMT_NOT_IMPLEMENTED] = "notimplemented", }; static ssize_t control_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *state = smt_states[cpu_smt_control]; #ifdef CONFIG_HOTPLUG_SMT /* * If SMT is enabled but not all threads are enabled then show the * number of threads. If all threads are enabled show "on". Otherwise * show the state name. */ if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_num_threads != cpu_smt_max_threads) return sysfs_emit(buf, "%d\n", cpu_smt_num_threads); #endif return sysfs_emit(buf, "%s\n", state); } static ssize_t control_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { return __store_smt_control(dev, attr, buf, count); } static DEVICE_ATTR_RW(control); static ssize_t active_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", sched_smt_active()); } static DEVICE_ATTR_RO(active); static struct attribute *cpuhp_smt_attrs[] = { &dev_attr_control.attr, &dev_attr_active.attr, NULL }; static const struct attribute_group cpuhp_smt_attr_group = { .attrs = cpuhp_smt_attrs, .name = "smt", }; static int __init cpu_smt_sysfs_init(void) { struct device *dev_root; int ret = -ENODEV; dev_root = bus_get_dev_root(&cpu_subsys); if (dev_root) { ret = sysfs_create_group(&dev_root->kobj, &cpuhp_smt_attr_group); put_device(dev_root); } return ret; } static int __init cpuhp_sysfs_init(void) { struct device *dev_root; int cpu, ret; ret = cpu_smt_sysfs_init(); if (ret) return ret; dev_root = bus_get_dev_root(&cpu_subsys); if (dev_root) { ret = sysfs_create_group(&dev_root->kobj, &cpuhp_cpu_root_attr_group); put_device(dev_root); if (ret) return ret; } for_each_possible_cpu(cpu) { struct device *dev = get_cpu_device(cpu); if (!dev) continue; ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group); if (ret) return ret; } return 0; } device_initcall(cpuhp_sysfs_init); #endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */ /* * cpu_bit_bitmap[] is a special, "compressed" data structure that * represents all NR_CPUS bits binary values of 1<<nr. * * It is used by cpumask_of() to get a constant address to a CPU * mask value that has a single bit set only. */ /* cpu_bit_bitmap[0] is empty - so we can back into it */ #define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x)) #define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) #define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) #define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = { MASK_DECLARE_8(0), MASK_DECLARE_8(8), MASK_DECLARE_8(16), MASK_DECLARE_8(24), #if BITS_PER_LONG > 32 MASK_DECLARE_8(32), MASK_DECLARE_8(40), MASK_DECLARE_8(48), MASK_DECLARE_8(56), #endif }; EXPORT_SYMBOL_GPL(cpu_bit_bitmap); const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; EXPORT_SYMBOL(cpu_all_bits); #ifdef CONFIG_INIT_ALL_POSSIBLE struct cpumask __cpu_possible_mask __ro_after_init = {CPU_BITS_ALL}; #else struct cpumask __cpu_possible_mask __ro_after_init; #endif EXPORT_SYMBOL(__cpu_possible_mask); struct cpumask __cpu_online_mask __read_mostly; EXPORT_SYMBOL(__cpu_online_mask); struct cpumask __cpu_enabled_mask __read_mostly; EXPORT_SYMBOL(__cpu_enabled_mask); struct cpumask __cpu_present_mask __read_mostly; EXPORT_SYMBOL(__cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); struct cpumask __cpu_dying_mask __read_mostly; EXPORT_SYMBOL(__cpu_dying_mask); atomic_t __num_online_cpus __read_mostly; EXPORT_SYMBOL(__num_online_cpus); void init_cpu_present(const struct cpumask *src) { cpumask_copy(&__cpu_present_mask, src); } void init_cpu_possible(const struct cpumask *src) { cpumask_copy(&__cpu_possible_mask, src); } void set_cpu_online(unsigned int cpu, bool online) { /* * atomic_inc/dec() is required to handle the horrid abuse of this * function by the reboot and kexec code which invoke it from * IPI/NMI broadcasts when shutting down CPUs. Invocation from * regular CPU hotplug is properly serialized. * * Note, that the fact that __num_online_cpus is of type atomic_t * does not protect readers which are not serialized against * concurrent hotplug operations. */ if (online) { if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask)) atomic_inc(&__num_online_cpus); } else { if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask)) atomic_dec(&__num_online_cpus); } } /* * Activate the first processor. */ void __init boot_cpu_init(void) { int cpu = smp_processor_id(); /* Mark the boot cpu "present", "online" etc for SMP and UP case */ set_cpu_online(cpu, true); set_cpu_active(cpu, true); set_cpu_present(cpu, true); set_cpu_possible(cpu, true); #ifdef CONFIG_SMP __boot_cpu_id = cpu; #endif } /* * Must be called _AFTER_ setting up the per_cpu areas */ void __init boot_cpu_hotplug_init(void) { #ifdef CONFIG_SMP cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask); atomic_set(this_cpu_ptr(&cpuhp_state.ap_sync_state), SYNC_STATE_ONLINE); #endif this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); this_cpu_write(cpuhp_state.target, CPUHP_ONLINE); } #ifdef CONFIG_CPU_MITIGATIONS /* * These are used for a global "mitigations=" cmdline option for toggling * optional CPU mitigations. */ enum cpu_mitigations { CPU_MITIGATIONS_OFF, CPU_MITIGATIONS_AUTO, CPU_MITIGATIONS_AUTO_NOSMT, }; static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; static int __init mitigations_parse_cmdline(char *arg) { if (!strcmp(arg, "off")) cpu_mitigations = CPU_MITIGATIONS_OFF; else if (!strcmp(arg, "auto")) cpu_mitigations = CPU_MITIGATIONS_AUTO; else if (!strcmp(arg, "auto,nosmt")) cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; else pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", arg); return 0; } /* mitigations=off */ bool cpu_mitigations_off(void) { return cpu_mitigations == CPU_MITIGATIONS_OFF; } EXPORT_SYMBOL_GPL(cpu_mitigations_off); /* mitigations=auto,nosmt */ bool cpu_mitigations_auto_nosmt(void) { return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; } EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt); #else static int __init mitigations_parse_cmdline(char *arg) { pr_crit("Kernel compiled without mitigations, ignoring 'mitigations'; system may still be vulnerable\n"); return 0; } #endif early_param("mitigations", mitigations_parse_cmdline); |
| 1 1 8 2 6 1 5 9 1 8 13 1 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | // SPDX-License-Identifier: GPL-2.0 /* * fs/isofs/export.c * * (C) 2004 Paul Serice - The new inode scheme requires switching * from iget() to iget5_locked() which means * the NFS export operations have to be hand * coded because the default routines rely on * iget(). * * The following files are helpful: * * Documentation/filesystems/nfs/exporting.rst * fs/exportfs/expfs.c. */ #include "isofs.h" static struct dentry * isofs_export_iget(struct super_block *sb, unsigned long block, unsigned long offset, __u32 generation) { struct inode *inode; if (block == 0) return ERR_PTR(-ESTALE); inode = isofs_iget(sb, block, offset); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); return ERR_PTR(-ESTALE); } return d_obtain_alias(inode); } /* This function is surprisingly simple. The trick is understanding * that "child" is always a directory. So, to find its parent, you * simply need to find its ".." entry, normalize its block and offset, * and return the underlying inode. See the comments for * isofs_normalize_block_and_offset(). */ static struct dentry *isofs_export_get_parent(struct dentry *child) { unsigned long parent_block = 0; unsigned long parent_offset = 0; struct inode *child_inode = d_inode(child); struct iso_inode_info *e_child_inode = ISOFS_I(child_inode); struct iso_directory_record *de = NULL; struct buffer_head * bh = NULL; struct dentry *rv = NULL; /* "child" must always be a directory. */ if (!S_ISDIR(child_inode->i_mode)) { printk(KERN_ERR "isofs: isofs_export_get_parent(): " "child is not a directory!\n"); rv = ERR_PTR(-EACCES); goto out; } /* It is an invariant that the directory offset is zero. If * it is not zero, it means the directory failed to be * normalized for some reason. */ if (e_child_inode->i_iget5_offset != 0) { printk(KERN_ERR "isofs: isofs_export_get_parent(): " "child directory not normalized!\n"); rv = ERR_PTR(-EACCES); goto out; } /* The child inode has been normalized such that its * i_iget5_block value points to the "." entry. Fortunately, * the ".." entry is located in the same block. */ parent_block = e_child_inode->i_iget5_block; /* Get the block in question. */ bh = sb_bread(child_inode->i_sb, parent_block); if (bh == NULL) { rv = ERR_PTR(-EACCES); goto out; } /* This is the "." entry. */ de = (struct iso_directory_record*)bh->b_data; /* The ".." entry is always the second entry. */ parent_offset = (unsigned long)isonum_711(de->length); de = (struct iso_directory_record*)(bh->b_data + parent_offset); /* Verify it is in fact the ".." entry. */ if ((isonum_711(de->name_len) != 1) || (de->name[0] != 1)) { printk(KERN_ERR "isofs: Unable to find the \"..\" " "directory for NFS.\n"); rv = ERR_PTR(-EACCES); goto out; } /* Normalize */ isofs_normalize_block_and_offset(de, &parent_block, &parent_offset); rv = d_obtain_alias(isofs_iget(child_inode->i_sb, parent_block, parent_offset)); out: if (bh) brelse(bh); return rv; } static int isofs_export_encode_fh(struct inode *inode, __u32 *fh32, int *max_len, struct inode *parent) { struct iso_inode_info * ei = ISOFS_I(inode); int len = *max_len; int type = 1; __u16 *fh16 = (__u16*)fh32; /* * WARNING: max_len is 5 for NFSv2. Because of this * limitation, we use the lower 16 bits of fh32[1] to hold the * offset of the inode and the upper 16 bits of fh32[1] to * hold the offset of the parent. */ if (parent && (len < 5)) { *max_len = 5; return FILEID_INVALID; } else if (len < 3) { *max_len = 3; return FILEID_INVALID; } len = 3; fh32[0] = ei->i_iget5_block; fh16[2] = (__u16)ei->i_iget5_offset; /* fh16 [sic] */ fh16[3] = 0; /* avoid leaking uninitialized data */ fh32[2] = inode->i_generation; if (parent) { struct iso_inode_info *eparent; eparent = ISOFS_I(parent); fh32[3] = eparent->i_iget5_block; fh16[3] = (__u16)eparent->i_iget5_offset; /* fh16 [sic] */ fh32[4] = parent->i_generation; len = 5; type = 2; } *max_len = len; return type; } struct isofs_fid { u32 block; u16 offset; u16 parent_offset; u32 generation; u32 parent_block; u32 parent_generation; }; static struct dentry *isofs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct isofs_fid *ifid = (struct isofs_fid *)fid; if (fh_len < 3 || fh_type > 2) return NULL; return isofs_export_iget(sb, ifid->block, ifid->offset, ifid->generation); } static struct dentry *isofs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct isofs_fid *ifid = (struct isofs_fid *)fid; if (fh_len < 2 || fh_type != 2) return NULL; return isofs_export_iget(sb, fh_len > 3 ? ifid->parent_block : 0, ifid->parent_offset, fh_len > 4 ? ifid->parent_generation : 0); } const struct export_operations isofs_export_ops = { .encode_fh = isofs_export_encode_fh, .fh_to_dentry = isofs_fh_to_dentry, .fh_to_parent = isofs_fh_to_parent, .get_parent = isofs_export_get_parent, }; |
| 23 17 23 28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 | /* * linux/fs/nls/nls_cp775.c * * Charset cp775 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0106, 0x00fc, 0x00e9, 0x0101, 0x00e4, 0x0123, 0x00e5, 0x0107, 0x0142, 0x0113, 0x0156, 0x0157, 0x012b, 0x0179, 0x00c4, 0x00c5, /* 0x90*/ 0x00c9, 0x00e6, 0x00c6, 0x014d, 0x00f6, 0x0122, 0x00a2, 0x015a, 0x015b, 0x00d6, 0x00dc, 0x00f8, 0x00a3, 0x00d8, 0x00d7, 0x00a4, /* 0xa0*/ 0x0100, 0x012a, 0x00f3, 0x017b, 0x017c, 0x017a, 0x201d, 0x00a6, 0x00a9, 0x00ae, 0x00ac, 0x00bd, 0x00bc, 0x0141, 0x00ab, 0x00bb, /* 0xb0*/ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x0104, 0x010c, 0x0118, 0x0116, 0x2563, 0x2551, 0x2557, 0x255d, 0x012e, 0x0160, 0x2510, /* 0xc0*/ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x0172, 0x016a, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x017d, /* 0xd0*/ 0x0105, 0x010d, 0x0119, 0x0117, 0x012f, 0x0161, 0x0173, 0x016b, 0x017e, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, /* 0xe0*/ 0x00d3, 0x00df, 0x014c, 0x0143, 0x00f5, 0x00d5, 0x00b5, 0x0144, 0x0136, 0x0137, 0x013b, 0x013c, 0x0146, 0x0112, 0x0145, 0x2019, /* 0xf0*/ 0x00ad, 0x00b1, 0x201c, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x201e, 0x00b0, 0x2219, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xff, 0x00, 0x96, 0x9c, 0x9f, 0x00, 0xa7, 0xf5, /* 0xa0-0xa7 */ 0x00, 0xa8, 0x00, 0xae, 0xaa, 0xf0, 0xa9, 0x00, /* 0xa8-0xaf */ 0xf8, 0xf1, 0xfd, 0xfc, 0x00, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */ 0x00, 0xfb, 0x00, 0xaf, 0xac, 0xab, 0xf3, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x00, /* 0xc0-0xc7 */ 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0xe0, 0x00, 0xe5, 0x99, 0x9e, /* 0xd0-0xd7 */ 0x9d, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x84, 0x86, 0x91, 0x00, /* 0xe0-0xe7 */ 0x00, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0xa2, 0x00, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */ 0x9b, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0xa0, 0x83, 0x00, 0x00, 0xb5, 0xd0, 0x80, 0x87, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xb6, 0xd1, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0xed, 0x89, 0x00, 0x00, 0xb8, 0xd3, /* 0x10-0x17 */ 0xb7, 0xd2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x95, 0x85, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0xa1, 0x8c, 0x00, 0x00, 0xbd, 0xd4, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, 0xe9, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0xea, 0xeb, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0xad, 0x88, 0xe3, 0xe7, 0xee, 0xec, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0xe2, 0x93, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8a, 0x8b, /* 0x50-0x57 */ 0x00, 0x00, 0x97, 0x98, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xbe, 0xd5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0xc7, 0xd7, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0xc6, 0xd6, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x8d, 0xa5, 0xa3, 0xa4, 0xcf, 0xd8, 0x00, /* 0x78-0x7f */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0xef, 0x00, 0x00, 0xf2, 0xa6, 0xf7, 0x00, /* 0x18-0x1f */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0xf9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ }; static const unsigned char page25[256] = { 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, NULL, page22, NULL, NULL, page25, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8b, 0x8b, 0x8c, 0xa5, 0x84, 0x86, /* 0x88-0x8f */ 0x82, 0x91, 0x91, 0x93, 0x94, 0x85, 0x96, 0x98, /* 0x90-0x97 */ 0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */ 0x83, 0x8c, 0xa2, 0xa4, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0x88, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xd0, 0xd1, 0xd2, /* 0xb0-0xb7 */ 0xd3, 0xb9, 0xba, 0xbb, 0xbc, 0xd4, 0xd5, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xd6, 0xd7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xd8, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xa2, 0xe1, 0x93, 0xe7, 0xe4, 0xe4, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe9, 0xe9, 0xeb, 0xeb, 0xec, 0x89, 0xec, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x9a, 0x90, 0xa0, 0x8e, 0x95, 0x8f, 0x80, /* 0x80-0x87 */ 0xad, 0xed, 0x8a, 0x8a, 0xa1, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x92, 0x92, 0xe2, 0x99, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x97, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xe0, 0xa3, 0xa3, 0x8d, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xb5, 0xb6, 0xb7, 0xb8, 0xbd, 0xbe, 0xc6, 0xc7, /* 0xd0-0xd7 */ 0xcf, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0xe3, /* 0xe0-0xe7 */ 0xe8, 0xe8, 0xea, 0xea, 0xee, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp775", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp775(void) { return register_nls(&table); } static void __exit exit_nls_cp775(void) { unregister_nls(&table); } module_init(init_nls_cp775) module_exit(exit_nls_cp775) MODULE_DESCRIPTION("NLS Codepage 775 (Baltic Rim)"); MODULE_LICENSE("Dual BSD/GPL"); |
| 52 27 6 6 149 149 5 2 11 11 11 11 11 149 50 52 21 21 11 21 21 149 149 11 12 11 11 11 11 11 11 11 149 149 149 160 14 149 205 206 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * Copyright (c) 2008 Dave Chinner * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_trace.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_log.h" #include "xfs_log_priv.h" #ifdef DEBUG /* * Check that the list is sorted as it should be. * * Called with the ail lock held, but we don't want to assert fail with it * held otherwise we'll lock everything up and won't be able to debug the * cause. Hence we sample and check the state under the AIL lock and return if * everything is fine, otherwise we drop the lock and run the ASSERT checks. * Asserts may not be fatal, so pick the lock back up and continue onwards. */ STATIC void xfs_ail_check( struct xfs_ail *ailp, struct xfs_log_item *lip) __must_hold(&ailp->ail_lock) { struct xfs_log_item *prev_lip; struct xfs_log_item *next_lip; xfs_lsn_t prev_lsn = NULLCOMMITLSN; xfs_lsn_t next_lsn = NULLCOMMITLSN; xfs_lsn_t lsn; bool in_ail; if (list_empty(&ailp->ail_head)) return; /* * Sample then check the next and previous entries are valid. */ in_ail = test_bit(XFS_LI_IN_AIL, &lip->li_flags); prev_lip = list_entry(lip->li_ail.prev, struct xfs_log_item, li_ail); if (&prev_lip->li_ail != &ailp->ail_head) prev_lsn = prev_lip->li_lsn; next_lip = list_entry(lip->li_ail.next, struct xfs_log_item, li_ail); if (&next_lip->li_ail != &ailp->ail_head) next_lsn = next_lip->li_lsn; lsn = lip->li_lsn; if (in_ail && (prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0) && (next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0)) return; spin_unlock(&ailp->ail_lock); ASSERT(in_ail); ASSERT(prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0); ASSERT(next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0); spin_lock(&ailp->ail_lock); } #else /* !DEBUG */ #define xfs_ail_check(a,l) #endif /* DEBUG */ /* * Return a pointer to the last item in the AIL. If the AIL is empty, then * return NULL. */ static struct xfs_log_item * xfs_ail_max( struct xfs_ail *ailp) { if (list_empty(&ailp->ail_head)) return NULL; return list_entry(ailp->ail_head.prev, struct xfs_log_item, li_ail); } /* * Return a pointer to the item which follows the given item in the AIL. If * the given item is the last item in the list, then return NULL. */ static struct xfs_log_item * xfs_ail_next( struct xfs_ail *ailp, struct xfs_log_item *lip) { if (lip->li_ail.next == &ailp->ail_head) return NULL; return list_first_entry(&lip->li_ail, struct xfs_log_item, li_ail); } /* * This is called by the log manager code to determine the LSN of the tail of * the log. This is exactly the LSN of the first item in the AIL. If the AIL * is empty, then this function returns 0. * * We need the AIL lock in order to get a coherent read of the lsn of the last * item in the AIL. */ static xfs_lsn_t __xfs_ail_min_lsn( struct xfs_ail *ailp) { struct xfs_log_item *lip = xfs_ail_min(ailp); if (lip) return lip->li_lsn; return 0; } xfs_lsn_t xfs_ail_min_lsn( struct xfs_ail *ailp) { xfs_lsn_t lsn; spin_lock(&ailp->ail_lock); lsn = __xfs_ail_min_lsn(ailp); spin_unlock(&ailp->ail_lock); return lsn; } /* * The cursor keeps track of where our current traversal is up to by tracking * the next item in the list for us. However, for this to be safe, removing an * object from the AIL needs to invalidate any cursor that points to it. hence * the traversal cursor needs to be linked to the struct xfs_ail so that * deletion can search all the active cursors for invalidation. */ STATIC void xfs_trans_ail_cursor_init( struct xfs_ail *ailp, struct xfs_ail_cursor *cur) { cur->item = NULL; list_add_tail(&cur->list, &ailp->ail_cursors); } /* * Get the next item in the traversal and advance the cursor. If the cursor * was invalidated (indicated by a lip of 1), restart the traversal. */ struct xfs_log_item * xfs_trans_ail_cursor_next( struct xfs_ail *ailp, struct xfs_ail_cursor *cur) { struct xfs_log_item *lip = cur->item; if ((uintptr_t)lip & 1) lip = xfs_ail_min(ailp); if (lip) cur->item = xfs_ail_next(ailp, lip); return lip; } /* * When the traversal is complete, we need to remove the cursor from the list * of traversing cursors. */ void xfs_trans_ail_cursor_done( struct xfs_ail_cursor *cur) { cur->item = NULL; list_del_init(&cur->list); } /* * Invalidate any cursor that is pointing to this item. This is called when an * item is removed from the AIL. Any cursor pointing to this object is now * invalid and the traversal needs to be terminated so it doesn't reference a * freed object. We set the low bit of the cursor item pointer so we can * distinguish between an invalidation and the end of the list when getting the * next item from the cursor. */ STATIC void xfs_trans_ail_cursor_clear( struct xfs_ail *ailp, struct xfs_log_item *lip) { struct xfs_ail_cursor *cur; list_for_each_entry(cur, &ailp->ail_cursors, list) { if (cur->item == lip) cur->item = (struct xfs_log_item *) ((uintptr_t)cur->item | 1); } } /* * Find the first item in the AIL with the given @lsn by searching in ascending * LSN order and initialise the cursor to point to the next item for a * ascending traversal. Pass a @lsn of zero to initialise the cursor to the * first item in the AIL. Returns NULL if the list is empty. */ struct xfs_log_item * xfs_trans_ail_cursor_first( struct xfs_ail *ailp, struct xfs_ail_cursor *cur, xfs_lsn_t lsn) { struct xfs_log_item *lip; xfs_trans_ail_cursor_init(ailp, cur); if (lsn == 0) { lip = xfs_ail_min(ailp); goto out; } list_for_each_entry(lip, &ailp->ail_head, li_ail) { if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0) goto out; } return NULL; out: if (lip) cur->item = xfs_ail_next(ailp, lip); return lip; } static struct xfs_log_item * __xfs_trans_ail_cursor_last( struct xfs_ail *ailp, xfs_lsn_t lsn) { struct xfs_log_item *lip; list_for_each_entry_reverse(lip, &ailp->ail_head, li_ail) { if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0) return lip; } return NULL; } /* * Find the last item in the AIL with the given @lsn by searching in descending * LSN order and initialise the cursor to point to that item. If there is no * item with the value of @lsn, then it sets the cursor to the last item with an * LSN lower than @lsn. Returns NULL if the list is empty. */ struct xfs_log_item * xfs_trans_ail_cursor_last( struct xfs_ail *ailp, struct xfs_ail_cursor *cur, xfs_lsn_t lsn) { xfs_trans_ail_cursor_init(ailp, cur); cur->item = __xfs_trans_ail_cursor_last(ailp, lsn); return cur->item; } /* * Splice the log item list into the AIL at the given LSN. We splice to the * tail of the given LSN to maintain insert order for push traversals. The * cursor is optional, allowing repeated updates to the same LSN to avoid * repeated traversals. This should not be called with an empty list. */ static void xfs_ail_splice( struct xfs_ail *ailp, struct xfs_ail_cursor *cur, struct list_head *list, xfs_lsn_t lsn) { struct xfs_log_item *lip; ASSERT(!list_empty(list)); /* * Use the cursor to determine the insertion point if one is * provided. If not, or if the one we got is not valid, * find the place in the AIL where the items belong. */ lip = cur ? cur->item : NULL; if (!lip || (uintptr_t)lip & 1) lip = __xfs_trans_ail_cursor_last(ailp, lsn); /* * If a cursor is provided, we know we're processing the AIL * in lsn order, and future items to be spliced in will * follow the last one being inserted now. Update the * cursor to point to that last item, now while we have a * reliable pointer to it. */ if (cur) cur->item = list_entry(list->prev, struct xfs_log_item, li_ail); /* * Finally perform the splice. Unless the AIL was empty, * lip points to the item in the AIL _after_ which the new * items should go. If lip is null the AIL was empty, so * the new items go at the head of the AIL. */ if (lip) list_splice(list, &lip->li_ail); else list_splice(list, &ailp->ail_head); } /* * Delete the given item from the AIL. */ static void xfs_ail_delete( struct xfs_ail *ailp, struct xfs_log_item *lip) { xfs_ail_check(ailp, lip); list_del(&lip->li_ail); xfs_trans_ail_cursor_clear(ailp, lip); } /* * Requeue a failed buffer for writeback. * * We clear the log item failed state here as well, but we have to be careful * about reference counts because the only active reference counts on the buffer * may be the failed log items. Hence if we clear the log item failed state * before queuing the buffer for IO we can release all active references to * the buffer and free it, leading to use after free problems in * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which * order we process them in - the buffer is locked, and we own the buffer list * so nothing on them is going to change while we are performing this action. * * Hence we can safely queue the buffer for IO before we clear the failed log * item state, therefore always having an active reference to the buffer and * avoiding the transient zero-reference state that leads to use-after-free. */ static inline int xfsaild_resubmit_item( struct xfs_log_item *lip, struct list_head *buffer_list) { struct xfs_buf *bp = lip->li_buf; if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; if (!xfs_buf_delwri_queue(bp, buffer_list)) { xfs_buf_unlock(bp); return XFS_ITEM_FLUSHING; } /* protected by ail_lock */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) clear_bit(XFS_LI_FAILED, &lip->li_flags); xfs_buf_unlock(bp); return XFS_ITEM_SUCCESS; } static inline uint xfsaild_push_item( struct xfs_ail *ailp, struct xfs_log_item *lip) { /* * If log item pinning is enabled, skip the push and track the item as * pinned. This can help induce head-behind-tail conditions. */ if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) return XFS_ITEM_PINNED; /* * Consider the item pinned if a push callback is not defined so the * caller will force the log. This should only happen for intent items * as they are unpinned once the associated done item is committed to * the on-disk log. */ if (!lip->li_ops->iop_push) return XFS_ITEM_PINNED; if (test_bit(XFS_LI_FAILED, &lip->li_flags)) return xfsaild_resubmit_item(lip, &ailp->ail_buf_list); return lip->li_ops->iop_push(lip, &ailp->ail_buf_list); } /* * Compute the LSN that we'd need to push the log tail towards in order to have * at least 25% of the log space free. If the log free space already meets this * threshold, this function returns the lowest LSN in the AIL to slowly keep * writeback ticking over and the tail of the log moving forward. */ static xfs_lsn_t xfs_ail_calc_push_target( struct xfs_ail *ailp) { struct xlog *log = ailp->ail_log; struct xfs_log_item *lip; xfs_lsn_t target_lsn; xfs_lsn_t max_lsn; xfs_lsn_t min_lsn; int32_t free_bytes; uint32_t target_block; uint32_t target_cycle; lockdep_assert_held(&ailp->ail_lock); lip = xfs_ail_max(ailp); if (!lip) return NULLCOMMITLSN; max_lsn = lip->li_lsn; min_lsn = __xfs_ail_min_lsn(ailp); /* * If we are supposed to push all the items in the AIL, we want to push * to the current head. We then clear the push flag so that we don't * keep pushing newly queued items beyond where the push all command was * run. If the push waiter wants to empty the ail, it should queue * itself on the ail_empty wait queue. */ if (test_and_clear_bit(XFS_AIL_OPSTATE_PUSH_ALL, &ailp->ail_opstate)) return max_lsn; /* If someone wants the AIL empty, keep pushing everything we have. */ if (waitqueue_active(&ailp->ail_empty)) return max_lsn; /* * Background pushing - attempt to keep 25% of the log free and if we * have that much free retain the existing target. */ free_bytes = log->l_logsize - xlog_lsn_sub(log, max_lsn, min_lsn); if (free_bytes >= log->l_logsize >> 2) return ailp->ail_target; target_cycle = CYCLE_LSN(min_lsn); target_block = BLOCK_LSN(min_lsn) + (log->l_logBBsize >> 2); if (target_block >= log->l_logBBsize) { target_block -= log->l_logBBsize; target_cycle += 1; } target_lsn = xlog_assign_lsn(target_cycle, target_block); /* Cap the target to the highest LSN known to be in the AIL. */ if (XFS_LSN_CMP(target_lsn, max_lsn) > 0) return max_lsn; /* If the existing target is higher than the new target, keep it. */ if (XFS_LSN_CMP(ailp->ail_target, target_lsn) >= 0) return ailp->ail_target; return target_lsn; } static long xfsaild_push( struct xfs_ail *ailp) { struct xfs_mount *mp = ailp->ail_log->l_mp; struct xfs_ail_cursor cur; struct xfs_log_item *lip; xfs_lsn_t lsn; long tout; int stuck = 0; int flushing = 0; int count = 0; /* * If we encountered pinned items or did not finish writing out all * buffers the last time we ran, force a background CIL push to get the * items unpinned in the near future. We do not wait on the CIL push as * that could stall us for seconds if there is enough background IO * load. Stalling for that long when the tail of the log is pinned and * needs flushing will hard stop the transaction subsystem when log * space runs out. */ if (ailp->ail_log_flush && ailp->ail_last_pushed_lsn == 0 && (!list_empty_careful(&ailp->ail_buf_list) || xfs_ail_min_lsn(ailp))) { ailp->ail_log_flush = 0; XFS_STATS_INC(mp, xs_push_ail_flush); xlog_cil_flush(ailp->ail_log); } spin_lock(&ailp->ail_lock); WRITE_ONCE(ailp->ail_target, xfs_ail_calc_push_target(ailp)); if (ailp->ail_target == NULLCOMMITLSN) goto out_done; /* we're done if the AIL is empty or our push has reached the end */ lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn); if (!lip) goto out_done_cursor; XFS_STATS_INC(mp, xs_push_ail); ASSERT(ailp->ail_target != NULLCOMMITLSN); lsn = lip->li_lsn; while ((XFS_LSN_CMP(lip->li_lsn, ailp->ail_target) <= 0)) { int lock_result; if (test_bit(XFS_LI_FLUSHING, &lip->li_flags)) goto next_item; /* * Note that iop_push may unlock and reacquire the AIL lock. We * rely on the AIL cursor implementation to be able to deal with * the dropped lock. */ lock_result = xfsaild_push_item(ailp, lip); switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(mp, xs_push_ail_success); trace_xfs_ail_push(lip); ailp->ail_last_pushed_lsn = lsn; break; case XFS_ITEM_FLUSHING: /* * The item or its backing buffer is already being * flushed. The typical reason for that is that an * inode buffer is locked because we already pushed the * updates to it as part of inode clustering. * * We do not want to stop flushing just because lots * of items are already being flushed, but we need to * re-try the flushing relatively soon if most of the * AIL is being flushed. */ XFS_STATS_INC(mp, xs_push_ail_flushing); trace_xfs_ail_flushing(lip); flushing++; ailp->ail_last_pushed_lsn = lsn; break; case XFS_ITEM_PINNED: XFS_STATS_INC(mp, xs_push_ail_pinned); trace_xfs_ail_pinned(lip); stuck++; ailp->ail_log_flush++; break; case XFS_ITEM_LOCKED: XFS_STATS_INC(mp, xs_push_ail_locked); trace_xfs_ail_locked(lip); stuck++; break; default: ASSERT(0); break; } count++; /* * Are there too many items we can't do anything with? * * If we are skipping too many items because we can't flush * them or they are already being flushed, we back off and * given them time to complete whatever operation is being * done. i.e. remove pressure from the AIL while we can't make * progress so traversals don't slow down further inserts and * removals to/from the AIL. * * The value of 100 is an arbitrary magic number based on * observation. */ if (stuck > 100) break; next_item: lip = xfs_trans_ail_cursor_next(ailp, &cur); if (lip == NULL) break; if (lip->li_lsn != lsn && count > 1000) break; lsn = lip->li_lsn; } out_done_cursor: xfs_trans_ail_cursor_done(&cur); out_done: spin_unlock(&ailp->ail_lock); if (xfs_buf_delwri_submit_nowait(&ailp->ail_buf_list)) ailp->ail_log_flush++; if (!count || XFS_LSN_CMP(lsn, ailp->ail_target) >= 0) { /* * We reached the target or the AIL is empty, so wait a bit * longer for I/O to complete and remove pushed items from the * AIL before we start the next scan from the start of the AIL. */ tout = 50; ailp->ail_last_pushed_lsn = 0; } else if (((stuck + flushing) * 100) / count > 90) { /* * Either there is a lot of contention on the AIL or we are * stuck due to operations in progress. "Stuck" in this case * is defined as >90% of the items we tried to push were stuck. * * Backoff a bit more to allow some I/O to complete before * restarting from the start of the AIL. This prevents us from * spinning on the same items, and if they are pinned will all * the restart to issue a log force to unpin the stuck items. */ tout = 20; ailp->ail_last_pushed_lsn = 0; } else { /* * Assume we have more work to do in a short while. */ tout = 0; } return tout; } static int xfsaild( void *data) { struct xfs_ail *ailp = data; long tout = 0; /* milliseconds */ unsigned int noreclaim_flag; noreclaim_flag = memalloc_noreclaim_save(); set_freezable(); while (1) { /* * Long waits of 50ms or more occur when we've run out of items * to push, so we only want uninterruptible state if we're * actually blocked on something. */ if (tout && tout <= 20) set_current_state(TASK_KILLABLE|TASK_FREEZABLE); else set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); /* * Check kthread_should_stop() after we set the task state to * guarantee that we either see the stop bit and exit or the * task state is reset to runnable such that it's not scheduled * out indefinitely and detects the stop bit at next iteration. * A memory barrier is included in above task state set to * serialize again kthread_stop(). */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); /* * The caller forces out the AIL before stopping the * thread in the common case, which means the delwri * queue is drained. In the shutdown case, the queue may * still hold relogged buffers that haven't been * submitted because they were pinned since added to the * queue. * * Log I/O error processing stales the underlying buffer * and clears the delwri state, expecting the buf to be * removed on the next submission attempt. That won't * happen if we're shutting down, so this is the last * opportunity to release such buffers from the queue. */ ASSERT(list_empty(&ailp->ail_buf_list) || xlog_is_shutdown(ailp->ail_log)); xfs_buf_delwri_cancel(&ailp->ail_buf_list); break; } /* Idle if the AIL is empty. */ spin_lock(&ailp->ail_lock); if (!xfs_ail_min(ailp) && list_empty(&ailp->ail_buf_list)) { spin_unlock(&ailp->ail_lock); schedule(); tout = 0; continue; } spin_unlock(&ailp->ail_lock); if (tout) schedule_timeout(msecs_to_jiffies(tout)); __set_current_state(TASK_RUNNING); try_to_freeze(); tout = xfsaild_push(ailp); } memalloc_noreclaim_restore(noreclaim_flag); return 0; } /* * Push out all items in the AIL immediately and wait until the AIL is empty. */ void xfs_ail_push_all_sync( struct xfs_ail *ailp) { DEFINE_WAIT(wait); spin_lock(&ailp->ail_lock); while (xfs_ail_max(ailp) != NULL) { prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE); wake_up_process(ailp->ail_task); spin_unlock(&ailp->ail_lock); schedule(); spin_lock(&ailp->ail_lock); } spin_unlock(&ailp->ail_lock); finish_wait(&ailp->ail_empty, &wait); } void __xfs_ail_assign_tail_lsn( struct xfs_ail *ailp) { struct xlog *log = ailp->ail_log; xfs_lsn_t tail_lsn; assert_spin_locked(&ailp->ail_lock); if (xlog_is_shutdown(log)) return; tail_lsn = __xfs_ail_min_lsn(ailp); if (!tail_lsn) tail_lsn = ailp->ail_head_lsn; WRITE_ONCE(log->l_tail_space, xlog_lsn_sub(log, ailp->ail_head_lsn, tail_lsn)); trace_xfs_log_assign_tail_lsn(log, tail_lsn); atomic64_set(&log->l_tail_lsn, tail_lsn); } /* * Callers should pass the original tail lsn so that we can detect if the tail * has moved as a result of the operation that was performed. If the caller * needs to force a tail space update, it should pass NULLCOMMITLSN to bypass * the "did the tail LSN change?" checks. If the caller wants to avoid a tail * update (e.g. it knows the tail did not change) it should pass an @old_lsn of * 0. */ void xfs_ail_update_finish( struct xfs_ail *ailp, xfs_lsn_t old_lsn) __releases(ailp->ail_lock) { struct xlog *log = ailp->ail_log; /* If the tail lsn hasn't changed, don't do updates or wakeups. */ if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) { spin_unlock(&ailp->ail_lock); return; } __xfs_ail_assign_tail_lsn(ailp); if (list_empty(&ailp->ail_head)) wake_up_all(&ailp->ail_empty); spin_unlock(&ailp->ail_lock); xfs_log_space_wake(log->l_mp); } /* * xfs_trans_ail_update_bulk - bulk AIL insertion operation. * * @xfs_trans_ail_update_bulk takes an array of log items that all need to be * positioned at the same LSN in the AIL. If an item is not in the AIL, it will * be added. Otherwise, it will be repositioned by removing it and re-adding * it to the AIL. * * If we move the first item in the AIL, update the log tail to match the new * minimum LSN in the AIL. * * This function should be called with the AIL lock held. * * To optimise the insert operation, we add all items to a temporary list, then * splice this list into the correct position in the AIL. * * Items that are already in the AIL are first deleted from their current * location before being added to the temporary list. * * This avoids needing to do an insert operation on every item. * * The AIL lock is dropped by xfs_ail_update_finish() before returning to * the caller. */ void xfs_trans_ail_update_bulk( struct xfs_ail *ailp, struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, xfs_lsn_t lsn) __releases(ailp->ail_lock) { struct xfs_log_item *mlip; xfs_lsn_t tail_lsn = 0; int i; LIST_HEAD(tmp); ASSERT(nr_items > 0); /* Not required, but true. */ mlip = xfs_ail_min(ailp); for (i = 0; i < nr_items; i++) { struct xfs_log_item *lip = log_items[i]; if (test_and_set_bit(XFS_LI_IN_AIL, &lip->li_flags)) { /* check if we really need to move the item */ if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0) continue; trace_xfs_ail_move(lip, lip->li_lsn, lsn); if (mlip == lip && !tail_lsn) tail_lsn = lip->li_lsn; xfs_ail_delete(ailp, lip); } else { trace_xfs_ail_insert(lip, 0, lsn); } lip->li_lsn = lsn; list_add_tail(&lip->li_ail, &tmp); } if (!list_empty(&tmp)) xfs_ail_splice(ailp, cur, &tmp, lsn); /* * If this is the first insert, wake up the push daemon so it can * actively scan for items to push. We also need to do a log tail * LSN update to ensure that it is correctly tracked by the log, so * set the tail_lsn to NULLCOMMITLSN so that xfs_ail_update_finish() * will see that the tail lsn has changed and will update the tail * appropriately. */ if (!mlip) { wake_up_process(ailp->ail_task); tail_lsn = NULLCOMMITLSN; } xfs_ail_update_finish(ailp, tail_lsn); } /* Insert a log item into the AIL. */ void xfs_trans_ail_insert( struct xfs_ail *ailp, struct xfs_log_item *lip, xfs_lsn_t lsn) { spin_lock(&ailp->ail_lock); xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } /* * Delete one log item from the AIL. * * If this item was at the tail of the AIL, return the LSN of the log item so * that we can use it to check if the LSN of the tail of the log has moved * when finishing up the AIL delete process in xfs_ail_update_finish(). */ xfs_lsn_t xfs_ail_delete_one( struct xfs_ail *ailp, struct xfs_log_item *lip) { struct xfs_log_item *mlip = xfs_ail_min(ailp); xfs_lsn_t lsn = lip->li_lsn; trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); clear_bit(XFS_LI_IN_AIL, &lip->li_flags); lip->li_lsn = 0; if (mlip == lip) return lsn; return 0; } void xfs_trans_ail_delete( struct xfs_log_item *lip, int shutdown_type) { struct xfs_ail *ailp = lip->li_ailp; struct xlog *log = ailp->ail_log; xfs_lsn_t tail_lsn; spin_lock(&ailp->ail_lock); if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); if (shutdown_type && !xlog_is_shutdown(log)) { xfs_alert_tag(log->l_mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); xlog_force_shutdown(log, shutdown_type); } return; } clear_bit(XFS_LI_FAILED, &lip->li_flags); tail_lsn = xfs_ail_delete_one(ailp, lip); xfs_ail_update_finish(ailp, tail_lsn); /* drops the AIL lock */ } int xfs_trans_ail_init( xfs_mount_t *mp) { struct xfs_ail *ailp; ailp = kzalloc(sizeof(struct xfs_ail), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!ailp) return -ENOMEM; ailp->ail_log = mp->m_log; INIT_LIST_HEAD(&ailp->ail_head); INIT_LIST_HEAD(&ailp->ail_cursors); spin_lock_init(&ailp->ail_lock); INIT_LIST_HEAD(&ailp->ail_buf_list); init_waitqueue_head(&ailp->ail_empty); ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s", mp->m_super->s_id); if (IS_ERR(ailp->ail_task)) goto out_free_ailp; mp->m_ail = ailp; return 0; out_free_ailp: kfree(ailp); return -ENOMEM; } void xfs_trans_ail_destroy( xfs_mount_t *mp) { struct xfs_ail *ailp = mp->m_ail; kthread_stop(ailp->ail_task); kfree(ailp); } |
| 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 | // SPDX-License-Identifier: GPL-2.0-only /* * This file is part of UBIFS. * * Copyright (C) 2006-2008 Nokia Corporation. * * Authors: Artem Bityutskiy (Битюцкий Артём) * Adrian Hunter */ /* * This file implements UBIFS shrinker which evicts clean znodes from the TNC * tree when Linux VM needs more RAM. * * We do not implement any LRU lists to find oldest znodes to free because it * would add additional overhead to the file system fast paths. So the shrinker * just walks the TNC tree when searching for znodes to free. * * If the root of a TNC sub-tree is clean and old enough, then the children are * also clean and old enough. So the shrinker walks the TNC in level order and * dumps entire sub-trees. * * The age of znodes is just the time-stamp when they were last looked at. * The current shrinker first tries to evict old znodes, then young ones. * * Since the shrinker is global, it has to protect against races with FS * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'. */ #include "ubifs.h" /* List of all UBIFS file-system instances */ LIST_HEAD(ubifs_infos); /* * We number each shrinker run and record the number on the ubifs_info structure * so that we can easily work out which ubifs_info structures have already been * done by the current run. */ static unsigned int shrinker_run_no; /* Protects 'ubifs_infos' list */ DEFINE_SPINLOCK(ubifs_infos_lock); /* Global clean znode counter (for all mounted UBIFS instances) */ atomic_long_t ubifs_clean_zn_cnt; /** * shrink_tnc - shrink TNC tree. * @c: UBIFS file-system description object * @nr: number of znodes to free * @age: the age of znodes to free * @contention: if any contention, this is set to %1 * * This function traverses TNC tree and frees clean znodes. It does not free * clean znodes which younger then @age. Returns number of freed znodes. */ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) { int total_freed = 0; struct ubifs_znode *znode, *zprev; time64_t time = ktime_get_seconds(); ubifs_assert(c, mutex_is_locked(&c->umount_mutex)); ubifs_assert(c, mutex_is_locked(&c->tnc_mutex)); if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0) return 0; /* * Traverse the TNC tree in levelorder manner, so that it is possible * to destroy large sub-trees. Indeed, if a znode is old, then all its * children are older or of the same age. * * Note, we are holding 'c->tnc_mutex', so we do not have to lock the * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is * changed only when the 'c->tnc_mutex' is held. */ zprev = NULL; znode = ubifs_tnc_levelorder_next(c, c->zroot.znode, NULL); while (znode && total_freed < nr && atomic_long_read(&c->clean_zn_cnt) > 0) { int freed; /* * If the znode is clean, but it is in the 'c->cnext' list, this * means that this znode has just been written to flash as a * part of commit and was marked clean. They will be removed * from the list at end commit. We cannot change the list, * because it is not protected by any mutex (design decision to * make commit really independent and parallel to main I/O). So * we just skip these znodes. * * Note, the 'clean_zn_cnt' counters are not updated until * after the commit, so the UBIFS shrinker does not report * the znodes which are in the 'c->cnext' list as freeable. * * Also note, if the root of a sub-tree is not in 'c->cnext', * then the whole sub-tree is not in 'c->cnext' as well, so it * is safe to dump whole sub-tree. */ if (znode->cnext) { /* * Very soon these znodes will be removed from the list * and become freeable. */ *contention = 1; } else if (!ubifs_zn_dirty(znode) && abs(time - znode->time) >= age) { if (znode->parent) znode->parent->zbranch[znode->iip].znode = NULL; else c->zroot.znode = NULL; freed = ubifs_destroy_tnc_subtree(c, znode); atomic_long_sub(freed, &ubifs_clean_zn_cnt); atomic_long_sub(freed, &c->clean_zn_cnt); total_freed += freed; znode = zprev; } if (unlikely(!c->zroot.znode)) break; zprev = znode; znode = ubifs_tnc_levelorder_next(c, c->zroot.znode, znode); cond_resched(); } return total_freed; } /** * shrink_tnc_trees - shrink UBIFS TNC trees. * @nr: number of znodes to free * @age: the age of znodes to free * @contention: if any contention, this is set to %1 * * This function walks the list of mounted UBIFS file-systems and frees clean * znodes which are older than @age, until at least @nr znodes are freed. * Returns the number of freed znodes. */ static int shrink_tnc_trees(int nr, int age, int *contention) { struct ubifs_info *c; struct list_head *p; unsigned int run_no; int freed = 0; spin_lock(&ubifs_infos_lock); do { run_no = ++shrinker_run_no; } while (run_no == 0); /* Iterate over all mounted UBIFS file-systems and try to shrink them */ p = ubifs_infos.next; while (p != &ubifs_infos) { c = list_entry(p, struct ubifs_info, infos_list); /* * We move the ones we do to the end of the list, so we stop * when we see one we have already done. */ if (c->shrinker_run_no == run_no) break; if (!mutex_trylock(&c->umount_mutex)) { /* Some un-mount is in progress, try next FS */ *contention = 1; p = p->next; continue; } /* * We're holding 'c->umount_mutex', so the file-system won't go * away. */ if (!mutex_trylock(&c->tnc_mutex)) { mutex_unlock(&c->umount_mutex); *contention = 1; p = p->next; continue; } spin_unlock(&ubifs_infos_lock); /* * OK, now we have TNC locked, the file-system cannot go away - * it is safe to reap the cache. */ c->shrinker_run_no = run_no; freed += shrink_tnc(c, nr, age, contention); mutex_unlock(&c->tnc_mutex); spin_lock(&ubifs_infos_lock); /* Get the next list element before we move this one */ p = p->next; /* * Move this one to the end of the list to provide some * fairness. */ list_move_tail(&c->infos_list, &ubifs_infos); mutex_unlock(&c->umount_mutex); if (freed >= nr) break; } spin_unlock(&ubifs_infos_lock); return freed; } /** * kick_a_thread - kick a background thread to start commit. * * This function kicks a background thread to start background commit. Returns * %-1 if a thread was kicked or there is another reason to assume the memory * will soon be freed or become freeable. If there are no dirty znodes, returns * %0. */ static int kick_a_thread(void) { int i; struct ubifs_info *c; /* * Iterate over all mounted UBIFS file-systems and find out if there is * already an ongoing commit operation there. If no, then iterate for * the second time and initiate background commit. */ spin_lock(&ubifs_infos_lock); for (i = 0; i < 2; i++) { list_for_each_entry(c, &ubifs_infos, infos_list) { long dirty_zn_cnt; if (!mutex_trylock(&c->umount_mutex)) { /* * Some un-mount is in progress, it will * certainly free memory, so just return. */ spin_unlock(&ubifs_infos_lock); return -1; } dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt); if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN || c->ro_mount || c->ro_error) { mutex_unlock(&c->umount_mutex); continue; } if (c->cmt_state != COMMIT_RESTING) { spin_unlock(&ubifs_infos_lock); mutex_unlock(&c->umount_mutex); return -1; } if (i == 1) { list_move_tail(&c->infos_list, &ubifs_infos); spin_unlock(&ubifs_infos_lock); ubifs_request_bg_commit(c); mutex_unlock(&c->umount_mutex); return -1; } mutex_unlock(&c->umount_mutex); } } spin_unlock(&ubifs_infos_lock); return 0; } unsigned long ubifs_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); /* * Due to the way UBIFS updates the clean znode counter it may * temporarily be negative. */ return clean_zn_cnt >= 0 ? clean_zn_cnt : 1; } unsigned long ubifs_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { unsigned long nr = sc->nr_to_scan; int contention = 0; unsigned long freed; long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); if (!clean_zn_cnt) { /* * No clean znodes, nothing to reap. All we can do in this case * is to kick background threads to start commit, which will * probably make clean znodes which, in turn, will be freeable. * And we return -1 which means will make VM call us again * later. */ dbg_tnc("no clean znodes, kick a thread"); return kick_a_thread(); } freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention); if (freed >= nr) goto out; dbg_tnc("not enough old znodes, try to free young ones"); freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention); if (freed >= nr) goto out; dbg_tnc("not enough young znodes, free all"); freed += shrink_tnc_trees(nr - freed, 0, &contention); if (!freed && contention) { dbg_tnc("freed nothing, but contention"); return SHRINK_STOP; } out: dbg_tnc("%lu znodes were freed, requested %lu", freed, nr); return freed; } |
| 1164 1088 4 4 4 2 2 11 11 1371 1377 1376 1375 1105 1108 1107 1105 1106 6955 1086 6931 196 196 198 9 9 9 9 196 198 196 196 195 198 850 853 852 856 852 857 857 853 180 821 851 850 851 6219 6210 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 | // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include <uapi/linux/sched/types.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/numa.h> #include <linux/sched/isolation.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; static LIST_HEAD(kthreads_hotplug); static DEFINE_MUTEX(kthreads_hotplug_lock); struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ char *full_name; int (*threadfn)(void *data); void *data; int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; struct completion *done; struct list_head list; }; struct kthread { unsigned long flags; unsigned int cpu; unsigned int node; int started; int result; int (*threadfn)(void *); void *data; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif /* To store the full name if task comm is truncated. */ char *full_name; struct task_struct *task; struct list_head hotplug_node; struct cpumask *preferred_affinity; }; enum KTHREAD_BITS { KTHREAD_IS_PER_CPU = 0, KTHREAD_SHOULD_STOP, KTHREAD_SHOULD_PARK, }; static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); return k->worker_private; } /* * Variant of to_kthread() that doesn't assume @p is a kthread. * * Per construction; when: * * (p->flags & PF_KTHREAD) && p->worker_private * * the task is both a kthread and struct kthread is persistent. However * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and * begin_new_exec()). */ static inline struct kthread *__to_kthread(struct task_struct *p) { void *kthread = p->worker_private; if (kthread && !(p->flags & PF_KTHREAD)) kthread = NULL; return kthread; } void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { struct kthread *kthread = to_kthread(tsk); if (!kthread || !kthread->full_name) { strscpy(buf, tsk->comm, buf_size); return; } strscpy_pad(buf, kthread->full_name, buf_size); } bool set_kthread_struct(struct task_struct *p) { struct kthread *kthread; if (WARN_ON_ONCE(to_kthread(p))) return false; kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); if (!kthread) return false; init_completion(&kthread->exited); init_completion(&kthread->parked); INIT_LIST_HEAD(&kthread->hotplug_node); p->vfork_done = &kthread->exited; kthread->task = p; kthread->node = tsk_fork_get_node(current); p->worker_private = kthread; return true; } void free_kthread_struct(struct task_struct *k) { struct kthread *kthread; /* * Can be NULL if kmalloc() in set_kthread_struct() failed. */ kthread = to_kthread(k); if (!kthread) return; #ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread->blkcg_css); #endif k->worker_private = NULL; kfree(kthread->full_name); kfree(kthread); } /** * kthread_should_stop - should this kthread return now? * * When someone calls kthread_stop() on your kthread, it will be woken * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ bool kthread_should_stop(void) { return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); static bool __kthread_should_park(struct task_struct *k) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); } /** * kthread_should_park - should this kthread park now? * * When someone calls kthread_park() on your kthread, it will be woken * and this will return true. You should then do the necessary * cleanup and call kthread_parkme() * * Similar to kthread_should_stop(), but this keeps the thread alive * and in a park position. kthread_unpark() "restarts" the thread and * calls the thread function again. */ bool kthread_should_park(void) { return __kthread_should_park(current); } EXPORT_SYMBOL_GPL(kthread_should_park); bool kthread_should_stop_or_park(void) { struct kthread *kthread = __to_kthread(current); if (!kthread) return false; return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); } /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * * kthread_should_stop() for freezable kthreads, which will enter * refrigerator if necessary. This function is safe from kthread_stop() / * freezer deadlock and freezable kthreads should use this function instead * of calling try_to_freeze() directly. */ bool kthread_freezable_should_stop(bool *was_frozen) { bool frozen = false; might_sleep(); if (unlikely(freezing(current))) frozen = __refrigerator(true); if (was_frozen) *was_frozen = frozen; return kthread_should_stop(); } EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); /** * kthread_func - return the function specified on kthread creation * @task: kthread task in question * * Returns NULL if the task is not a kthread. */ void *kthread_func(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); if (kthread) return kthread->threadfn; return NULL; } EXPORT_SYMBOL_GPL(kthread_func); /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * * Return the data value specified when kthread @task was created. * The caller is responsible for ensuring the validity of @task when * calling this function. */ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() * @task: possible kthread task in question * * @task could be a kthread task. Return the data value specified when it * was created if accessible. If @task isn't a kthread task or its data is * inaccessible for any reason, %NULL is returned. This function requires * that @task itself is safe to dereference. */ void *kthread_probe_data(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); void *data = NULL; if (kthread) copy_from_kernel_nofault(&data, &kthread->data, sizeof(data)); return data; } static void __kthread_parkme(struct kthread *self) { for (;;) { /* * TASK_PARKED is a special state; we must serialize against * possible pending wakeups to avoid store-store collisions on * task->state. * * Such a collision might possibly result in the task state * changin from TASK_PARKED and us failing the * wait_task_inactive() in kthread_park(). */ set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; /* * Thread is going to call schedule(), do not preempt it, * or the caller of kthread_park() may spend more time in * wait_task_inactive(). */ preempt_disable(); complete(&self->parked); schedule_preempt_disabled(); preempt_enable(); } __set_current_state(TASK_RUNNING); } void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } EXPORT_SYMBOL_GPL(kthread_parkme); /** * kthread_exit - Cause the current kthread return @result to kthread_stop(). * @result: The integer value to return to kthread_stop(). * * While kthread_exit can be called directly, it exists so that * functions which do some additional work in non-modular code such as * module_put_and_kthread_exit can be implemented. * * Does not return. */ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; if (!list_empty(&kthread->hotplug_node)) { mutex_lock(&kthreads_hotplug_lock); list_del(&kthread->hotplug_node); mutex_unlock(&kthreads_hotplug_lock); if (kthread->preferred_affinity) { kfree(kthread->preferred_affinity); kthread->preferred_affinity = NULL; } } do_exit(0); } EXPORT_SYMBOL(kthread_exit); /** * kthread_complete_and_exit - Exit the current kthread. * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * * If present, complete @comp and then return code to kthread_stop(). * * A kernel thread whose module may be removed after the completion of * @comp can use this function to exit safely. * * Does not return. */ void __noreturn kthread_complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); kthread_exit(code); } EXPORT_SYMBOL(kthread_complete_and_exit); static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask) { const struct cpumask *pref; if (kthread->preferred_affinity) { pref = kthread->preferred_affinity; } else { if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE)) return; pref = cpumask_of_node(kthread->node); } cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD)); if (cpumask_empty(cpumask)) cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD)); } static void kthread_affine_node(void) { struct kthread *kthread = to_kthread(current); cpumask_var_t affinity; WARN_ON_ONCE(kthread_is_per_cpu(current)); if (kthread->node == NUMA_NO_NODE) { housekeeping_affine(current, HK_TYPE_KTHREAD); } else { if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) { WARN_ON_ONCE(1); return; } mutex_lock(&kthreads_hotplug_lock); WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); /* * The node cpumask is racy when read from kthread() but: * - a racing CPU going down will either fail on the subsequent * call to set_cpus_allowed_ptr() or be migrated to housekeepers * afterwards by the scheduler. * - a racing CPU going up will be handled by kthreads_online_cpu() */ kthread_fetch_affinity(kthread, affinity); set_cpus_allowed_ptr(current, affinity); mutex_unlock(&kthreads_hotplug_lock); free_cpumask_var(affinity); } } static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; void *data = create->data; struct completion *done; struct kthread *self; int ret; self = to_kthread(current); /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { kfree(create->full_name); kfree(create); kthread_exit(-EINTR); } self->full_name = create->full_name; self->threadfn = threadfn; self->data = data; /* * The new thread inherited kthreadd's priority and CPU mask. Reset * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; /* * Thread is going to call schedule(), do not preempt it, * or the creator may spend more time in wait_task_inactive(). */ preempt_disable(); complete(done); schedule_preempt_disabled(); preempt_enable(); self->started = 1; if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity) kthread_affine_node(); ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); __kthread_parkme(self); ret = threadfn(data); } kthread_exit(ret); } /* called from kernel_clone() to get node information for about to be created task */ int tsk_fork_get_node(struct task_struct *tsk) { #ifdef CONFIG_NUMA if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) { int pid; #ifdef CONFIG_NUMA current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, create->full_name, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); kfree(create->full_name); if (!done) { kfree(create); return; } create->result = ERR_PTR(pid); complete(done); } } static __printf(4, 0) struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), GFP_KERNEL); if (!create) return ERR_PTR(-ENOMEM); create->threadfn = threadfn; create->data = data; create->node = node; create->done = &done; create->full_name = kvasprintf(GFP_KERNEL, namefmt, args); if (!create->full_name) { task = ERR_PTR(-ENOMEM); goto free_create; } spin_lock(&kthread_create_lock); list_add_tail(&create->list, &kthread_create_list); spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task); /* * Wait for completion in killable state, for I might be chosen by * the OOM killer while kthreadd is trying to allocate memory for * new kernel thread. */ if (unlikely(wait_for_completion_killable(&done))) { /* * If I was killed by a fatal signal before kthreadd (or new * kernel thread) calls complete(), leave the cleanup of this * structure to that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. */ wait_for_completion(&done); } task = create->result; free_create: kfree(create); return task; } /** * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either return directly if it is a * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], ...) { struct task_struct *task; va_list args; va_start(args, namefmt); task = __kthread_create_on_node(threadfn, data, node, namefmt, args); va_end(args); return task; } EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { unsigned long flags; if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) { __kthread_bind_mask(p, cpumask_of(cpu), state); } void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); WARN_ON_ONCE(kthread->started); } /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). * @cpu: cpu (might not be online, must be possible) for @k to run on. * * Description: This function is equivalent to set_cpus_allowed(), * except that @cpu doesn't need to be online, and the thread must be * stopped (i.e., just returned from kthread_create()). */ void kthread_bind(struct task_struct *p, unsigned int cpu) { struct kthread *kthread = to_kthread(p); __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); WARN_ON_ONCE(kthread->started); } EXPORT_SYMBOL(kthread_bind); /** * kthread_create_on_cpu - Create a cpu bound kthread * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @cpu: The cpu on which the thread should be bound, * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Description: This helper function creates and names a kernel thread */ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, const char *namefmt) { struct task_struct *p; p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu); if (IS_ERR(p)) return p; kthread_bind(p, cpu); /* CPU hotplug need to bind once again when unparking the thread. */ to_kthread(p)->cpu = cpu; return p; } EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { struct kthread *kthread = to_kthread(k); if (!kthread) return; WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY)); if (cpu < 0) { clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags); return; } kthread->cpu = cpu; set_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } bool kthread_is_per_cpu(struct task_struct *p) { struct kthread *kthread = __to_kthread(p); if (!kthread) return false; return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return false, wakes it, and * waits for it to return. If the thread is marked percpu then its * bound to the cpu again. */ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)) return; /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. */ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return true, wakes it, and * waits for it to return. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will park without * calling threadfn(). * * Returns 0 if the thread is parked, -ENOSYS if the thread exited. * If called by the kthread itself just the park bit is set. */ int kthread_park(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) return -EBUSY; set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); /* * Wait for __kthread_parkme() to complete(), this means we * _will_ have TASK_PARKED and are about to call schedule(). */ wait_for_completion(&kthread->parked); /* * Now wait for that schedule() to complete and the task to * get scheduled out. */ WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; } EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and * waits for it to exit. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will exit without * calling threadfn(). * * If threadfn() may call kthread_exit() itself, the caller must ensure * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. */ int kthread_stop(struct task_struct *k) { struct kthread *kthread; int ret; trace_sched_kthread_stop(k); get_task_struct(k); kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); kthread_unpark(k); set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL); wake_up_process(k); wait_for_completion(&kthread->exited); ret = kthread->result; put_task_struct(k); trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); /** * kthread_stop_put - stop a thread and put its task struct * @k: thread created by kthread_create(). * * Stops a thread created by kthread_create() and put its task_struct. * Only use when holding an extra task struct reference obtained by * calling get_task_struct(). */ int kthread_stop_put(struct task_struct *k) { int ret; ret = kthread_stop(k); put_task_struct(k); return ret; } EXPORT_SYMBOL(kthread_stop_put); int kthreadd(void *unused) { static const char comm[TASK_COMM_LEN] = "kthreadd"; struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, comm); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&kthread_create_list)) schedule(); __set_current_state(TASK_RUNNING); spin_lock(&kthread_create_lock); while (!list_empty(&kthread_create_list)) { struct kthread_create_info *create; create = list_entry(kthread_create_list.next, struct kthread_create_info, list); list_del_init(&create->list); spin_unlock(&kthread_create_lock); create_kthread(create); spin_lock(&kthread_create_lock); } spin_unlock(&kthread_create_lock); } return 0; } int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; unsigned long flags; int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { WARN_ON(1); return -EINVAL; } WARN_ON_ONCE(kthread->preferred_affinity); if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) return -ENOMEM; kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!kthread->preferred_affinity) { ret = -ENOMEM; goto out; } mutex_lock(&kthreads_hotplug_lock); cpumask_copy(kthread->preferred_affinity, mask); WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); kthread_fetch_affinity(kthread, affinity); /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, affinity); raw_spin_unlock_irqrestore(&p->pi_lock, flags); mutex_unlock(&kthreads_hotplug_lock); out: free_cpumask_var(affinity); return ret; } /* * Re-affine kthreads according to their preferences * and the newly online CPU. The CPU down part is handled * by select_fallback_rq() which default re-affines to * housekeepers from other nodes in case the preferred * affinity doesn't apply anymore. */ static int kthreads_online_cpu(unsigned int cpu) { cpumask_var_t affinity; struct kthread *k; int ret; guard(mutex)(&kthreads_hotplug_lock); if (list_empty(&kthreads_hotplug)) return 0; if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) return -ENOMEM; ret = 0; list_for_each_entry(k, &kthreads_hotplug, hotplug_node) { if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) || kthread_is_per_cpu(k->task))) { ret = -EINVAL; continue; } kthread_fetch_affinity(k, affinity); set_cpus_allowed_ptr(k->task, affinity); } free_cpumask_var(affinity); return ret; } static int kthreads_init(void) { return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online", kthreads_online_cpu, NULL); } early_initcall(kthreads_init); void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); } EXPORT_SYMBOL_GPL(__kthread_init_worker); /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker * * This function implements the main cycle of kthread worker. It processes * work_list until it is stopped with kthread_stop(). It sleeps when the queue * is empty. * * The works are not allowed to keep any locks, disable preemption or interrupts * when they finish. There is defined a safe point for freezing when one work * finishes and before a new one is started. * * Also the works must not be handled by more than one worker at the same time, * see also kthread_queue_work(). */ int kthread_worker_fn(void *worker_ptr) { struct kthread_worker *worker = worker_ptr; struct kthread_work *work; /* * FIXME: Update the check and remove the assignment when all kthread * worker users are created using kthread_create_worker*() functions. */ WARN_ON(worker->task && worker->task != current); worker->task = current; if (worker->flags & KTW_FREEZABLE) set_freezable(); repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&worker->lock); worker->task = NULL; raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; raw_spin_unlock_irq(&worker->lock); if (work) { kthread_work_func_t func = work->func; __set_current_state(TASK_RUNNING); trace_sched_kthread_work_execute_start(work); work->func(work); /* * Avoid dereferencing work after this point. The trace * event only cares about the address. */ trace_sched_kthread_work_execute_end(work, func); } else if (!freezing(current)) { schedule(); } else { /* * Handle the case where the current remains * TASK_INTERRUPTIBLE. try_to_freeze() expects * the current to be TASK_RUNNING. */ __set_current_state(TASK_RUNNING); } try_to_freeze(); cond_resched(); goto repeat; } EXPORT_SYMBOL_GPL(kthread_worker_fn); static __printf(3, 0) struct kthread_worker * __kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) return ERR_PTR(-ENOMEM); kthread_init_worker(worker); task = __kthread_create_on_node(kthread_worker_fn, worker, node, namefmt, args); if (IS_ERR(task)) goto fail_task; worker->flags = flags; worker->task = task; return worker; fail_task: kfree(worker); return ERR_CAST(task); } /** * kthread_create_worker_on_node - create a kthread worker * @flags: flags modifying the default behavior of the worker * @node: task structure for the thread is allocated on this node * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker_on_node(flags, node, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_node); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Use a valid CPU number if you want to bind the kthread worker * to the given CPU and the associated NUMA node. * * A good practice is to add the cpu number also into the worker name. * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). * * CPU hotplug: * The kthread worker API is simple and generic. It just provides a way * to create, use, and destroy workers. * * It is up to the API user how to handle CPU hotplug. They have to decide * how to handle pending work items, prevent queuing new ones, and * restore the functionality when the CPU goes off and on. There are a * few catches: * * - CPU affinity gets lost when it is scheduled on an offline CPU. * * - The worker might not exist when the CPU was off when the user * created the workers. * * Good practice is to implement two CPU hotplug callbacks and to * destroy/create the worker when the CPU goes down/up. * * Return: * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[]) { struct kthread_worker *worker; worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu); if (!IS_ERR(worker)) kthread_bind(worker->task, cpu); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_cpu); /* * Returns true when the work could not be queued at the moment. * It happens when it is already pending in a worker list * or when it is being cancelled. */ static inline bool queuing_blocked(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); return !list_empty(&work->node) || work->canceling; } static void kthread_insert_work_sanity_check(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); WARN_ON_ONCE(!list_empty(&work->node)); /* Do not use a work with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker && work->worker != worker); } /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, struct list_head *pos) { kthread_insert_work_sanity_check(worker, work); trace_sched_kthread_work_queue_work(worker, work); list_add_tail(&work->node, pos); work->worker = worker; if (!worker->current_work && likely(worker->task)) wake_up_process(worker->task); } /** * kthread_queue_work - queue a kthread_work * @worker: target kthread_worker * @work: kthread_work to queue * * Queue @work to work processor @task for async execution. @task * must have been created with kthread_create_worker(). Returns %true * if @work was successfully queued, %false if it was already pending. * * Reinitialize the work if it needs to be used by another worker. * For example, when the worker was stopped and started again. */ bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) { bool ret = false; unsigned long flags; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); /** * kthread_delayed_work_timer_fn - callback that queues the associated kthread * delayed work when the timer expires. * @t: pointer to the expired timer * * The format of the function is defined by struct timer_list. * It should have been called from irqsafe timer with irq already off. */ void kthread_delayed_work_timer_fn(struct timer_list *t) { struct kthread_delayed_work *dwork = timer_container_of(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; unsigned long flags; /* * This might happen when a pending work is reinitialized. * It means that it is used a wrong way. */ if (WARN_ON_ONCE(!worker)) return; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); if (!work->canceling) kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ if (!delay) { kthread_insert_work(worker, work, &worker->work_list); return; } /* Be paranoid and try to detect possible races already now. */ kthread_insert_work_sanity_check(worker, work); list_add(&work->node, &worker->delayed_work_list); work->worker = worker; timer->expires = jiffies + delay; add_timer(timer); } /** * kthread_queue_delayed_work - queue the associated kthread work * after a delay. * @worker: target kthread_worker * @dwork: kthread_delayed_work to queue * @delay: number of jiffies to wait before queuing * * If the work has not been pending it starts a timer that will queue * the work after the given @delay. If @delay is zero, it queues the * work immediately. * * Return: %false if the @work has already been pending. It means that * either the timer was running or the work was queued. It returns %true * otherwise. */ bool kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; bool ret = false; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); struct kthread_flush_work { struct kthread_work work; struct completion done; }; static void kthread_flush_work_fn(struct kthread_work *work) { struct kthread_flush_work *fwork = container_of(work, struct kthread_flush_work, work); complete(&fwork->done); } /** * kthread_flush_work - flush a kthread_work * @work: work to flush * * If @work is queued or executing, wait for it to finish execution. */ void kthread_flush_work(struct kthread_work *work) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; struct kthread_worker *worker; bool noop = false; worker = work->worker; if (!worker) return; raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (!list_empty(&work->node)) kthread_insert_work(worker, &fwork.work, work->node.next); else if (worker->current_work == work) kthread_insert_work(worker, &fwork.work, worker->work_list.next); else noop = true; raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_work); /* * Make sure that the timer is neither set nor running and could * not manipulate the work list_head any longer. * * The function is called under worker->lock. The lock is temporary * released but the timer can't be set again in the meantime. */ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, unsigned long *flags) { struct kthread_delayed_work *dwork = container_of(work, struct kthread_delayed_work, work); struct kthread_worker *worker = work->worker; /* * timer_delete_sync() must be called to make sure that the timer * callback is not running. The lock must be temporary released * to avoid a deadlock with the callback. In the meantime, * any queuing is blocked by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, *flags); timer_delete_sync(&dwork->timer); raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } /* * This function removes the work from the worker queue. * * It is called under worker->lock. The caller must make sure that * the timer used by delayed work is not running, e.g. by calling * kthread_cancel_delayed_work_timer(). * * The work might still be in use when this function finishes. See the * current_work proceed by the worker. * * Return: %true if @work was pending and successfully canceled, * %false if @work was not pending */ static bool __kthread_cancel_work(struct kthread_work *work) { /* * Try to remove the work from a worker list. It might either * be from worker->work_list or from worker->delayed_work_list. */ if (!list_empty(&work->node)) { list_del_init(&work->node); return true; } return false; } /** * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work * @worker: kthread worker to use * @dwork: kthread delayed work to queue * @delay: number of jiffies to wait before queuing * * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is zero, * @work is guaranteed to be queued immediately. * * Return: %false if @dwork was idle and queued, %true otherwise. * * A special case is when the work is being canceled in parallel. * It might be caused either by the real kthread_cancel_delayed_work_sync() * or yet another kthread_mod_delayed_work() call. We let the other command * win and return %true here. The return value can be used for reference * counting and the number of queued works stays the same. Anyway, the caller * is supposed to synchronize these operations a reasonable way. * * This function is safe to call from any context including IRQ handler. * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() * for details. */ bool kthread_mod_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; int ret; raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) { ret = false; goto fast_queue; } /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); /* * Temporary cancel the work but do not fight with another command * that is canceling the work as well. * * It is a bit tricky because of possible races with another * mod_delayed_work() and cancel_delayed_work() callers. * * The timer must be canceled first because worker->lock is released * when doing so. But the work can be removed from the queue (list) * only when it can be queued again so that the return value can * be used for reference counting. */ kthread_cancel_delayed_work_timer(work, &flags); if (work->canceling) { /* The number of works in the queue does not change. */ ret = true; goto out; } ret = __kthread_cancel_work(work); fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) { struct kthread_worker *worker = work->worker; unsigned long flags; int ret = false; if (!worker) goto out; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (is_dwork) kthread_cancel_delayed_work_timer(work, &flags); ret = __kthread_cancel_work(work); if (worker->current_work != work) goto out_fast; /* * The work is in progress and we need to wait with the lock released. * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } /** * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish * @work: the kthread work to cancel * * Cancel @work and wait for its execution to finish. This function * can be used even if the work re-queues itself. On return from this * function, @work is guaranteed to be not pending or executing on any CPU. * * kthread_cancel_work_sync(&delayed_work->work) must not be used for * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. * * The caller must ensure that the worker on which @work was last * queued can't be destroyed before this function returns. * * Return: %true if @work was pending, %false otherwise. */ bool kthread_cancel_work_sync(struct kthread_work *work) { return __kthread_cancel_work_sync(work, false); } EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); /** * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and * wait for it to finish. * @dwork: the kthread delayed work to cancel * * This is kthread_cancel_work_sync() for delayed works. * * Return: %true if @dwork was pending, %false otherwise. */ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) { return __kthread_cancel_work_sync(&dwork->work, true); } EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); /** * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush * * Wait until all currently executing or pending works on @worker are * finished. */ void kthread_flush_worker(struct kthread_worker *worker) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; kthread_queue_work(worker, &fwork.work); wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_worker); /** * kthread_destroy_worker - destroy a kthread worker * @worker: worker to be destroyed * * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. * * Note that this function is not responsible for handling delayed work, so * caller should be responsible for queuing or canceling all delayed work items * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { struct task_struct *task; task = worker->task; if (WARN_ON(!task)) return; kthread_flush_worker(worker); kthread_stop(task); WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); /** * kthread_use_mm - make the calling kthread operate on an address space * @mm: address space to operate on */ void kthread_use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); /* * It is possible for mm to be the same as tsk->active_mm, but * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), * because these references are not equivalent. */ mmgrab(mm); task_lock(tsk); /* Hold off tlb flush IPIs while switching mm's */ local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); #endif /* * When a kthread starts operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by * mmdrop_lazy_tlb(). */ mmdrop_lazy_tlb(active_mm); } EXPORT_SYMBOL_GPL(kthread_use_mm); /** * kthread_unuse_mm - reverse the effect of kthread_use_mm() * @mm: address space to operate on */ void kthread_unuse_mm(struct mm_struct *mm) { struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); task_lock(tsk); /* * When a kthread stops operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after accessing user-space memory, before * clearing tsk->mm. */ smp_mb__after_spinlock(); local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread * @css: the cgroup info * * Current thread must be a kthread. The thread is running jobs on behalf of * other threads. In some cases, we expect the jobs attach cgroup info of * original threads instead of that of current thread. This function stores * original thread's cgroup info in current kthread context for later * retrieval. */ void kthread_associate_blkcg(struct cgroup_subsys_state *css) { struct kthread *kthread; if (!(current->flags & PF_KTHREAD)) return; kthread = to_kthread(current); if (!kthread) return; if (kthread->blkcg_css) { css_put(kthread->blkcg_css); kthread->blkcg_css = NULL; } if (css) { css_get(css); kthread->blkcg_css = css; } } EXPORT_SYMBOL(kthread_associate_blkcg); /** * kthread_blkcg - get associated blkcg css of current kthread * * Current thread must be a kthread. */ struct cgroup_subsys_state *kthread_blkcg(void) { struct kthread *kthread; if (current->flags & PF_KTHREAD) { kthread = to_kthread(current); if (kthread) return kthread->blkcg_css; } return NULL; } #endif |
| 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for inet_sock * * Authors: Many, reorganised here by * Arnaldo Carvalho de Melo <acme@mandriva.com> */ #ifndef _INET_SOCK_H #define _INET_SOCK_H #include <linux/bitops.h> #include <linux/string.h> #include <linux/types.h> #include <linux/jhash.h> #include <linux/netdevice.h> #include <net/flow.h> #include <net/inet_dscp.h> #include <net/sock.h> #include <net/request_sock.h> #include <net/netns/hash.h> #include <net/tcp_states.h> #include <net/l3mdev.h> /** struct ip_options - IP Options * * @faddr - Saved first hop address * @nexthop - Saved nexthop address in LSRR and SSRR * @is_strictroute - Strict source route * @srr_is_hit - Packet destination addr was our one * @is_changed - IP checksum more not valid * @rr_needaddr - Need to record addr of outgoing dev * @ts_needtime - Need to record timestamp * @ts_needaddr - Need to record addr of outgoing dev */ struct ip_options { __be32 faddr; __be32 nexthop; unsigned char optlen; unsigned char srr; unsigned char rr; unsigned char ts; unsigned char is_strictroute:1, srr_is_hit:1, is_changed:1, rr_needaddr:1, ts_needtime:1, ts_needaddr:1; unsigned char router_alert; unsigned char cipso; unsigned char __pad2; unsigned char __data[]; }; struct ip_options_rcu { struct rcu_head rcu; struct ip_options opt; }; struct ip_options_data { struct ip_options_rcu opt; char data[40]; }; struct inet_request_sock { struct request_sock req; #define ir_loc_addr req.__req_common.skc_rcv_saddr #define ir_rmt_addr req.__req_common.skc_daddr #define ir_num req.__req_common.skc_num #define ir_rmt_port req.__req_common.skc_dport #define ir_v6_rmt_addr req.__req_common.skc_v6_daddr #define ir_v6_loc_addr req.__req_common.skc_v6_rcv_saddr #define ir_iif req.__req_common.skc_bound_dev_if #define ir_cookie req.__req_common.skc_cookie #define ireq_net req.__req_common.skc_net #define ireq_state req.__req_common.skc_state #define ireq_family req.__req_common.skc_family u16 snd_wscale : 4, rcv_wscale : 4, tstamp_ok : 1, sack_ok : 1, wscale_ok : 1, ecn_ok : 1, acked : 1, no_srccheck: 1, smc_ok : 1; u32 ir_mark; union { struct ip_options_rcu __rcu *ireq_opt; #if IS_ENABLED(CONFIG_IPV6) struct { struct ipv6_txoptions *ipv6_opt; struct sk_buff *pktopts; }; #endif }; }; static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) { return (struct inet_request_sock *)sk; } static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb) { u32 mark = READ_ONCE(sk->sk_mark); if (!mark && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept)) return skb->mark; return mark; } static inline int inet_request_bound_dev_if(const struct sock *sk, struct sk_buff *skb) { int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); #ifdef CONFIG_NET_L3_MASTER_DEV struct net *net = sock_net(sk); if (!bound_dev_if && READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept)) return l3mdev_master_ifindex_by_index(net, skb->skb_iif); #endif return bound_dev_if; } static inline int inet_sk_bound_l3mdev(const struct sock *sk) { #ifdef CONFIG_NET_L3_MASTER_DEV struct net *net = sock_net(sk); if (!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept)) return l3mdev_master_ifindex_by_index(net, sk->sk_bound_dev_if); #endif return 0; } static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if, int dif, int sdif) { if (!bound_dev_if) return !sdif || l3mdev_accept; return bound_dev_if == dif || bound_dev_if == sdif; } static inline bool inet_sk_bound_dev_eq(const struct net *net, int bound_dev_if, int dif, int sdif) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept), bound_dev_if, dif, sdif); #else return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); #endif } struct inet_cork { unsigned int flags; __be32 addr; struct ip_options *opt; unsigned int fragsize; int length; /* Total length of all frames */ struct dst_entry *dst; u8 tx_flags; __u8 ttl; __s16 tos; u32 priority; __u16 gso_size; u32 ts_opt_id; u64 transmit_time; u32 mark; }; struct inet_cork_full { struct inet_cork base; struct flowi fl; }; struct ip_mc_socklist; struct ipv6_pinfo; struct rtable; /** struct inet_sock - representation of INET sockets * * @sk - ancestor class * @pinet6 - pointer to IPv6 control block * @inet_daddr - Foreign IPv4 addr * @inet_rcv_saddr - Bound local IPv4 addr * @inet_dport - Destination port * @inet_num - Local port * @inet_flags - various atomic flags * @inet_saddr - Sending source * @uc_ttl - Unicast TTL * @inet_sport - Source port * @inet_id - ID counter for DF pkts * @tos - TOS * @mc_ttl - Multicasting TTL * @uc_index - Unicast outgoing device index * @mc_index - Multicast device index * @mc_list - Group array * @cork - info to build ip hdr on each ip frag while socket is corked */ struct inet_sock { /* sk and pinet6 has to be the first two members of inet_sock */ struct sock sk; #if IS_ENABLED(CONFIG_IPV6) struct ipv6_pinfo *pinet6; #endif /* Socket demultiplex comparisons on incoming packets. */ #define inet_daddr sk.__sk_common.skc_daddr #define inet_rcv_saddr sk.__sk_common.skc_rcv_saddr #define inet_dport sk.__sk_common.skc_dport #define inet_num sk.__sk_common.skc_num unsigned long inet_flags; __be32 inet_saddr; __s16 uc_ttl; __be16 inet_sport; struct ip_options_rcu __rcu *inet_opt; atomic_t inet_id; __u8 tos; __u8 min_ttl; __u8 mc_ttl; __u8 pmtudisc; __u8 rcv_tos; __u8 convert_csum; int uc_index; int mc_index; __be32 mc_addr; u32 local_port_range; /* high << 16 | low */ struct ip_mc_socklist __rcu *mc_list; struct inet_cork_full cork; }; #define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ #define IPCORK_TS_OPT_ID 2 /* ts_opt_id field is valid, overriding sk_tskey */ enum { INET_FLAGS_PKTINFO = 0, INET_FLAGS_TTL = 1, INET_FLAGS_TOS = 2, INET_FLAGS_RECVOPTS = 3, INET_FLAGS_RETOPTS = 4, INET_FLAGS_PASSSEC = 5, INET_FLAGS_ORIGDSTADDR = 6, INET_FLAGS_CHECKSUM = 7, INET_FLAGS_RECVFRAGSIZE = 8, INET_FLAGS_RECVERR = 9, INET_FLAGS_RECVERR_RFC4884 = 10, INET_FLAGS_FREEBIND = 11, INET_FLAGS_HDRINCL = 12, INET_FLAGS_MC_LOOP = 13, INET_FLAGS_MC_ALL = 14, INET_FLAGS_TRANSPARENT = 15, INET_FLAGS_IS_ICSK = 16, INET_FLAGS_NODEFRAG = 17, INET_FLAGS_BIND_ADDRESS_NO_PORT = 18, INET_FLAGS_DEFER_CONNECT = 19, INET_FLAGS_MC6_LOOP = 20, INET_FLAGS_RECVERR6_RFC4884 = 21, INET_FLAGS_MC6_ALL = 22, INET_FLAGS_AUTOFLOWLABEL_SET = 23, INET_FLAGS_AUTOFLOWLABEL = 24, INET_FLAGS_DONTFRAG = 25, INET_FLAGS_RECVERR6 = 26, INET_FLAGS_REPFLOW = 27, INET_FLAGS_RTALERT_ISOLATE = 28, INET_FLAGS_SNDFLOW = 29, INET_FLAGS_RTALERT = 30, }; /* cmsg flags for inet */ #define IP_CMSG_PKTINFO BIT(INET_FLAGS_PKTINFO) #define IP_CMSG_TTL BIT(INET_FLAGS_TTL) #define IP_CMSG_TOS BIT(INET_FLAGS_TOS) #define IP_CMSG_RECVOPTS BIT(INET_FLAGS_RECVOPTS) #define IP_CMSG_RETOPTS BIT(INET_FLAGS_RETOPTS) #define IP_CMSG_PASSSEC BIT(INET_FLAGS_PASSSEC) #define IP_CMSG_ORIGDSTADDR BIT(INET_FLAGS_ORIGDSTADDR) #define IP_CMSG_CHECKSUM BIT(INET_FLAGS_CHECKSUM) #define IP_CMSG_RECVFRAGSIZE BIT(INET_FLAGS_RECVFRAGSIZE) #define IP_CMSG_ALL (IP_CMSG_PKTINFO | IP_CMSG_TTL | \ IP_CMSG_TOS | IP_CMSG_RECVOPTS | \ IP_CMSG_RETOPTS | IP_CMSG_PASSSEC | \ IP_CMSG_ORIGDSTADDR | IP_CMSG_CHECKSUM | \ IP_CMSG_RECVFRAGSIZE) static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) { return READ_ONCE(inet->inet_flags) & IP_CMSG_ALL; } static inline dscp_t inet_sk_dscp(const struct inet_sock *inet) { return inet_dsfield_to_dscp(READ_ONCE(inet->tos)); } #define inet_test_bit(nr, sk) \ test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) #define inet_set_bit(nr, sk) \ set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) #define inet_clear_bit(nr, sk) \ clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) #define inet_assign_bit(nr, sk, val) \ assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val) /** * sk_to_full_sk - Access to a full socket * @sk: pointer to a socket * * SYNACK messages might be attached to request sockets. * Some places want to reach the listener in this case. */ static inline struct sock *sk_to_full_sk(struct sock *sk) { #ifdef CONFIG_INET if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV) sk = inet_reqsk(sk)->rsk_listener; if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT) sk = NULL; #endif return sk; } /* sk_to_full_sk() variant with a const argument */ static inline const struct sock *sk_const_to_full_sk(const struct sock *sk) { #ifdef CONFIG_INET if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV) sk = ((const struct request_sock *)sk)->rsk_listener; if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT) sk = NULL; #endif return sk; } static inline struct sock *skb_to_full_sk(const struct sk_buff *skb) { return sk_to_full_sk(skb->sk); } #define inet_sk(ptr) container_of_const(ptr, struct inet_sock, sk) static inline void __inet_sk_copy_descendant(struct sock *sk_to, const struct sock *sk_from, const int ancestor_size) { memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1, sk_from->sk_prot->obj_size - ancestor_size); } int inet_sk_rebuild_header(struct sock *sk); /** * inet_sk_state_load - read sk->sk_state for lockless contexts * @sk: socket pointer * * Paired with inet_sk_state_store(). Used in places we don't hold socket lock: * tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ... */ static inline int inet_sk_state_load(const struct sock *sk) { /* state change might impact lockless readers. */ return smp_load_acquire(&sk->sk_state); } /** * inet_sk_state_store - update sk->sk_state * @sk: socket pointer * @newstate: new state * * Paired with inet_sk_state_load(). Should be used in contexts where * state change might impact lockless readers. */ void inet_sk_state_store(struct sock *sk, int newstate); void inet_sk_set_state(struct sock *sk, int state); static inline unsigned int __inet_ehashfn(const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport, u32 initval) { return jhash_3words((__force __u32) laddr, (__force __u32) faddr, ((__u32) lport) << 16 | (__force __u32)fport, initval); } struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener); static inline __u8 inet_sk_flowi_flags(const struct sock *sk) { __u8 flags = 0; if (inet_test_bit(TRANSPARENT, sk) || inet_test_bit(HDRINCL, sk)) flags |= FLOWI_FLAG_ANYSRC; return flags; } static inline void inet_inc_convert_csum(struct sock *sk) { inet_sk(sk)->convert_csum++; } static inline void inet_dec_convert_csum(struct sock *sk) { if (inet_sk(sk)->convert_csum > 0) inet_sk(sk)->convert_csum--; } static inline bool inet_get_convert_csum(struct sock *sk) { return !!inet_sk(sk)->convert_csum; } static inline bool inet_can_nonlocal_bind(struct net *net, struct inet_sock *inet) { return READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind) || test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) || test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags); } static inline bool inet_addr_valid_or_nonlocal(struct net *net, struct inet_sock *inet, __be32 addr, int addr_type) { return inet_can_nonlocal_bind(net, inet) || addr == htonl(INADDR_ANY) || addr_type == RTN_LOCAL || addr_type == RTN_MULTICAST || addr_type == RTN_BROADCAST; } #endif /* _INET_SOCK_H */ |
| 14 12 14 26 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 | /* * linux/fs/nls/nls_iso8859-3.c * * Charset iso8859-3 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* 0x90*/ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* 0xa0*/ 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, 0x0000, 0x0124, 0x00a7, 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, 0x0000, 0x017b, /* 0xb0*/ 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, 0x0000, 0x017c, /* 0xc0*/ 0x00c0, 0x00c1, 0x00c2, 0x0000, 0x00c4, 0x010a, 0x0108, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* 0xd0*/ 0x0000, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df, /* 0xe0*/ 0x00e0, 0x00e1, 0x00e2, 0x0000, 0x00e4, 0x010b, 0x0109, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* 0xf0*/ 0x0000, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0x00, 0x00, 0xa3, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ 0xb0, 0x00, 0xb2, 0xb3, 0xb4, 0xb5, 0x00, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0x00, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0x00, 0x00, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0x00, 0xd9, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0x00, 0x00, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0x00, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0x00, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0xc6, 0xe6, 0xc5, 0xe5, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0xd8, 0xf8, 0xab, 0xbb, /* 0x18-0x1f */ 0xd5, 0xf5, 0x00, 0x00, 0xa6, 0xb6, 0xa1, 0xb1, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0xa9, 0xb9, 0x00, 0x00, 0xac, 0xbc, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0xde, 0xfe, 0xaa, 0xba, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0xdd, 0xfd, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0xaf, 0xbf, 0x00, 0x00, 0x00, /* 0x78-0x7f */ }; static const unsigned char page02[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0xa2, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, page02, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xb1, 0xa2, 0xa3, 0xa4, 0x00, 0xb6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0x69, 0xba, 0xbb, 0xbc, 0xad, 0x00, 0xbf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0x00, 0xbf, /* 0xb8-0xbf */ 0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xa1, 0xb2, 0xb3, 0xb4, 0x00, 0xa6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0x49, 0xaa, 0xab, 0xac, 0xbd, 0x00, 0xaf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "iso8859-3", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_iso8859_3(void) { return register_nls(&table); } static void __exit exit_nls_iso8859_3(void) { unregister_nls(&table); } module_init(init_nls_iso8859_3) module_exit(exit_nls_iso8859_3) MODULE_DESCRIPTION("NLS ISO 8859-3 (Latin 3; Esperanto, Galician, Maltese, Turkish)"); MODULE_LICENSE("Dual BSD/GPL"); |
| 1 1 1 1 1 1 1 1 1 1 1 2 2 1 2 2 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 | // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/crc32c.h> #include <linux/ctype.h> #include <linux/highmem.h> #include <linux/inet.h> #include <linux/kthread.h> #include <linux/net.h> #include <linux/nsproxy.h> #include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/string.h> #ifdef CONFIG_BLOCK #include <linux/bio.h> #endif /* CONFIG_BLOCK */ #include <linux/dns_resolver.h> #include <net/tcp.h> #include <trace/events/sock.h> #include <linux/ceph/ceph_features.h> #include <linux/ceph/libceph.h> #include <linux/ceph/messenger.h> #include <linux/ceph/decode.h> #include <linux/ceph/pagelist.h> #include <linux/export.h> /* * Ceph uses the messenger to exchange ceph_msg messages with other * hosts in the system. The messenger provides ordered and reliable * delivery. We tolerate TCP disconnects by reconnecting (with * exponential backoff) in the case of a fault (disconnection, bad * crc, protocol error). Acks allow sent messages to be discarded by * the sender. */ /* * We track the state of the socket on a given connection using * values defined below. The transition to a new socket state is * handled by a function which verifies we aren't coming from an * unexpected state. * * -------- * | NEW* | transient initial state * -------- * | con_sock_state_init() * v * ---------- * | CLOSED | initialized, but no socket (and no * ---------- TCP connection) * ^ \ * | \ con_sock_state_connecting() * | ---------------------- * | \ * + con_sock_state_closed() \ * |+--------------------------- \ * | \ \ \ * | ----------- \ \ * | | CLOSING | socket event; \ \ * | ----------- await close \ \ * | ^ \ | * | | \ | * | + con_sock_state_closing() \ | * | / \ | | * | / --------------- | | * | / \ v v * | / -------------- * | / -----------------| CONNECTING | socket created, TCP * | | / -------------- connect initiated * | | | con_sock_state_connected() * | | v * ------------- * | CONNECTED | TCP connection established * ------------- * * State values for ceph_connection->sock_state; NEW is assumed to be 0. */ #define CON_SOCK_STATE_NEW 0 /* -> CLOSED */ #define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */ #define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */ #define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */ #define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */ static bool con_flag_valid(unsigned long con_flag) { switch (con_flag) { case CEPH_CON_F_LOSSYTX: case CEPH_CON_F_KEEPALIVE_PENDING: case CEPH_CON_F_WRITE_PENDING: case CEPH_CON_F_SOCK_CLOSED: case CEPH_CON_F_BACKOFF: return true; default: return false; } } void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); clear_bit(con_flag, &con->flags); } void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); set_bit(con_flag, &con->flags); } bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_bit(con_flag, &con->flags); } bool ceph_con_flag_test_and_clear(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_and_clear_bit(con_flag, &con->flags); } bool ceph_con_flag_test_and_set(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_and_set_bit(con_flag, &con->flags); } /* Slab caches for frequently-allocated structures */ static struct kmem_cache *ceph_msg_cache; #ifdef CONFIG_LOCKDEP static struct lock_class_key socket_class; #endif static void queue_con(struct ceph_connection *con); static void cancel_con(struct ceph_connection *con); static void ceph_con_workfn(struct work_struct *); static void con_fault(struct ceph_connection *con); /* * Nicely render a sockaddr as a string. An array of formatted * strings is used, to approximate reentrancy. */ #define ADDR_STR_COUNT_LOG 5 /* log2(# address strings in array) */ #define ADDR_STR_COUNT (1 << ADDR_STR_COUNT_LOG) #define ADDR_STR_COUNT_MASK (ADDR_STR_COUNT - 1) #define MAX_ADDR_STR_LEN 64 /* 54 is enough */ static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN]; static atomic_t addr_str_seq = ATOMIC_INIT(0); struct page *ceph_zero_page; /* used in certain error cases */ const char *ceph_pr_addr(const struct ceph_entity_addr *addr) { int i; char *s; struct sockaddr_storage ss = addr->in_addr; /* align */ struct sockaddr_in *in4 = (struct sockaddr_in *)&ss; struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)&ss; i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK; s = addr_str[i]; switch (ss.ss_family) { case AF_INET: snprintf(s, MAX_ADDR_STR_LEN, "(%d)%pI4:%hu", le32_to_cpu(addr->type), &in4->sin_addr, ntohs(in4->sin_port)); break; case AF_INET6: snprintf(s, MAX_ADDR_STR_LEN, "(%d)[%pI6c]:%hu", le32_to_cpu(addr->type), &in6->sin6_addr, ntohs(in6->sin6_port)); break; default: snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)", ss.ss_family); } return s; } EXPORT_SYMBOL(ceph_pr_addr); void ceph_encode_my_addr(struct ceph_messenger *msgr) { if (!ceph_msgr2(from_msgr(msgr))) { memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr)); ceph_encode_banner_addr(&msgr->my_enc_addr); } } /* * work queue for all reading and writing to/from the socket. */ static struct workqueue_struct *ceph_msgr_wq; static int ceph_msgr_slab_init(void) { BUG_ON(ceph_msg_cache); ceph_msg_cache = KMEM_CACHE(ceph_msg, 0); if (!ceph_msg_cache) return -ENOMEM; return 0; } static void ceph_msgr_slab_exit(void) { BUG_ON(!ceph_msg_cache); kmem_cache_destroy(ceph_msg_cache); ceph_msg_cache = NULL; } static void _ceph_msgr_exit(void) { if (ceph_msgr_wq) { destroy_workqueue(ceph_msgr_wq); ceph_msgr_wq = NULL; } BUG_ON(!ceph_zero_page); put_page(ceph_zero_page); ceph_zero_page = NULL; ceph_msgr_slab_exit(); } int __init ceph_msgr_init(void) { if (ceph_msgr_slab_init()) return -ENOMEM; BUG_ON(ceph_zero_page); ceph_zero_page = ZERO_PAGE(0); get_page(ceph_zero_page); /* * The number of active work items is limited by the number of * connections, so leave @max_active at default. */ ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_MEM_RECLAIM, 0); if (ceph_msgr_wq) return 0; pr_err("msgr_init failed to create workqueue\n"); _ceph_msgr_exit(); return -ENOMEM; } void ceph_msgr_exit(void) { BUG_ON(ceph_msgr_wq == NULL); _ceph_msgr_exit(); } void ceph_msgr_flush(void) { flush_workqueue(ceph_msgr_wq); } EXPORT_SYMBOL(ceph_msgr_flush); /* Connection socket state transition functions */ static void con_sock_state_init(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); if (WARN_ON(old_state != CON_SOCK_STATE_NEW)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CLOSED); } static void con_sock_state_connecting(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING); if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CONNECTING); } static void con_sock_state_connected(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED); if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CONNECTED); } static void con_sock_state_closing(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING); if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING && old_state != CON_SOCK_STATE_CONNECTED && old_state != CON_SOCK_STATE_CLOSING)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CLOSING); } static void con_sock_state_closed(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED && old_state != CON_SOCK_STATE_CLOSING && old_state != CON_SOCK_STATE_CONNECTING && old_state != CON_SOCK_STATE_CLOSED)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CLOSED); } /* * socket callback functions */ /* data available on socket, or listen socket received a connect */ static void ceph_sock_data_ready(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; trace_sk_data_ready(sk); if (atomic_read(&con->msgr->stopping)) { return; } if (sk->sk_state != TCP_CLOSE_WAIT) { dout("%s %p state = %d, queueing work\n", __func__, con, con->state); queue_con(con); } } /* socket has buffer space for writing */ static void ceph_sock_write_space(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; /* only queue to workqueue if there is data we want to write, * and there is sufficient space in the socket buffer to accept * more data. clear SOCK_NOSPACE so that ceph_sock_write_space() * doesn't get called again until try_write() fills the socket * buffer. See net/ipv4/tcp_input.c:tcp_check_space() * and net/core/stream.c:sk_stream_write_space(). */ if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) { if (sk_stream_is_writeable(sk)) { dout("%s %p queueing write work\n", __func__, con); clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); queue_con(con); } } else { dout("%s %p nothing to write\n", __func__, con); } } /* socket's state has changed */ static void ceph_sock_state_change(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; dout("%s %p state = %d sk_state = %u\n", __func__, con, con->state, sk->sk_state); switch (sk->sk_state) { case TCP_CLOSE: dout("%s TCP_CLOSE\n", __func__); fallthrough; case TCP_CLOSE_WAIT: dout("%s TCP_CLOSE_WAIT\n", __func__); con_sock_state_closing(con); ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED); queue_con(con); break; case TCP_ESTABLISHED: dout("%s TCP_ESTABLISHED\n", __func__); con_sock_state_connected(con); queue_con(con); break; default: /* Everything else is uninteresting */ break; } } /* * set up socket callbacks */ static void set_sock_callbacks(struct socket *sock, struct ceph_connection *con) { struct sock *sk = sock->sk; sk->sk_user_data = con; sk->sk_data_ready = ceph_sock_data_ready; sk->sk_write_space = ceph_sock_write_space; sk->sk_state_change = ceph_sock_state_change; } /* * socket helpers */ /* * initiate connection to a remote socket. */ int ceph_tcp_connect(struct ceph_connection *con) { struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */ struct socket *sock; unsigned int noio_flag; int ret; dout("%s con %p peer_addr %s\n", __func__, con, ceph_pr_addr(&con->peer_addr)); BUG_ON(con->sock); /* sock_create_kern() allocates with GFP_KERNEL */ noio_flag = memalloc_noio_save(); ret = sock_create_kern(read_pnet(&con->msgr->net), ss.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock); memalloc_noio_restore(noio_flag); if (ret) return ret; sock->sk->sk_allocation = GFP_NOFS; sock->sk->sk_use_task_frag = false; #ifdef CONFIG_LOCKDEP lockdep_set_class(&sock->sk->sk_lock, &socket_class); #endif set_sock_callbacks(sock, con); con_sock_state_connecting(con); ret = kernel_connect(sock, (struct sockaddr *)&ss, sizeof(ss), O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", ceph_pr_addr(&con->peer_addr), sock->sk->sk_state); } else if (ret < 0) { pr_err("connect %s error %d\n", ceph_pr_addr(&con->peer_addr), ret); sock_release(sock); return ret; } if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) tcp_sock_set_nodelay(sock->sk); con->sock = sock; return 0; } /* * Shutdown/close the socket for the given connection. */ int ceph_con_close_socket(struct ceph_connection *con) { int rc = 0; dout("%s con %p sock %p\n", __func__, con, con->sock); if (con->sock) { rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); sock_release(con->sock); con->sock = NULL; } /* * Forcibly clear the SOCK_CLOSED flag. It gets set * independent of the connection mutex, and we could have * received a socket close event before we had the chance to * shut the socket down. */ ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED); con_sock_state_closed(con); return rc; } static void ceph_con_reset_protocol(struct ceph_connection *con) { dout("%s con %p\n", __func__, con); ceph_con_close_socket(con); if (con->in_msg) { WARN_ON(con->in_msg->con != con); ceph_msg_put(con->in_msg); con->in_msg = NULL; } if (con->out_msg) { WARN_ON(con->out_msg->con != con); ceph_msg_put(con->out_msg); con->out_msg = NULL; } if (con->bounce_page) { __free_page(con->bounce_page); con->bounce_page = NULL; } if (ceph_msgr2(from_msgr(con->msgr))) ceph_con_v2_reset_protocol(con); else ceph_con_v1_reset_protocol(con); } /* * Reset a connection. Discard all incoming and outgoing messages * and clear *_seq state. */ static void ceph_msg_remove(struct ceph_msg *msg) { list_del_init(&msg->list_head); ceph_msg_put(msg); } static void ceph_msg_remove_list(struct list_head *head) { while (!list_empty(head)) { struct ceph_msg *msg = list_first_entry(head, struct ceph_msg, list_head); ceph_msg_remove(msg); } } void ceph_con_reset_session(struct ceph_connection *con) { dout("%s con %p\n", __func__, con); WARN_ON(con->in_msg); WARN_ON(con->out_msg); ceph_msg_remove_list(&con->out_queue); ceph_msg_remove_list(&con->out_sent); con->out_seq = 0; con->in_seq = 0; con->in_seq_acked = 0; if (ceph_msgr2(from_msgr(con->msgr))) ceph_con_v2_reset_session(con); else ceph_con_v1_reset_session(con); } /* * mark a peer down. drop any open connections. */ void ceph_con_close(struct ceph_connection *con) { mutex_lock(&con->mutex); dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr)); con->state = CEPH_CON_S_CLOSED; ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX); /* so we retry next connect */ ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING); ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF); ceph_con_reset_protocol(con); ceph_con_reset_session(con); cancel_con(con); mutex_unlock(&con->mutex); } EXPORT_SYMBOL(ceph_con_close); /* * Reopen a closed connection, with a new peer address. */ void ceph_con_open(struct ceph_connection *con, __u8 entity_type, __u64 entity_num, struct ceph_entity_addr *addr) { mutex_lock(&con->mutex); dout("con_open %p %s\n", con, ceph_pr_addr(addr)); WARN_ON(con->state != CEPH_CON_S_CLOSED); con->state = CEPH_CON_S_PREOPEN; con->peer_name.type = (__u8) entity_type; con->peer_name.num = cpu_to_le64(entity_num); memcpy(&con->peer_addr, addr, sizeof(*addr)); con->delay = 0; /* reset backoff memory */ mutex_unlock(&con->mutex); queue_con(con); } EXPORT_SYMBOL(ceph_con_open); /* * return true if this connection ever successfully opened */ bool ceph_con_opened(struct ceph_connection *con) { if (ceph_msgr2(from_msgr(con->msgr))) return ceph_con_v2_opened(con); return ceph_con_v1_opened(con); } /* * initialize a new connection. */ void ceph_con_init(struct ceph_connection *con, void *private, const struct ceph_connection_operations *ops, struct ceph_messenger *msgr) { dout("con_init %p\n", con); memset(con, 0, sizeof(*con)); con->private = private; con->ops = ops; con->msgr = msgr; con_sock_state_init(con); mutex_init(&con->mutex); INIT_LIST_HEAD(&con->out_queue); INIT_LIST_HEAD(&con->out_sent); INIT_DELAYED_WORK(&con->work, ceph_con_workfn); con->state = CEPH_CON_S_CLOSED; } EXPORT_SYMBOL(ceph_con_init); /* * We maintain a global counter to order connection attempts. Get * a unique seq greater than @gt. */ u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt) { u32 ret; spin_lock(&msgr->global_seq_lock); if (msgr->global_seq < gt) msgr->global_seq = gt; ret = ++msgr->global_seq; spin_unlock(&msgr->global_seq_lock); return ret; } /* * Discard messages that have been acked by the server. */ void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq) { struct ceph_msg *msg; u64 seq; dout("%s con %p ack_seq %llu\n", __func__, con, ack_seq); while (!list_empty(&con->out_sent)) { msg = list_first_entry(&con->out_sent, struct ceph_msg, list_head); WARN_ON(msg->needs_out_seq); seq = le64_to_cpu(msg->hdr.seq); if (seq > ack_seq) break; dout("%s con %p discarding msg %p seq %llu\n", __func__, con, msg, seq); ceph_msg_remove(msg); } } /* * Discard messages that have been requeued in con_fault(), up to * reconnect_seq. This avoids gratuitously resending messages that * the server had received and handled prior to reconnect. */ void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq) { struct ceph_msg *msg; u64 seq; dout("%s con %p reconnect_seq %llu\n", __func__, con, reconnect_seq); while (!list_empty(&con->out_queue)) { msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head); if (msg->needs_out_seq) break; seq = le64_to_cpu(msg->hdr.seq); if (seq > reconnect_seq) break; dout("%s con %p discarding msg %p seq %llu\n", __func__, con, msg, seq); ceph_msg_remove(msg); } } #ifdef CONFIG_BLOCK /* * For a bio data item, a piece is whatever remains of the next * entry in the current bio iovec, or the first entry in the next * bio in the list. */ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; struct ceph_bio_iter *it = &cursor->bio_iter; cursor->resid = min_t(size_t, length, data->bio_length); *it = data->bio_pos; if (cursor->resid < it->iter.bi_size) it->iter.bi_size = cursor->resid; BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); } static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio, cursor->bio_iter.iter); *page_offset = bv.bv_offset; *length = bv.bv_len; return bv.bv_page; } static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct ceph_bio_iter *it = &cursor->bio_iter; struct page *page = bio_iter_page(it->bio, it->iter); BUG_ON(bytes > cursor->resid); BUG_ON(bytes > bio_iter_len(it->bio, it->iter)); cursor->resid -= bytes; bio_advance_iter(it->bio, &it->iter, bytes); if (!cursor->resid) return false; /* no more data */ if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done && page == bio_iter_page(it->bio, it->iter))) return false; /* more bytes to process in this segment */ if (!it->iter.bi_size) { it->bio = it->bio->bi_next; it->iter = it->bio->bi_iter; if (cursor->resid < it->iter.bi_size) it->iter.bi_size = cursor->resid; } BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); return true; } #endif /* CONFIG_BLOCK */ static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; struct bio_vec *bvecs = data->bvec_pos.bvecs; cursor->resid = min_t(size_t, length, data->bvec_pos.iter.bi_size); cursor->bvec_iter = data->bvec_pos.iter; cursor->bvec_iter.bi_size = cursor->resid; BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); } static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct bio_vec bv = bvec_iter_bvec(cursor->data->bvec_pos.bvecs, cursor->bvec_iter); *page_offset = bv.bv_offset; *length = bv.bv_len; return bv.bv_page; } static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct bio_vec *bvecs = cursor->data->bvec_pos.bvecs; struct page *page = bvec_iter_page(bvecs, cursor->bvec_iter); BUG_ON(bytes > cursor->resid); BUG_ON(bytes > bvec_iter_len(bvecs, cursor->bvec_iter)); cursor->resid -= bytes; bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes); if (!cursor->resid) return false; /* no more data */ if (!bytes || (cursor->bvec_iter.bi_bvec_done && page == bvec_iter_page(bvecs, cursor->bvec_iter))) return false; /* more bytes to process in this segment */ BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); return true; } /* * For a page array, a piece comes from the first page in the array * that has not already been fully consumed. */ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; int page_count; BUG_ON(data->type != CEPH_MSG_DATA_PAGES); BUG_ON(!data->pages); BUG_ON(!data->length); cursor->resid = min(length, data->length); page_count = calc_pages_for(data->alignment, (u64)data->length); cursor->page_offset = data->alignment & ~PAGE_MASK; cursor->page_index = 0; BUG_ON(page_count > (int)USHRT_MAX); cursor->page_count = (unsigned short)page_count; BUG_ON(length > SIZE_MAX - cursor->page_offset); } static struct page * ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct ceph_msg_data *data = cursor->data; BUG_ON(data->type != CEPH_MSG_DATA_PAGES); BUG_ON(cursor->page_index >= cursor->page_count); BUG_ON(cursor->page_offset >= PAGE_SIZE); *page_offset = cursor->page_offset; *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset); return data->pages[cursor->page_index]; } static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES); BUG_ON(cursor->page_offset + bytes > PAGE_SIZE); /* Advance the cursor page offset */ cursor->resid -= bytes; cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK; if (!bytes || cursor->page_offset) return false; /* more bytes to process in the current page */ if (!cursor->resid) return false; /* no more data */ /* Move on to the next page; offset is already at 0 */ BUG_ON(cursor->page_index >= cursor->page_count); cursor->page_index++; return true; } /* * For a pagelist, a piece is whatever remains to be consumed in the * first page in the list, or the front of the next page. */ static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; struct ceph_pagelist *pagelist; struct page *page; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); pagelist = data->pagelist; BUG_ON(!pagelist); if (!length) return; /* pagelist can be assigned but empty */ BUG_ON(list_empty(&pagelist->head)); page = list_first_entry(&pagelist->head, struct page, lru); cursor->resid = min(length, pagelist->length); cursor->page = page; cursor->offset = 0; } static struct page * ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct ceph_msg_data *data = cursor->data; struct ceph_pagelist *pagelist; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); pagelist = data->pagelist; BUG_ON(!pagelist); BUG_ON(!cursor->page); BUG_ON(cursor->offset + cursor->resid != pagelist->length); /* offset of first page in pagelist is always 0 */ *page_offset = cursor->offset & ~PAGE_MASK; *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset); return cursor->page; } static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct ceph_msg_data *data = cursor->data; struct ceph_pagelist *pagelist; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); pagelist = data->pagelist; BUG_ON(!pagelist); BUG_ON(cursor->offset + cursor->resid != pagelist->length); BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE); /* Advance the cursor offset */ cursor->resid -= bytes; cursor->offset += bytes; /* offset of first page in pagelist is always 0 */ if (!bytes || cursor->offset & ~PAGE_MASK) return false; /* more bytes to process in the current page */ if (!cursor->resid) return false; /* no more data */ /* Move on to the next page */ BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); cursor->page = list_next_entry(cursor->page, lru); return true; } static void ceph_msg_data_iter_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; cursor->iov_iter = data->iter; cursor->lastlen = 0; iov_iter_truncate(&cursor->iov_iter, length); cursor->resid = iov_iter_count(&cursor->iov_iter); } static struct page *ceph_msg_data_iter_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct page *page; ssize_t len; if (cursor->lastlen) iov_iter_revert(&cursor->iov_iter, cursor->lastlen); len = iov_iter_get_pages2(&cursor->iov_iter, &page, PAGE_SIZE, 1, page_offset); BUG_ON(len < 0); cursor->lastlen = len; /* * FIXME: The assumption is that the pages represented by the iov_iter * are pinned, with the references held by the upper-level * callers, or by virtue of being under writeback. Eventually, * we'll get an iov_iter_get_pages2 variant that doesn't take * page refs. Until then, just put the page ref. */ VM_BUG_ON_PAGE(!PageWriteback(page) && page_count(page) < 2, page); put_page(page); *length = min_t(size_t, len, cursor->resid); return page; } static bool ceph_msg_data_iter_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { BUG_ON(bytes > cursor->resid); cursor->resid -= bytes; if (bytes < cursor->lastlen) { cursor->lastlen -= bytes; } else { iov_iter_advance(&cursor->iov_iter, bytes - cursor->lastlen); cursor->lastlen = 0; } return cursor->resid; } /* * Message data is handled (sent or received) in pieces, where each * piece resides on a single page. The network layer might not * consume an entire piece at once. A data item's cursor keeps * track of which piece is next to process and how much remains to * be processed in that piece. It also tracks whether the current * piece is the last one in the data item. */ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) { size_t length = cursor->total_resid; switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: ceph_msg_data_pagelist_cursor_init(cursor, length); break; case CEPH_MSG_DATA_PAGES: ceph_msg_data_pages_cursor_init(cursor, length); break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: ceph_msg_data_bio_cursor_init(cursor, length); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_BVECS: ceph_msg_data_bvecs_cursor_init(cursor, length); break; case CEPH_MSG_DATA_ITER: ceph_msg_data_iter_cursor_init(cursor, length); break; case CEPH_MSG_DATA_NONE: default: /* BUG(); */ break; } cursor->need_crc = true; } void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, struct ceph_msg *msg, size_t length) { BUG_ON(!length); BUG_ON(length > msg->data_length); BUG_ON(!msg->num_data_items); cursor->total_resid = length; cursor->data = msg->data; cursor->sr_resid = 0; __ceph_msg_data_cursor_init(cursor); } /* * Return the page containing the next piece to process for a given * data item, and supply the page offset and length of that piece. * Indicate whether this is the last piece in this data item. */ struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct page *page; switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: page = ceph_msg_data_pagelist_next(cursor, page_offset, length); break; case CEPH_MSG_DATA_PAGES: page = ceph_msg_data_pages_next(cursor, page_offset, length); break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: page = ceph_msg_data_bio_next(cursor, page_offset, length); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_BVECS: page = ceph_msg_data_bvecs_next(cursor, page_offset, length); break; case CEPH_MSG_DATA_ITER: page = ceph_msg_data_iter_next(cursor, page_offset, length); break; case CEPH_MSG_DATA_NONE: default: page = NULL; break; } BUG_ON(!page); BUG_ON(*page_offset + *length > PAGE_SIZE); BUG_ON(!*length); BUG_ON(*length > cursor->resid); return page; } /* * Returns true if the result moves the cursor on to the next piece * of the data item. */ void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { bool new_piece; BUG_ON(bytes > cursor->resid); switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: new_piece = ceph_msg_data_pagelist_advance(cursor, bytes); break; case CEPH_MSG_DATA_PAGES: new_piece = ceph_msg_data_pages_advance(cursor, bytes); break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: new_piece = ceph_msg_data_bio_advance(cursor, bytes); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_BVECS: new_piece = ceph_msg_data_bvecs_advance(cursor, bytes); break; case CEPH_MSG_DATA_ITER: new_piece = ceph_msg_data_iter_advance(cursor, bytes); break; case CEPH_MSG_DATA_NONE: default: BUG(); break; } cursor->total_resid -= bytes; if (!cursor->resid && cursor->total_resid) { cursor->data++; __ceph_msg_data_cursor_init(cursor); new_piece = true; } cursor->need_crc = new_piece; } u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset, unsigned int length) { char *kaddr; kaddr = kmap(page); BUG_ON(kaddr == NULL); crc = crc32c(crc, kaddr + page_offset, length); kunmap(page); return crc; } bool ceph_addr_is_blank(const struct ceph_entity_addr *addr) { struct sockaddr_storage ss = addr->in_addr; /* align */ struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr; struct in6_addr *addr6 = &((struct sockaddr_in6 *)&ss)->sin6_addr; switch (ss.ss_family) { case AF_INET: return addr4->s_addr == htonl(INADDR_ANY); case AF_INET6: return ipv6_addr_any(addr6); default: return true; } } EXPORT_SYMBOL(ceph_addr_is_blank); int ceph_addr_port(const struct ceph_entity_addr *addr) { switch (get_unaligned(&addr->in_addr.ss_family)) { case AF_INET: return ntohs(get_unaligned(&((struct sockaddr_in *)&addr->in_addr)->sin_port)); case AF_INET6: return ntohs(get_unaligned(&((struct sockaddr_in6 *)&addr->in_addr)->sin6_port)); } return 0; } void ceph_addr_set_port(struct ceph_entity_addr *addr, int p) { switch (get_unaligned(&addr->in_addr.ss_family)) { case AF_INET: put_unaligned(htons(p), &((struct sockaddr_in *)&addr->in_addr)->sin_port); break; case AF_INET6: put_unaligned(htons(p), &((struct sockaddr_in6 *)&addr->in_addr)->sin6_port); break; } } /* * Unlike other *_pton function semantics, zero indicates success. */ static int ceph_pton(const char *str, size_t len, struct ceph_entity_addr *addr, char delim, const char **ipend) { memset(&addr->in_addr, 0, sizeof(addr->in_addr)); if (in4_pton(str, len, (u8 *)&((struct sockaddr_in *)&addr->in_addr)->sin_addr.s_addr, delim, ipend)) { put_unaligned(AF_INET, &addr->in_addr.ss_family); return 0; } if (in6_pton(str, len, (u8 *)&((struct sockaddr_in6 *)&addr->in_addr)->sin6_addr.s6_addr, delim, ipend)) { put_unaligned(AF_INET6, &addr->in_addr.ss_family); return 0; } return -EINVAL; } /* * Extract hostname string and resolve using kernel DNS facility. */ #ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER static int ceph_dns_resolve_name(const char *name, size_t namelen, struct ceph_entity_addr *addr, char delim, const char **ipend) { const char *end, *delim_p; char *colon_p, *ip_addr = NULL; int ip_len, ret; /* * The end of the hostname occurs immediately preceding the delimiter or * the port marker (':') where the delimiter takes precedence. */ delim_p = memchr(name, delim, namelen); colon_p = memchr(name, ':', namelen); if (delim_p && colon_p) end = min(delim_p, colon_p); else if (!delim_p && colon_p) end = colon_p; else { end = delim_p; if (!end) /* case: hostname:/ */ end = name + namelen; } if (end <= name) return -EINVAL; /* do dns_resolve upcall */ ip_len = dns_query(current->nsproxy->net_ns, NULL, name, end - name, NULL, &ip_addr, NULL, false); if (ip_len > 0) ret = ceph_pton(ip_addr, ip_len, addr, -1, NULL); else ret = -ESRCH; kfree(ip_addr); *ipend = end; pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name, ret, ret ? "failed" : ceph_pr_addr(addr)); return ret; } #else static inline int ceph_dns_resolve_name(const char *name, size_t namelen, struct ceph_entity_addr *addr, char delim, const char **ipend) { return -EINVAL; } #endif /* * Parse a server name (IP or hostname). If a valid IP address is not found * then try to extract a hostname to resolve using userspace DNS upcall. */ static int ceph_parse_server_name(const char *name, size_t namelen, struct ceph_entity_addr *addr, char delim, const char **ipend) { int ret; ret = ceph_pton(name, namelen, addr, delim, ipend); if (ret) ret = ceph_dns_resolve_name(name, namelen, addr, delim, ipend); return ret; } /* * Parse an ip[:port] list into an addr array. Use the default * monitor port if a port isn't specified. */ int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, int max_count, int *count, char delim) { int i, ret = -EINVAL; const char *p = c; dout("parse_ips on '%.*s'\n", (int)(end-c), c); for (i = 0; i < max_count; i++) { char cur_delim = delim; const char *ipend; int port; if (*p == '[') { cur_delim = ']'; p++; } ret = ceph_parse_server_name(p, end - p, &addr[i], cur_delim, &ipend); if (ret) goto bad; ret = -EINVAL; p = ipend; if (cur_delim == ']') { if (*p != ']') { dout("missing matching ']'\n"); goto bad; } p++; } /* port? */ if (p < end && *p == ':') { port = 0; p++; while (p < end && *p >= '0' && *p <= '9') { port = (port * 10) + (*p - '0'); p++; } if (port == 0) port = CEPH_MON_PORT; else if (port > 65535) goto bad; } else { port = CEPH_MON_PORT; } ceph_addr_set_port(&addr[i], port); /* * We want the type to be set according to ms_mode * option, but options are normally parsed after mon * addresses. Rather than complicating parsing, set * to LEGACY and override in build_initial_monmap() * for mon addresses and ceph_messenger_init() for * ip option. */ addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY; addr[i].nonce = 0; dout("%s got %s\n", __func__, ceph_pr_addr(&addr[i])); if (p == end) break; if (*p != delim) goto bad; p++; } if (p != end) goto bad; if (count) *count = i + 1; return 0; bad: return ret; } /* * Process message. This happens in the worker thread. The callback should * be careful not to do anything that waits on other incoming messages or it * may deadlock. */ void ceph_con_process_message(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; BUG_ON(con->in_msg->con != con); con->in_msg = NULL; /* if first message, set peer_name */ if (con->peer_name.type == 0) con->peer_name = msg->hdr.src; con->in_seq++; mutex_unlock(&con->mutex); dout("===== %p %llu from %s%lld %d=%s len %d+%d+%d (%u %u %u) =====\n", msg, le64_to_cpu(msg->hdr.seq), ENTITY_NAME(msg->hdr.src), le16_to_cpu(msg->hdr.type), ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len), con->in_front_crc, con->in_middle_crc, con->in_data_crc); con->ops->dispatch(con, msg); mutex_lock(&con->mutex); } /* * Atomically queue work on a connection after the specified delay. * Bump @con reference to avoid races with connection teardown. * Returns 0 if work was queued, or an error code otherwise. */ static int queue_con_delay(struct ceph_connection *con, unsigned long delay) { if (!con->ops->get(con)) { dout("%s %p ref count 0\n", __func__, con); return -ENOENT; } if (delay >= HZ) delay = round_jiffies_relative(delay); dout("%s %p %lu\n", __func__, con, delay); if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { dout("%s %p - already queued\n", __func__, con); con->ops->put(con); return -EBUSY; } return 0; } static void queue_con(struct ceph_connection *con) { (void) queue_con_delay(con, 0); } static void cancel_con(struct ceph_connection *con) { if (cancel_delayed_work(&con->work)) { dout("%s %p\n", __func__, con); con->ops->put(con); } } static bool con_sock_closed(struct ceph_connection *con) { if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED)) return false; #define CASE(x) \ case CEPH_CON_S_ ## x: \ con->error_msg = "socket closed (con state " #x ")"; \ break; switch (con->state) { CASE(CLOSED); CASE(PREOPEN); CASE(V1_BANNER); CASE(V1_CONNECT_MSG); CASE(V2_BANNER_PREFIX); CASE(V2_BANNER_PAYLOAD); CASE(V2_HELLO); CASE(V2_AUTH); CASE(V2_AUTH_SIGNATURE); CASE(V2_SESSION_CONNECT); CASE(V2_SESSION_RECONNECT); CASE(OPEN); CASE(STANDBY); default: BUG(); } #undef CASE return true; } static bool con_backoff(struct ceph_connection *con) { int ret; if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF)) return false; ret = queue_con_delay(con, con->delay); if (ret) { dout("%s: con %p FAILED to back off %lu\n", __func__, con, con->delay); BUG_ON(ret == -ENOENT); ceph_con_flag_set(con, CEPH_CON_F_BACKOFF); } return true; } /* Finish fault handling; con->mutex must *not* be held here */ static void con_fault_finish(struct ceph_connection *con) { dout("%s %p\n", __func__, con); /* * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ if (con->v1.auth_retry) { dout("auth_retry %d, invalidating\n", con->v1.auth_retry); if (con->ops->invalidate_authorizer) con->ops->invalidate_authorizer(con); con->v1.auth_retry = 0; } if (con->ops->fault) con->ops->fault(con); } /* * Do some work on a connection. Drop a connection ref when we're done. */ static void ceph_con_workfn(struct work_struct *work) { struct ceph_connection *con = container_of(work, struct ceph_connection, work.work); bool fault; mutex_lock(&con->mutex); while (true) { int ret; if ((fault = con_sock_closed(con))) { dout("%s: con %p SOCK_CLOSED\n", __func__, con); break; } if (con_backoff(con)) { dout("%s: con %p BACKOFF\n", __func__, con); break; } if (con->state == CEPH_CON_S_STANDBY) { dout("%s: con %p STANDBY\n", __func__, con); break; } if (con->state == CEPH_CON_S_CLOSED) { dout("%s: con %p CLOSED\n", __func__, con); BUG_ON(con->sock); break; } if (con->state == CEPH_CON_S_PREOPEN) { dout("%s: con %p PREOPEN\n", __func__, con); BUG_ON(con->sock); } if (ceph_msgr2(from_msgr(con->msgr))) ret = ceph_con_v2_try_read(con); else ret = ceph_con_v1_try_read(con); if (ret < 0) { if (ret == -EAGAIN) continue; if (!con->error_msg) con->error_msg = "socket error on read"; fault = true; break; } if (ceph_msgr2(from_msgr(con->msgr))) ret = ceph_con_v2_try_write(con); else ret = ceph_con_v1_try_write(con); if (ret < 0) { if (ret == -EAGAIN) continue; if (!con->error_msg) con->error_msg = "socket error on write"; fault = true; } break; /* If we make it to here, we're done */ } if (fault) con_fault(con); mutex_unlock(&con->mutex); if (fault) con_fault_finish(con); con->ops->put(con); } /* * Generic error/fault handler. A retry mechanism is used with * exponential backoff */ static void con_fault(struct ceph_connection *con) { dout("fault %p state %d to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr)); pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr), con->error_msg); con->error_msg = NULL; WARN_ON(con->state == CEPH_CON_S_STANDBY || con->state == CEPH_CON_S_CLOSED); ceph_con_reset_protocol(con); if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) { dout("fault on LOSSYTX channel, marking CLOSED\n"); con->state = CEPH_CON_S_CLOSED; return; } /* Requeue anything that hasn't been acked */ list_splice_init(&con->out_sent, &con->out_queue); /* If there are no messages queued or keepalive pending, place * the connection in a STANDBY state */ if (list_empty(&con->out_queue) && !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) { dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); con->state = CEPH_CON_S_STANDBY; } else { /* retry after a delay. */ con->state = CEPH_CON_S_PREOPEN; if (!con->delay) { con->delay = BASE_DELAY_INTERVAL; } else if (con->delay < MAX_DELAY_INTERVAL) { con->delay *= 2; if (con->delay > MAX_DELAY_INTERVAL) con->delay = MAX_DELAY_INTERVAL; } ceph_con_flag_set(con, CEPH_CON_F_BACKOFF); queue_con(con); } } void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) { u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000; msgr->inst.addr.nonce = cpu_to_le32(nonce); ceph_encode_my_addr(msgr); } /* * initialize a new messenger instance */ void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr) { spin_lock_init(&msgr->global_seq_lock); if (myaddr) { memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr, sizeof(msgr->inst.addr.in_addr)); ceph_addr_set_port(&msgr->inst.addr, 0); } /* * Since nautilus, clients are identified using type ANY. * For msgr1, ceph_encode_banner_addr() munges it to NONE. */ msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY; /* generate a random non-zero nonce */ do { get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); } while (!msgr->inst.addr.nonce); ceph_encode_my_addr(msgr); atomic_set(&msgr->stopping, 0); write_pnet(&msgr->net, get_net(current->nsproxy->net_ns)); dout("%s %p\n", __func__, msgr); } void ceph_messenger_fini(struct ceph_messenger *msgr) { put_net(read_pnet(&msgr->net)); } static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con) { if (msg->con) msg->con->ops->put(msg->con); msg->con = con ? con->ops->get(con) : NULL; BUG_ON(msg->con != con); } static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ if (con->state == CEPH_CON_S_STANDBY) { dout("clear_standby %p and ++connect_seq\n", con); con->state = CEPH_CON_S_PREOPEN; con->v1.connect_seq++; WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)); WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)); } } /* * Queue up an outgoing message on the given connection. * * Consumes a ref on @msg. */ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) { /* set src+dst */ msg->hdr.src = con->msgr->inst.name; BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); msg->needs_out_seq = true; mutex_lock(&con->mutex); if (con->state == CEPH_CON_S_CLOSED) { dout("con_send %p closed, dropping %p\n", con, msg); ceph_msg_put(msg); mutex_unlock(&con->mutex); return; } msg_con_set(msg, con); BUG_ON(!list_empty(&msg->list_head)); list_add_tail(&msg->list_head, &con->out_queue); dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type), ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len)); clear_standby(con); mutex_unlock(&con->mutex); /* if there wasn't anything waiting to send before, queue * new work */ if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING)) queue_con(con); } EXPORT_SYMBOL(ceph_con_send); /* * Revoke a message that was previously queued for send */ void ceph_msg_revoke(struct ceph_msg *msg) { struct ceph_connection *con = msg->con; if (!con) { dout("%s msg %p null con\n", __func__, msg); return; /* Message not in our possession */ } mutex_lock(&con->mutex); if (list_empty(&msg->list_head)) { WARN_ON(con->out_msg == msg); dout("%s con %p msg %p not linked\n", __func__, con, msg); mutex_unlock(&con->mutex); return; } dout("%s con %p msg %p was linked\n", __func__, con, msg); msg->hdr.seq = 0; ceph_msg_remove(msg); if (con->out_msg == msg) { WARN_ON(con->state != CEPH_CON_S_OPEN); dout("%s con %p msg %p was sending\n", __func__, con, msg); if (ceph_msgr2(from_msgr(con->msgr))) ceph_con_v2_revoke(con); else ceph_con_v1_revoke(con); ceph_msg_put(con->out_msg); con->out_msg = NULL; } else { dout("%s con %p msg %p not current, out_msg %p\n", __func__, con, msg, con->out_msg); } mutex_unlock(&con->mutex); } /* * Revoke a message that we may be reading data into */ void ceph_msg_revoke_incoming(struct ceph_msg *msg) { struct ceph_connection *con = msg->con; if (!con) { dout("%s msg %p null con\n", __func__, msg); return; /* Message not in our possession */ } mutex_lock(&con->mutex); if (con->in_msg == msg) { WARN_ON(con->state != CEPH_CON_S_OPEN); dout("%s con %p msg %p was recving\n", __func__, con, msg); if (ceph_msgr2(from_msgr(con->msgr))) ceph_con_v2_revoke_incoming(con); else ceph_con_v1_revoke_incoming(con); ceph_msg_put(con->in_msg); con->in_msg = NULL; } else { dout("%s con %p msg %p not current, in_msg %p\n", __func__, con, msg, con->in_msg); } mutex_unlock(&con->mutex); } /* * Queue a keepalive byte to ensure the tcp connection is alive. */ void ceph_con_keepalive(struct ceph_connection *con) { dout("con_keepalive %p\n", con); mutex_lock(&con->mutex); clear_standby(con); ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING); mutex_unlock(&con->mutex); if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING)) queue_con(con); } EXPORT_SYMBOL(ceph_con_keepalive); bool ceph_con_keepalive_expired(struct ceph_connection *con, unsigned long interval) { if (interval > 0 && (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) { struct timespec64 now; struct timespec64 ts; ktime_get_real_ts64(&now); jiffies_to_timespec64(interval, &ts); ts = timespec64_add(con->last_keepalive_ack, ts); return timespec64_compare(&now, &ts) >= 0; } return false; } static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg) { BUG_ON(msg->num_data_items >= msg->max_data_items); return &msg->data[msg->num_data_items++]; } static void ceph_msg_data_destroy(struct ceph_msg_data *data) { if (data->type == CEPH_MSG_DATA_PAGES && data->own_pages) { int num_pages = calc_pages_for(data->alignment, data->length); ceph_release_page_vector(data->pages, num_pages); } else if (data->type == CEPH_MSG_DATA_PAGELIST) { ceph_pagelist_release(data->pagelist); } } void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment, bool own_pages) { struct ceph_msg_data *data; BUG_ON(!pages); BUG_ON(!length); data = ceph_msg_data_add(msg); data->type = CEPH_MSG_DATA_PAGES; data->pages = pages; data->length = length; data->alignment = alignment & ~PAGE_MASK; data->own_pages = own_pages; msg->data_length += length; } EXPORT_SYMBOL(ceph_msg_data_add_pages); void ceph_msg_data_add_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist) { struct ceph_msg_data *data; BUG_ON(!pagelist); BUG_ON(!pagelist->length); data = ceph_msg_data_add(msg); data->type = CEPH_MSG_DATA_PAGELIST; refcount_inc(&pagelist->refcnt); data->pagelist = pagelist; msg->data_length += pagelist->length; } EXPORT_SYMBOL(ceph_msg_data_add_pagelist); #ifdef CONFIG_BLOCK void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, u32 length) { struct ceph_msg_data *data; data = ceph_msg_data_add(msg); data->type = CEPH_MSG_DATA_BIO; data->bio_pos = *bio_pos; data->bio_length = length; msg->data_length += length; } EXPORT_SYMBOL(ceph_msg_data_add_bio); #endif /* CONFIG_BLOCK */ void ceph_msg_data_add_bvecs(struct ceph_msg *msg, struct ceph_bvec_iter *bvec_pos) { struct ceph_msg_data *data; data = ceph_msg_data_add(msg); data->type = CEPH_MSG_DATA_BVECS; data->bvec_pos = *bvec_pos; msg->data_length += bvec_pos->iter.bi_size; } EXPORT_SYMBOL(ceph_msg_data_add_bvecs); void ceph_msg_data_add_iter(struct ceph_msg *msg, struct iov_iter *iter) { struct ceph_msg_data *data; data = ceph_msg_data_add(msg); data->type = CEPH_MSG_DATA_ITER; data->iter = *iter; msg->data_length += iov_iter_count(&data->iter); } /* * construct a new message with given type, size * the new msg has a ref count of 1. */ struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, gfp_t flags, bool can_fail) { struct ceph_msg *m; m = kmem_cache_zalloc(ceph_msg_cache, flags); if (m == NULL) goto out; m->hdr.type = cpu_to_le16(type); m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); m->hdr.front_len = cpu_to_le32(front_len); INIT_LIST_HEAD(&m->list_head); kref_init(&m->kref); /* front */ if (front_len) { m->front.iov_base = kvmalloc(front_len, flags); if (m->front.iov_base == NULL) { dout("ceph_msg_new can't allocate %d bytes\n", front_len); goto out2; } } else { m->front.iov_base = NULL; } m->front_alloc_len = m->front.iov_len = front_len; if (max_data_items) { m->data = kmalloc_array(max_data_items, sizeof(*m->data), flags); if (!m->data) goto out2; m->max_data_items = max_data_items; } dout("ceph_msg_new %p front %d\n", m, front_len); return m; out2: ceph_msg_put(m); out: if (!can_fail) { pr_err("msg_new can't create type %d front %d\n", type, front_len); WARN_ON(1); } else { dout("msg_new can't create type %d front %d\n", type, front_len); } return NULL; } EXPORT_SYMBOL(ceph_msg_new2); struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail) { return ceph_msg_new2(type, front_len, 0, flags, can_fail); } EXPORT_SYMBOL(ceph_msg_new); /* * Allocate "middle" portion of a message, if it is needed and wasn't * allocated by alloc_msg. This allows us to read a small fixed-size * per-type header in the front and then gracefully fail (i.e., * propagate the error to the caller based on info in the front) when * the middle is too large. */ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) { int type = le16_to_cpu(msg->hdr.type); int middle_len = le32_to_cpu(msg->hdr.middle_len); dout("alloc_middle %p type %d %s middle_len %d\n", msg, type, ceph_msg_type_name(type), middle_len); BUG_ON(!middle_len); BUG_ON(msg->middle); msg->middle = ceph_buffer_new(middle_len, GFP_NOFS); if (!msg->middle) return -ENOMEM; return 0; } /* * Allocate a message for receiving an incoming message on a * connection, and save the result in con->in_msg. Uses the * connection's private alloc_msg op if available. * * Returns 0 on success, or a negative error code. * * On success, if we set *skip = 1: * - the next message should be skipped and ignored. * - con->in_msg == NULL * or if we set *skip = 0: * - con->in_msg is non-null. * On error (ENOMEM, EAGAIN, ...), * - con->in_msg == NULL */ int ceph_con_in_msg_alloc(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip) { int middle_len = le32_to_cpu(hdr->middle_len); struct ceph_msg *msg; int ret = 0; BUG_ON(con->in_msg != NULL); BUG_ON(!con->ops->alloc_msg); mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); if (con->state != CEPH_CON_S_OPEN) { if (msg) ceph_msg_put(msg); return -EAGAIN; } if (msg) { BUG_ON(*skip); msg_con_set(msg, con); con->in_msg = msg; } else { /* * Null message pointer means either we should skip * this message or we couldn't allocate memory. The * former is not an error. */ if (*skip) return 0; con->error_msg = "error allocating memory for incoming message"; return -ENOMEM; } memcpy(&con->in_msg->hdr, hdr, sizeof(*hdr)); if (middle_len && !con->in_msg->middle) { ret = ceph_alloc_middle(con, con->in_msg); if (ret < 0) { ceph_msg_put(con->in_msg); con->in_msg = NULL; } } return ret; } void ceph_con_get_out_msg(struct ceph_connection *con) { struct ceph_msg *msg; BUG_ON(list_empty(&con->out_queue)); msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head); WARN_ON(msg->con != con); /* * Put the message on "sent" list using a ref from ceph_con_send(). * It is put when the message is acked or revoked. */ list_move_tail(&msg->list_head, &con->out_sent); /* * Only assign outgoing seq # if we haven't sent this message * yet. If it is requeued, resend with it's original seq. */ if (msg->needs_out_seq) { msg->hdr.seq = cpu_to_le64(++con->out_seq); msg->needs_out_seq = false; if (con->ops->reencode_message) con->ops->reencode_message(msg); } /* * Get a ref for out_msg. It is put when we are done sending the * message or in case of a fault. */ WARN_ON(con->out_msg); con->out_msg = ceph_msg_get(msg); } /* * Free a generically kmalloc'd message. */ static void ceph_msg_free(struct ceph_msg *m) { dout("%s %p\n", __func__, m); kvfree(m->front.iov_base); kfree(m->data); kmem_cache_free(ceph_msg_cache, m); } static void ceph_msg_release(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); int i; dout("%s %p\n", __func__, m); WARN_ON(!list_empty(&m->list_head)); msg_con_set(m, NULL); /* drop middle, data, if any */ if (m->middle) { ceph_buffer_put(m->middle); m->middle = NULL; } for (i = 0; i < m->num_data_items; i++) ceph_msg_data_destroy(&m->data[i]); if (m->pool) ceph_msgpool_put(m->pool, m); else ceph_msg_free(m); } struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) { dout("%s %p (was %d)\n", __func__, msg, kref_read(&msg->kref)); kref_get(&msg->kref); return msg; } EXPORT_SYMBOL(ceph_msg_get); void ceph_msg_put(struct ceph_msg *msg) { dout("%s %p (was %d)\n", __func__, msg, kref_read(&msg->kref)); kref_put(&msg->kref, ceph_msg_release); } EXPORT_SYMBOL(ceph_msg_put); void ceph_msg_dump(struct ceph_msg *msg) { pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg, msg->front_alloc_len, msg->data_length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); print_hex_dump(KERN_DEBUG, " front: ", DUMP_PREFIX_OFFSET, 16, 1, msg->front.iov_base, msg->front.iov_len, true); if (msg->middle) print_hex_dump(KERN_DEBUG, "middle: ", DUMP_PREFIX_OFFSET, 16, 1, msg->middle->vec.iov_base, msg->middle->vec.iov_len, true); print_hex_dump(KERN_DEBUG, "footer: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->footer, sizeof(msg->footer), true); } EXPORT_SYMBOL(ceph_msg_dump); |
| 10 10 183 431 5289 5287 1155 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef RQ_QOS_H #define RQ_QOS_H #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/blk_types.h> #include <linux/atomic.h> #include <linux/wait.h> #include <linux/blk-mq.h> #include "blk-mq-debugfs.h" struct blk_mq_debugfs_attr; extern struct static_key_false block_rq_qos; enum rq_qos_id { RQ_QOS_WBT, RQ_QOS_LATENCY, RQ_QOS_COST, }; struct rq_wait { wait_queue_head_t wait; atomic_t inflight; }; struct rq_qos { const struct rq_qos_ops *ops; struct gendisk *disk; enum rq_qos_id id; struct rq_qos *next; #ifdef CONFIG_BLK_DEBUG_FS struct dentry *debugfs_dir; #endif }; struct rq_qos_ops { void (*throttle)(struct rq_qos *, struct bio *); void (*track)(struct rq_qos *, struct request *, struct bio *); void (*merge)(struct rq_qos *, struct request *, struct bio *); void (*issue)(struct rq_qos *, struct request *); void (*requeue)(struct rq_qos *, struct request *); void (*done)(struct rq_qos *, struct request *); void (*done_bio)(struct rq_qos *, struct bio *); void (*cleanup)(struct rq_qos *, struct bio *); void (*queue_depth_changed)(struct rq_qos *); void (*exit)(struct rq_qos *); const struct blk_mq_debugfs_attr *debugfs_attrs; }; struct rq_depth { unsigned int max_depth; int scale_step; bool scaled_max; unsigned int queue_depth; unsigned int default_depth; }; static inline struct rq_qos *rq_qos_id(struct request_queue *q, enum rq_qos_id id) { struct rq_qos *rqos; for (rqos = q->rq_qos; rqos; rqos = rqos->next) { if (rqos->id == id) break; } return rqos; } static inline struct rq_qos *wbt_rq_qos(struct request_queue *q) { return rq_qos_id(q, RQ_QOS_WBT); } static inline struct rq_qos *iolat_rq_qos(struct request_queue *q) { return rq_qos_id(q, RQ_QOS_LATENCY); } static inline void rq_wait_init(struct rq_wait *rq_wait) { atomic_set(&rq_wait->inflight, 0); init_waitqueue_head(&rq_wait->wait); } int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, const struct rq_qos_ops *ops); void rq_qos_del(struct rq_qos *rqos); typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data); void rq_qos_wait(struct rq_wait *rqw, void *private_data, acquire_inflight_cb_t *acquire_inflight_cb, cleanup_cb_t *cleanup_cb); bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); bool rq_depth_scale_up(struct rq_depth *rqd); bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); bool rq_depth_calc_max_depth(struct rq_depth *rqd); void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio); void __rq_qos_done(struct rq_qos *rqos, struct request *rq); void __rq_qos_issue(struct rq_qos *rqos, struct request *rq); void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq); void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio); void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio); void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio); void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio); void __rq_qos_queue_depth_changed(struct rq_qos *rqos); static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) { if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_cleanup(q->rq_qos, bio); } static inline void rq_qos_done(struct request_queue *q, struct request *rq) { if (static_branch_unlikely(&block_rq_qos) && q->rq_qos && !blk_rq_is_passthrough(rq)) __rq_qos_done(q->rq_qos, rq); } static inline void rq_qos_issue(struct request_queue *q, struct request *rq) { if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_issue(q->rq_qos, rq); } static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) { if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_requeue(q->rq_qos, rq); } static inline void rq_qos_done_bio(struct bio *bio) { if (static_branch_unlikely(&block_rq_qos) && bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) || bio_flagged(bio, BIO_QOS_MERGED))) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); if (q->rq_qos) __rq_qos_done_bio(q->rq_qos, bio); } } static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) { if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) { bio_set_flag(bio, BIO_QOS_THROTTLED); __rq_qos_throttle(q->rq_qos, bio); } } static inline void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) { if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_track(q->rq_qos, rq, bio); } static inline void rq_qos_merge(struct request_queue *q, struct request *rq, struct bio *bio) { if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) { bio_set_flag(bio, BIO_QOS_MERGED); __rq_qos_merge(q->rq_qos, rq, bio); } } static inline void rq_qos_queue_depth_changed(struct request_queue *q) { if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_queue_depth_changed(q->rq_qos); } void rq_qos_exit(struct request_queue *); #endif |
| 2773 2773 2785 155 4290 2292 2803 21 2490 24 38 24 20 18 3 753 1049 1051 50 41 742 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PGTABLE_H #define _LINUX_PGTABLE_H #include <linux/pfn.h> #include <asm/pgtable.h> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) #define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT) #ifndef __ASSEMBLY__ #ifdef CONFIG_MMU #include <linux/mm_types.h> #include <linux/bug.h> #include <linux/errno.h> #include <asm-generic/pgtable_uffd.h> #include <linux/page_table_check.h> #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS #error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED #endif /* * On almost all architectures and configurations, 0 can be used as the * upper ceiling to free_pgtables(): on many architectures it has the same * effect as using TASK_SIZE. However, there is one configuration which * must impose a more careful limit, to avoid freeing kernel pgtables. */ #ifndef USER_PGTABLES_CEILING #define USER_PGTABLES_CEILING 0UL #endif /* * This defines the first usable user address. Platforms * can override its value with custom FIRST_USER_ADDRESS * defined in their respective <asm/pgtable.h>. */ #ifndef FIRST_USER_ADDRESS #define FIRST_USER_ADDRESS 0UL #endif /* * This defines the generic helper for accessing PMD page * table page. Although platforms can still override this * via their respective <asm/pgtable.h>. */ #ifndef pmd_pgtable #define pmd_pgtable(pmd) pmd_page(pmd) #endif #define pmd_folio(pmd) page_folio(pmd_page(pmd)) /* * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD] * * The pXx_index() functions return the index of the entry in the page * table page which would control the given virtual address * * As these functions may be used by the same code for different levels of * the page table folding, they are always available, regardless of * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0 * because in such cases PTRS_PER_PxD equals 1. */ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } #define pmd_index pmd_index #endif #ifndef pud_index static inline unsigned long pud_index(unsigned long address) { return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } #define pud_index pud_index #endif #ifndef pgd_index /* Must be a compile-time constant, so implement it as a macro */ #define pgd_index(a) (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) #endif #ifndef kernel_pte_init static inline void kernel_pte_init(void *addr) { } #define kernel_pte_init kernel_pte_init #endif #ifndef pmd_init static inline void pmd_init(void *addr) { } #define pmd_init pmd_init #endif #ifndef pud_init static inline void pud_init(void *addr) { } #define pud_init pud_init #endif #ifndef pte_offset_kernel static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) { return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); } #define pte_offset_kernel pte_offset_kernel #endif #ifdef CONFIG_HIGHPTE #define __pte_map(pmd, address) \ ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address))) #define pte_unmap(pte) do { \ kunmap_local((pte)); \ rcu_read_unlock(); \ } while (0) #else static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address) { return pte_offset_kernel(pmd, address); } static inline void pte_unmap(pte_t *pte) { rcu_read_unlock(); } #endif void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); /* Find an entry in the second-level page table.. */ #ifndef pmd_offset static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return pud_pgtable(*pud) + pmd_index(address); } #define pmd_offset pmd_offset #endif #ifndef pud_offset static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) { return p4d_pgtable(*p4d) + pud_index(address); } #define pud_offset pud_offset #endif static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address) { return (pgd + pgd_index(address)); }; /* * a shortcut to get a pgd_t in a given mm */ #ifndef pgd_offset #define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) #endif /* * a shortcut which implies the use of the kernel's pgd, instead * of a process's */ #define pgd_offset_k(address) pgd_offset(&init_mm, (address)) /* * In many cases it is known that a virtual address is mapped at PMD or PTE * level, so instead of traversing all the page table levels, we can get a * pointer to the PMD entry in user or kernel page table or translate a virtual * address to the pointer in the PTE in the kernel page tables with simple * helpers. */ static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va); } static inline pmd_t *pmd_off_k(unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va); } static inline pte_t *virt_to_kpte(unsigned long vaddr) { pmd_t *pmd = pmd_off_k(vaddr); return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr); } #ifndef pmd_young static inline int pmd_young(pmd_t pmd) { return 0; } #endif #ifndef pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return 0; } #endif /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode * is issued. Some architectures may benefit from doing this, and it is * beneficial for both shadow and direct mode hypervisors, which may batch * the PTE updates which happen during this window. Note that using this * interface requires that read hazards be removed from the code. A read * hazard could result in the direct mode hypervisor case, since the actual * write to the page tables may not yet have taken place, so reads though * a raw PTE pointer after it has been modified are not guaranteed to be * up to date. * * In the general case, no lock is guaranteed to be held between entry and exit * of the lazy mode. So the implementation must assume preemption may be enabled * and cpu migration is possible; it must take steps to be robust against this. * (In practice, for user PTE updates, the appropriate page table lock(s) are * held, but for kernel PTE updates, no lock is held). Nesting is not permitted * and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE #define arch_enter_lazy_mmu_mode() do {} while (0) #define arch_leave_lazy_mmu_mode() do {} while (0) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif #ifndef pte_batch_hint /** * pte_batch_hint - Number of pages that can be added to batch without scanning. * @ptep: Page table pointer for the entry. * @pte: Page table entry. * * Some architectures know that a set of contiguous ptes all map the same * contiguous memory with the same permissions. In this case, it can provide a * hint to aid pte batching without the core code needing to scan every pte. * * An architecture implementation may ignore the PTE accessed state. Further, * the dirty state must apply atomically to all the PTEs described by the hint. * * May be overridden by the architecture, else pte_batch_hint is always 1. */ static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) { return 1; } #endif #ifndef pte_advance_pfn static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); } #endif #define pte_next_pfn(pte) pte_advance_pfn(pte, 1) #ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. * @mm: Address space to map the pages into. * @addr: Address to map the first page at. * @ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @nr: Number of pages to map. * * When nr==1, initial state of pte may be present or not present, and new state * may be present or not present. When nr>1, initial state of all ptes must be * not present, and new state must be present. * * May be overridden by the architecture, or the architecture can define * set_pte() and PFN_PTE_SHIFT. * * Context: The caller holds the page table lock. The pages all belong * to the same folio. The PTEs are all in the same PMD. */ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { page_table_check_ptes_set(mm, ptep, pte, nr); for (;;) { set_pte(ptep, pte); if (--nr == 0) break; ptep++; pte = pte_next_pfn(pte); } } #endif #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); #endif #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); extern int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty); #else static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { BUILD_BUG(); return 0; } static inline int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef ptep_get static inline pte_t ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); } #endif #ifndef pmdp_get static inline pmd_t pmdp_get(pmd_t *pmdp) { return READ_ONCE(*pmdp); } #endif #ifndef pudp_get static inline pud_t pudp_get(pud_t *pudp) { return READ_ONCE(*pudp); } #endif #ifndef p4dp_get static inline p4d_t p4dp_get(p4d_t *p4dp) { return READ_ONCE(*p4dp); } #endif #ifndef pgdp_get static inline pgd_t pgdp_get(pgd_t *pgdp) { return READ_ONCE(*pgdp); } #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); int r = 1; if (!pte_young(pte)) r = 0; else set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); return r; } #endif #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; int r = 1; if (!pmd_young(pmd)) r = 0; else set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); return r; } #else static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else /* * Despite relevant to THP only, this API is called from generic rmap code * under PageTransHuge(), hence needs a dummy implementation for !THP */ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef arch_has_hw_nonleaf_pmd_young /* * Return whether the accessed bit in non-leaf PMD entries is supported on the * local CPU. */ static inline bool arch_has_hw_nonleaf_pmd_young(void) { return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG); } #endif #ifndef arch_has_hw_pte_young /* * Return whether the accessed bit is supported on the local CPU. * * This stub assumes accessing through an old PTE triggers a page fault. * Architectures that automatically set the access bit should overwrite it. */ static inline bool arch_has_hw_pte_young(void) { return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG); } #endif #ifndef arch_check_zapped_pte static inline void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) { } #endif #ifndef arch_check_zapped_pmd static inline void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) { } #endif #ifndef arch_check_zapped_pud static inline void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) { } #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); page_table_check_pte_clear(mm, pte); return pte; } #endif #ifndef clear_young_dirty_ptes /** * clear_young_dirty_ptes - Mark PTEs that map consecutive pages of the * same folio as old/clean. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to mark old/clean. * @flags: Flags to modify the PTE batch semantics. * * May be overridden by the architecture; otherwise, implemented by * get_and_clear/modify/set for each pte in the range. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void clear_young_dirty_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr, cydp_t flags) { pte_t pte; for (;;) { if (flags == CYDP_CLEAR_YOUNG) ptep_test_and_clear_young(vma, addr, ptep); else { pte = ptep_get_and_clear(vma->vm_mm, addr, ptep); if (flags & CYDP_CLEAR_YOUNG) pte = pte_mkold(pte); if (flags & CYDP_CLEAR_DIRTY) pte = pte_mkclean(pte); set_pte_at(vma->vm_mm, addr, ptep, pte); } if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = ptep_get(ptep); pte_clear(mm, addr, ptep); /* * No need for ptep_get_and_clear(): page table check doesn't care about * any bits that could have been set by HW concurrently. */ page_table_check_pte_clear(mm, pte); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH /* * For walking the pagetables without holding any locks. Some architectures * (eg x86-32 PAE) cannot load the entries atomically without using expensive * instructions. We are guaranteed that a PTE will only either go from not * present to present, or present to not present -- it will not switch to a * completely different present page without a TLB flush inbetween; which we * are blocking by holding interrupts off. * * Setting ptes from not present to present goes: * * ptep->pte_high = h; * smp_wmb(); * ptep->pte_low = l; * * And present to not present goes: * * ptep->pte_low = 0; * smp_wmb(); * ptep->pte_high = 0; * * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. * We load pte_high *after* loading pte_low, which ensures we don't see an older * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't * picked up a changed pte high. We might have gotten rubbish values from * pte_low and pte_high, but we are guaranteed that pte_low will not have the * present bit set *unless* it is 'l'. Because get_user_pages_fast() only * operates on present ptes we're safe. */ static inline pte_t ptep_get_lockless(pte_t *ptep) { pte_t pte; do { pte.pte_low = ptep->pte_low; smp_rmb(); pte.pte_high = ptep->pte_high; smp_rmb(); } while (unlikely(pte.pte_low != ptep->pte_low)); return pte; } #define ptep_get_lockless ptep_get_lockless #if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { pmd_t pmd; do { pmd.pmd_low = pmdp->pmd_low; smp_rmb(); pmd.pmd_high = pmdp->pmd_high; smp_rmb(); } while (unlikely(pmd.pmd_low != pmdp->pmd_low)); return pmd; } #define pmdp_get_lockless pmdp_get_lockless #define pmdp_get_lockless_sync() tlb_remove_table_sync_one() #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */ /* * We require that the PTE can be read atomically. */ #ifndef ptep_get_lockless static inline pte_t ptep_get_lockless(pte_t *ptep) { return ptep_get(ptep); } #endif #ifndef pmdp_get_lockless static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { return pmdp_get(pmdp); } static inline void pmdp_get_lockless_sync(void) { } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; pmd_clear(pmdp); page_table_check_pmd_clear(mm, pmd); return pmd; } #endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */ #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t pud = *pudp; pud_clear(pudp); page_table_check_pud_clear(mm, pud); return pud; } #endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, int full) { return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); } #endif #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, int full) { return pudp_huge_get_and_clear(vma->vm_mm, address, pudp); } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { return ptep_get_and_clear(mm, address, ptep); } #endif #ifndef get_and_clear_full_ptes /** * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of * the same folio, collecting dirty/accessed bits. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the * returned PTE. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { pte_t pte, tmp_pte; pte = ptep_get_and_clear_full(mm, addr, ptep, full); while (--nr) { ptep++; addr += PAGE_SIZE; tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full); if (pte_dirty(tmp_pte)) pte = pte_mkdirty(pte); if (pte_young(tmp_pte)) pte = pte_mkyoung(pte); } return pte; } #endif #ifndef clear_full_ptes /** * clear_full_ptes - Clear present PTEs that map consecutive pages of the same * folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_get_and_clear_full(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { for (;;) { ptep_get_and_clear_full(mm, addr, ptep, full); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif /* * If two threads concurrently fault at the same page, the thread that * won the race updates the PTE and its local TLB/Cache. The other thread * gives up, simply does nothing, and continues; on architectures where * software can update TLB, local TLB can be updated here to avoid next page * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache. */ #ifndef update_mmu_tlb_range static inline void update_mmu_tlb_range(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int nr) { } #endif static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { update_mmu_tlb_range(vma, address, ptep, 1); } /* * Some architectures may be able to avoid expensive synchronization * primitives when modifications are made to PTE's which are already * not present, or in the process of an address space destruction. */ #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL static inline void pte_clear_not_present_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { pte_clear(mm, address, ptep); } #endif #ifndef clear_not_present_full_ptes /** * clear_not_present_full_ptes - Clear multiple not present PTEs which are * consecutive in the pgtable. * @mm: Address space the ptes represent. * @addr: Address of the first pte. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over pte_clear_not_present_full(). * * Context: The caller holds the page table lock. The PTEs are all not present. * The PTEs are all in the same PMD. */ static inline void clear_not_present_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { for (;;) { pte_clear_not_present_full(mm, addr, ptep, full); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH extern pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t *pudp); #endif #ifndef pte_mkwrite static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) { return pte_mkwrite_novma(pte); } #endif #if defined(CONFIG_ARCH_WANT_PMD_MKWRITE) && !defined(pmd_mkwrite) static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { return pmd_mkwrite_novma(pmd); } #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t old_pte = ptep_get(ptep); set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif #ifndef wrprotect_ptes /** * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same * folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to write-protect. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_set_wrprotect(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { for (;;) { ptep_set_wrprotect(mm, addr, ptep); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings * out extra page fault penalty to track page access bit. For optimization page * access bit can be set during all page fault flow on these arches. * To be differentiate with macro pte_mkyoung, this macro is used on platforms * where software maintains page access bit. */ #ifndef pte_sw_mkyoung static inline pte_t pte_sw_mkyoung(pte_t pte) { return pte; } #define pte_sw_mkyoung pte_sw_mkyoung #endif #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t old_pmd = *pmdp; set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd)); } #else static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t old_pud = *pudp; set_pud_at(mm, address, pudp, pud_wrprotect(old_pud)); } #else static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #endif #ifndef pmdp_collapse_flush #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return *pmdp; } #define pmdp_collapse_flush pmdp_collapse_flush #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif #ifndef arch_needs_pgtable_deposit #define arch_needs_pgtable_deposit() (false) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is an implementation of pmdp_establish() that is only suitable for an * architecture that doesn't have hardware dirty/accessed bits. In this case we * can't race with CPU which sets these bits and non-atomic approach is fine. */ static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { pmd_t old_pmd = *pmdp; set_pmd_at(vma->vm_mm, address, pmdp, pmd); return old_pmd; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD /* * pmdp_invalidate_ad() invalidates the PMD while changing a transparent * hugepage mapping in the page tables. This function is similar to * pmdp_invalidate(), but should only be used if the access and dirty bits would * not be cleared by the software in the new PMD value. The function ensures * that hardware changes of the access and dirty bits updates would not be lost. * * Doing so can allow in certain architectures to avoid a TLB flush in most * cases. Yet, another TLB flush might be necessary later if the PMD update * itself requires such flush (e.g., if protection was set to be stricter). Yet, * even when a TLB flush is needed because of the update, the caller may be able * to batch these TLB flushing operations, so fewer TLB flush operations are * needed. */ extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { return pte_val(pte_a) == pte_val(pte_b); } #endif #ifndef __HAVE_ARCH_PTE_UNUSED /* * Some architectures provide facilities to virtualization guests * so that they can flag allocated pages as unused. This allows the * host to transparently reclaim unused pages. This function returns * whether the pte's page is unused. */ static inline int pte_unused(pte_t pte) { return 0; } #endif #ifndef pte_access_permitted #define pte_access_permitted(pte, write) \ (pte_present(pte) && (!(write) || pte_write(pte))) #endif #ifndef pmd_access_permitted #define pmd_access_permitted(pmd, write) \ (pmd_present(pmd) && (!(write) || pmd_write(pmd))) #endif #ifndef pud_access_permitted #define pud_access_permitted(pud, write) \ (pud_present(pud) && (!(write) || pud_write(pud))) #endif #ifndef p4d_access_permitted #define p4d_access_permitted(p4d, write) \ (p4d_present(p4d) && (!(write) || p4d_write(p4d))) #endif #ifndef pgd_access_permitted #define pgd_access_permitted(pgd, write) \ (pgd_present(pgd) && (!(write) || pgd_write(pgd))) #endif #ifndef __HAVE_ARCH_PMD_SAME static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return pmd_val(pmd_a) == pmd_val(pmd_b); } #endif #ifndef pud_same static inline int pud_same(pud_t pud_a, pud_t pud_b) { return pud_val(pud_a) == pud_val(pud_b); } #define pud_same pud_same #endif #ifndef __HAVE_ARCH_P4D_SAME static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b) { return p4d_val(p4d_a) == p4d_val(p4d_b); } #endif #ifndef __HAVE_ARCH_PGD_SAME static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) { return pgd_val(pgd_a) == pgd_val(pgd_b); } #endif #ifndef __HAVE_ARCH_DO_SWAP_PAGE static inline void arch_do_swap_page_nr(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t oldpte, int nr) { } #else /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ static inline void arch_do_swap_page_nr(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t oldpte, int nr) { for (int i = 0; i < nr; i++) { arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE, pte_advance_pfn(pte, i), pte_advance_pfn(oldpte, i)); } } #endif #ifndef __HAVE_ARCH_UNMAP_ONE /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_unmap_one() can save this * metadata on a swap-out of a page. */ static inline int arch_unmap_one(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t orig_pte) { return 0; } #endif /* * Allow architectures to preserve additional metadata associated with * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function * prototypes must be defined in the arch-specific asm/pgtable.h file. */ #ifndef __HAVE_ARCH_PREPARE_TO_SWAP static inline int arch_prepare_to_swap(struct folio *folio) { return 0; } #endif #ifndef __HAVE_ARCH_SWAP_INVALIDATE static inline void arch_swap_invalidate_page(int type, pgoff_t offset) { } static inline void arch_swap_invalidate_area(int type) { } #endif #ifndef __HAVE_ARCH_SWAP_RESTORE static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) { } #endif #ifndef __HAVE_ARCH_MOVE_PTE #define move_pte(pte, old_addr, new_addr) (pte) #endif #ifndef pte_accessible # define pte_accessible(mm, pte) ((void)(pte), 1) #endif #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address) #endif /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout. */ #define pgd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #ifndef p4d_addr_end #define p4d_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pud_addr_end #define pud_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pmd_addr_end #define pmd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif /* * When walking page tables, we usually want to skip any p?d_none entries; * and any p?d_bad entries - reporting the error before resetting to none. * Do the tests inline, but report and clear the bad entry in mm/memory.c. */ void pgd_clear_bad(pgd_t *); #ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *); #else #define p4d_clear_bad(p4d) do { } while (0) #endif #ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *); #else #define pud_clear_bad(p4d) do { } while (0) #endif void pmd_clear_bad(pmd_t *); static inline int pgd_none_or_clear_bad(pgd_t *pgd) { if (pgd_none(*pgd)) return 1; if (unlikely(pgd_bad(*pgd))) { pgd_clear_bad(pgd); return 1; } return 0; } static inline int p4d_none_or_clear_bad(p4d_t *p4d) { if (p4d_none(*p4d)) return 1; if (unlikely(p4d_bad(*p4d))) { p4d_clear_bad(p4d); return 1; } return 0; } static inline int pud_none_or_clear_bad(pud_t *pud) { if (pud_none(*pud)) return 1; if (unlikely(pud_bad(*pud))) { pud_clear_bad(pud); return 1; } return 0; } static inline int pmd_none_or_clear_bad(pmd_t *pmd) { if (pmd_none(*pmd)) return 1; if (unlikely(pmd_bad(*pmd))) { pmd_clear_bad(pmd); return 1; } return 0; } static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { /* * Get the current pte state, but zero it out to make it * non-present, preventing the hardware from asynchronously * updating it. */ return ptep_get_and_clear(vma->vm_mm, addr, ptep); } static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { /* * The pte is non-present, so there's no hardware state to * preserve. */ set_pte_at(vma->vm_mm, addr, ptep, pte); } #ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION /* * Start a pte protection read-modify-write transaction, which * protects against asynchronous hardware modifications to the pte. * The intention is not to prevent the hardware from making pte * updates, but to prevent any updates it may make from being lost. * * This does not protect against other software modifications of the * pte; the appropriate pte lock must be held over the transaction. * * Note that this interface is intended to be batchable, meaning that * ptep_modify_prot_commit may not actually update the pte, but merely * queue the update to be done at some later time. The update must be * actually committed before the pte lock is released, however. */ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return __ptep_modify_prot_start(vma, addr, ptep); } /* * Commit an update to a pte, leaving any hardware-controlled bits in * the PTE unmodified. */ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { __ptep_modify_prot_commit(vma, addr, ptep, pte); } #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ #endif /* CONFIG_MMU */ /* * No-op macros that just return the current protection value. Defined here * because these macros can be used even if CONFIG_MMU is not defined. */ #ifndef pgprot_nx #define pgprot_nx(prot) (prot) #endif #ifndef pgprot_noncached #define pgprot_noncached(prot) (prot) #endif #ifndef pgprot_writecombine #define pgprot_writecombine pgprot_noncached #endif #ifndef pgprot_writethrough #define pgprot_writethrough pgprot_noncached #endif #ifndef pgprot_device #define pgprot_device pgprot_noncached #endif #ifndef pgprot_mhp #define pgprot_mhp(prot) (prot) #endif #ifdef CONFIG_MMU #ifndef pgprot_modify #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot))) newprot = pgprot_noncached(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot))) newprot = pgprot_writecombine(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot))) newprot = pgprot_device(newprot); return newprot; } #endif #endif /* CONFIG_MMU */ #ifndef pgprot_encrypted #define pgprot_encrypted(prot) (prot) #endif #ifndef pgprot_decrypted #define pgprot_decrypted(prot) (prot) #endif /* * A facility to provide batching of the reload of page tables and * other process state with the actual context switch code for * paravirtualized guests. By convention, only one of the batched * update (lazy) modes (CPU, MMU) should be active at any given time, * entry should never be nested, and entry and exits should always be * paired. This is for sanity of maintaining and reasoning about the * kernel code. In this case, the exit (end of the context switch) is * in architecture-specific code, and so doesn't need a generic * definition. */ #ifndef __HAVE_ARCH_START_CONTEXT_SWITCH #define arch_start_context_switch(prev) do {} while (0) #endif #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */ static inline int pte_soft_dirty(pte_t pte) { return 0; } static inline int pmd_soft_dirty(pmd_t pmd) { return 0; } static inline pte_t pte_mksoft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { return pte; } static inline int pte_swp_soft_dirty(pte_t pte) { return 0; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #ifndef __HAVE_PFNMAP_TRACKING /* * Interfaces that can be used by architecture code to keep track of * memory type of pfn mappings specified by the remap_pfn_range, * vmf_insert_pfn. */ static inline int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot) { return 0; } static inline int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot) { return 0; } static inline void pfnmap_untrack(unsigned long pfn, unsigned long size) { } #else /** * pfnmap_setup_cachemode - setup the cachemode in the pgprot for a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * @prot: the pgprot to modify * * Lookup the cachemode for the pfn range starting at @pfn with the size * @size and store it in @prot, leaving other data in @prot unchanged. * * This allows for a hardware implementation to have fine-grained control of * memory cache behavior at page level granularity. Without a hardware * implementation, this function does nothing. * * Currently there is only one implementation for this - x86 Page Attribute * Table (PAT). See Documentation/arch/x86/pat.rst for more details. * * This function can fail if the pfn range spans pfns that require differing * cachemodes. If the pfn range was previously verified to have a single * cachemode, it is sufficient to query only a single pfn. The assumption is * that this is the case for drivers using the vmf_insert_pfn*() interface. * * Returns 0 on success and -EINVAL on error. */ int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot); /** * pfnmap_track - track a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * @prot: the pgprot to track * * Requested the pfn range to be 'tracked' by a hardware implementation and * setup the cachemode in @prot similar to pfnmap_setup_cachemode(). * * This allows for fine-grained control of memory cache behaviour at page * level granularity. Tracking memory this way is persisted across VMA splits * (VMA merging does not apply for VM_PFNMAP). * * Currently, there is only one implementation for this - x86 Page Attribute * Table (PAT). See Documentation/arch/x86/pat.rst for more details. * * Returns 0 on success and -EINVAL on error. */ int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot); /** * pfnmap_untrack - untrack a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * * Untrack a pfn range previously tracked through pfnmap_track(). */ void pfnmap_untrack(unsigned long pfn, unsigned long size); #endif /** * pfnmap_setup_cachemode_pfn - setup the cachemode in the pgprot for a pfn * @pfn: the pfn * @prot: the pgprot to modify * * Lookup the cachemode for @pfn and store it in @prot, leaving other * data in @prot unchanged. * * See pfnmap_setup_cachemode() for details. */ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) { pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot); } #ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; unsigned long offset_from_zero_pfn = pfn - zero_pfn; return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); } #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) #else static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; return pfn == zero_pfn; } static inline unsigned long my_zero_pfn(unsigned long addr) { extern unsigned long zero_pfn; return zero_pfn; } #endif #else static inline int is_zero_pfn(unsigned long pfn) { return 0; } static inline unsigned long my_zero_pfn(unsigned long addr) { return 0; } #endif /* CONFIG_MMU */ #ifdef CONFIG_MMU #ifndef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_trans_huge(pmd_t pmd) { return 0; } #ifndef pmd_write static inline int pmd_write(pmd_t pmd) { BUG(); return 0; } #endif /* pmd_write */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef pud_write static inline int pud_write(pud_t pud) { BUG(); return 0; } #endif /* pud_write */ #if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline int pmd_devmap(pmd_t pmd) { return 0; } static inline int pud_devmap(pud_t pud) { return 0; } static inline int pgd_devmap(pgd_t pgd) { return 0; } #endif #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static inline int pud_trans_huge(pud_t pud) { return 0; } #endif static inline int pud_trans_unstable(pud_t *pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pudval = READ_ONCE(*pud); if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval)) return 1; if (unlikely(pud_bad(pudval))) { pud_clear_bad(pud); return 1; } #endif return 0; } #ifndef CONFIG_NUMA_BALANCING /* * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is * perfectly valid to indicate "no" in that case, which is why our default * implementation defaults to "always no". * * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE * page protection due to NUMA hinting. NUMA hinting faults only apply in * accessible VMAs. * * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault, * looking at the VMA accessibility is sufficient. */ static inline int pte_protnone(pte_t pte) { return 0; } static inline int pmd_protnone(pmd_t pmd) { return 0; } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_MMU */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #ifndef __PAGETABLE_P4D_FOLDED int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot); void p4d_clear_huge(p4d_t *p4d); #else static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline void p4d_clear_huge(p4d_t *p4d) { } #endif /* !__PAGETABLE_P4D_FOLDED */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { return 0; } static inline void p4d_clear_huge(p4d_t *p4d) { } static inline int pud_clear_huge(pud_t *pud) { return 0; } static inline int pmd_clear_huge(pmd_t *pmd) { return 0; } static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) { return 0; } static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr) { return 0; } static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return 0; } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * ARCHes with special requirements for evicting THP backing TLB entries can * implement this. Otherwise also, it can help optimize normal TLB flush in * THP regime. Stock flush_tlb_range() typically has optimization to nuke the * entire TLB if flush span is greater than a threshold, which will * likely be true for a single huge page. Thus a single THP flush will * invalidate the entire TLB which is not desirable. * e.g. see arch/arc: flush_pmd_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #define flush_pud_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #else #define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG() #define flush_pud_tlb_range(vma, addr, end) BUILD_BUG() #endif #endif struct file; int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot); #ifndef CONFIG_X86_ESPFIX64 static inline void init_espfix_bsp(void) { } #endif extern void __init pgtable_cache_init(void); #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) { return true; } static inline bool arch_has_pfn_modify_check(void) { return false; } #endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */ /* * Architecture PAGE_KERNEL_* fallbacks * * Some architectures don't define certain PAGE_KERNEL_* flags. This is either * because they really don't support them, or the port needs to be updated to * reflect the required functionality. Below are a set of relatively safe * fallbacks, as best effort, which we can count on in lieu of the architectures * not defining them on their own yet. */ #ifndef PAGE_KERNEL_RO # define PAGE_KERNEL_RO PAGE_KERNEL #endif #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif /* * Page Table Modification bits for pgtbl_mod_mask. * * These are used by the p?d_alloc_track*() set of functions an in the generic * vmalloc/ioremap code to track at which page-table levels entries have been * modified. Based on that the code can better decide when vmalloc and ioremap * mapping changes need to be synchronized to other page-tables in the system. */ #define __PGTBL_PGD_MODIFIED 0 #define __PGTBL_P4D_MODIFIED 1 #define __PGTBL_PUD_MODIFIED 2 #define __PGTBL_PMD_MODIFIED 3 #define __PGTBL_PTE_MODIFIED 4 #define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED) #define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED) #define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED) #define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED) #define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED) /* Page-Table Modification Mask */ typedef unsigned int pgtbl_mod_mask; #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) #ifdef CONFIG_PHYS_ADDR_T_64BIT /* * ZSMALLOC needs to know the highest PFN on 32-bit architectures * with physical address space extension, but falls back to * BITS_PER_LONG otherwise. */ #error Missing MAX_POSSIBLE_PHYSMEM_BITS definition #else #define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif #endif #ifndef has_transparent_hugepage #define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE) #endif #ifndef has_transparent_pud_hugepage #define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) #endif /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not. */ #ifndef mm_p4d_folded #define mm_p4d_folded(mm) __is_defined(__PAGETABLE_P4D_FOLDED) #endif #ifndef mm_pud_folded #define mm_pud_folded(mm) __is_defined(__PAGETABLE_PUD_FOLDED) #endif #ifndef mm_pmd_folded #define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED) #endif #ifndef p4d_offset_lockless #define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address) #endif #ifndef pud_offset_lockless #define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address) #endif #ifndef pmd_offset_lockless #define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address) #endif /* * pXd_leaf() is the API to check whether a pgtable entry is a huge page * mapping. It should work globally across all archs, without any * dependency on CONFIG_* options. For architectures that do not support * huge mappings on specific levels, below fallbacks will be used. * * A leaf pgtable entry should always imply the following: * * - It is a "present" entry. IOW, before using this API, please check it * with pXd_present() first. NOTE: it may not always mean the "present * bit" is set. For example, PROT_NONE entries are always "present". * * - It should _never_ be a swap entry of any type. Above "present" check * should have guarded this, but let's be crystal clear on this. * * - It should contain a huge PFN, which points to a huge page larger than * PAGE_SIZE of the platform. The PFN format isn't important here. * * - It should cover all kinds of huge mappings (e.g., pXd_trans_huge(), * pXd_devmap(), or hugetlb mappings). */ #ifndef pgd_leaf #define pgd_leaf(x) false #endif #ifndef p4d_leaf #define p4d_leaf(x) false #endif #ifndef pud_leaf #define pud_leaf(x) false #endif #ifndef pmd_leaf #define pmd_leaf(x) false #endif #ifndef pgd_leaf_size #define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT) #endif #ifndef p4d_leaf_size #define p4d_leaf_size(x) P4D_SIZE #endif #ifndef pud_leaf_size #define pud_leaf_size(x) PUD_SIZE #endif #ifndef pmd_leaf_size #define pmd_leaf_size(x) PMD_SIZE #endif #ifndef __pte_leaf_size #ifndef pte_leaf_size #define pte_leaf_size(x) PAGE_SIZE #endif #define __pte_leaf_size(x,y) pte_leaf_size(y) #endif /* * We always define pmd_pfn for all archs as it's used in lots of generic * code. Now it happens too for pud_pfn (and can happen for larger * mappings too in the future; we're not there yet). Instead of defining * it for all archs (like pmd_pfn), provide a fallback. * * Note that returning 0 here means any arch that didn't define this can * get severely wrong when it hits a real pud leaf. It's arch's * responsibility to properly define it when a huge pud is possible. */ #ifndef pud_pfn #define pud_pfn(x) 0 #endif /* * Some architectures have MMUs that are configurable or selectable at boot * time. These lead to variable PTRS_PER_x. For statically allocated arrays it * helps to have a static maximum value. */ #ifndef MAX_PTRS_PER_PTE #define MAX_PTRS_PER_PTE PTRS_PER_PTE #endif #ifndef MAX_PTRS_PER_PMD #define MAX_PTRS_PER_PMD PTRS_PER_PMD #endif #ifndef MAX_PTRS_PER_PUD #define MAX_PTRS_PER_PUD PTRS_PER_PUD #endif #ifndef MAX_PTRS_PER_P4D #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif #ifndef pte_pgprot #define pte_pgprot(x) ((pgprot_t) {0}) #endif #ifndef pmd_pgprot #define pmd_pgprot(x) ((pgprot_t) {0}) #endif #ifndef pud_pgprot #define pud_pgprot(x) ((pgprot_t) {0}) #endif /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: * * map_type prot * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (yes) yes w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and * MAP_PRIVATE (with Enhanced PAN supported): * r: (no) no * w: (no) no * x: (yes) yes */ #define DECLARE_VM_GET_PAGE_PROT \ pgprot_t vm_get_page_prot(unsigned long vm_flags) \ { \ return protection_map[vm_flags & \ (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)]; \ } \ EXPORT_SYMBOL(vm_get_page_prot); #endif /* _LINUX_PGTABLE_H */ |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | #undef TRACE_SYSTEM #define TRACE_SYSTEM qdisc #if !defined(_TRACE_QDISC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_QDISC_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/tracepoint.h> #include <linux/ftrace.h> #include <linux/pkt_sched.h> #include <net/sch_generic.h> TRACE_EVENT(qdisc_dequeue, TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, int packets, struct sk_buff *skb), TP_ARGS(qdisc, txq, packets, skb), TP_STRUCT__entry( __field( struct Qdisc *, qdisc ) __field(const struct netdev_queue *, txq ) __field( int, packets ) __field( void *, skbaddr ) __field( int, ifindex ) __field( u32, handle ) __field( u32, parent ) __field( unsigned long, txq_state) ), /* skb==NULL indicate packets dequeued was 0, even when packets==1 */ TP_fast_assign( __entry->qdisc = qdisc; __entry->txq = txq; __entry->packets = skb ? packets : 0; __entry->skbaddr = skb; __entry->ifindex = txq->dev ? txq->dev->ifindex : 0; __entry->handle = qdisc->handle; __entry->parent = qdisc->parent; __entry->txq_state = txq->state; ), TP_printk("dequeue ifindex=%d qdisc handle=0x%X parent=0x%X txq_state=0x%lX packets=%d skbaddr=%p", __entry->ifindex, __entry->handle, __entry->parent, __entry->txq_state, __entry->packets, __entry->skbaddr ) ); TRACE_EVENT(qdisc_enqueue, TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, struct sk_buff *skb), TP_ARGS(qdisc, txq, skb), TP_STRUCT__entry( __field(struct Qdisc *, qdisc) __field(const struct netdev_queue *, txq) __field(void *, skbaddr) __field(int, ifindex) __field(u32, handle) __field(u32, parent) ), TP_fast_assign( __entry->qdisc = qdisc; __entry->txq = txq; __entry->skbaddr = skb; __entry->ifindex = txq->dev ? txq->dev->ifindex : 0; __entry->handle = qdisc->handle; __entry->parent = qdisc->parent; ), TP_printk("enqueue ifindex=%d qdisc handle=0x%X parent=0x%X skbaddr=%p", __entry->ifindex, __entry->handle, __entry->parent, __entry->skbaddr) ); TRACE_EVENT(qdisc_reset, TP_PROTO(struct Qdisc *q), TP_ARGS(q), TP_STRUCT__entry( __string( dev, qdisc_dev(q) ? qdisc_dev(q)->name : "(null)" ) __string( kind, q->ops->id ) __field( u32, parent ) __field( u32, handle ) ), TP_fast_assign( __assign_str(dev); __assign_str(kind); __entry->parent = q->parent; __entry->handle = q->handle; ), TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent), TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) ); TRACE_EVENT(qdisc_destroy, TP_PROTO(struct Qdisc *q), TP_ARGS(q), TP_STRUCT__entry( __string( dev, qdisc_dev(q)->name ) __string( kind, q->ops->id ) __field( u32, parent ) __field( u32, handle ) ), TP_fast_assign( __assign_str(dev); __assign_str(kind); __entry->parent = q->parent; __entry->handle = q->handle; ), TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent), TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) ); TRACE_EVENT(qdisc_create, TP_PROTO(const struct Qdisc_ops *ops, struct net_device *dev, u32 parent), TP_ARGS(ops, dev, parent), TP_STRUCT__entry( __string( dev, dev->name ) __string( kind, ops->id ) __field( u32, parent ) ), TP_fast_assign( __assign_str(dev); __assign_str(kind); __entry->parent = parent; ), TP_printk("dev=%s kind=%s parent=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent)) ); #endif /* _TRACE_QDISC_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 54 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BYTEORDER_GENERIC_H #define _LINUX_BYTEORDER_GENERIC_H /* * linux/byteorder/generic.h * Generic Byte-reordering support * * The "... p" macros, like le64_to_cpup, can be used with pointers * to unaligned data, but there will be a performance penalty on * some architectures. Use get_unaligned for unaligned data. * * Francois-Rene Rideau <fare@tunes.org> 19970707 * gathered all the good ideas from all asm-foo/byteorder.h into one file, * cleaned them up. * I hope it is compliant with non-GCC compilers. * I decided to put __BYTEORDER_HAS_U64__ in byteorder.h, * because I wasn't sure it would be ok to put it in types.h * Upgraded it to 2.1.43 * Francois-Rene Rideau <fare@tunes.org> 19971012 * Upgraded it to 2.1.57 * to please Linus T., replaced huge #ifdef's between little/big endian * by nestedly #include'd files. * Francois-Rene Rideau <fare@tunes.org> 19971205 * Made it to 2.1.71; now a facelift: * Put files under include/linux/byteorder/ * Split swab from generic support. * * TODO: * = Regular kernel maintainers could also replace all these manual * byteswap macros that remain, disseminated among drivers, * after some grep or the sources... * = Linus might want to rename all these macros and files to fit his taste, * to fit his personal naming scheme. * = it seems that a few drivers would also appreciate * nybble swapping support... * = every architecture could add their byteswap macro in asm/byteorder.h * see how some architectures already do (i386, alpha, ppc, etc) * = cpu_to_beXX and beXX_to_cpu might some day need to be well * distinguished throughout the kernel. This is not the case currently, * since little endian, big endian, and pdp endian machines needn't it. * But this might be the case for, say, a port of Linux to 20/21 bit * architectures (and F21 Linux addict around?). */ /* * The following macros are to be defined by <asm/byteorder.h>: * * Conversion of long and short int between network and host format * ntohl(__u32 x) * ntohs(__u16 x) * htonl(__u32 x) * htons(__u16 x) * It seems that some programs (which? where? or perhaps a standard? POSIX?) * might like the above to be functions, not macros (why?). * if that's true, then detect them, and take measures. * Anyway, the measure is: define only ___ntohl as a macro instead, * and in a separate file, have * unsigned long inline ntohl(x){return ___ntohl(x);} * * The same for constant arguments * __constant_ntohl(__u32 x) * __constant_ntohs(__u16 x) * __constant_htonl(__u32 x) * __constant_htons(__u16 x) * * Conversion of XX-bit integers (16- 32- or 64-) * between native CPU format and little/big endian format * 64-bit stuff only defined for proper architectures * cpu_to_[bl]eXX(__uXX x) * [bl]eXX_to_cpu(__uXX x) * * The same, but takes a pointer to the value to convert * cpu_to_[bl]eXXp(__uXX x) * [bl]eXX_to_cpup(__uXX x) * * The same, but change in situ * cpu_to_[bl]eXXs(__uXX x) * [bl]eXX_to_cpus(__uXX x) * * See asm-foo/byteorder.h for examples of how to provide * architecture-optimized versions * */ #define cpu_to_le64 __cpu_to_le64 #define le64_to_cpu __le64_to_cpu #define cpu_to_le32 __cpu_to_le32 #define le32_to_cpu __le32_to_cpu #define cpu_to_le16 __cpu_to_le16 #define le16_to_cpu __le16_to_cpu #define cpu_to_be64 __cpu_to_be64 #define be64_to_cpu __be64_to_cpu #define cpu_to_be32 __cpu_to_be32 #define be32_to_cpu __be32_to_cpu #define cpu_to_be16 __cpu_to_be16 #define be16_to_cpu __be16_to_cpu #define cpu_to_le64p __cpu_to_le64p #define le64_to_cpup __le64_to_cpup #define cpu_to_le32p __cpu_to_le32p #define le32_to_cpup __le32_to_cpup #define cpu_to_le16p __cpu_to_le16p #define le16_to_cpup __le16_to_cpup #define cpu_to_be64p __cpu_to_be64p #define be64_to_cpup __be64_to_cpup #define cpu_to_be32p __cpu_to_be32p #define be32_to_cpup __be32_to_cpup #define cpu_to_be16p __cpu_to_be16p #define be16_to_cpup __be16_to_cpup #define cpu_to_le64s __cpu_to_le64s #define le64_to_cpus __le64_to_cpus #define cpu_to_le32s __cpu_to_le32s #define le32_to_cpus __le32_to_cpus #define cpu_to_le16s __cpu_to_le16s #define le16_to_cpus __le16_to_cpus #define cpu_to_be64s __cpu_to_be64s #define be64_to_cpus __be64_to_cpus #define cpu_to_be32s __cpu_to_be32s #define be32_to_cpus __be32_to_cpus #define cpu_to_be16s __cpu_to_be16s #define be16_to_cpus __be16_to_cpus /* * They have to be macros in order to do the constant folding * correctly - if the argument passed into a inline function * it is no longer constant according to gcc.. */ #undef ntohl #undef ntohs #undef htonl #undef htons #define ___htonl(x) __cpu_to_be32(x) #define ___htons(x) __cpu_to_be16(x) #define ___ntohl(x) __be32_to_cpu(x) #define ___ntohs(x) __be16_to_cpu(x) #define htonl(x) ___htonl(x) #define ntohl(x) ___ntohl(x) #define htons(x) ___htons(x) #define ntohs(x) ___ntohs(x) static inline void le16_add_cpu(__le16 *var, u16 val) { *var = cpu_to_le16(le16_to_cpu(*var) + val); } static inline void le32_add_cpu(__le32 *var, u32 val) { *var = cpu_to_le32(le32_to_cpu(*var) + val); } static inline void le64_add_cpu(__le64 *var, u64 val) { *var = cpu_to_le64(le64_to_cpu(*var) + val); } /* XXX: this stuff can be optimized */ static inline void le32_to_cpu_array(u32 *buf, unsigned int words) { while (words--) { __le32_to_cpus(buf); buf++; } } static inline void cpu_to_le32_array(u32 *buf, unsigned int words) { while (words--) { __cpu_to_le32s(buf); buf++; } } static inline void be16_add_cpu(__be16 *var, u16 val) { *var = cpu_to_be16(be16_to_cpu(*var) + val); } static inline void be32_add_cpu(__be32 *var, u32 val) { *var = cpu_to_be32(be32_to_cpu(*var) + val); } static inline void be64_add_cpu(__be64 *var, u64 val) { *var = cpu_to_be64(be64_to_cpu(*var) + val); } static inline void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len) { size_t i; for (i = 0; i < len; i++) dst[i] = cpu_to_be32(src[i]); } static inline void be32_to_cpu_array(u32 *dst, const __be32 *src, size_t len) { size_t i; for (i = 0; i < len; i++) dst[i] = be32_to_cpu(src[i]); } #endif /* _LINUX_BYTEORDER_GENERIC_H */ |
| 4 4 9 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * Copyright (c) 2012-2013 Red Hat, Inc. * All rights reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_error.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" #include "xfs_symlink_remote.h" #include "xfs_bit.h" #include "xfs_bmap.h" #include "xfs_health.h" /* * Each contiguous block has a header, so it is not just a simple pathlen * to FSB conversion. */ int xfs_symlink_blocks( struct xfs_mount *mp, int pathlen) { int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); return (pathlen + buflen - 1) / buflen; } int xfs_symlink_hdr_set( struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, uint32_t size, struct xfs_buf *bp) { struct xfs_dsymlink_hdr *dsl = bp->b_addr; if (!xfs_has_crc(mp)) return 0; memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr)); dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC); dsl->sl_offset = cpu_to_be32(offset); dsl->sl_bytes = cpu_to_be32(size); uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid); dsl->sl_owner = cpu_to_be64(ino); dsl->sl_blkno = cpu_to_be64(xfs_buf_daddr(bp)); bp->b_ops = &xfs_symlink_buf_ops; return sizeof(struct xfs_dsymlink_hdr); } /* * Checking of the symlink header is split into two parts. the verifier does * CRC, location and bounds checking, the unpacking function checks the path * parameters and owner. */ bool xfs_symlink_hdr_ok( xfs_ino_t ino, uint32_t offset, uint32_t size, struct xfs_buf *bp) { struct xfs_dsymlink_hdr *dsl = bp->b_addr; if (offset != be32_to_cpu(dsl->sl_offset)) return false; if (size != be32_to_cpu(dsl->sl_bytes)) return false; if (ino != be64_to_cpu(dsl->sl_owner)) return false; /* ok */ return true; } static xfs_failaddr_t xfs_symlink_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_dsymlink_hdr *dsl = bp->b_addr; /* no verification of non-crc buffers */ if (!xfs_has_crc(mp)) return NULL; if (!xfs_verify_magic(bp, dsl->sl_magic)) return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (xfs_buf_daddr(bp) != be64_to_cpu(dsl->sl_blkno)) return __this_address; if (be32_to_cpu(dsl->sl_offset) + be32_to_cpu(dsl->sl_bytes) >= XFS_SYMLINK_MAXLEN) return __this_address; if (dsl->sl_owner == 0) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(dsl->sl_lsn))) return __this_address; return NULL; } static void xfs_symlink_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; /* no verification of non-crc buffers */ if (!xfs_has_crc(mp)) return; if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_symlink_verify(bp); if (fa) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } static void xfs_symlink_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; xfs_failaddr_t fa; /* no verification of non-crc buffers */ if (!xfs_has_crc(mp)) return; fa = xfs_symlink_verify(bp); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } if (bip) { struct xfs_dsymlink_hdr *dsl = bp->b_addr; dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn); } xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF); } const struct xfs_buf_ops xfs_symlink_buf_ops = { .name = "xfs_symlink", .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) }, .verify_read = xfs_symlink_read_verify, .verify_write = xfs_symlink_write_verify, .verify_struct = xfs_symlink_verify, }; void xfs_symlink_local_to_remote( struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp, void *priv) { struct xfs_mount *mp = ip->i_mount; char *buf; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); if (!xfs_has_crc(mp)) { bp->b_ops = NULL; memcpy(bp->b_addr, ifp->if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); return; } /* * As this symlink fits in an inode literal area, it must also fit in * the smallest buffer the filesystem supports. */ ASSERT(BBTOB(bp->b_length) >= ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr)); bp->b_ops = &xfs_symlink_buf_ops; buf = bp->b_addr; buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp); memcpy(buf, ifp->if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) + ifp->if_bytes - 1); } /* * Verify the in-memory consistency of an inline symlink data fork. This * does not do on-disk format checks. */ xfs_failaddr_t xfs_symlink_shortform_verify( void *sfp, int64_t size) { char *endp = sfp + size; /* * Zero length symlinks should never occur in memory as they are * never allowed to exist on disk. */ if (!size) return __this_address; /* No negative sizes or overly long symlink targets. */ if (size < 0 || size > XFS_SYMLINK_MAXLEN) return __this_address; /* No NULLs in the target either. */ if (memchr(sfp, 0, size - 1)) return __this_address; /* We /did/ null-terminate the buffer, right? */ if (*endp != 0) return __this_address; return NULL; } /* Read a remote symlink target into the buffer. */ int xfs_symlink_remote_read( struct xfs_inode *ip, char *link) { struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; struct xfs_buf *bp; xfs_daddr_t d; char *cur_chunk; int pathlen = ip->i_disk_size; int nmaps = XFS_SYMLINK_MAPS; int byte_cnt; int n; int error = 0; int fsblocks = 0; int offset; xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); fsblocks = xfs_symlink_blocks(mp, pathlen); error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); if (error) goto out; offset = 0; for (n = 0; n < nmaps; n++) { d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, &bp, &xfs_symlink_buf_ops); if (xfs_metadata_is_sick(error)) xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); if (error) return error; byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); if (pathlen < byte_cnt) byte_cnt = pathlen; cur_chunk = bp->b_addr; if (xfs_has_crc(mp)) { if (!xfs_symlink_hdr_ok(ip->i_ino, offset, byte_cnt, bp)) { xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); error = -EFSCORRUPTED; xfs_alert(mp, "symlink header does not match required off/len/owner (0x%x/0x%x,0x%llx)", offset, byte_cnt, ip->i_ino); xfs_buf_relse(bp); goto out; } cur_chunk += sizeof(struct xfs_dsymlink_hdr); } memcpy(link + offset, cur_chunk, byte_cnt); pathlen -= byte_cnt; offset += byte_cnt; xfs_buf_relse(bp); } ASSERT(pathlen == 0); link[ip->i_disk_size] = '\0'; error = 0; out: return error; } /* Write the symlink target into the inode. */ int xfs_symlink_write_target( struct xfs_trans *tp, struct xfs_inode *ip, xfs_ino_t owner, const char *target_path, int pathlen, xfs_fsblock_t fs_blocks, uint resblks) { struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; struct xfs_mount *mp = tp->t_mountp; const char *cur_chunk; struct xfs_buf *bp; xfs_daddr_t d; int byte_cnt; int nmaps; int offset = 0; int n; int error; /* * If the symlink will fit into the inode, write it inline. */ if (pathlen <= xfs_inode_data_fork_size(ip)) { xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); ip->i_disk_size = pathlen; ip->i_df.if_format = XFS_DINODE_FMT_LOCAL; xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); return 0; } nmaps = XFS_SYMLINK_MAPS; error = xfs_bmapi_write(tp, ip, 0, fs_blocks, XFS_BMAPI_METADATA, resblks, mval, &nmaps); if (error) return error; ip->i_disk_size = pathlen; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); cur_chunk = target_path; offset = 0; for (n = 0; n < nmaps; n++) { char *buf; d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, &bp); if (error) return error; bp->b_ops = &xfs_symlink_buf_ops; byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); byte_cnt = min(byte_cnt, pathlen); buf = bp->b_addr; buf += xfs_symlink_hdr_set(mp, owner, offset, byte_cnt, bp); memcpy(buf, cur_chunk, byte_cnt); cur_chunk += byte_cnt; pathlen -= byte_cnt; offset += byte_cnt; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - (char *)bp->b_addr); } ASSERT(pathlen == 0); return 0; } /* Remove all the blocks from a symlink and invalidate buffers. */ int xfs_symlink_remote_truncate( struct xfs_trans *tp, struct xfs_inode *ip) { struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *bp; int nmaps = XFS_SYMLINK_MAPS; int done = 0; int i; int error; /* Read mappings and invalidate buffers. */ error = xfs_bmapi_read(ip, 0, XFS_MAX_FILEOFF, mval, &nmaps, 0); if (error) return error; for (i = 0; i < nmaps; i++) { if (!xfs_bmap_is_real_extent(&mval[i])) break; error = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0, &bp); if (error) return error; xfs_trans_binval(tp, bp); } /* Unmap the remote blocks. */ error = xfs_bunmapi(tp, ip, 0, XFS_MAX_FILEOFF, 0, nmaps, &done); if (error) return error; if (!done) { ASSERT(done); xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); return -EFSCORRUPTED; } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); return 0; } |
| 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ */ #ifndef __EROFS_XATTR_H #define __EROFS_XATTR_H #include "internal.h" #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> /* Attribute not found */ #define ENOATTR ENODATA #ifdef CONFIG_EROFS_FS_XATTR extern const struct xattr_handler erofs_xattr_user_handler; extern const struct xattr_handler erofs_xattr_trusted_handler; extern const struct xattr_handler erofs_xattr_security_handler; static inline const char *erofs_xattr_prefix(unsigned int idx, struct dentry *dentry) { const struct xattr_handler *handler = NULL; static const struct xattr_handler * const xattr_handler_map[] = { [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, #ifdef CONFIG_EROFS_FS_POSIX_ACL [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, #endif [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, #ifdef CONFIG_EROFS_FS_SECURITY [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler, #endif }; if (idx && idx < ARRAY_SIZE(xattr_handler_map)) handler = xattr_handler_map[idx]; if (!xattr_handler_can_list(handler, dentry)) return NULL; return xattr_prefix(handler); } extern const struct xattr_handler * const erofs_xattr_handlers[]; int erofs_xattr_prefixes_init(struct super_block *sb); void erofs_xattr_prefixes_cleanup(struct super_block *sb); int erofs_getxattr(struct inode *, int, const char *, void *, size_t); ssize_t erofs_listxattr(struct dentry *, char *, size_t); #else static inline int erofs_xattr_prefixes_init(struct super_block *sb) { return 0; } static inline void erofs_xattr_prefixes_cleanup(struct super_block *sb) {} static inline int erofs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size) { return -EOPNOTSUPP; } #define erofs_listxattr (NULL) #define erofs_xattr_handlers (NULL) #endif /* !CONFIG_EROFS_FS_XATTR */ #ifdef CONFIG_EROFS_FS_POSIX_ACL struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu); #else #define erofs_get_acl (NULL) #endif #endif |
| 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TTY_DRIVER_H #define _LINUX_TTY_DRIVER_H #include <linux/export.h> #include <linux/fs.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/cdev.h> #include <linux/uaccess.h> #include <linux/termios.h> #include <linux/seq_file.h> struct tty_struct; struct tty_driver; struct serial_icounter_struct; struct serial_struct; /** * enum tty_driver_flag -- TTY Driver Flags * * These are flags passed to tty_alloc_driver(). * * @TTY_DRIVER_INSTALLED: * Whether this driver was succesfully installed. This is a tty internal * flag. Do not touch. * * @TTY_DRIVER_RESET_TERMIOS: * Requests the tty layer to reset the termios setting when the last * process has closed the device. Used for PTYs, in particular. * * @TTY_DRIVER_REAL_RAW: * Indicates that the driver will guarantee not to set any special * character handling flags if this is set for the tty: * * ``(IGNBRK || (!BRKINT && !PARMRK)) && (IGNPAR || !INPCK)`` * * That is, if there is no reason for the driver to * send notifications of parity and break characters up to the line * driver, it won't do so. This allows the line driver to optimize for * this case if this flag is set. (Note that there is also a promise, if * the above case is true, not to signal overruns, either.) * * @TTY_DRIVER_DYNAMIC_DEV: * The individual tty devices need to be registered with a call to * tty_register_device() when the device is found in the system and * unregistered with a call to tty_unregister_device() so the devices will * be show up properly in sysfs. If not set, all &tty_driver.num entries * will be created by the tty core in sysfs when tty_register_driver() is * called. This is to be used by drivers that have tty devices that can * appear and disappear while the main tty driver is registered with the * tty core. * * @TTY_DRIVER_DEVPTS_MEM: * Don't use the standard arrays (&tty_driver.ttys and * &tty_driver.termios), instead use dynamic memory keyed through the * devpts filesystem. This is only applicable to the PTY driver. * * @TTY_DRIVER_HARDWARE_BREAK: * Hardware handles break signals. Pass the requested timeout to the * &tty_operations.break_ctl instead of using a simple on/off interface. * * @TTY_DRIVER_DYNAMIC_ALLOC: * Do not allocate structures which are needed per line for this driver * (&tty_driver.ports) as it would waste memory. The driver will take * care. This is only applicable to the PTY driver. * * @TTY_DRIVER_UNNUMBERED_NODE: * Do not create numbered ``/dev`` nodes. For example, create * ``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a * driver for a single tty device is being allocated. */ enum tty_driver_flag { TTY_DRIVER_INSTALLED = BIT(0), TTY_DRIVER_RESET_TERMIOS = BIT(1), TTY_DRIVER_REAL_RAW = BIT(2), TTY_DRIVER_DYNAMIC_DEV = BIT(3), TTY_DRIVER_DEVPTS_MEM = BIT(4), TTY_DRIVER_HARDWARE_BREAK = BIT(5), TTY_DRIVER_DYNAMIC_ALLOC = BIT(6), TTY_DRIVER_UNNUMBERED_NODE = BIT(7), }; enum tty_driver_type { TTY_DRIVER_TYPE_SYSTEM, TTY_DRIVER_TYPE_CONSOLE, TTY_DRIVER_TYPE_SERIAL, TTY_DRIVER_TYPE_PTY, TTY_DRIVER_TYPE_SCC, TTY_DRIVER_TYPE_SYSCONS, }; enum tty_driver_subtype { SYSTEM_TYPE_TTY = 1, SYSTEM_TYPE_CONSOLE, SYSTEM_TYPE_SYSCONS, SYSTEM_TYPE_SYSPTMX, PTY_TYPE_MASTER = 1, PTY_TYPE_SLAVE, SERIAL_TYPE_NORMAL = 1, }; /** * struct tty_operations -- interface between driver and tty * * @lookup: ``struct tty_struct *()(struct tty_driver *self, struct file *, * int idx)`` * * Return the tty device corresponding to @idx, %NULL if there is not * one currently in use and an %ERR_PTR value on error. Called under * %tty_mutex (for now!) * * Optional method. Default behaviour is to use the @self->ttys array. * * @install: ``int ()(struct tty_driver *self, struct tty_struct *tty)`` * * Install a new @tty into the @self's internal tables. Used in * conjunction with @lookup and @remove methods. * * Optional method. Default behaviour is to use the @self->ttys array. * * @remove: ``void ()(struct tty_driver *self, struct tty_struct *tty)`` * * Remove a closed @tty from the @self's internal tables. Used in * conjunction with @lookup and @remove methods. * * Optional method. Default behaviour is to use the @self->ttys array. * * @open: ``int ()(struct tty_struct *tty, struct file *)`` * * This routine is called when a particular @tty device is opened. This * routine is mandatory; if this routine is not filled in, the attempted * open will fail with %ENODEV. * * Required method. Called with tty lock held. May sleep. * * @close: ``void ()(struct tty_struct *tty, struct file *)`` * * This routine is called when a particular @tty device is closed. At the * point of return from this call the driver must make no further ldisc * calls of any kind. * * Remark: called even if the corresponding @open() failed. * * Required method. Called with tty lock held. May sleep. * * @shutdown: ``void ()(struct tty_struct *tty)`` * * This routine is called under the tty lock when a particular @tty device * is closed for the last time. It executes before the @tty resources * are freed so may execute while another function holds a @tty kref. * * @cleanup: ``void ()(struct tty_struct *tty)`` * * This routine is called asynchronously when a particular @tty device * is closed for the last time freeing up the resources. This is * actually the second part of shutdown for routines that might sleep. * * @write: ``ssize_t ()(struct tty_struct *tty, const u8 *buf, size_t count)`` * * This routine is called by the kernel to write a series (@count) of * characters (@buf) to the @tty device. The characters may come from * user space or kernel space. This routine will return the * number of characters actually accepted for writing. * * May occur in parallel in special cases. Because this includes panic * paths drivers generally shouldn't try and do clever locking here. * * Optional: Required for writable devices. May not sleep. * * @put_char: ``int ()(struct tty_struct *tty, u8 ch)`` * * This routine is called by the kernel to write a single character @ch to * the @tty device. If the kernel uses this routine, it must call the * @flush_chars() routine (if defined) when it is done stuffing characters * into the driver. If there is no room in the queue, the character is * ignored. * * Optional: Kernel will use the @write method if not provided. Do not * call this function directly, call tty_put_char(). * * @flush_chars: ``void ()(struct tty_struct *tty)`` * * This routine is called by the kernel after it has written a * series of characters to the tty device using @put_char(). * * Optional. Do not call this function directly, call * tty_driver_flush_chars(). * * @write_room: ``unsigned int ()(struct tty_struct *tty)`` * * This routine returns the numbers of characters the @tty driver * will accept for queuing to be written. This number is subject * to change as output buffers get emptied, or if the output flow * control is acted. * * The ldisc is responsible for being intelligent about multi-threading of * write_room/write calls * * Required if @write method is provided else not needed. Do not call this * function directly, call tty_write_room() * * @chars_in_buffer: ``unsigned int ()(struct tty_struct *tty)`` * * This routine returns the number of characters in the device private * output queue. Used in tty_wait_until_sent() and for poll() * implementation. * * Optional: if not provided, it is assumed there is no queue on the * device. Do not call this function directly, call tty_chars_in_buffer(). * * @ioctl: ``int ()(struct tty_struct *tty, unsigned int cmd, * unsigned long arg)`` * * This routine allows the @tty driver to implement device-specific * ioctls. If the ioctl number passed in @cmd is not recognized by the * driver, it should return %ENOIOCTLCMD. * * Optional. * * @compat_ioctl: ``long ()(struct tty_struct *tty, unsigned int cmd, * unsigned long arg)`` * * Implement ioctl processing for 32 bit process on 64 bit system. * * Optional. * * @set_termios: ``void ()(struct tty_struct *tty, const struct ktermios *old)`` * * This routine allows the @tty driver to be notified when device's * termios settings have changed. New settings are in @tty->termios. * Previous settings are passed in the @old argument. * * The API is defined such that the driver should return the actual modes * selected. This means that the driver is responsible for modifying any * bits in @tty->termios it cannot fulfill to indicate the actual modes * being used. * * Optional. Called under the @tty->termios_rwsem. May sleep. * * @ldisc_ok: ``int ()(struct tty_struct *tty, int ldisc)`` * * This routine allows the @tty driver to decide if it can deal * with a particular @ldisc. * * Optional. Called under the @tty->ldisc_sem and @tty->termios_rwsem. * * @set_ldisc: ``void ()(struct tty_struct *tty)`` * * This routine allows the @tty driver to be notified when the device's * line discipline is being changed. At the point this is done the * discipline is not yet usable. * * Optional. Called under the @tty->ldisc_sem and @tty->termios_rwsem. * * @throttle: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that input buffers for the line * discipline are close to full, and it should somehow signal that no more * characters should be sent to the @tty. * * Serialization including with @unthrottle() is the job of the ldisc * layer. * * Optional: Always invoke via tty_throttle_safe(). Called under the * @tty->termios_rwsem. * * @unthrottle: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that it should signal that * characters can now be sent to the @tty without fear of overrunning the * input buffers of the line disciplines. * * Optional. Always invoke via tty_unthrottle(). Called under the * @tty->termios_rwsem. * * @stop: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that it should stop outputting * characters to the tty device. * * Called with @tty->flow.lock held. Serialized with @start() method. * * Optional. Always invoke via stop_tty(). * * @start: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that it resumed sending * characters to the @tty device. * * Called with @tty->flow.lock held. Serialized with stop() method. * * Optional. Always invoke via start_tty(). * * @hangup: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that it should hang up the @tty * device. * * Optional. Called with tty lock held. * * @break_ctl: ``int ()(struct tty_struct *tty, int state)`` * * This optional routine requests the @tty driver to turn on or off BREAK * status on the RS-232 port. If @state is -1, then the BREAK status * should be turned on; if @state is 0, then BREAK should be turned off. * * If this routine is implemented, the high-level tty driver will handle * the following ioctls: %TCSBRK, %TCSBRKP, %TIOCSBRK, %TIOCCBRK. * * If the driver sets %TTY_DRIVER_HARDWARE_BREAK in tty_alloc_driver(), * then the interface will also be called with actual times and the * hardware is expected to do the delay work itself. 0 and -1 are still * used for on/off. * * Optional: Required for %TCSBRK/%BRKP/etc. handling. May sleep. * * @flush_buffer: ``void ()(struct tty_struct *tty)`` * * This routine discards device private output buffer. Invoked on close, * hangup, to implement %TCOFLUSH ioctl and similar. * * Optional: if not provided, it is assumed there is no queue on the * device. Do not call this function directly, call * tty_driver_flush_buffer(). * * @wait_until_sent: ``void ()(struct tty_struct *tty, int timeout)`` * * This routine waits until the device has written out all of the * characters in its transmitter FIFO. Or until @timeout (in jiffies) is * reached. * * Optional: If not provided, the device is assumed to have no FIFO. * Usually correct to invoke via tty_wait_until_sent(). May sleep. * * @send_xchar: ``void ()(struct tty_struct *tty, u8 ch)`` * * This routine is used to send a high-priority XON/XOFF character (@ch) * to the @tty device. * * Optional: If not provided, then the @write method is called under * the @tty->atomic_write_lock to keep it serialized with the ldisc. * * @tiocmget: ``int ()(struct tty_struct *tty)`` * * This routine is used to obtain the modem status bits from the @tty * driver. * * Optional: If not provided, then %ENOTTY is returned from the %TIOCMGET * ioctl. Do not call this function directly, call tty_tiocmget(). * * @tiocmset: ``int ()(struct tty_struct *tty, * unsigned int set, unsigned int clear)`` * * This routine is used to set the modem status bits to the @tty driver. * First, @clear bits should be cleared, then @set bits set. * * Optional: If not provided, then %ENOTTY is returned from the %TIOCMSET * ioctl. Do not call this function directly, call tty_tiocmset(). * * @resize: ``int ()(struct tty_struct *tty, struct winsize *ws)`` * * Called when a termios request is issued which changes the requested * terminal geometry to @ws. * * Optional: the default action is to update the termios structure * without error. This is usually the correct behaviour. Drivers should * not force errors here if they are not resizable objects (e.g. a serial * line). See tty_do_resize() if you need to wrap the standard method * in your own logic -- the usual case. * * @get_icount: ``int ()(struct tty_struct *tty, * struct serial_icounter *icount)`` * * Called when the @tty device receives a %TIOCGICOUNT ioctl. Passed a * kernel structure @icount to complete. * * Optional: called only if provided, otherwise %ENOTTY will be returned. * * @get_serial: ``int ()(struct tty_struct *tty, struct serial_struct *p)`` * * Called when the @tty device receives a %TIOCGSERIAL ioctl. Passed a * kernel structure @p (&struct serial_struct) to complete. * * Optional: called only if provided, otherwise %ENOTTY will be returned. * Do not call this function directly, call tty_tiocgserial(). * * @set_serial: ``int ()(struct tty_struct *tty, struct serial_struct *p)`` * * Called when the @tty device receives a %TIOCSSERIAL ioctl. Passed a * kernel structure @p (&struct serial_struct) to set the values from. * * Optional: called only if provided, otherwise %ENOTTY will be returned. * Do not call this function directly, call tty_tiocsserial(). * * @show_fdinfo: ``void ()(struct tty_struct *tty, struct seq_file *m)`` * * Called when the @tty device file descriptor receives a fdinfo request * from VFS (to show in /proc/<pid>/fdinfo/). @m should be filled with * information. * * Optional: called only if provided, otherwise nothing is written to @m. * Do not call this function directly, call tty_show_fdinfo(). * * @poll_init: ``int ()(struct tty_driver *driver, int line, char *options)`` * * kgdboc support (Documentation/process/debugging/kgdb.rst). This routine is * called to initialize the HW for later use by calling @poll_get_char or * @poll_put_char. * * Optional: called only if provided, otherwise skipped as a non-polling * driver. * * @poll_get_char: ``int ()(struct tty_driver *driver, int line)`` * * kgdboc support (see @poll_init). @driver should read a character from a * tty identified by @line and return it. * * Optional: called only if @poll_init provided. * * @poll_put_char: ``void ()(struct tty_driver *driver, int line, char ch)`` * * kgdboc support (see @poll_init). @driver should write character @ch to * a tty identified by @line. * * Optional: called only if @poll_init provided. * * @proc_show: ``int ()(struct seq_file *m, void *driver)`` * * Driver @driver (cast to &struct tty_driver) can show additional info in * /proc/tty/driver/<driver_name>. It is enough to fill in the information * into @m. * * Optional: called only if provided, otherwise no /proc entry created. * * This structure defines the interface between the low-level tty driver and * the tty routines. These routines can be defined. Unless noted otherwise, * they are optional, and can be filled in with a %NULL pointer. */ struct tty_operations { struct tty_struct * (*lookup)(struct tty_driver *driver, struct file *filp, int idx); int (*install)(struct tty_driver *driver, struct tty_struct *tty); void (*remove)(struct tty_driver *driver, struct tty_struct *tty); int (*open)(struct tty_struct * tty, struct file * filp); void (*close)(struct tty_struct * tty, struct file * filp); void (*shutdown)(struct tty_struct *tty); void (*cleanup)(struct tty_struct *tty); ssize_t (*write)(struct tty_struct *tty, const u8 *buf, size_t count); int (*put_char)(struct tty_struct *tty, u8 ch); void (*flush_chars)(struct tty_struct *tty); unsigned int (*write_room)(struct tty_struct *tty); unsigned int (*chars_in_buffer)(struct tty_struct *tty); int (*ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg); long (*compat_ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg); void (*set_termios)(struct tty_struct *tty, const struct ktermios *old); void (*throttle)(struct tty_struct * tty); void (*unthrottle)(struct tty_struct * tty); void (*stop)(struct tty_struct *tty); void (*start)(struct tty_struct *tty); void (*hangup)(struct tty_struct *tty); int (*break_ctl)(struct tty_struct *tty, int state); void (*flush_buffer)(struct tty_struct *tty); int (*ldisc_ok)(struct tty_struct *tty, int ldisc); void (*set_ldisc)(struct tty_struct *tty); void (*wait_until_sent)(struct tty_struct *tty, int timeout); void (*send_xchar)(struct tty_struct *tty, u8 ch); int (*tiocmget)(struct tty_struct *tty); int (*tiocmset)(struct tty_struct *tty, unsigned int set, unsigned int clear); int (*resize)(struct tty_struct *tty, struct winsize *ws); int (*get_icount)(struct tty_struct *tty, struct serial_icounter_struct *icount); int (*get_serial)(struct tty_struct *tty, struct serial_struct *p); int (*set_serial)(struct tty_struct *tty, struct serial_struct *p); void (*show_fdinfo)(struct tty_struct *tty, struct seq_file *m); #ifdef CONFIG_CONSOLE_POLL int (*poll_init)(struct tty_driver *driver, int line, char *options); int (*poll_get_char)(struct tty_driver *driver, int line); void (*poll_put_char)(struct tty_driver *driver, int line, char ch); #endif int (*proc_show)(struct seq_file *m, void *driver); } __randomize_layout; /** * struct tty_driver -- driver for TTY devices * * @kref: reference counting. Reaching zero frees all the internals and the * driver. * @cdevs: allocated/registered character /dev devices * @owner: modules owning this driver. Used drivers cannot be rmmod'ed. * Automatically set by tty_alloc_driver(). * @driver_name: name of the driver used in /proc/tty * @name: used for constructing /dev node name * @name_base: used as a number base for constructing /dev node name * @major: major /dev device number (zero for autoassignment) * @minor_start: the first minor /dev device number * @num: number of devices allocated * @type: type of tty driver (enum tty_driver_type) * @subtype: subtype of tty driver (enum tty_driver_subtype) * @init_termios: termios to set to each tty initially (e.g. %tty_std_termios) * @flags: tty driver flags (%TTY_DRIVER_) * @proc_entry: proc fs entry, used internally * @other: driver of the linked tty; only used for the PTY driver * @ttys: array of active &struct tty_struct, set by tty_standard_install() * @ports: array of &struct tty_port; can be set during initialization by * tty_port_link_device() and similar * @termios: storage for termios at each TTY close for the next open * @driver_state: pointer to driver's arbitrary data * @ops: driver hooks for TTYs. Set them using tty_set_operations(). Use &struct * tty_port helpers in them as much as possible. * @tty_drivers: used internally to link tty_drivers together * * The usual handling of &struct tty_driver is to allocate it by * tty_alloc_driver(), set up all the necessary members, and register it by * tty_register_driver(). At last, the driver is torn down by calling * tty_unregister_driver() followed by tty_driver_kref_put(). * * The fields required to be set before calling tty_register_driver() include * @driver_name, @name, @type, @subtype, @init_termios, and @ops. */ struct tty_driver { struct kref kref; struct cdev **cdevs; struct module *owner; const char *driver_name; const char *name; int name_base; int major; int minor_start; unsigned int num; enum tty_driver_type type; enum tty_driver_subtype subtype; struct ktermios init_termios; unsigned long flags; struct proc_dir_entry *proc_entry; struct tty_driver *other; /* * Pointer to the tty data structures */ struct tty_struct **ttys; struct tty_port **ports; struct ktermios **termios; void *driver_state; /* * Driver methods */ const struct tty_operations *ops; struct list_head tty_drivers; } __randomize_layout; extern struct list_head tty_drivers; struct tty_driver *__tty_alloc_driver(unsigned int lines, struct module *owner, unsigned long flags); struct tty_driver *tty_find_polling_driver(char *name, int *line); void tty_driver_kref_put(struct tty_driver *driver); /** * tty_alloc_driver - allocate tty driver * @lines: count of lines this driver can handle at most * @flags: some of enum tty_driver_flag, will be set in driver->flags * * Returns: struct tty_driver or a PTR-encoded error (use IS_ERR() and friends). */ #define tty_alloc_driver(lines, flags) \ __tty_alloc_driver(lines, THIS_MODULE, flags) static inline struct tty_driver *tty_driver_kref_get(struct tty_driver *d) { kref_get(&d->kref); return d; } static inline void tty_set_operations(struct tty_driver *driver, const struct tty_operations *op) { driver->ops = op; } int tty_register_driver(struct tty_driver *driver); void tty_unregister_driver(struct tty_driver *driver); struct device *tty_register_device(struct tty_driver *driver, unsigned index, struct device *dev); struct device *tty_register_device_attr(struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); void tty_unregister_device(struct tty_driver *driver, unsigned index); #ifdef CONFIG_PROC_FS void proc_tty_register_driver(struct tty_driver *); void proc_tty_unregister_driver(struct tty_driver *); #else static inline void proc_tty_register_driver(struct tty_driver *d) {} static inline void proc_tty_unregister_driver(struct tty_driver *d) {} #endif #endif /* #ifdef _LINUX_TTY_DRIVER_H */ |
| 26 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 | /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/linux/nfs_fs.h * * Copyright (C) 1992 Rick Sladkey * * OS-specific nfs filesystem definitions and declarations */ #ifndef _LINUX_NFS_FS_H #define _LINUX_NFS_FS_H #include <uapi/linux/nfs_fs.h> /* * Enable dprintk() debugging support for nfs client. */ #ifdef CONFIG_NFS_DEBUG # define NFS_DEBUG #endif #include <linux/in.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/rbtree.h> #include <linux/refcount.h> #include <linux/rwsem.h> #include <linux/wait.h> #include <linux/sunrpc/debug.h> #include <linux/sunrpc/auth.h> #include <linux/sunrpc/clnt.h> #ifdef CONFIG_NFS_FSCACHE #include <linux/netfs.h> #endif #include <linux/nfs.h> #include <linux/nfs2.h> #include <linux/nfs3.h> #include <linux/nfs4.h> #include <linux/nfs_xdr.h> #include <linux/nfs_fs_sb.h> #include <linux/mempool.h> /* * These are the default for number of transports to different server IPs */ #define NFS_MAX_TRANSPORTS 16 /* * Size of the NFS directory verifier */ #define NFS_DIR_VERIFIER_SIZE 2 /* * NFSv3/v4 Access mode cache entry */ struct nfs_access_entry { struct rb_node rb_node; struct list_head lru; kuid_t fsuid; kgid_t fsgid; struct group_info *group_info; u64 timestamp; __u32 mask; struct rcu_head rcu_head; }; struct nfs_lock_context { refcount_t count; struct list_head list; struct nfs_open_context *open_context; fl_owner_t lockowner; atomic_t io_count; struct rcu_head rcu_head; }; struct nfs_file_localio { struct nfsd_file __rcu *ro_file; struct nfsd_file __rcu *rw_file; struct list_head list; void __rcu *nfs_uuid; /* opaque pointer to 'nfs_uuid_t' */ }; static inline void nfs_localio_file_init(struct nfs_file_localio *nfl) { #if IS_ENABLED(CONFIG_NFS_LOCALIO) nfl->ro_file = NULL; nfl->rw_file = NULL; INIT_LIST_HEAD(&nfl->list); nfl->nfs_uuid = NULL; #endif } struct nfs4_state; struct nfs_open_context { struct nfs_lock_context lock_context; fl_owner_t flock_owner; struct dentry *dentry; const struct cred *cred; struct rpc_cred __rcu *ll_cred; /* low-level cred - use to check for expiry */ struct nfs4_state *state; fmode_t mode; int error; unsigned long flags; #define NFS_CONTEXT_BAD (2) #define NFS_CONTEXT_UNLOCK (3) #define NFS_CONTEXT_FILE_OPEN (4) struct nfs4_threshold *mdsthreshold; struct list_head list; struct rcu_head rcu_head; struct nfs_file_localio nfl; }; struct nfs_open_dir_context { struct list_head list; atomic_t cache_hits; atomic_t cache_misses; unsigned long attr_gencount; __be32 verf[NFS_DIR_VERIFIER_SIZE]; __u64 dir_cookie; __u64 last_cookie; pgoff_t page_index; unsigned int dtsize; bool force_clear; bool eof; struct rcu_head rcu_head; }; /* * NFSv4 delegation */ struct nfs_delegation; struct posix_acl; struct nfs4_xattr_cache; /* * nfs fs inode data in memory */ struct nfs_inode { /* * The 64bit 'inode number' */ __u64 fileid; /* * NFS file handle */ struct nfs_fh fh; /* * Various flags */ unsigned long flags; /* atomic bit ops */ unsigned long cache_validity; /* bit mask */ /* * read_cache_jiffies is when we started read-caching this inode. * attrtimeo is for how long the cached information is assumed * to be valid. A successful attribute revalidation doubles * attrtimeo (up to acregmax/acdirmax), a failure resets it to * acregmin/acdirmin. * * We need to revalidate the cached attrs for this inode if * * jiffies - read_cache_jiffies >= attrtimeo * * Please note the comparison is greater than or equal * so that zero timeout values can be specified. */ unsigned long read_cache_jiffies; unsigned long attrtimeo; unsigned long attrtimeo_timestamp; unsigned long attr_gencount; struct rb_root access_cache; struct list_head access_cache_entry_lru; struct list_head access_cache_inode_lru; union { /* Directory */ struct { /* "Generation counter" for the attribute cache. * This is bumped whenever we update the metadata * on the server. */ unsigned long cache_change_attribute; /* * This is the cookie verifier used for NFSv3 readdir * operations */ __be32 cookieverf[NFS_DIR_VERIFIER_SIZE]; /* Readers: in-flight sillydelete RPC calls */ /* Writers: rmdir */ struct rw_semaphore rmdir_sem; }; /* Regular file */ struct { atomic_long_t nrequests; atomic_long_t redirtied_pages; struct nfs_mds_commit_info commit_info; struct mutex commit_mutex; }; }; /* Open contexts for shared mmap writes */ struct list_head open_files; /* Keep track of out-of-order replies. * The ooo array contains start/end pairs of * numbers from the changeid sequence when * the inode's iversion has been updated. * It also contains end/start pair (i.e. reverse order) * of sections of the changeid sequence that have * been seen in replies from the server. * Normally these should match and when both * A:B and B:A are found in ooo, they are both removed. * And if a reply with A:B causes an iversion update * of A:B, then neither are added. * When a reply has pre_change that doesn't match * iversion, then the changeid pair and any consequent * change in iversion ARE added. Later replies * might fill in the gaps, or possibly a gap is caused * by a change from another client. * When a file or directory is opened, if the ooo table * is not empty, then we assume the gaps were due to * another client and we invalidate the cached data. * * We can only track a limited number of concurrent gaps. * Currently that limit is 16. * We allocate the table on demand. If there is insufficient * memory, then we probably cannot cache the file anyway * so there is no loss. */ struct { int cnt; struct { u64 start, end; } gap[16]; } *ooo; #if IS_ENABLED(CONFIG_NFS_V4) struct nfs4_cached_acl *nfs4_acl; /* NFSv4 state */ struct list_head open_states; struct nfs_delegation __rcu *delegation; struct rw_semaphore rwsem; /* pNFS layout information */ struct pnfs_layout_hdr *layout; #endif /* CONFIG_NFS_V4*/ /* how many bytes have been written/read and how many bytes queued up */ __u64 write_io; __u64 read_io; #ifdef CONFIG_NFS_V4_2 struct nfs4_xattr_cache *xattr_cache; #endif union { struct inode vfs_inode; #ifdef CONFIG_NFS_FSCACHE struct netfs_inode netfs; /* netfs context and VFS inode */ #endif }; }; struct nfs4_copy_state { struct list_head copies; struct list_head src_copies; nfs4_stateid stateid; struct completion completion; uint64_t count; struct nfs_writeverf verf; int error; int flags; struct nfs4_state *parent_src_state; struct nfs4_state *parent_dst_state; }; /* * Access bit flags */ #define NFS_ACCESS_READ 0x0001 #define NFS_ACCESS_LOOKUP 0x0002 #define NFS_ACCESS_MODIFY 0x0004 #define NFS_ACCESS_EXTEND 0x0008 #define NFS_ACCESS_DELETE 0x0010 #define NFS_ACCESS_EXECUTE 0x0020 #define NFS_ACCESS_XAREAD 0x0040 #define NFS_ACCESS_XAWRITE 0x0080 #define NFS_ACCESS_XALIST 0x0100 /* * Cache validity bit flags */ #define NFS_INO_INVALID_DATA BIT(1) /* cached data is invalid */ #define NFS_INO_INVALID_ATIME BIT(2) /* cached atime is invalid */ #define NFS_INO_INVALID_ACCESS BIT(3) /* cached access cred invalid */ #define NFS_INO_INVALID_ACL BIT(4) /* cached acls are invalid */ #define NFS_INO_REVAL_FORCED BIT(6) /* force revalidation ignoring a delegation */ #define NFS_INO_INVALID_LABEL BIT(7) /* cached label is invalid */ #define NFS_INO_INVALID_CHANGE BIT(8) /* cached change is invalid */ #define NFS_INO_INVALID_CTIME BIT(9) /* cached ctime is invalid */ #define NFS_INO_INVALID_MTIME BIT(10) /* cached mtime is invalid */ #define NFS_INO_INVALID_SIZE BIT(11) /* cached size is invalid */ #define NFS_INO_INVALID_OTHER BIT(12) /* other attrs are invalid */ #define NFS_INO_DATA_INVAL_DEFER \ BIT(13) /* Deferred cache invalidation */ #define NFS_INO_INVALID_BLOCKS BIT(14) /* cached blocks are invalid */ #define NFS_INO_INVALID_XATTR BIT(15) /* xattrs are invalid */ #define NFS_INO_INVALID_NLINK BIT(16) /* cached nlinks is invalid */ #define NFS_INO_INVALID_MODE BIT(17) /* cached mode is invalid */ #define NFS_INO_INVALID_ATTR (NFS_INO_INVALID_CHANGE \ | NFS_INO_INVALID_CTIME \ | NFS_INO_INVALID_MTIME \ | NFS_INO_INVALID_SIZE \ | NFS_INO_INVALID_NLINK \ | NFS_INO_INVALID_MODE \ | NFS_INO_INVALID_OTHER) /* inode metadata is invalid */ /* * Bit offsets in flags field */ #define NFS_INO_STALE (1) /* possible stale inode */ #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ #define NFS_INO_INVALIDATING (3) /* inode is being invalidated */ #define NFS_INO_PRESERVE_UNLINKED (4) /* preserve file if removed while open */ #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ #define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ #define NFS_INO_ODIRECT (12) /* I/O setting is O_DIRECT */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { return container_of(inode, struct nfs_inode, vfs_inode); } static inline struct nfs_server *NFS_SB(const struct super_block *s) { return (struct nfs_server *)(s->s_fs_info); } static inline struct nfs_fh *NFS_FH(const struct inode *inode) { return &NFS_I(inode)->fh; } static inline struct nfs_server *NFS_SERVER(const struct inode *inode) { return NFS_SB(inode->i_sb); } static inline struct rpc_clnt *NFS_CLIENT(const struct inode *inode) { return NFS_SERVER(inode)->client; } static inline const struct nfs_rpc_ops *NFS_PROTO(const struct inode *inode) { return NFS_SERVER(inode)->nfs_client->rpc_ops; } static inline unsigned NFS_MINATTRTIMEO(const struct inode *inode) { struct nfs_server *nfss = NFS_SERVER(inode); return S_ISDIR(inode->i_mode) ? nfss->acdirmin : nfss->acregmin; } static inline unsigned NFS_MAXATTRTIMEO(const struct inode *inode) { struct nfs_server *nfss = NFS_SERVER(inode); return S_ISDIR(inode->i_mode) ? nfss->acdirmax : nfss->acregmax; } static inline int NFS_STALE(const struct inode *inode) { return test_bit(NFS_INO_STALE, &NFS_I(inode)->flags); } static inline __u64 NFS_FILEID(const struct inode *inode) { return NFS_I(inode)->fileid; } static inline void set_nfs_fileid(struct inode *inode, __u64 fileid) { NFS_I(inode)->fileid = fileid; } static inline void nfs_mark_for_revalidate(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); nfsi->cache_validity |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_INVALID_SIZE; if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); } static inline int nfs_server_capable(const struct inode *inode, int cap) { return NFS_SERVER(inode)->caps & cap; } /** * nfs_save_change_attribute - Returns the inode attribute change cookie * @dir - pointer to parent directory inode * The "cache change attribute" is updated when we need to revalidate * our dentry cache after a directory was seen to change on the server. */ static inline unsigned long nfs_save_change_attribute(struct inode *dir) { return NFS_I(dir)->cache_change_attribute; } /* * linux/fs/nfs/inode.c */ extern int nfs_sync_mapping(struct address_space *mapping); extern void nfs_zap_mapping(struct inode *inode, struct address_space *mapping); extern void nfs_zap_caches(struct inode *); extern void nfs_set_inode_stale(struct inode *inode); extern void nfs_invalidate_atime(struct inode *); extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *, struct nfs_fattr *); struct inode *nfs_ilookup(struct super_block *sb, struct nfs_fattr *, struct nfs_fh *); extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *, const struct cred *); extern void nfs_access_set_mask(struct nfs_access_entry *, u32); extern int nfs_permission(struct mnt_idmap *, struct inode *, int); extern int nfs_open(struct inode *, struct file *); extern int nfs_attribute_cache_expired(struct inode *inode); extern int nfs_revalidate_inode(struct inode *inode, unsigned long flags); extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); extern int nfs_clear_invalid_mapping(struct address_space *mapping); extern bool nfs_mapping_need_revalidate_inode(struct inode *inode); extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); extern int nfs_revalidate_mapping_rcu(struct inode *inode); extern int nfs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr); extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); extern void put_nfs_open_context(struct nfs_open_context *ctx); extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct cred *cred, fmode_t mode); extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode, struct file *filp); extern void nfs_inode_attach_open_context(struct nfs_open_context *ctx); extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx); extern void nfs_file_clear_open_context(struct file *flip); extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx); extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); extern u64 nfs_compat_user_ino64(u64 fileid); extern void nfs_fattr_init(struct nfs_fattr *fattr); extern void nfs_fattr_set_barrier(struct nfs_fattr *fattr); extern unsigned long nfs_inc_attr_generation_counter(void); extern struct nfs_fattr *nfs_alloc_fattr(void); extern struct nfs_fattr *nfs_alloc_fattr_with_label(struct nfs_server *server); static inline void nfs4_label_free(struct nfs4_label *label) { #ifdef CONFIG_NFS_V4_SECURITY_LABEL if (label) { kfree(label->label); kfree(label); } #endif } static inline void nfs_free_fattr(const struct nfs_fattr *fattr) { if (fattr) nfs4_label_free(fattr->label); kfree(fattr); } extern struct nfs_fh *nfs_alloc_fhandle(void); static inline void nfs_free_fhandle(const struct nfs_fh *fh) { kfree(fh); } #ifdef NFS_DEBUG extern u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh); static inline u32 nfs_display_fhandle_hash(const struct nfs_fh *fh) { return _nfs_display_fhandle_hash(fh); } extern void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption); #define nfs_display_fhandle(fh, caption) \ do { \ if (unlikely(nfs_debug & NFSDBG_FACILITY)) \ _nfs_display_fhandle(fh, caption); \ } while (0) #else static inline u32 nfs_display_fhandle_hash(const struct nfs_fh *fh) { return 0; } static inline void nfs_display_fhandle(const struct nfs_fh *fh, const char *caption) { } #endif /* * linux/fs/nfs/nfsroot.c */ extern int nfs_root_data(char **root_device, char **root_data); /*__init*/ /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */ extern __be32 root_nfs_parse_addr(char *name); /*__init*/ /* * linux/fs/nfs/file.c */ extern const struct file_operations nfs_file_operations; #if IS_ENABLED(CONFIG_NFS_V4) extern const struct file_operations nfs4_file_operations; #endif /* CONFIG_NFS_V4 */ extern const struct address_space_operations nfs_file_aops; extern const struct address_space_operations nfs_dir_aops; static inline struct nfs_open_context *nfs_file_open_context(struct file *filp) { return filp->private_data; } static inline const struct cred *nfs_file_cred(struct file *file) { if (file != NULL) { struct nfs_open_context *ctx = nfs_file_open_context(file); if (ctx) return ctx->cred; } return NULL; } /* * linux/fs/nfs/direct.c */ int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter); ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, bool swap); ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, bool swap); /* * linux/fs/nfs/dir.c */ extern const struct file_operations nfs_dir_operations; extern const struct dentry_operations nfs_dentry_operations; extern void nfs_force_lookup_revalidate(struct inode *dir); extern void nfs_set_verifier(struct dentry * dentry, unsigned long verf); #if IS_ENABLED(CONFIG_NFS_V4) extern void nfs_clear_verifier_delegated(struct inode *inode); #endif /* IS_ENABLED(CONFIG_NFS_V4) */ extern struct dentry *nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr); extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr); extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags); extern void nfs_access_zap_cache(struct inode *inode); extern int nfs_access_get_cached(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block); extern int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, struct file *file, unsigned int open_flags, umode_t mode); /* * linux/fs/nfs/symlink.c */ extern const struct inode_operations nfs_symlink_inode_operations; /* * linux/fs/nfs/sysctl.c */ #ifdef CONFIG_SYSCTL extern int nfs_register_sysctl(void); extern void nfs_unregister_sysctl(void); #else #define nfs_register_sysctl() 0 #define nfs_unregister_sysctl() do { } while(0) #endif /* * linux/fs/nfs/namespace.c */ extern const struct inode_operations nfs_mountpoint_inode_operations; extern const struct inode_operations nfs_referral_inode_operations; extern int nfs_mountpoint_expiry_timeout; extern void nfs_release_automount_timer(void); /* * linux/fs/nfs/unlink.c */ extern void nfs_complete_unlink(struct dentry *dentry, struct inode *); /* * linux/fs/nfs/write.c */ extern int nfs_congestion_kb; extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct folio *folio); extern int nfs_update_folio(struct file *file, struct folio *folio, unsigned int offset, unsigned int count); /* * Try to write back everything synchronously (but check the * return value!) */ extern int nfs_sync_inode(struct inode *inode); extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_folio(struct inode *inode, struct folio *folio); int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio); extern int nfs_commit_inode(struct inode *, int); extern struct nfs_commit_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_commit_data *data); void nfs_commit_begin(struct nfs_mds_commit_info *cinfo); bool nfs_commit_end(struct nfs_mds_commit_info *cinfo); static inline bool nfs_have_writebacks(const struct inode *inode) { if (S_ISREG(inode->i_mode)) return atomic_long_read(&NFS_I(inode)->nrequests) != 0; return false; } /* * linux/fs/nfs/read.c */ int nfs_read_folio(struct file *, struct folio *); void nfs_readahead(struct readahead_control *); /* * inline functions */ static inline loff_t nfs_size_to_loff_t(__u64 size) { return min_t(u64, size, OFFSET_MAX); } static inline ino_t nfs_fileid_to_ino_t(u64 fileid) { ino_t ino = (ino_t) fileid; if (sizeof(ino_t) < sizeof(u64)) ino ^= fileid >> (sizeof(u64)-sizeof(ino_t)) * 8; return ino; } static inline void nfs_ooo_clear(struct nfs_inode *nfsi) { nfsi->cache_validity &= ~NFS_INO_DATA_INVAL_DEFER; kfree(nfsi->ooo); nfsi->ooo = NULL; } static inline bool nfs_ooo_test(struct nfs_inode *nfsi) { return (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER) || (nfsi->ooo && nfsi->ooo->cnt > 0); } #define NFS_JUKEBOX_RETRY_TIME (5 * HZ) /* We need to block new opens while a file is being unlinked. * If it is opened *before* we decide to unlink, we will silly-rename * instead. If it is opened *after*, then we need to create or will fail. * If we allow the two to race, we could end up with a file that is open * but deleted on the server resulting in ESTALE. * So use ->d_fsdata to record when the unlink is happening * and block dentry revalidation while it is set. */ #define NFS_FSDATA_BLOCKED ((void*)1) # undef ifdebug # ifdef NFS_DEBUG # define ifdebug(fac) if (unlikely(nfs_debug & NFSDBG_##fac)) # define NFS_IFDEBUG(x) x # else # define ifdebug(fac) if (0) # define NFS_IFDEBUG(x) # endif #endif |
| 3861 3858 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 | // SPDX-License-Identifier: GPL-2.0-only /* * Landlock LSM - Ruleset management * * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI */ #include <linux/bits.h> #include <linux/bug.h> #include <linux/cleanup.h> #include <linux/compiler_types.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/lockdep.h> #include <linux/mutex.h> #include <linux/overflow.h> #include <linux/rbtree.h> #include <linux/refcount.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include "access.h" #include "audit.h" #include "domain.h" #include "limits.h" #include "object.h" #include "ruleset.h" static struct landlock_ruleset *create_ruleset(const u32 num_layers) { struct landlock_ruleset *new_ruleset; new_ruleset = kzalloc(struct_size(new_ruleset, access_masks, num_layers), GFP_KERNEL_ACCOUNT); if (!new_ruleset) return ERR_PTR(-ENOMEM); refcount_set(&new_ruleset->usage, 1); mutex_init(&new_ruleset->lock); new_ruleset->root_inode = RB_ROOT; #if IS_ENABLED(CONFIG_INET) new_ruleset->root_net_port = RB_ROOT; #endif /* IS_ENABLED(CONFIG_INET) */ new_ruleset->num_layers = num_layers; /* * hierarchy = NULL * num_rules = 0 * access_masks[] = 0 */ return new_ruleset; } struct landlock_ruleset * landlock_create_ruleset(const access_mask_t fs_access_mask, const access_mask_t net_access_mask, const access_mask_t scope_mask) { struct landlock_ruleset *new_ruleset; /* Informs about useless ruleset. */ if (!fs_access_mask && !net_access_mask && !scope_mask) return ERR_PTR(-ENOMSG); new_ruleset = create_ruleset(1); if (IS_ERR(new_ruleset)) return new_ruleset; if (fs_access_mask) landlock_add_fs_access_mask(new_ruleset, fs_access_mask, 0); if (net_access_mask) landlock_add_net_access_mask(new_ruleset, net_access_mask, 0); if (scope_mask) landlock_add_scope_mask(new_ruleset, scope_mask, 0); return new_ruleset; } static void build_check_rule(void) { const struct landlock_rule rule = { .num_layers = ~0, }; BUILD_BUG_ON(rule.num_layers < LANDLOCK_MAX_NUM_LAYERS); } static bool is_object_pointer(const enum landlock_key_type key_type) { switch (key_type) { case LANDLOCK_KEY_INODE: return true; #if IS_ENABLED(CONFIG_INET) case LANDLOCK_KEY_NET_PORT: return false; #endif /* IS_ENABLED(CONFIG_INET) */ default: WARN_ON_ONCE(1); return false; } } static struct landlock_rule * create_rule(const struct landlock_id id, const struct landlock_layer (*const layers)[], const u32 num_layers, const struct landlock_layer *const new_layer) { struct landlock_rule *new_rule; u32 new_num_layers; build_check_rule(); if (new_layer) { /* Should already be checked by landlock_merge_ruleset(). */ if (WARN_ON_ONCE(num_layers >= LANDLOCK_MAX_NUM_LAYERS)) return ERR_PTR(-E2BIG); new_num_layers = num_layers + 1; } else { new_num_layers = num_layers; } new_rule = kzalloc(struct_size(new_rule, layers, new_num_layers), GFP_KERNEL_ACCOUNT); if (!new_rule) return ERR_PTR(-ENOMEM); RB_CLEAR_NODE(&new_rule->node); if (is_object_pointer(id.type)) { /* This should have been caught by insert_rule(). */ WARN_ON_ONCE(!id.key.object); landlock_get_object(id.key.object); } new_rule->key = id.key; new_rule->num_layers = new_num_layers; /* Copies the original layer stack. */ memcpy(new_rule->layers, layers, flex_array_size(new_rule, layers, num_layers)); if (new_layer) /* Adds a copy of @new_layer on the layer stack. */ new_rule->layers[new_rule->num_layers - 1] = *new_layer; return new_rule; } static struct rb_root *get_root(struct landlock_ruleset *const ruleset, const enum landlock_key_type key_type) { switch (key_type) { case LANDLOCK_KEY_INODE: return &ruleset->root_inode; #if IS_ENABLED(CONFIG_INET) case LANDLOCK_KEY_NET_PORT: return &ruleset->root_net_port; #endif /* IS_ENABLED(CONFIG_INET) */ default: WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } } static void free_rule(struct landlock_rule *const rule, const enum landlock_key_type key_type) { might_sleep(); if (!rule) return; if (is_object_pointer(key_type)) landlock_put_object(rule->key.object); kfree(rule); } static void build_check_ruleset(void) { const struct landlock_ruleset ruleset = { .num_rules = ~0, .num_layers = ~0, }; BUILD_BUG_ON(ruleset.num_rules < LANDLOCK_MAX_NUM_RULES); BUILD_BUG_ON(ruleset.num_layers < LANDLOCK_MAX_NUM_LAYERS); } /** * insert_rule - Create and insert a rule in a ruleset * * @ruleset: The ruleset to be updated. * @id: The ID to build the new rule with. The underlying kernel object, if * any, must be held by the caller. * @layers: One or multiple layers to be copied into the new rule. * @num_layers: The number of @layers entries. * * When user space requests to add a new rule to a ruleset, @layers only * contains one entry and this entry is not assigned to any level. In this * case, the new rule will extend @ruleset, similarly to a boolean OR between * access rights. * * When merging a ruleset in a domain, or copying a domain, @layers will be * added to @ruleset as new constraints, similarly to a boolean AND between * access rights. */ static int insert_rule(struct landlock_ruleset *const ruleset, const struct landlock_id id, const struct landlock_layer (*const layers)[], const size_t num_layers) { struct rb_node **walker_node; struct rb_node *parent_node = NULL; struct landlock_rule *new_rule; struct rb_root *root; might_sleep(); lockdep_assert_held(&ruleset->lock); if (WARN_ON_ONCE(!layers)) return -ENOENT; if (is_object_pointer(id.type) && WARN_ON_ONCE(!id.key.object)) return -ENOENT; root = get_root(ruleset, id.type); if (IS_ERR(root)) return PTR_ERR(root); walker_node = &root->rb_node; while (*walker_node) { struct landlock_rule *const this = rb_entry(*walker_node, struct landlock_rule, node); if (this->key.data != id.key.data) { parent_node = *walker_node; if (this->key.data < id.key.data) walker_node = &((*walker_node)->rb_right); else walker_node = &((*walker_node)->rb_left); continue; } /* Only a single-level layer should match an existing rule. */ if (WARN_ON_ONCE(num_layers != 1)) return -EINVAL; /* If there is a matching rule, updates it. */ if ((*layers)[0].level == 0) { /* * Extends access rights when the request comes from * landlock_add_rule(2), i.e. @ruleset is not a domain. */ if (WARN_ON_ONCE(this->num_layers != 1)) return -EINVAL; if (WARN_ON_ONCE(this->layers[0].level != 0)) return -EINVAL; this->layers[0].access |= (*layers)[0].access; return 0; } if (WARN_ON_ONCE(this->layers[0].level == 0)) return -EINVAL; /* * Intersects access rights when it is a merge between a * ruleset and a domain. */ new_rule = create_rule(id, &this->layers, this->num_layers, &(*layers)[0]); if (IS_ERR(new_rule)) return PTR_ERR(new_rule); rb_replace_node(&this->node, &new_rule->node, root); free_rule(this, id.type); return 0; } /* There is no match for @id. */ build_check_ruleset(); if (ruleset->num_rules >= LANDLOCK_MAX_NUM_RULES) return -E2BIG; new_rule = create_rule(id, layers, num_layers, NULL); if (IS_ERR(new_rule)) return PTR_ERR(new_rule); rb_link_node(&new_rule->node, parent_node, walker_node); rb_insert_color(&new_rule->node, root); ruleset->num_rules++; return 0; } static void build_check_layer(void) { const struct landlock_layer layer = { .level = ~0, .access = ~0, }; BUILD_BUG_ON(layer.level < LANDLOCK_MAX_NUM_LAYERS); BUILD_BUG_ON(layer.access < LANDLOCK_MASK_ACCESS_FS); } /* @ruleset must be locked by the caller. */ int landlock_insert_rule(struct landlock_ruleset *const ruleset, const struct landlock_id id, const access_mask_t access) { struct landlock_layer layers[] = { { .access = access, /* When @level is zero, insert_rule() extends @ruleset. */ .level = 0, } }; build_check_layer(); return insert_rule(ruleset, id, &layers, ARRAY_SIZE(layers)); } static int merge_tree(struct landlock_ruleset *const dst, struct landlock_ruleset *const src, const enum landlock_key_type key_type) { struct landlock_rule *walker_rule, *next_rule; struct rb_root *src_root; int err = 0; might_sleep(); lockdep_assert_held(&dst->lock); lockdep_assert_held(&src->lock); src_root = get_root(src, key_type); if (IS_ERR(src_root)) return PTR_ERR(src_root); /* Merges the @src tree. */ rbtree_postorder_for_each_entry_safe(walker_rule, next_rule, src_root, node) { struct landlock_layer layers[] = { { .level = dst->num_layers, } }; const struct landlock_id id = { .key = walker_rule->key, .type = key_type, }; if (WARN_ON_ONCE(walker_rule->num_layers != 1)) return -EINVAL; if (WARN_ON_ONCE(walker_rule->layers[0].level != 0)) return -EINVAL; layers[0].access = walker_rule->layers[0].access; err = insert_rule(dst, id, &layers, ARRAY_SIZE(layers)); if (err) return err; } return err; } static int merge_ruleset(struct landlock_ruleset *const dst, struct landlock_ruleset *const src) { int err = 0; might_sleep(); /* Should already be checked by landlock_merge_ruleset() */ if (WARN_ON_ONCE(!src)) return 0; /* Only merge into a domain. */ if (WARN_ON_ONCE(!dst || !dst->hierarchy)) return -EINVAL; /* Locks @dst first because we are its only owner. */ mutex_lock(&dst->lock); mutex_lock_nested(&src->lock, SINGLE_DEPTH_NESTING); /* Stacks the new layer. */ if (WARN_ON_ONCE(src->num_layers != 1 || dst->num_layers < 1)) { err = -EINVAL; goto out_unlock; } dst->access_masks[dst->num_layers - 1] = landlock_upgrade_handled_access_masks(src->access_masks[0]); /* Merges the @src inode tree. */ err = merge_tree(dst, src, LANDLOCK_KEY_INODE); if (err) goto out_unlock; #if IS_ENABLED(CONFIG_INET) /* Merges the @src network port tree. */ err = merge_tree(dst, src, LANDLOCK_KEY_NET_PORT); if (err) goto out_unlock; #endif /* IS_ENABLED(CONFIG_INET) */ out_unlock: mutex_unlock(&src->lock); mutex_unlock(&dst->lock); return err; } static int inherit_tree(struct landlock_ruleset *const parent, struct landlock_ruleset *const child, const enum landlock_key_type key_type) { struct landlock_rule *walker_rule, *next_rule; struct rb_root *parent_root; int err = 0; might_sleep(); lockdep_assert_held(&parent->lock); lockdep_assert_held(&child->lock); parent_root = get_root(parent, key_type); if (IS_ERR(parent_root)) return PTR_ERR(parent_root); /* Copies the @parent inode or network tree. */ rbtree_postorder_for_each_entry_safe(walker_rule, next_rule, parent_root, node) { const struct landlock_id id = { .key = walker_rule->key, .type = key_type, }; err = insert_rule(child, id, &walker_rule->layers, walker_rule->num_layers); if (err) return err; } return err; } static int inherit_ruleset(struct landlock_ruleset *const parent, struct landlock_ruleset *const child) { int err = 0; might_sleep(); if (!parent) return 0; /* Locks @child first because we are its only owner. */ mutex_lock(&child->lock); mutex_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); /* Copies the @parent inode tree. */ err = inherit_tree(parent, child, LANDLOCK_KEY_INODE); if (err) goto out_unlock; #if IS_ENABLED(CONFIG_INET) /* Copies the @parent network port tree. */ err = inherit_tree(parent, child, LANDLOCK_KEY_NET_PORT); if (err) goto out_unlock; #endif /* IS_ENABLED(CONFIG_INET) */ if (WARN_ON_ONCE(child->num_layers <= parent->num_layers)) { err = -EINVAL; goto out_unlock; } /* Copies the parent layer stack and leaves a space for the new layer. */ memcpy(child->access_masks, parent->access_masks, flex_array_size(parent, access_masks, parent->num_layers)); if (WARN_ON_ONCE(!parent->hierarchy)) { err = -EINVAL; goto out_unlock; } landlock_get_hierarchy(parent->hierarchy); child->hierarchy->parent = parent->hierarchy; out_unlock: mutex_unlock(&parent->lock); mutex_unlock(&child->lock); return err; } static void free_ruleset(struct landlock_ruleset *const ruleset) { struct landlock_rule *freeme, *next; might_sleep(); rbtree_postorder_for_each_entry_safe(freeme, next, &ruleset->root_inode, node) free_rule(freeme, LANDLOCK_KEY_INODE); #if IS_ENABLED(CONFIG_INET) rbtree_postorder_for_each_entry_safe(freeme, next, &ruleset->root_net_port, node) free_rule(freeme, LANDLOCK_KEY_NET_PORT); #endif /* IS_ENABLED(CONFIG_INET) */ landlock_put_hierarchy(ruleset->hierarchy); kfree(ruleset); } void landlock_put_ruleset(struct landlock_ruleset *const ruleset) { might_sleep(); if (ruleset && refcount_dec_and_test(&ruleset->usage)) free_ruleset(ruleset); } static void free_ruleset_work(struct work_struct *const work) { struct landlock_ruleset *ruleset; ruleset = container_of(work, struct landlock_ruleset, work_free); free_ruleset(ruleset); } /* Only called by hook_cred_free(). */ void landlock_put_ruleset_deferred(struct landlock_ruleset *const ruleset) { if (ruleset && refcount_dec_and_test(&ruleset->usage)) { INIT_WORK(&ruleset->work_free, free_ruleset_work); schedule_work(&ruleset->work_free); } } /** * landlock_merge_ruleset - Merge a ruleset with a domain * * @parent: Parent domain. * @ruleset: New ruleset to be merged. * * The current task is requesting to be restricted. The subjective credentials * must not be in an overridden state. cf. landlock_init_hierarchy_log(). * * Returns the intersection of @parent and @ruleset, or returns @parent if * @ruleset is empty, or returns a duplicate of @ruleset if @parent is empty. */ struct landlock_ruleset * landlock_merge_ruleset(struct landlock_ruleset *const parent, struct landlock_ruleset *const ruleset) { struct landlock_ruleset *new_dom __free(landlock_put_ruleset) = NULL; u32 num_layers; int err; might_sleep(); if (WARN_ON_ONCE(!ruleset || parent == ruleset)) return ERR_PTR(-EINVAL); if (parent) { if (parent->num_layers >= LANDLOCK_MAX_NUM_LAYERS) return ERR_PTR(-E2BIG); num_layers = parent->num_layers + 1; } else { num_layers = 1; } /* Creates a new domain... */ new_dom = create_ruleset(num_layers); if (IS_ERR(new_dom)) return new_dom; new_dom->hierarchy = kzalloc(sizeof(*new_dom->hierarchy), GFP_KERNEL_ACCOUNT); if (!new_dom->hierarchy) return ERR_PTR(-ENOMEM); refcount_set(&new_dom->hierarchy->usage, 1); /* ...as a child of @parent... */ err = inherit_ruleset(parent, new_dom); if (err) return ERR_PTR(err); /* ...and including @ruleset. */ err = merge_ruleset(new_dom, ruleset); if (err) return ERR_PTR(err); err = landlock_init_hierarchy_log(new_dom->hierarchy); if (err) return ERR_PTR(err); return no_free_ptr(new_dom); } /* * The returned access has the same lifetime as @ruleset. */ const struct landlock_rule * landlock_find_rule(const struct landlock_ruleset *const ruleset, const struct landlock_id id) { const struct rb_root *root; const struct rb_node *node; root = get_root((struct landlock_ruleset *)ruleset, id.type); if (IS_ERR(root)) return NULL; node = root->rb_node; while (node) { struct landlock_rule *this = rb_entry(node, struct landlock_rule, node); if (this->key.data == id.key.data) return this; if (this->key.data < id.key.data) node = node->rb_right; else node = node->rb_left; } return NULL; } /* * @layer_masks is read and may be updated according to the access request and * the matching rule. * @masks_array_size must be equal to ARRAY_SIZE(*layer_masks). * * Returns true if the request is allowed (i.e. relevant layer masks for the * request are empty). */ bool landlock_unmask_layers(const struct landlock_rule *const rule, const access_mask_t access_request, layer_mask_t (*const layer_masks)[], const size_t masks_array_size) { size_t layer_level; if (!access_request || !layer_masks) return true; if (!rule) return false; /* * An access is granted if, for each policy layer, at least one rule * encountered on the pathwalk grants the requested access, * regardless of its position in the layer stack. We must then check * the remaining layers for each inode, from the first added layer to * the last one. When there is multiple requested accesses, for each * policy layer, the full set of requested accesses may not be granted * by only one rule, but by the union (binary OR) of multiple rules. * E.g. /a/b <execute> + /a <read> => /a/b <execute + read> */ for (layer_level = 0; layer_level < rule->num_layers; layer_level++) { const struct landlock_layer *const layer = &rule->layers[layer_level]; const layer_mask_t layer_bit = BIT_ULL(layer->level - 1); const unsigned long access_req = access_request; unsigned long access_bit; bool is_empty; /* * Records in @layer_masks which layer grants access to each * requested access. */ is_empty = true; for_each_set_bit(access_bit, &access_req, masks_array_size) { if (layer->access & BIT_ULL(access_bit)) (*layer_masks)[access_bit] &= ~layer_bit; is_empty = is_empty && !(*layer_masks)[access_bit]; } if (is_empty) return true; } return false; } typedef access_mask_t get_access_mask_t(const struct landlock_ruleset *const ruleset, const u16 layer_level); /** * landlock_init_layer_masks - Initialize layer masks from an access request * * Populates @layer_masks such that for each access right in @access_request, * the bits for all the layers are set where this access right is handled. * * @domain: The domain that defines the current restrictions. * @access_request: The requested access rights to check. * @layer_masks: It must contain %LANDLOCK_NUM_ACCESS_FS or * %LANDLOCK_NUM_ACCESS_NET elements according to @key_type. * @key_type: The key type to switch between access masks of different types. * * Returns: An access mask where each access right bit is set which is handled * in any of the active layers in @domain. */ access_mask_t landlock_init_layer_masks(const struct landlock_ruleset *const domain, const access_mask_t access_request, layer_mask_t (*const layer_masks)[], const enum landlock_key_type key_type) { access_mask_t handled_accesses = 0; size_t layer_level, num_access; get_access_mask_t *get_access_mask; switch (key_type) { case LANDLOCK_KEY_INODE: get_access_mask = landlock_get_fs_access_mask; num_access = LANDLOCK_NUM_ACCESS_FS; break; #if IS_ENABLED(CONFIG_INET) case LANDLOCK_KEY_NET_PORT: get_access_mask = landlock_get_net_access_mask; num_access = LANDLOCK_NUM_ACCESS_NET; break; #endif /* IS_ENABLED(CONFIG_INET) */ default: WARN_ON_ONCE(1); return 0; } memset(layer_masks, 0, array_size(sizeof((*layer_masks)[0]), num_access)); /* An empty access request can happen because of O_WRONLY | O_RDWR. */ if (!access_request) return 0; /* Saves all handled accesses per layer. */ for (layer_level = 0; layer_level < domain->num_layers; layer_level++) { const unsigned long access_req = access_request; const access_mask_t access_mask = get_access_mask(domain, layer_level); unsigned long access_bit; for_each_set_bit(access_bit, &access_req, num_access) { if (BIT_ULL(access_bit) & access_mask) { (*layer_masks)[access_bit] |= BIT_ULL(layer_level); handled_accesses |= BIT_ULL(access_bit); } } } return handled_accesses; } |
| 28 23 26 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 | /* * linux/fs/nls/nls_cp864.c * * Charset cp864 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x066a, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x00b0, 0x00b7, 0x2219, 0x221a, 0x2592, 0x2500, 0x2502, 0x253c, 0x2524, 0x252c, 0x251c, 0x2534, 0x2510, 0x250c, 0x2514, 0x2518, /* 0x90*/ 0x03b2, 0x221e, 0x03c6, 0x00b1, 0x00bd, 0x00bc, 0x2248, 0x00ab, 0x00bb, 0xfef7, 0xfef8, 0x0000, 0x0000, 0xfefb, 0xfefc, 0x0000, /* 0xa0*/ 0x00a0, 0x00ad, 0xfe82, 0x00a3, 0x00a4, 0xfe84, 0x0000, 0x0000, 0xfe8e, 0xfe8f, 0xfe95, 0xfe99, 0x060c, 0xfe9d, 0xfea1, 0xfea5, /* 0xb0*/ 0x0660, 0x0661, 0x0662, 0x0663, 0x0664, 0x0665, 0x0666, 0x0667, 0x0668, 0x0669, 0xfed1, 0x061b, 0xfeb1, 0xfeb5, 0xfeb9, 0x061f, /* 0xc0*/ 0x00a2, 0xfe80, 0xfe81, 0xfe83, 0xfe85, 0xfeca, 0xfe8b, 0xfe8d, 0xfe91, 0xfe93, 0xfe97, 0xfe9b, 0xfe9f, 0xfea3, 0xfea7, 0xfea9, /* 0xd0*/ 0xfeab, 0xfead, 0xfeaf, 0xfeb3, 0xfeb7, 0xfebb, 0xfebf, 0xfec1, 0xfec5, 0xfecb, 0xfecf, 0x00a6, 0x00ac, 0x00f7, 0x00d7, 0xfec9, /* 0xe0*/ 0x0640, 0xfed3, 0xfed7, 0xfedb, 0xfedf, 0xfee3, 0xfee7, 0xfeeb, 0xfeed, 0xfeef, 0xfef3, 0xfebd, 0xfecc, 0xfece, 0xfecd, 0xfee1, /* 0xf0*/ 0xfe7d, 0x0651, 0xfee5, 0xfee9, 0xfeec, 0xfef0, 0xfef2, 0xfed0, 0xfed5, 0xfef5, 0xfef6, 0xfedd, 0xfed9, 0xfef1, 0x25a0, 0x0000, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x00, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xa0, 0x00, 0xc0, 0xa3, 0xa4, 0x00, 0xdb, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x97, 0xdc, 0xa1, 0x00, 0x00, /* 0xa8-0xaf */ 0x80, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x98, 0x95, 0x94, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdd, /* 0xf0-0xf7 */ }; static const unsigned char page03[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x00, /* 0xc0-0xc7 */ }; static const unsigned char page06[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0xe0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0xf1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x60-0x67 */ 0xb8, 0xb9, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x82, 0x83, 0x00, 0x00, 0x00, 0x91, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ }; static const unsigned char page25[256] = { 0x85, 0x00, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x8c, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x8f, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char pagefe[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, /* 0x78-0x7f */ 0xc1, 0xc2, 0xa2, 0xc3, 0xa5, 0xc4, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0xc6, 0x00, 0xc7, 0xa8, 0xa9, /* 0x88-0x8f */ 0x00, 0xc8, 0x00, 0xc9, 0x00, 0xaa, 0x00, 0xca, /* 0x90-0x97 */ 0x00, 0xab, 0x00, 0xcb, 0x00, 0xad, 0x00, 0xcc, /* 0x98-0x9f */ 0x00, 0xae, 0x00, 0xcd, 0x00, 0xaf, 0x00, 0xce, /* 0xa0-0xa7 */ 0x00, 0xcf, 0x00, 0xd0, 0x00, 0xd1, 0x00, 0xd2, /* 0xa8-0xaf */ 0x00, 0xbc, 0x00, 0xd3, 0x00, 0xbd, 0x00, 0xd4, /* 0xb0-0xb7 */ 0x00, 0xbe, 0x00, 0xd5, 0x00, 0xeb, 0x00, 0xd6, /* 0xb8-0xbf */ 0x00, 0xd7, 0x00, 0x00, 0x00, 0xd8, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0xdf, 0xc5, 0xd9, 0xec, 0xee, 0xed, 0xda, /* 0xc8-0xcf */ 0xf7, 0xba, 0x00, 0xe1, 0x00, 0xf8, 0x00, 0xe2, /* 0xd0-0xd7 */ 0x00, 0xfc, 0x00, 0xe3, 0x00, 0xfb, 0x00, 0xe4, /* 0xd8-0xdf */ 0x00, 0xef, 0x00, 0xe5, 0x00, 0xf2, 0x00, 0xe6, /* 0xe0-0xe7 */ 0x00, 0xf3, 0x00, 0xe7, 0xf4, 0xe8, 0x00, 0xe9, /* 0xe8-0xef */ 0xf5, 0xfd, 0xf6, 0xea, 0x00, 0xf9, 0xfa, 0x99, /* 0xf0-0xf7 */ 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char *const page_uni2charset[256] = { page00, NULL, NULL, page03, NULL, NULL, page06, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page22, NULL, NULL, page25, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, pagefe, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0x00, 0x00, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x00, 0x91, 0x00, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0x00, 0x00, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp864", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp864(void) { return register_nls(&table); } static void __exit exit_nls_cp864(void) { unregister_nls(&table); } module_init(init_nls_cp864) module_exit(exit_nls_cp864) MODULE_DESCRIPTION("NLS Codepage 864 (Arabic)"); MODULE_LICENSE("Dual BSD/GPL"); |
| 188 188 188 187 187 17 52 167 165 1 166 188 129 4 49 94 64 32 71 60 27 70 49 49 47 91 6 127 126 112 7 112 31 6 7 110 186 64 3 124 127 114 186 49 168 53 52 53 6 11 11 20 136 137 24 24 11 149 4 115 35 24 35 3 137 137 27 146 122 35 18 41 12 137 27 102 35 111 41 111 43 54 56 2 12 12 12 135 137 100 12 97 99 97 14 87 4 13 81 150 100 54 150 149 4 1 150 150 24 2 22 28 21 7 21 7 26 2 43 27 16 18 26 26 8 18 18 12 17 15 10 4 5 5 25 25 17 15 1 2 20 8 16 25 35 34 1 20 5 15 15 9 6 1 5 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 | // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/dir.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <linux/unaligned.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/sched/signal.h> #include <linux/unicode.h> #include "f2fs.h" #include "node.h" #include "acl.h" #include "xattr.h" #include <trace/events/f2fs.h> #if IS_ENABLED(CONFIG_UNICODE) extern struct kmem_cache *f2fs_cf_name_slab; #endif static unsigned long dir_blocks(struct inode *inode) { return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1)) >> PAGE_SHIFT; } static unsigned int dir_buckets(unsigned int level, int dir_level) { if (level + dir_level < MAX_DIR_HASH_DEPTH / 2) return BIT(level + dir_level); else return MAX_DIR_BUCKETS; } static unsigned int bucket_blocks(unsigned int level) { if (level < MAX_DIR_HASH_DEPTH / 2) return 2; else return 4; } #if IS_ENABLED(CONFIG_UNICODE) /* If @dir is casefolded, initialize @fname->cf_name from @fname->usr_fname. */ int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname) { struct super_block *sb = dir->i_sb; unsigned char *buf; int len; if (IS_CASEFOLDED(dir) && !is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) { buf = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, GFP_NOFS, false, F2FS_SB(sb)); if (!buf) return -ENOMEM; len = utf8_casefold(sb->s_encoding, fname->usr_fname, buf, F2FS_NAME_LEN); if (len <= 0) { kmem_cache_free(f2fs_cf_name_slab, buf); if (sb_has_strict_encoding(sb)) return -EINVAL; /* fall back to treating name as opaque byte sequence */ return 0; } fname->cf_name.name = buf; fname->cf_name.len = len; } return 0; } void f2fs_free_casefolded_name(struct f2fs_filename *fname) { unsigned char *buf = (unsigned char *)fname->cf_name.name; if (buf) { kmem_cache_free(f2fs_cf_name_slab, buf); fname->cf_name.name = NULL; } } #endif /* CONFIG_UNICODE */ static int __f2fs_setup_filename(const struct inode *dir, const struct fscrypt_name *crypt_name, struct f2fs_filename *fname) { int err; memset(fname, 0, sizeof(*fname)); fname->usr_fname = crypt_name->usr_fname; fname->disk_name = crypt_name->disk_name; #ifdef CONFIG_FS_ENCRYPTION fname->crypto_buf = crypt_name->crypto_buf; #endif if (crypt_name->is_nokey_name) { /* hash was decoded from the no-key name */ fname->hash = cpu_to_le32(crypt_name->hash); } else { err = f2fs_init_casefolded_name(dir, fname); if (err) { f2fs_free_filename(fname); return err; } f2fs_hash_filename(dir, fname); } return 0; } /* * Prepare to search for @iname in @dir. This is similar to * fscrypt_setup_filename(), but this also handles computing the casefolded name * and the f2fs dirhash if needed, then packing all the information about this * filename up into a 'struct f2fs_filename'. */ int f2fs_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct f2fs_filename *fname) { struct fscrypt_name crypt_name; int err; err = fscrypt_setup_filename(dir, iname, lookup, &crypt_name); if (err) return err; return __f2fs_setup_filename(dir, &crypt_name, fname); } /* * Prepare to look up @dentry in @dir. This is similar to * fscrypt_prepare_lookup(), but this also handles computing the casefolded name * and the f2fs dirhash if needed, then packing all the information about this * filename up into a 'struct f2fs_filename'. */ int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry, struct f2fs_filename *fname) { struct fscrypt_name crypt_name; int err; err = fscrypt_prepare_lookup(dir, dentry, &crypt_name); if (err) return err; return __f2fs_setup_filename(dir, &crypt_name, fname); } void f2fs_free_filename(struct f2fs_filename *fname) { #ifdef CONFIG_FS_ENCRYPTION kfree(fname->crypto_buf.name); fname->crypto_buf.name = NULL; #endif f2fs_free_casefolded_name(fname); } static unsigned long dir_block_index(unsigned int level, int dir_level, unsigned int idx) { unsigned long i; unsigned long bidx = 0; for (i = 0; i < level; i++) bidx += mul_u32_u32(dir_buckets(i, dir_level), bucket_blocks(i)); bidx += idx * bucket_blocks(level); return bidx; } static struct f2fs_dir_entry *find_in_block(struct inode *dir, struct folio *dentry_folio, const struct f2fs_filename *fname, int *max_slots, bool use_hash) { struct f2fs_dentry_block *dentry_blk; struct f2fs_dentry_ptr d; dentry_blk = folio_address(dentry_folio); make_dentry_ptr_block(dir, &d, dentry_blk); return f2fs_find_target_dentry(&d, fname, max_slots, use_hash); } static inline int f2fs_match_name(const struct inode *dir, const struct f2fs_filename *fname, const u8 *de_name, u32 de_name_len) { struct fscrypt_name f; #if IS_ENABLED(CONFIG_UNICODE) if (fname->cf_name.name) return generic_ci_match(dir, fname->usr_fname, &fname->cf_name, de_name, de_name_len); #endif f.usr_fname = fname->usr_fname; f.disk_name = fname->disk_name; #ifdef CONFIG_FS_ENCRYPTION f.crypto_buf = fname->crypto_buf; #endif return fscrypt_match_name(&f, de_name, de_name_len); } struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, const struct f2fs_filename *fname, int *max_slots, bool use_hash) { struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; int res = 0; if (max_slots) *max_slots = 0; while (bit_pos < d->max) { if (!test_bit_le(bit_pos, d->bitmap)) { bit_pos++; max_len++; continue; } de = &d->dentry[bit_pos]; if (unlikely(!de->name_len)) { bit_pos++; continue; } if (!use_hash || de->hash_code == fname->hash) { res = f2fs_match_name(d->inode, fname, d->filename[bit_pos], le16_to_cpu(de->name_len)); if (res < 0) return ERR_PTR(res); if (res) goto found; } if (max_slots && max_len > *max_slots) *max_slots = max_len; max_len = 0; bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } de = NULL; found: if (max_slots && max_len > *max_slots) *max_slots = max_len; return de; } static struct f2fs_dir_entry *find_in_level(struct inode *dir, unsigned int level, const struct f2fs_filename *fname, struct folio **res_folio, bool use_hash) { int s = GET_DENTRY_SLOTS(fname->disk_name.len); unsigned int nbucket, nblock; unsigned int bidx, end_block, bucket_no; struct f2fs_dir_entry *de = NULL; pgoff_t next_pgofs; bool room = false; int max_slots; nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); bucket_no = use_hash ? le32_to_cpu(fname->hash) % nbucket : 0; start_find_bucket: bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, bucket_no); end_block = bidx + nblock; while (bidx < end_block) { /* no need to allocate new dentry pages to all the indices */ struct folio *dentry_folio; dentry_folio = f2fs_find_data_folio(dir, bidx, &next_pgofs); if (IS_ERR(dentry_folio)) { if (PTR_ERR(dentry_folio) == -ENOENT) { room = true; bidx = next_pgofs; continue; } else { *res_folio = dentry_folio; break; } } de = find_in_block(dir, dentry_folio, fname, &max_slots, use_hash); if (IS_ERR(de)) { *res_folio = ERR_CAST(de); de = NULL; break; } else if (de) { *res_folio = dentry_folio; break; } if (max_slots >= s) room = true; f2fs_folio_put(dentry_folio, false); bidx++; } if (de) return de; if (likely(use_hash)) { if (room && F2FS_I(dir)->chash != fname->hash) { F2FS_I(dir)->chash = fname->hash; F2FS_I(dir)->clevel = level; } } else if (++bucket_no < nbucket) { goto start_find_bucket; } return NULL; } struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, const struct f2fs_filename *fname, struct folio **res_folio) { unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; unsigned int max_depth; unsigned int level; bool use_hash = true; *res_folio = NULL; #if IS_ENABLED(CONFIG_UNICODE) start_find_entry: #endif if (f2fs_has_inline_dentry(dir)) { de = f2fs_find_in_inline_dir(dir, fname, res_folio, use_hash); goto out; } if (npages == 0) goto out; max_depth = F2FS_I(dir)->i_current_depth; if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { f2fs_warn(F2FS_I_SB(dir), "Corrupted max_depth of %lu: %u", dir->i_ino, max_depth); max_depth = MAX_DIR_HASH_DEPTH; f2fs_i_depth_write(dir, max_depth); } for (level = 0; level < max_depth; level++) { de = find_in_level(dir, level, fname, res_folio, use_hash); if (de || IS_ERR(*res_folio)) break; } out: #if IS_ENABLED(CONFIG_UNICODE) if (!sb_no_casefold_compat_fallback(dir->i_sb) && IS_CASEFOLDED(dir) && !de && use_hash) { use_hash = false; goto start_find_entry; } #endif /* This is to increase the speed of f2fs_create */ if (!de) F2FS_I(dir)->task = current; return de; } /* * Find an entry in the specified directory with the wanted name. * It returns the page where the entry was found (as a parameter - res_page), * and the entry itself. Page is returned mapped and unlocked. * Entry is guaranteed to be valid. */ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, const struct qstr *child, struct folio **res_folio) { struct f2fs_dir_entry *de = NULL; struct f2fs_filename fname; int err; err = f2fs_setup_filename(dir, child, 1, &fname); if (err) { if (err == -ENOENT) *res_folio = NULL; else *res_folio = ERR_PTR(err); return NULL; } de = __f2fs_find_entry(dir, &fname, res_folio); f2fs_free_filename(&fname); return de; } struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct folio **f) { return f2fs_find_entry(dir, &dotdot_name, f); } ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, struct folio **folio) { ino_t res = 0; struct f2fs_dir_entry *de; de = f2fs_find_entry(dir, qstr, folio); if (de) { res = le32_to_cpu(de->ino); f2fs_folio_put(*folio, false); } return res; } void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct folio *folio, struct inode *inode) { enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; folio_lock(folio); f2fs_folio_wait_writeback(folio, type, true, true); de->ino = cpu_to_le32(inode->i_ino); de->file_type = fs_umode_to_ftype(inode->i_mode); folio_mark_dirty(folio); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); f2fs_folio_put(folio, true); } static void init_dent_inode(struct inode *dir, struct inode *inode, const struct f2fs_filename *fname, struct folio *ifolio) { struct f2fs_inode *ri; if (!fname) /* tmpfile case? */ return; f2fs_folio_wait_writeback(ifolio, NODE, true, true); /* copy name info. to this inode folio */ ri = F2FS_INODE(&ifolio->page); ri->i_namelen = cpu_to_le32(fname->disk_name.len); memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len); if (IS_ENCRYPTED(dir)) { file_set_enc_name(inode); /* * Roll-forward recovery doesn't have encryption keys available, * so it can't compute the dirhash for encrypted+casefolded * filenames. Append it to i_name if possible. Else, disable * roll-forward recovery of the dentry (i.e., make fsync'ing the * file force a checkpoint) by setting LOST_PINO. */ if (IS_CASEFOLDED(dir)) { if (fname->disk_name.len + sizeof(f2fs_hash_t) <= F2FS_NAME_LEN) put_unaligned(fname->hash, (f2fs_hash_t *) &ri->i_name[fname->disk_name.len]); else file_lost_pino(inode); } } folio_mark_dirty(ifolio); } void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { struct fscrypt_str dot = FSTR_INIT(".", 1); struct fscrypt_str dotdot = FSTR_INIT("..", 2); /* update dirent of "." */ f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0); /* update dirent of ".." */ f2fs_update_dentry(parent->i_ino, parent->i_mode, d, &dotdot, 0, 1); } static int make_empty_dir(struct inode *inode, struct inode *parent, struct folio *folio) { struct folio *dentry_folio; struct f2fs_dentry_block *dentry_blk; struct f2fs_dentry_ptr d; if (f2fs_has_inline_dentry(inode)) return f2fs_make_empty_inline_dir(inode, parent, folio); dentry_folio = f2fs_get_new_data_folio(inode, folio, 0, true); if (IS_ERR(dentry_folio)) return PTR_ERR(dentry_folio); dentry_blk = folio_address(dentry_folio); make_dentry_ptr_block(NULL, &d, dentry_blk); f2fs_do_make_empty_dir(inode, parent, &d); folio_mark_dirty(dentry_folio); f2fs_folio_put(dentry_folio, true); return 0; } struct folio *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, const struct f2fs_filename *fname, struct folio *dfolio) { struct folio *folio; int err; if (is_inode_flag_set(inode, FI_NEW_INODE)) { folio = f2fs_new_inode_folio(inode); if (IS_ERR(folio)) return folio; if (S_ISDIR(inode->i_mode)) { /* in order to handle error case */ folio_get(folio); err = make_empty_dir(inode, dir, folio); if (err) { folio_lock(folio); goto put_error; } folio_put(folio); } err = f2fs_init_acl(inode, dir, folio, dfolio); if (err) goto put_error; err = f2fs_init_security(inode, dir, fname ? fname->usr_fname : NULL, folio); if (err) goto put_error; if (IS_ENCRYPTED(inode)) { err = fscrypt_set_context(inode, folio); if (err) goto put_error; } } else { folio = f2fs_get_inode_folio(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(folio)) return folio; } init_dent_inode(dir, inode, fname, folio); /* * This file should be checkpointed during fsync. * We lost i_pino from now on. */ if (is_inode_flag_set(inode, FI_INC_LINK)) { if (!S_ISDIR(inode->i_mode)) file_lost_pino(inode); /* * If link the tmpfile to alias through linkat path, * we should remove this inode from orphan list. */ if (inode->i_nlink == 0) f2fs_remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); f2fs_i_links_write(inode, true); } return folio; put_error: clear_nlink(inode); f2fs_update_inode(inode, folio); f2fs_folio_put(folio, true); return ERR_PTR(err); } void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) { if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, true); clear_inode_flag(inode, FI_NEW_INODE); } inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); if (F2FS_I(dir)->i_current_depth != current_depth) f2fs_i_depth_write(dir, current_depth); if (inode && is_inode_flag_set(inode, FI_INC_LINK)) clear_inode_flag(inode, FI_INC_LINK); } int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots) { int bit_start = 0; int zero_start, zero_end; next: zero_start = find_next_zero_bit_le(bitmap, max_slots, bit_start); if (zero_start >= max_slots) return max_slots; zero_end = find_next_bit_le(bitmap, max_slots, zero_start); if (zero_end - zero_start >= slots) return zero_start; bit_start = zero_end + 1; if (zero_end + 1 >= max_slots) return max_slots; goto next; } bool f2fs_has_enough_room(struct inode *dir, struct folio *ifolio, const struct f2fs_filename *fname) { struct f2fs_dentry_ptr d; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(fname->disk_name.len); make_dentry_ptr_inline(dir, &d, inline_data_addr(dir, ifolio)); bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); return bit_pos < d.max; } void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct fscrypt_str *name, f2fs_hash_t name_hash, unsigned int bit_pos) { struct f2fs_dir_entry *de; int slots = GET_DENTRY_SLOTS(name->len); int i; de = &d->dentry[bit_pos]; de->hash_code = name_hash; de->name_len = cpu_to_le16(name->len); memcpy(d->filename[bit_pos], name->name, name->len); de->ino = cpu_to_le32(ino); de->file_type = fs_umode_to_ftype(mode); for (i = 0; i < slots; i++) { __set_bit_le(bit_pos + i, (void *)d->bitmap); /* avoid wrong garbage data for readdir */ if (i) (de + i)->name_len = 0; } } int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode) { unsigned int bit_pos; unsigned int level; unsigned int current_depth; unsigned long bidx, block; unsigned int nbucket, nblock; struct folio *dentry_folio = NULL; struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dentry_ptr d; struct folio *folio = NULL; int slots, err = 0; level = 0; slots = GET_DENTRY_SLOTS(fname->disk_name.len); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == fname->hash) { level = F2FS_I(dir)->clevel; F2FS_I(dir)->chash = 0; } start: if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) return -ENOSPC; if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; /* Increase the depth, if required */ if (level == current_depth) ++current_depth; nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, (le32_to_cpu(fname->hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_folio = f2fs_get_new_data_folio(dir, NULL, block, true); if (IS_ERR(dentry_folio)) return PTR_ERR(dentry_folio); dentry_blk = folio_address(dentry_folio); bit_pos = f2fs_room_for_filename(&dentry_blk->dentry_bitmap, slots, NR_DENTRY_IN_BLOCK); if (bit_pos < NR_DENTRY_IN_BLOCK) goto add_dentry; f2fs_folio_put(dentry_folio, true); } /* Move to next level to find the empty slot for new dentry */ ++level; goto start; add_dentry: f2fs_folio_wait_writeback(dentry_folio, DATA, true, true); if (inode) { f2fs_down_write(&F2FS_I(inode)->i_sem); folio = f2fs_init_inode_metadata(inode, dir, fname, NULL); if (IS_ERR(folio)) { err = PTR_ERR(folio); goto fail; } } make_dentry_ptr_block(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, &fname->disk_name, fname->hash, bit_pos); folio_mark_dirty(dentry_folio); if (inode) { f2fs_i_pino_write(inode, dir->i_ino); /* synchronize inode page's data from inode cache */ if (is_inode_flag_set(inode, FI_NEW_INODE)) f2fs_update_inode(inode, folio); f2fs_folio_put(folio, true); } f2fs_update_parent_metadata(dir, inode, current_depth); fail: if (inode) f2fs_up_write(&F2FS_I(inode)->i_sem); f2fs_folio_put(dentry_folio, true); return err; } int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode) { int err = -EAGAIN; if (f2fs_has_inline_dentry(dir)) { /* * Should get i_xattr_sem to keep the lock order: * i_xattr_sem -> inode_page lock used by f2fs_setxattr. */ f2fs_down_read(&F2FS_I(dir)->i_xattr_sem); err = f2fs_add_inline_entry(dir, fname, inode, ino, mode); f2fs_up_read(&F2FS_I(dir)->i_xattr_sem); } if (err == -EAGAIN) err = f2fs_add_regular_entry(dir, fname, inode, ino, mode); f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); return err; } /* * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). */ int f2fs_do_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct f2fs_filename fname; struct folio *folio = NULL; struct f2fs_dir_entry *de = NULL; int err; err = f2fs_setup_filename(dir, name, 0, &fname); if (err) return err; /* * An immature stackable filesystem shows a race condition between lookup * and create. If we have same task when doing lookup and create, it's * definitely fine as expected by VFS normally. Otherwise, let's just * verify on-disk dentry one more time, which guarantees filesystem * consistency more. */ if (current != F2FS_I(dir)->task) { de = __f2fs_find_entry(dir, &fname, &folio); F2FS_I(dir)->task = NULL; } if (de) { f2fs_folio_put(folio, false); err = -EEXIST; } else if (IS_ERR(folio)) { err = PTR_ERR(folio); } else { err = f2fs_add_dentry(dir, &fname, inode, ino, mode); } f2fs_free_filename(&fname); return err; } int f2fs_do_tmpfile(struct inode *inode, struct inode *dir, struct f2fs_filename *fname) { struct folio *folio; int err = 0; f2fs_down_write(&F2FS_I(inode)->i_sem); folio = f2fs_init_inode_metadata(inode, dir, fname, NULL); if (IS_ERR(folio)) { err = PTR_ERR(folio); goto fail; } f2fs_folio_put(folio, true); clear_inode_flag(inode, FI_NEW_INODE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); fail: f2fs_up_write(&F2FS_I(inode)->i_sem); return err; } void f2fs_drop_nlink(struct inode *dir, struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); f2fs_down_write(&F2FS_I(inode)->i_sem); if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, false); inode_set_ctime_current(inode); f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { f2fs_i_links_write(inode, false); f2fs_i_size_write(inode, 0); } f2fs_up_write(&F2FS_I(inode)->i_sem); if (inode->i_nlink == 0) f2fs_add_orphan_inode(inode); else f2fs_release_orphan_inode(sbi); } /* * It only removes the dentry from the dentry page, corresponding name * entry in name page does not need to be touched during deletion. */ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct folio *folio, struct inode *dir, struct inode *inode) { struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); pgoff_t index = folio->index; int i; f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); if (F2FS_OPTION(F2FS_I_SB(dir)).fsync_mode == FSYNC_MODE_STRICT) f2fs_add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, folio, dir, inode); folio_lock(folio); f2fs_folio_wait_writeback(folio, DATA, true, true); dentry_blk = folio_address(folio); bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) __clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); /* Let's check and deallocate this dentry page */ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, 0); folio_mark_dirty(folio); if (bit_pos == NR_DENTRY_IN_BLOCK && !f2fs_truncate_hole(dir, index, index + 1)) { f2fs_clear_page_cache_dirty_tag(folio); folio_clear_dirty_for_io(folio); folio_clear_uptodate(folio); clear_page_private_all(&folio->page); inode_dec_dirty_pages(dir); f2fs_remove_dirty_inode(dir); } f2fs_folio_put(folio, true); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); } bool f2fs_empty_dir(struct inode *dir) { unsigned long bidx = 0; unsigned int bit_pos; struct f2fs_dentry_block *dentry_blk; unsigned long nblock = dir_blocks(dir); if (f2fs_has_inline_dentry(dir)) return f2fs_empty_inline_dir(dir); while (bidx < nblock) { pgoff_t next_pgofs; struct folio *dentry_folio; dentry_folio = f2fs_find_data_folio(dir, bidx, &next_pgofs); if (IS_ERR(dentry_folio)) { if (PTR_ERR(dentry_folio) == -ENOENT) { bidx = next_pgofs; continue; } else { return false; } } dentry_blk = folio_address(dentry_folio); if (bidx == 0) bit_pos = 2; else bit_pos = 0; bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, bit_pos); f2fs_folio_put(dentry_folio, false); if (bit_pos < NR_DENTRY_IN_BLOCK) return false; bidx++; } return true; } int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr) { unsigned char d_type = DT_UNKNOWN; unsigned int bit_pos; struct f2fs_dir_entry *de = NULL; struct fscrypt_str de_name = FSTR_INIT(NULL, 0); struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); struct blk_plug plug; bool readdir_ra = sbi->readdir_ra; bool found_valid_dirent = false; int err = 0; bit_pos = ((unsigned long)ctx->pos % d->max); if (readdir_ra) blk_start_plug(&plug); while (bit_pos < d->max) { bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos); if (bit_pos >= d->max) break; de = &d->dentry[bit_pos]; if (de->name_len == 0) { if (found_valid_dirent || !bit_pos) { f2fs_warn_ratelimited(sbi, "invalid namelen(0), ino:%u, run fsck to fix.", le32_to_cpu(de->ino)); set_sbi_flag(sbi, SBI_NEED_FSCK); } bit_pos++; ctx->pos = start_pos + bit_pos; continue; } d_type = fs_ftype_to_dtype(de->file_type); de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); /* check memory boundary before moving forward */ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); if (unlikely(bit_pos > d->max || le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) { f2fs_warn(sbi, "%s: corrupted namelen=%d, run fsck to fix.", __func__, le16_to_cpu(de->name_len)); set_sbi_flag(sbi, SBI_NEED_FSCK); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_CORRUPTED_DIRENT); goto out; } if (IS_ENCRYPTED(d->inode)) { int save_len = fstr->len; err = fscrypt_fname_disk_to_usr(d->inode, (u32)le32_to_cpu(de->hash_code), 0, &de_name, fstr); if (err) goto out; de_name = *fstr; fstr->len = save_len; } if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->ino), d_type)) { err = 1; goto out; } if (readdir_ra) f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); ctx->pos = start_pos + bit_pos; found_valid_dirent = true; } out: if (readdir_ra) blk_finish_plug(&plug); return err; } static int f2fs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); unsigned long npages = dir_blocks(inode); struct f2fs_dentry_block *dentry_blk = NULL; struct file_ra_state *ra = &file->f_ra; loff_t start_pos = ctx->pos; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); struct f2fs_dentry_ptr d; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); int err = 0; if (IS_ENCRYPTED(inode)) { err = fscrypt_prepare_readdir(inode); if (err) goto out; err = fscrypt_fname_alloc_buffer(F2FS_NAME_LEN, &fstr); if (err < 0) goto out; } if (f2fs_has_inline_dentry(inode)) { err = f2fs_read_inline_dir(file, ctx, &fstr); goto out_free; } for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) { struct folio *dentry_folio; pgoff_t next_pgofs; /* allow readdir() to be interrupted */ if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto out_free; } cond_resched(); /* readahead for multi pages of dir */ if (npages - n > 1 && !ra_has_index(ra, n)) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); dentry_folio = f2fs_find_data_folio(inode, n, &next_pgofs); if (IS_ERR(dentry_folio)) { err = PTR_ERR(dentry_folio); if (err == -ENOENT) { err = 0; n = next_pgofs; continue; } else { goto out_free; } } dentry_blk = folio_address(dentry_folio); make_dentry_ptr_block(inode, &d, dentry_blk); err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); f2fs_folio_put(dentry_folio, false); if (err) break; n++; } out_free: fscrypt_fname_free_buffer(&fstr); out: trace_f2fs_readdir(inode, start_pos, ctx->pos, err); return err < 0 ? err : 0; } const struct file_operations f2fs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = f2fs_readdir, .fsync = f2fs_sync_file, .unlocked_ioctl = f2fs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = f2fs_compat_ioctl, #endif }; |
| 3 195 195 195 203 201 202 202 194 201 8 187 10 194 193 194 8 202 202 8 1 190 10 3 3 195 194 194 195 195 193 194 48 10 203 1 202 7 192 190 1 2 3 3 3 195 195 195 195 194 202 203 195 8 203 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 | // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * */ #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/random.h> #include <linux/slab.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" /* * LOG FILE structs */ // clang-format off #define MaxLogFileSize 0x100000000ull #define DefaultLogPageSize 4096 #define MinLogRecordPages 0x30 struct RESTART_HDR { struct NTFS_RECORD_HEADER rhdr; // 'RSTR' __le32 sys_page_size; // 0x10: Page size of the system which initialized the log. __le32 page_size; // 0x14: Log page size used for this log file. __le16 ra_off; // 0x18: __le16 minor_ver; // 0x1A: __le16 major_ver; // 0x1C: __le16 fixups[]; }; #define LFS_NO_CLIENT 0xffff #define LFS_NO_CLIENT_LE cpu_to_le16(0xffff) struct CLIENT_REC { __le64 oldest_lsn; __le64 restart_lsn; // 0x08: __le16 prev_client; // 0x10: __le16 next_client; // 0x12: __le16 seq_num; // 0x14: u8 align[6]; // 0x16: __le32 name_bytes; // 0x1C: In bytes. __le16 name[32]; // 0x20: Name of client. }; static_assert(sizeof(struct CLIENT_REC) == 0x60); /* Two copies of these will exist at the beginning of the log file */ struct RESTART_AREA { __le64 current_lsn; // 0x00: Current logical end of log file. __le16 log_clients; // 0x08: Maximum number of clients. __le16 client_idx[2]; // 0x0A: Free/use index into the client record arrays. __le16 flags; // 0x0E: See RESTART_SINGLE_PAGE_IO. __le32 seq_num_bits; // 0x10: The number of bits in sequence number. __le16 ra_len; // 0x14: __le16 client_off; // 0x16: __le64 l_size; // 0x18: Usable log file size. __le32 last_lsn_data_len; // 0x20: __le16 rec_hdr_len; // 0x24: Log page data offset. __le16 data_off; // 0x26: Log page data length. __le32 open_log_count; // 0x28: __le32 align[5]; // 0x2C: struct CLIENT_REC clients[]; // 0x40: }; struct LOG_REC_HDR { __le16 redo_op; // 0x00: NTFS_LOG_OPERATION __le16 undo_op; // 0x02: NTFS_LOG_OPERATION __le16 redo_off; // 0x04: Offset to Redo record. __le16 redo_len; // 0x06: Redo length. __le16 undo_off; // 0x08: Offset to Undo record. __le16 undo_len; // 0x0A: Undo length. __le16 target_attr; // 0x0C: __le16 lcns_follow; // 0x0E: __le16 record_off; // 0x10: __le16 attr_off; // 0x12: __le16 cluster_off; // 0x14: __le16 reserved; // 0x16: __le64 target_vcn; // 0x18: __le64 page_lcns[]; // 0x20: }; static_assert(sizeof(struct LOG_REC_HDR) == 0x20); #define RESTART_ENTRY_ALLOCATED 0xFFFFFFFF #define RESTART_ENTRY_ALLOCATED_LE cpu_to_le32(0xFFFFFFFF) struct RESTART_TABLE { __le16 size; // 0x00: In bytes __le16 used; // 0x02: Entries __le16 total; // 0x04: Entries __le16 res[3]; // 0x06: __le32 free_goal; // 0x0C: __le32 first_free; // 0x10: __le32 last_free; // 0x14: }; static_assert(sizeof(struct RESTART_TABLE) == 0x18); struct ATTR_NAME_ENTRY { __le16 off; // Offset in the Open attribute Table. __le16 name_bytes; __le16 name[]; }; struct OPEN_ATTR_ENRTY { __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated __le32 bytes_per_index; // 0x04: enum ATTR_TYPE type; // 0x08: u8 is_dirty_pages; // 0x0C: u8 is_attr_name; // 0x0B: Faked field to manage 'ptr' u8 name_len; // 0x0C: Faked field to manage 'ptr' u8 res; struct MFT_REF ref; // 0x10: File Reference of file containing attribute __le64 open_record_lsn; // 0x18: void *ptr; // 0x20: }; /* 32 bit version of 'struct OPEN_ATTR_ENRTY' */ struct OPEN_ATTR_ENRTY_32 { __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated __le32 ptr; // 0x04: struct MFT_REF ref; // 0x08: __le64 open_record_lsn; // 0x10: u8 is_dirty_pages; // 0x18: u8 is_attr_name; // 0x19: u8 res1[2]; enum ATTR_TYPE type; // 0x1C: u8 name_len; // 0x20: In wchar u8 res2[3]; __le32 AttributeName; // 0x24: __le32 bytes_per_index; // 0x28: }; #define SIZEOF_OPENATTRIBUTEENTRY0 0x2c // static_assert( 0x2C == sizeof(struct OPEN_ATTR_ENRTY_32) ); static_assert(sizeof(struct OPEN_ATTR_ENRTY) < SIZEOF_OPENATTRIBUTEENTRY0); /* * One entry exists in the Dirty Pages Table for each page which is dirty at * the time the Restart Area is written. */ struct DIR_PAGE_ENTRY { __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated __le32 target_attr; // 0x04: Index into the Open attribute Table __le32 transfer_len; // 0x08: __le32 lcns_follow; // 0x0C: __le64 vcn; // 0x10: Vcn of dirty page __le64 oldest_lsn; // 0x18: __le64 page_lcns[]; // 0x20: }; static_assert(sizeof(struct DIR_PAGE_ENTRY) == 0x20); /* 32 bit version of 'struct DIR_PAGE_ENTRY' */ struct DIR_PAGE_ENTRY_32 { __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated __le32 target_attr; // 0x04: Index into the Open attribute Table __le32 transfer_len; // 0x08: __le32 lcns_follow; // 0x0C: __le32 reserved; // 0x10: __le32 vcn_low; // 0x14: Vcn of dirty page __le32 vcn_hi; // 0x18: Vcn of dirty page __le32 oldest_lsn_low; // 0x1C: __le32 oldest_lsn_hi; // 0x1C: __le32 page_lcns_low; // 0x24: __le32 page_lcns_hi; // 0x24: }; static_assert(offsetof(struct DIR_PAGE_ENTRY_32, vcn_low) == 0x14); static_assert(sizeof(struct DIR_PAGE_ENTRY_32) == 0x2c); enum transact_state { TransactionUninitialized = 0, TransactionActive, TransactionPrepared, TransactionCommitted }; struct TRANSACTION_ENTRY { __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated u8 transact_state; // 0x04: u8 reserved[3]; // 0x05: __le64 first_lsn; // 0x08: __le64 prev_lsn; // 0x10: __le64 undo_next_lsn; // 0x18: __le32 undo_records; // 0x20: Number of undo log records pending abort __le32 undo_len; // 0x24: Total undo size }; static_assert(sizeof(struct TRANSACTION_ENTRY) == 0x28); struct NTFS_RESTART { __le32 major_ver; // 0x00: __le32 minor_ver; // 0x04: __le64 check_point_start; // 0x08: __le64 open_attr_table_lsn; // 0x10: __le64 attr_names_lsn; // 0x18: __le64 dirty_pages_table_lsn; // 0x20: __le64 transact_table_lsn; // 0x28: __le32 open_attr_len; // 0x30: In bytes __le32 attr_names_len; // 0x34: In bytes __le32 dirty_pages_len; // 0x38: In bytes __le32 transact_table_len; // 0x3C: In bytes }; static_assert(sizeof(struct NTFS_RESTART) == 0x40); struct NEW_ATTRIBUTE_SIZES { __le64 alloc_size; __le64 valid_size; __le64 data_size; __le64 total_size; }; struct BITMAP_RANGE { __le32 bitmap_off; __le32 bits; }; struct LCN_RANGE { __le64 lcn; __le64 len; }; /* The following type defines the different log record types. */ #define LfsClientRecord cpu_to_le32(1) #define LfsClientRestart cpu_to_le32(2) /* This is used to uniquely identify a client for a particular log file. */ struct CLIENT_ID { __le16 seq_num; __le16 client_idx; }; /* This is the header that begins every Log Record in the log file. */ struct LFS_RECORD_HDR { __le64 this_lsn; // 0x00: __le64 client_prev_lsn; // 0x08: __le64 client_undo_next_lsn; // 0x10: __le32 client_data_len; // 0x18: struct CLIENT_ID client; // 0x1C: Owner of this log record. __le32 record_type; // 0x20: LfsClientRecord or LfsClientRestart. __le32 transact_id; // 0x24: __le16 flags; // 0x28: LOG_RECORD_MULTI_PAGE u8 align[6]; // 0x2A: }; #define LOG_RECORD_MULTI_PAGE cpu_to_le16(1) static_assert(sizeof(struct LFS_RECORD_HDR) == 0x30); struct LFS_RECORD { __le16 next_record_off; // 0x00: Offset of the free space in the page, u8 align[6]; // 0x02: __le64 last_end_lsn; // 0x08: lsn for the last log record which ends on the page, }; static_assert(sizeof(struct LFS_RECORD) == 0x10); struct RECORD_PAGE_HDR { struct NTFS_RECORD_HEADER rhdr; // 'RCRD' __le32 rflags; // 0x10: See LOG_PAGE_LOG_RECORD_END __le16 page_count; // 0x14: __le16 page_pos; // 0x16: struct LFS_RECORD record_hdr; // 0x18: __le16 fixups[10]; // 0x28: __le32 file_off; // 0x3c: Used when major version >= 2 }; // clang-format on // Page contains the end of a log record. #define LOG_PAGE_LOG_RECORD_END cpu_to_le32(0x00000001) static inline bool is_log_record_end(const struct RECORD_PAGE_HDR *hdr) { return hdr->rflags & LOG_PAGE_LOG_RECORD_END; } static_assert(offsetof(struct RECORD_PAGE_HDR, file_off) == 0x3c); /* * END of NTFS LOG structures */ /* Define some tuning parameters to keep the restart tables a reasonable size. */ #define INITIAL_NUMBER_TRANSACTIONS 5 enum NTFS_LOG_OPERATION { Noop = 0x00, CompensationLogRecord = 0x01, InitializeFileRecordSegment = 0x02, DeallocateFileRecordSegment = 0x03, WriteEndOfFileRecordSegment = 0x04, CreateAttribute = 0x05, DeleteAttribute = 0x06, UpdateResidentValue = 0x07, UpdateNonresidentValue = 0x08, UpdateMappingPairs = 0x09, DeleteDirtyClusters = 0x0A, SetNewAttributeSizes = 0x0B, AddIndexEntryRoot = 0x0C, DeleteIndexEntryRoot = 0x0D, AddIndexEntryAllocation = 0x0E, DeleteIndexEntryAllocation = 0x0F, WriteEndOfIndexBuffer = 0x10, SetIndexEntryVcnRoot = 0x11, SetIndexEntryVcnAllocation = 0x12, UpdateFileNameRoot = 0x13, UpdateFileNameAllocation = 0x14, SetBitsInNonresidentBitMap = 0x15, ClearBitsInNonresidentBitMap = 0x16, HotFix = 0x17, EndTopLevelAction = 0x18, PrepareTransaction = 0x19, CommitTransaction = 0x1A, ForgetTransaction = 0x1B, OpenNonresidentAttribute = 0x1C, OpenAttributeTableDump = 0x1D, AttributeNamesDump = 0x1E, DirtyPageTableDump = 0x1F, TransactionTableDump = 0x20, UpdateRecordDataRoot = 0x21, UpdateRecordDataAllocation = 0x22, UpdateRelativeDataInIndex = 0x23, // NtOfsRestartUpdateRelativeDataInIndex UpdateRelativeDataInIndex2 = 0x24, ZeroEndOfFileRecord = 0x25, }; /* * Array for log records which require a target attribute. * A true indicates that the corresponding restart operation * requires a target attribute. */ static const u8 AttributeRequired[] = { 0xFC, 0xFB, 0xFF, 0x10, 0x06, }; static inline bool is_target_required(u16 op) { bool ret = op <= UpdateRecordDataAllocation && (AttributeRequired[op >> 3] >> (op & 7) & 1); return ret; } static inline bool can_skip_action(enum NTFS_LOG_OPERATION op) { switch (op) { case Noop: case DeleteDirtyClusters: case HotFix: case EndTopLevelAction: case PrepareTransaction: case CommitTransaction: case ForgetTransaction: case CompensationLogRecord: case OpenNonresidentAttribute: case OpenAttributeTableDump: case AttributeNamesDump: case DirtyPageTableDump: case TransactionTableDump: return true; default: return false; } } enum { lcb_ctx_undo_next, lcb_ctx_prev, lcb_ctx_next }; /* Bytes per restart table. */ static inline u32 bytes_per_rt(const struct RESTART_TABLE *rt) { return le16_to_cpu(rt->used) * le16_to_cpu(rt->size) + sizeof(struct RESTART_TABLE); } /* Log record length. */ static inline u32 lrh_length(const struct LOG_REC_HDR *lr) { u16 t16 = le16_to_cpu(lr->lcns_follow); return struct_size(lr, page_lcns, max_t(u16, 1, t16)); } struct lcb { struct LFS_RECORD_HDR *lrh; // Log record header of the current lsn. struct LOG_REC_HDR *log_rec; u32 ctx_mode; // lcb_ctx_undo_next/lcb_ctx_prev/lcb_ctx_next struct CLIENT_ID client; bool alloc; // If true the we should deallocate 'log_rec'. }; static void lcb_put(struct lcb *lcb) { if (lcb->alloc) kfree(lcb->log_rec); kfree(lcb->lrh); kfree(lcb); } /* Find the oldest lsn from active clients. */ static inline void oldest_client_lsn(const struct CLIENT_REC *ca, __le16 next_client, u64 *oldest_lsn) { while (next_client != LFS_NO_CLIENT_LE) { const struct CLIENT_REC *cr = ca + le16_to_cpu(next_client); u64 lsn = le64_to_cpu(cr->oldest_lsn); /* Ignore this block if it's oldest lsn is 0. */ if (lsn && lsn < *oldest_lsn) *oldest_lsn = lsn; next_client = cr->next_client; } } static inline bool is_rst_page_hdr_valid(u32 file_off, const struct RESTART_HDR *rhdr) { u32 sys_page = le32_to_cpu(rhdr->sys_page_size); u32 page_size = le32_to_cpu(rhdr->page_size); u32 end_usa; u16 ro; if (sys_page < SECTOR_SIZE || page_size < SECTOR_SIZE || sys_page & (sys_page - 1) || page_size & (page_size - 1)) { return false; } /* Check that if the file offset isn't 0, it is the system page size. */ if (file_off && file_off != sys_page) return false; /* Check support version 1.1+. */ if (le16_to_cpu(rhdr->major_ver) <= 1 && !rhdr->minor_ver) return false; if (le16_to_cpu(rhdr->major_ver) > 2) return false; ro = le16_to_cpu(rhdr->ra_off); if (!IS_ALIGNED(ro, 8) || ro > sys_page) return false; end_usa = ((sys_page >> SECTOR_SHIFT) + 1) * sizeof(short); end_usa += le16_to_cpu(rhdr->rhdr.fix_off); if (ro < end_usa) return false; return true; } static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr) { const struct RESTART_AREA *ra; u16 cl, fl, ul; u32 off, l_size, seq_bits; u16 ro = le16_to_cpu(rhdr->ra_off); u32 sys_page = le32_to_cpu(rhdr->sys_page_size); if (ro + offsetof(struct RESTART_AREA, l_size) > SECTOR_SIZE - sizeof(short)) return false; ra = Add2Ptr(rhdr, ro); cl = le16_to_cpu(ra->log_clients); if (cl > 1) return false; off = le16_to_cpu(ra->client_off); if (!IS_ALIGNED(off, 8) || ro + off > SECTOR_SIZE - sizeof(short)) return false; off += cl * sizeof(struct CLIENT_REC); if (off > sys_page) return false; /* * Check the restart length field and whether the entire * restart area is contained that length. */ if (le16_to_cpu(rhdr->ra_off) + le16_to_cpu(ra->ra_len) > sys_page || off > le16_to_cpu(ra->ra_len)) { return false; } /* * As a final check make sure that the use list and the free list * are either empty or point to a valid client. */ fl = le16_to_cpu(ra->client_idx[0]); ul = le16_to_cpu(ra->client_idx[1]); if ((fl != LFS_NO_CLIENT && fl >= cl) || (ul != LFS_NO_CLIENT && ul >= cl)) return false; /* Make sure the sequence number bits match the log file size. */ l_size = le64_to_cpu(ra->l_size); seq_bits = sizeof(u64) * 8 + 3; while (l_size) { l_size >>= 1; seq_bits -= 1; } if (seq_bits != le32_to_cpu(ra->seq_num_bits)) return false; /* The log page data offset and record header length must be quad-aligned. */ if (!IS_ALIGNED(le16_to_cpu(ra->data_off), 8) || !IS_ALIGNED(le16_to_cpu(ra->rec_hdr_len), 8)) return false; return true; } static inline bool is_client_area_valid(const struct RESTART_HDR *rhdr, bool usa_error) { u16 ro = le16_to_cpu(rhdr->ra_off); const struct RESTART_AREA *ra = Add2Ptr(rhdr, ro); u16 ra_len = le16_to_cpu(ra->ra_len); const struct CLIENT_REC *ca; u32 i; if (usa_error && ra_len + ro > SECTOR_SIZE - sizeof(short)) return false; /* Find the start of the client array. */ ca = Add2Ptr(ra, le16_to_cpu(ra->client_off)); /* * Start with the free list. * Check that all the clients are valid and that there isn't a cycle. * Do the in-use list on the second pass. */ for (i = 0; i < 2; i++) { u16 client_idx = le16_to_cpu(ra->client_idx[i]); bool first_client = true; u16 clients = le16_to_cpu(ra->log_clients); while (client_idx != LFS_NO_CLIENT) { const struct CLIENT_REC *cr; if (!clients || client_idx >= le16_to_cpu(ra->log_clients)) return false; clients -= 1; cr = ca + client_idx; client_idx = le16_to_cpu(cr->next_client); if (first_client) { first_client = false; if (cr->prev_client != LFS_NO_CLIENT_LE) return false; } } } return true; } /* * remove_client * * Remove a client record from a client record list an restart area. */ static inline void remove_client(struct CLIENT_REC *ca, const struct CLIENT_REC *cr, __le16 *head) { if (cr->prev_client == LFS_NO_CLIENT_LE) *head = cr->next_client; else ca[le16_to_cpu(cr->prev_client)].next_client = cr->next_client; if (cr->next_client != LFS_NO_CLIENT_LE) ca[le16_to_cpu(cr->next_client)].prev_client = cr->prev_client; } /* * add_client - Add a client record to the start of a list. */ static inline void add_client(struct CLIENT_REC *ca, u16 index, __le16 *head) { struct CLIENT_REC *cr = ca + index; cr->prev_client = LFS_NO_CLIENT_LE; cr->next_client = *head; if (*head != LFS_NO_CLIENT_LE) ca[le16_to_cpu(*head)].prev_client = cpu_to_le16(index); *head = cpu_to_le16(index); } /* * Enumerate restart table. * * @t - table to enumerate. * @c - current enumerated element. * * enumeration starts with @c == NULL * returns next element or NULL */ static inline void *enum_rstbl(struct RESTART_TABLE *t, void *c) { __le32 *e; u32 bprt; u16 rsize; if (!t) return NULL; rsize = le16_to_cpu(t->size); if (!c) { /* start enumeration. */ if (!t->total) return NULL; e = Add2Ptr(t, sizeof(struct RESTART_TABLE)); } else { e = Add2Ptr(c, rsize); } /* Loop until we hit the first one allocated, or the end of the list. */ for (bprt = bytes_per_rt(t); PtrOffset(t, e) < bprt; e = Add2Ptr(e, rsize)) { if (*e == RESTART_ENTRY_ALLOCATED_LE) return e; } return NULL; } /* * find_dp - Search for a @vcn in Dirty Page Table. */ static inline struct DIR_PAGE_ENTRY *find_dp(struct RESTART_TABLE *dptbl, u32 target_attr, u64 vcn) { __le32 ta = cpu_to_le32(target_attr); struct DIR_PAGE_ENTRY *dp = NULL; while ((dp = enum_rstbl(dptbl, dp))) { u64 dp_vcn = le64_to_cpu(dp->vcn); if (dp->target_attr == ta && vcn >= dp_vcn && vcn < dp_vcn + le32_to_cpu(dp->lcns_follow)) { return dp; } } return NULL; } static inline u32 norm_file_page(u32 page_size, u32 *l_size, bool use_default) { if (use_default) page_size = DefaultLogPageSize; /* Round the file size down to a system page boundary. */ *l_size &= ~(page_size - 1); /* File should contain at least 2 restart pages and MinLogRecordPages pages. */ if (*l_size < (MinLogRecordPages + 2) * page_size) return 0; return page_size; } static bool check_log_rec(const struct LOG_REC_HDR *lr, u32 bytes, u32 tr, u32 bytes_per_attr_entry) { u16 t16; if (bytes < sizeof(struct LOG_REC_HDR)) return false; if (!tr) return false; if ((tr - sizeof(struct RESTART_TABLE)) % sizeof(struct TRANSACTION_ENTRY)) return false; if (le16_to_cpu(lr->redo_off) & 7) return false; if (le16_to_cpu(lr->undo_off) & 7) return false; if (lr->target_attr) goto check_lcns; if (is_target_required(le16_to_cpu(lr->redo_op))) return false; if (is_target_required(le16_to_cpu(lr->undo_op))) return false; check_lcns: if (!lr->lcns_follow) goto check_length; t16 = le16_to_cpu(lr->target_attr); if ((t16 - sizeof(struct RESTART_TABLE)) % bytes_per_attr_entry) return false; check_length: if (bytes < lrh_length(lr)) return false; return true; } static bool check_rstbl(const struct RESTART_TABLE *rt, size_t bytes) { u32 ts; u32 i, off; u16 rsize = le16_to_cpu(rt->size); u16 ne = le16_to_cpu(rt->used); u32 ff = le32_to_cpu(rt->first_free); u32 lf = le32_to_cpu(rt->last_free); ts = rsize * ne + sizeof(struct RESTART_TABLE); if (!rsize || rsize > bytes || rsize + sizeof(struct RESTART_TABLE) > bytes || bytes < ts || le16_to_cpu(rt->total) > ne || ff > ts - sizeof(__le32) || lf > ts - sizeof(__le32) || (ff && ff < sizeof(struct RESTART_TABLE)) || (lf && lf < sizeof(struct RESTART_TABLE))) { return false; } /* * Verify each entry is either allocated or points * to a valid offset the table. */ for (i = 0; i < ne; i++) { off = le32_to_cpu(*(__le32 *)Add2Ptr( rt, i * rsize + sizeof(struct RESTART_TABLE))); if (off != RESTART_ENTRY_ALLOCATED && off && (off < sizeof(struct RESTART_TABLE) || ((off - sizeof(struct RESTART_TABLE)) % rsize))) { return false; } } /* * Walk through the list headed by the first entry to make * sure none of the entries are currently being used. */ for (off = ff; off;) { if (off == RESTART_ENTRY_ALLOCATED) return false; off = le32_to_cpu(*(__le32 *)Add2Ptr(rt, off)); if (off > ts - sizeof(__le32)) return false; } return true; } /* * free_rsttbl_idx - Free a previously allocated index a Restart Table. */ static inline void free_rsttbl_idx(struct RESTART_TABLE *rt, u32 off) { __le32 *e; u32 lf = le32_to_cpu(rt->last_free); __le32 off_le = cpu_to_le32(off); e = Add2Ptr(rt, off); if (off < le32_to_cpu(rt->free_goal)) { *e = rt->first_free; rt->first_free = off_le; if (!lf) rt->last_free = off_le; } else { if (lf) *(__le32 *)Add2Ptr(rt, lf) = off_le; else rt->first_free = off_le; rt->last_free = off_le; *e = 0; } le16_sub_cpu(&rt->total, 1); } static inline struct RESTART_TABLE *init_rsttbl(u16 esize, u16 used) { __le32 *e, *last_free; u32 off; u32 bytes = esize * used + sizeof(struct RESTART_TABLE); u32 lf = sizeof(struct RESTART_TABLE) + (used - 1) * esize; struct RESTART_TABLE *t = kzalloc(bytes, GFP_NOFS); if (!t) return NULL; t->size = cpu_to_le16(esize); t->used = cpu_to_le16(used); t->free_goal = cpu_to_le32(~0u); t->first_free = cpu_to_le32(sizeof(struct RESTART_TABLE)); t->last_free = cpu_to_le32(lf); e = (__le32 *)(t + 1); last_free = Add2Ptr(t, lf); for (off = sizeof(struct RESTART_TABLE) + esize; e < last_free; e = Add2Ptr(e, esize), off += esize) { *e = cpu_to_le32(off); } return t; } static inline struct RESTART_TABLE *extend_rsttbl(struct RESTART_TABLE *tbl, u32 add, u32 free_goal) { u16 esize = le16_to_cpu(tbl->size); __le32 osize = cpu_to_le32(bytes_per_rt(tbl)); u32 used = le16_to_cpu(tbl->used); struct RESTART_TABLE *rt; rt = init_rsttbl(esize, used + add); if (!rt) return NULL; memcpy(rt + 1, tbl + 1, esize * used); rt->free_goal = free_goal == ~0u ? cpu_to_le32(~0u) : cpu_to_le32(sizeof(struct RESTART_TABLE) + free_goal * esize); if (tbl->first_free) { rt->first_free = tbl->first_free; *(__le32 *)Add2Ptr(rt, le32_to_cpu(tbl->last_free)) = osize; } else { rt->first_free = osize; } rt->total = tbl->total; kfree(tbl); return rt; } /* * alloc_rsttbl_idx * * Allocate an index from within a previously initialized Restart Table. */ static inline void *alloc_rsttbl_idx(struct RESTART_TABLE **tbl) { u32 off; __le32 *e; struct RESTART_TABLE *t = *tbl; if (!t->first_free) { *tbl = t = extend_rsttbl(t, 16, ~0u); if (!t) return NULL; } off = le32_to_cpu(t->first_free); /* Dequeue this entry and zero it. */ e = Add2Ptr(t, off); t->first_free = *e; memset(e, 0, le16_to_cpu(t->size)); *e = RESTART_ENTRY_ALLOCATED_LE; /* If list is going empty, then we fix the last_free as well. */ if (!t->first_free) t->last_free = 0; le16_add_cpu(&t->total, 1); return Add2Ptr(t, off); } /* * alloc_rsttbl_from_idx * * Allocate a specific index from within a previously initialized Restart Table. */ static inline void *alloc_rsttbl_from_idx(struct RESTART_TABLE **tbl, u32 vbo) { u32 off; __le32 *e; struct RESTART_TABLE *rt = *tbl; u32 bytes = bytes_per_rt(rt); u16 esize = le16_to_cpu(rt->size); /* If the entry is not the table, we will have to extend the table. */ if (vbo >= bytes) { /* * Extend the size by computing the number of entries between * the existing size and the desired index and adding 1 to that. */ u32 bytes2idx = vbo - bytes; /* * There should always be an integral number of entries * being added. Now extend the table. */ *tbl = rt = extend_rsttbl(rt, bytes2idx / esize + 1, bytes); if (!rt) return NULL; } /* See if the entry is already allocated, and just return if it is. */ e = Add2Ptr(rt, vbo); if (*e == RESTART_ENTRY_ALLOCATED_LE) return e; /* * Walk through the table, looking for the entry we're * interested and the previous entry. */ off = le32_to_cpu(rt->first_free); e = Add2Ptr(rt, off); if (off == vbo) { /* this is a match */ rt->first_free = *e; goto skip_looking; } /* * Need to walk through the list looking for the predecessor * of our entry. */ for (;;) { /* Remember the entry just found */ u32 last_off = off; __le32 *last_e = e; /* Should never run of entries. */ /* Lookup up the next entry the list. */ off = le32_to_cpu(*last_e); e = Add2Ptr(rt, off); /* If this is our match we are done. */ if (off == vbo) { *last_e = *e; /* * If this was the last entry, we update that * table as well. */ if (le32_to_cpu(rt->last_free) == off) rt->last_free = cpu_to_le32(last_off); break; } } skip_looking: /* If the list is now empty, we fix the last_free as well. */ if (!rt->first_free) rt->last_free = 0; /* Zero this entry. */ memset(e, 0, esize); *e = RESTART_ENTRY_ALLOCATED_LE; le16_add_cpu(&rt->total, 1); return e; } struct restart_info { u64 last_lsn; struct RESTART_HDR *r_page; u32 vbo; bool chkdsk_was_run; bool valid_page; bool initialized; bool restart; }; #define RESTART_SINGLE_PAGE_IO cpu_to_le16(0x0001) #define NTFSLOG_WRAPPED 0x00000001 #define NTFSLOG_MULTIPLE_PAGE_IO 0x00000002 #define NTFSLOG_NO_LAST_LSN 0x00000004 #define NTFSLOG_REUSE_TAIL 0x00000010 #define NTFSLOG_NO_OLDEST_LSN 0x00000020 /* Helper struct to work with NTFS $LogFile. */ struct ntfs_log { struct ntfs_inode *ni; u32 l_size; u32 orig_file_size; u32 sys_page_size; u32 sys_page_mask; u32 page_size; u32 page_mask; // page_size - 1 u8 page_bits; struct RECORD_PAGE_HDR *one_page_buf; struct RESTART_TABLE *open_attr_tbl; u32 transaction_id; u32 clst_per_page; u32 first_page; u32 next_page; u32 ra_off; u32 data_off; u32 restart_size; u32 data_size; u16 record_header_len; u64 seq_num; u32 seq_num_bits; u32 file_data_bits; u32 seq_num_mask; /* (1 << file_data_bits) - 1 */ struct RESTART_AREA *ra; /* In-memory image of the next restart area. */ u32 ra_size; /* The usable size of the restart area. */ /* * If true, then the in-memory restart area is to be written * to the first position on the disk. */ bool init_ra; bool set_dirty; /* True if we need to set dirty flag. */ u64 oldest_lsn; u32 oldest_lsn_off; u64 last_lsn; u32 total_avail; u32 total_avail_pages; u32 total_undo_commit; u32 max_current_avail; u32 current_avail; u32 reserved; short major_ver; short minor_ver; u32 l_flags; /* See NTFSLOG_XXX */ u32 current_openlog_count; /* On-disk value for open_log_count. */ struct CLIENT_ID client_id; u32 client_undo_commit; struct restart_info rst_info, rst_info2; }; static inline u32 lsn_to_vbo(struct ntfs_log *log, const u64 lsn) { u32 vbo = (lsn << log->seq_num_bits) >> (log->seq_num_bits - 3); return vbo; } /* Compute the offset in the log file of the next log page. */ static inline u32 next_page_off(struct ntfs_log *log, u32 off) { off = (off & ~log->sys_page_mask) + log->page_size; return off >= log->l_size ? log->first_page : off; } static inline u32 lsn_to_page_off(struct ntfs_log *log, u64 lsn) { return (((u32)lsn) << 3) & log->page_mask; } static inline u64 vbo_to_lsn(struct ntfs_log *log, u32 off, u64 Seq) { return (off >> 3) + (Seq << log->file_data_bits); } static inline bool is_lsn_in_file(struct ntfs_log *log, u64 lsn) { return lsn >= log->oldest_lsn && lsn <= le64_to_cpu(log->ra->current_lsn); } static inline u32 hdr_file_off(struct ntfs_log *log, struct RECORD_PAGE_HDR *hdr) { if (log->major_ver < 2) return le64_to_cpu(hdr->rhdr.lsn); return le32_to_cpu(hdr->file_off); } static inline u64 base_lsn(struct ntfs_log *log, const struct RECORD_PAGE_HDR *hdr, u64 lsn) { u64 h_lsn = le64_to_cpu(hdr->rhdr.lsn); u64 ret = (((h_lsn >> log->file_data_bits) + (lsn < (lsn_to_vbo(log, h_lsn) & ~log->page_mask) ? 1 : 0)) << log->file_data_bits) + ((((is_log_record_end(hdr) && h_lsn <= le64_to_cpu(hdr->record_hdr.last_end_lsn)) ? le16_to_cpu(hdr->record_hdr.next_record_off) : log->page_size) + lsn) >> 3); return ret; } static inline bool verify_client_lsn(struct ntfs_log *log, const struct CLIENT_REC *client, u64 lsn) { return lsn >= le64_to_cpu(client->oldest_lsn) && lsn <= le64_to_cpu(log->ra->current_lsn) && lsn; } static int read_log_page(struct ntfs_log *log, u32 vbo, struct RECORD_PAGE_HDR **buffer, bool *usa_error) { int err = 0; u32 page_idx = vbo >> log->page_bits; u32 page_off = vbo & log->page_mask; u32 bytes = log->page_size - page_off; void *to_free = NULL; u32 page_vbo = page_idx << log->page_bits; struct RECORD_PAGE_HDR *page_buf; struct ntfs_inode *ni = log->ni; bool bBAAD; if (vbo >= log->l_size) return -EINVAL; if (!*buffer) { to_free = kmalloc(log->page_size, GFP_NOFS); if (!to_free) return -ENOMEM; *buffer = to_free; } page_buf = page_off ? log->one_page_buf : *buffer; err = ntfs_read_run_nb(ni->mi.sbi, &ni->file.run, page_vbo, page_buf, log->page_size, NULL); if (err) goto out; if (page_buf->rhdr.sign != NTFS_FFFF_SIGNATURE) ntfs_fix_post_read(&page_buf->rhdr, PAGE_SIZE, false); if (page_buf != *buffer) memcpy(*buffer, Add2Ptr(page_buf, page_off), bytes); bBAAD = page_buf->rhdr.sign == NTFS_BAAD_SIGNATURE; if (usa_error) *usa_error = bBAAD; /* Check that the update sequence array for this page is valid */ /* If we don't allow errors, raise an error status */ else if (bBAAD) err = -EINVAL; out: if (err && to_free) { kfree(to_free); *buffer = NULL; } return err; } /* * log_read_rst * * It walks through 512 blocks of the file looking for a valid * restart page header. It will stop the first time we find a * valid page header. */ static int log_read_rst(struct ntfs_log *log, bool first, struct restart_info *info) { u32 skip; u64 vbo; struct RESTART_HDR *r_page = NULL; /* Determine which restart area we are looking for. */ if (first) { vbo = 0; skip = 512; } else { vbo = 512; skip = 0; } /* Loop continuously until we succeed. */ for (; vbo < log->l_size; vbo = 2 * vbo + skip, skip = 0) { bool usa_error; bool brst, bchk; struct RESTART_AREA *ra; /* Read a page header at the current offset. */ if (read_log_page(log, vbo, (struct RECORD_PAGE_HDR **)&r_page, &usa_error)) { /* Ignore any errors. */ continue; } /* Exit if the signature is a log record page. */ if (r_page->rhdr.sign == NTFS_RCRD_SIGNATURE) { info->initialized = true; break; } brst = r_page->rhdr.sign == NTFS_RSTR_SIGNATURE; bchk = r_page->rhdr.sign == NTFS_CHKD_SIGNATURE; if (!bchk && !brst) { if (r_page->rhdr.sign != NTFS_FFFF_SIGNATURE) { /* * Remember if the signature does not * indicate uninitialized file. */ info->initialized = true; } continue; } ra = NULL; info->valid_page = false; info->initialized = true; info->vbo = vbo; /* Let's check the restart area if this is a valid page. */ if (!is_rst_page_hdr_valid(vbo, r_page)) goto check_result; ra = Add2Ptr(r_page, le16_to_cpu(r_page->ra_off)); if (!is_rst_area_valid(r_page)) goto check_result; /* * We have a valid restart page header and restart area. * If chkdsk was run or we have no clients then we have * no more checking to do. */ if (bchk || ra->client_idx[1] == LFS_NO_CLIENT_LE) { info->valid_page = true; goto check_result; } if (is_client_area_valid(r_page, usa_error)) { info->valid_page = true; ra = Add2Ptr(r_page, le16_to_cpu(r_page->ra_off)); } check_result: /* * If chkdsk was run then update the caller's * values and return. */ if (r_page->rhdr.sign == NTFS_CHKD_SIGNATURE) { info->chkdsk_was_run = true; info->last_lsn = le64_to_cpu(r_page->rhdr.lsn); info->restart = true; info->r_page = r_page; return 0; } /* * If we have a valid page then copy the values * we need from it. */ if (info->valid_page) { info->last_lsn = le64_to_cpu(ra->current_lsn); info->restart = true; info->r_page = r_page; return 0; } } kfree(r_page); return 0; } /* * Ilog_init_pg_hdr - Init @log from restart page header. */ static void log_init_pg_hdr(struct ntfs_log *log, u16 major_ver, u16 minor_ver) { log->sys_page_size = log->page_size; log->sys_page_mask = log->page_mask; log->clst_per_page = log->page_size >> log->ni->mi.sbi->cluster_bits; if (!log->clst_per_page) log->clst_per_page = 1; log->first_page = major_ver >= 2 ? 0x22 * log->page_size : 4 * log->page_size; log->major_ver = major_ver; log->minor_ver = minor_ver; } /* * log_create - Init @log in cases when we don't have a restart area to use. */ static void log_create(struct ntfs_log *log, const u64 last_lsn, u32 open_log_count, bool wrapped, bool use_multi_page) { /* All file offsets must be quadword aligned. */ log->file_data_bits = blksize_bits(log->l_size) - 3; log->seq_num_mask = (8 << log->file_data_bits) - 1; log->seq_num_bits = sizeof(u64) * 8 - log->file_data_bits; log->seq_num = (last_lsn >> log->file_data_bits) + 2; log->next_page = log->first_page; log->oldest_lsn = log->seq_num << log->file_data_bits; log->oldest_lsn_off = 0; log->last_lsn = log->oldest_lsn; log->l_flags |= NTFSLOG_NO_LAST_LSN | NTFSLOG_NO_OLDEST_LSN; /* Set the correct flags for the I/O and indicate if we have wrapped. */ if (wrapped) log->l_flags |= NTFSLOG_WRAPPED; if (use_multi_page) log->l_flags |= NTFSLOG_MULTIPLE_PAGE_IO; /* Compute the log page values. */ log->data_off = ALIGN( offsetof(struct RECORD_PAGE_HDR, fixups) + sizeof(short) * ((log->page_size >> SECTOR_SHIFT) + 1), 8); log->data_size = log->page_size - log->data_off; log->record_header_len = sizeof(struct LFS_RECORD_HDR); /* Remember the different page sizes for reservation. */ log->reserved = log->data_size - log->record_header_len; /* Compute the restart page values. */ log->ra_off = ALIGN( offsetof(struct RESTART_HDR, fixups) + sizeof(short) * ((log->sys_page_size >> SECTOR_SHIFT) + 1), 8); log->restart_size = log->sys_page_size - log->ra_off; log->ra_size = struct_size(log->ra, clients, 1); log->current_openlog_count = open_log_count; /* * The total available log file space is the number of * log file pages times the space available on each page. */ log->total_avail_pages = log->l_size - log->first_page; log->total_avail = log->total_avail_pages >> log->page_bits; /* * We assume that we can't use the end of the page less than * the file record size. * Then we won't need to reserve more than the caller asks for. */ log->max_current_avail = log->total_avail * log->reserved; log->total_avail = log->total_avail * log->data_size; log->current_avail = log->max_current_avail; } /* * log_create_ra - Fill a restart area from the values stored in @log. */ static struct RESTART_AREA *log_create_ra(struct ntfs_log *log) { struct CLIENT_REC *cr; struct RESTART_AREA *ra = kzalloc(log->restart_size, GFP_NOFS); if (!ra) return NULL; ra->current_lsn = cpu_to_le64(log->last_lsn); ra->log_clients = cpu_to_le16(1); ra->client_idx[1] = LFS_NO_CLIENT_LE; if (log->l_flags & NTFSLOG_MULTIPLE_PAGE_IO) ra->flags = RESTART_SINGLE_PAGE_IO; ra->seq_num_bits = cpu_to_le32(log->seq_num_bits); ra->ra_len = cpu_to_le16(log->ra_size); ra->client_off = cpu_to_le16(offsetof(struct RESTART_AREA, clients)); ra->l_size = cpu_to_le64(log->l_size); ra->rec_hdr_len = cpu_to_le16(log->record_header_len); ra->data_off = cpu_to_le16(log->data_off); ra->open_log_count = cpu_to_le32(log->current_openlog_count + 1); cr = ra->clients; cr->prev_client = LFS_NO_CLIENT_LE; cr->next_client = LFS_NO_CLIENT_LE; return ra; } static u32 final_log_off(struct ntfs_log *log, u64 lsn, u32 data_len) { u32 base_vbo = lsn << 3; u32 final_log_off = (base_vbo & log->seq_num_mask) & ~log->page_mask; u32 page_off = base_vbo & log->page_mask; u32 tail = log->page_size - page_off; page_off -= 1; /* Add the length of the header. */ data_len += log->record_header_len; /* * If this lsn is contained this log page we are done. * Otherwise we need to walk through several log pages. */ if (data_len > tail) { data_len -= tail; tail = log->data_size; page_off = log->data_off - 1; for (;;) { final_log_off = next_page_off(log, final_log_off); /* * We are done if the remaining bytes * fit on this page. */ if (data_len <= tail) break; data_len -= tail; } } /* * We add the remaining bytes to our starting position on this page * and then add that value to the file offset of this log page. */ return final_log_off + data_len + page_off; } static int next_log_lsn(struct ntfs_log *log, const struct LFS_RECORD_HDR *rh, u64 *lsn) { int err; u64 this_lsn = le64_to_cpu(rh->this_lsn); u32 vbo = lsn_to_vbo(log, this_lsn); u32 end = final_log_off(log, this_lsn, le32_to_cpu(rh->client_data_len)); u32 hdr_off = end & ~log->sys_page_mask; u64 seq = this_lsn >> log->file_data_bits; struct RECORD_PAGE_HDR *page = NULL; /* Remember if we wrapped. */ if (end <= vbo) seq += 1; /* Log page header for this page. */ err = read_log_page(log, hdr_off, &page, NULL); if (err) return err; /* * If the lsn we were given was not the last lsn on this page, * then the starting offset for the next lsn is on a quad word * boundary following the last file offset for the current lsn. * Otherwise the file offset is the start of the data on the next page. */ if (this_lsn == le64_to_cpu(page->rhdr.lsn)) { /* If we wrapped, we need to increment the sequence number. */ hdr_off = next_page_off(log, hdr_off); if (hdr_off == log->first_page) seq += 1; vbo = hdr_off + log->data_off; } else { vbo = ALIGN(end, 8); } /* Compute the lsn based on the file offset and the sequence count. */ *lsn = vbo_to_lsn(log, vbo, seq); /* * If this lsn is within the legal range for the file, we return true. * Otherwise false indicates that there are no more lsn's. */ if (!is_lsn_in_file(log, *lsn)) *lsn = 0; kfree(page); return 0; } /* * current_log_avail - Calculate the number of bytes available for log records. */ static u32 current_log_avail(struct ntfs_log *log) { u32 oldest_off, next_free_off, free_bytes; if (log->l_flags & NTFSLOG_NO_LAST_LSN) { /* The entire file is available. */ return log->max_current_avail; } /* * If there is a last lsn the restart area then we know that we will * have to compute the free range. * If there is no oldest lsn then start at the first page of the file. */ oldest_off = (log->l_flags & NTFSLOG_NO_OLDEST_LSN) ? log->first_page : (log->oldest_lsn_off & ~log->sys_page_mask); /* * We will use the next log page offset to compute the next free page. * If we are going to reuse this page go to the next page. * If we are at the first page then use the end of the file. */ next_free_off = (log->l_flags & NTFSLOG_REUSE_TAIL) ? log->next_page + log->page_size : log->next_page == log->first_page ? log->l_size : log->next_page; /* If the two offsets are the same then there is no available space. */ if (oldest_off == next_free_off) return 0; /* * If the free offset follows the oldest offset then subtract * this range from the total available pages. */ free_bytes = oldest_off < next_free_off ? log->total_avail_pages - (next_free_off - oldest_off) : oldest_off - next_free_off; free_bytes >>= log->page_bits; return free_bytes * log->reserved; } static bool check_subseq_log_page(struct ntfs_log *log, const struct RECORD_PAGE_HDR *rp, u32 vbo, u64 seq) { u64 lsn_seq; const struct NTFS_RECORD_HEADER *rhdr = &rp->rhdr; u64 lsn = le64_to_cpu(rhdr->lsn); if (rhdr->sign == NTFS_FFFF_SIGNATURE || !rhdr->sign) return false; /* * If the last lsn on the page occurs was written after the page * that caused the original error then we have a fatal error. */ lsn_seq = lsn >> log->file_data_bits; /* * If the sequence number for the lsn the page is equal or greater * than lsn we expect, then this is a subsequent write. */ return lsn_seq >= seq || (lsn_seq == seq - 1 && log->first_page == vbo && vbo != (lsn_to_vbo(log, lsn) & ~log->page_mask)); } /* * last_log_lsn * * Walks through the log pages for a file, searching for the * last log page written to the file. */ static int last_log_lsn(struct ntfs_log *log) { int err; bool usa_error = false; bool replace_page = false; bool reuse_page = log->l_flags & NTFSLOG_REUSE_TAIL; bool wrapped_file, wrapped; u32 page_cnt = 1, page_pos = 1; u32 page_off = 0, page_off1 = 0, saved_off = 0; u32 final_off, second_off, final_off_prev = 0, second_off_prev = 0; u32 first_file_off = 0, second_file_off = 0; u32 part_io_count = 0; u32 tails = 0; u32 this_off, curpage_off, nextpage_off, remain_pages; u64 expected_seq, seq_base = 0, lsn_base = 0; u64 best_lsn, best_lsn1, best_lsn2; u64 lsn_cur, lsn1, lsn2; u64 last_ok_lsn = reuse_page ? log->last_lsn : 0; u16 cur_pos, best_page_pos; struct RECORD_PAGE_HDR *page = NULL; struct RECORD_PAGE_HDR *tst_page = NULL; struct RECORD_PAGE_HDR *first_tail = NULL; struct RECORD_PAGE_HDR *second_tail = NULL; struct RECORD_PAGE_HDR *tail_page = NULL; struct RECORD_PAGE_HDR *second_tail_prev = NULL; struct RECORD_PAGE_HDR *first_tail_prev = NULL; struct RECORD_PAGE_HDR *page_bufs = NULL; struct RECORD_PAGE_HDR *best_page; if (log->major_ver >= 2) { final_off = 0x02 * log->page_size; second_off = 0x12 * log->page_size; // 0x10 == 0x12 - 0x2 page_bufs = kmalloc(log->page_size * 0x10, GFP_NOFS); if (!page_bufs) return -ENOMEM; } else { second_off = log->first_page - log->page_size; final_off = second_off - log->page_size; } next_tail: /* Read second tail page (at pos 3/0x12000). */ if (read_log_page(log, second_off, &second_tail, &usa_error) || usa_error || second_tail->rhdr.sign != NTFS_RCRD_SIGNATURE) { kfree(second_tail); second_tail = NULL; second_file_off = 0; lsn2 = 0; } else { second_file_off = hdr_file_off(log, second_tail); lsn2 = le64_to_cpu(second_tail->record_hdr.last_end_lsn); } /* Read first tail page (at pos 2/0x2000). */ if (read_log_page(log, final_off, &first_tail, &usa_error) || usa_error || first_tail->rhdr.sign != NTFS_RCRD_SIGNATURE) { kfree(first_tail); first_tail = NULL; first_file_off = 0; lsn1 = 0; } else { first_file_off = hdr_file_off(log, first_tail); lsn1 = le64_to_cpu(first_tail->record_hdr.last_end_lsn); } if (log->major_ver < 2) { int best_page; first_tail_prev = first_tail; final_off_prev = first_file_off; second_tail_prev = second_tail; second_off_prev = second_file_off; tails = 1; if (!first_tail && !second_tail) goto tail_read; if (first_tail && second_tail) best_page = lsn1 < lsn2 ? 1 : 0; else if (first_tail) best_page = 0; else best_page = 1; page_off = best_page ? second_file_off : first_file_off; seq_base = (best_page ? lsn2 : lsn1) >> log->file_data_bits; goto tail_read; } best_lsn1 = first_tail ? base_lsn(log, first_tail, first_file_off) : 0; best_lsn2 = second_tail ? base_lsn(log, second_tail, second_file_off) : 0; if (first_tail && second_tail) { if (best_lsn1 > best_lsn2) { best_lsn = best_lsn1; best_page = first_tail; this_off = first_file_off; } else { best_lsn = best_lsn2; best_page = second_tail; this_off = second_file_off; } } else if (first_tail) { best_lsn = best_lsn1; best_page = first_tail; this_off = first_file_off; } else if (second_tail) { best_lsn = best_lsn2; best_page = second_tail; this_off = second_file_off; } else { goto tail_read; } best_page_pos = le16_to_cpu(best_page->page_pos); if (!tails) { if (best_page_pos == page_pos) { seq_base = best_lsn >> log->file_data_bits; saved_off = page_off = le32_to_cpu(best_page->file_off); lsn_base = best_lsn; memmove(page_bufs, best_page, log->page_size); page_cnt = le16_to_cpu(best_page->page_count); if (page_cnt > 1) page_pos += 1; tails = 1; } } else if (seq_base == (best_lsn >> log->file_data_bits) && saved_off + log->page_size == this_off && lsn_base < best_lsn && (page_pos != page_cnt || best_page_pos == page_pos || best_page_pos == 1) && (page_pos >= page_cnt || best_page_pos == page_pos)) { u16 bppc = le16_to_cpu(best_page->page_count); saved_off += log->page_size; lsn_base = best_lsn; memmove(Add2Ptr(page_bufs, tails * log->page_size), best_page, log->page_size); tails += 1; if (best_page_pos != bppc) { page_cnt = bppc; page_pos = best_page_pos; if (page_cnt > 1) page_pos += 1; } else { page_pos = page_cnt = 1; } } else { kfree(first_tail); kfree(second_tail); goto tail_read; } kfree(first_tail_prev); first_tail_prev = first_tail; final_off_prev = first_file_off; first_tail = NULL; kfree(second_tail_prev); second_tail_prev = second_tail; second_off_prev = second_file_off; second_tail = NULL; final_off += log->page_size; second_off += log->page_size; if (tails < 0x10) goto next_tail; tail_read: first_tail = first_tail_prev; final_off = final_off_prev; second_tail = second_tail_prev; second_off = second_off_prev; page_cnt = page_pos = 1; curpage_off = seq_base == log->seq_num ? min(log->next_page, page_off) : log->next_page; wrapped_file = curpage_off == log->first_page && !(log->l_flags & (NTFSLOG_NO_LAST_LSN | NTFSLOG_REUSE_TAIL)); expected_seq = wrapped_file ? (log->seq_num + 1) : log->seq_num; nextpage_off = curpage_off; next_page: tail_page = NULL; /* Read the next log page. */ err = read_log_page(log, curpage_off, &page, &usa_error); /* Compute the next log page offset the file. */ nextpage_off = next_page_off(log, curpage_off); wrapped = nextpage_off == log->first_page; if (tails > 1) { struct RECORD_PAGE_HDR *cur_page = Add2Ptr(page_bufs, curpage_off - page_off); if (curpage_off == saved_off) { tail_page = cur_page; goto use_tail_page; } if (page_off > curpage_off || curpage_off >= saved_off) goto use_tail_page; if (page_off1) goto use_cur_page; if (!err && !usa_error && page->rhdr.sign == NTFS_RCRD_SIGNATURE && cur_page->rhdr.lsn == page->rhdr.lsn && cur_page->record_hdr.next_record_off == page->record_hdr.next_record_off && ((page_pos == page_cnt && le16_to_cpu(page->page_pos) == 1) || (page_pos != page_cnt && le16_to_cpu(page->page_pos) == page_pos + 1 && le16_to_cpu(page->page_count) == page_cnt))) { cur_page = NULL; goto use_tail_page; } page_off1 = page_off; use_cur_page: lsn_cur = le64_to_cpu(cur_page->rhdr.lsn); if (last_ok_lsn != le64_to_cpu(cur_page->record_hdr.last_end_lsn) && ((lsn_cur >> log->file_data_bits) + ((curpage_off < (lsn_to_vbo(log, lsn_cur) & ~log->page_mask)) ? 1 : 0)) != expected_seq) { goto check_tail; } if (!is_log_record_end(cur_page)) { tail_page = NULL; last_ok_lsn = lsn_cur; goto next_page_1; } log->seq_num = expected_seq; log->l_flags &= ~NTFSLOG_NO_LAST_LSN; log->last_lsn = le64_to_cpu(cur_page->record_hdr.last_end_lsn); log->ra->current_lsn = cur_page->record_hdr.last_end_lsn; if (log->record_header_len <= log->page_size - le16_to_cpu(cur_page->record_hdr.next_record_off)) { log->l_flags |= NTFSLOG_REUSE_TAIL; log->next_page = curpage_off; } else { log->l_flags &= ~NTFSLOG_REUSE_TAIL; log->next_page = nextpage_off; } if (wrapped_file) log->l_flags |= NTFSLOG_WRAPPED; last_ok_lsn = le64_to_cpu(cur_page->record_hdr.last_end_lsn); goto next_page_1; } /* * If we are at the expected first page of a transfer check to see * if either tail copy is at this offset. * If this page is the last page of a transfer, check if we wrote * a subsequent tail copy. */ if (page_cnt == page_pos || page_cnt == page_pos + 1) { /* * Check if the offset matches either the first or second * tail copy. It is possible it will match both. */ if (curpage_off == final_off) tail_page = first_tail; /* * If we already matched on the first page then * check the ending lsn's. */ if (curpage_off == second_off) { if (!tail_page || (second_tail && le64_to_cpu(second_tail->record_hdr.last_end_lsn) > le64_to_cpu(first_tail->record_hdr .last_end_lsn))) { tail_page = second_tail; } } } use_tail_page: if (tail_page) { /* We have a candidate for a tail copy. */ lsn_cur = le64_to_cpu(tail_page->record_hdr.last_end_lsn); if (last_ok_lsn < lsn_cur) { /* * If the sequence number is not expected, * then don't use the tail copy. */ if (expected_seq != (lsn_cur >> log->file_data_bits)) tail_page = NULL; } else if (last_ok_lsn > lsn_cur) { /* * If the last lsn is greater than the one on * this page then forget this tail. */ tail_page = NULL; } } /* *If we have an error on the current page, * we will break of this loop. */ if (err || usa_error) goto check_tail; /* * Done if the last lsn on this page doesn't match the previous known * last lsn or the sequence number is not expected. */ lsn_cur = le64_to_cpu(page->rhdr.lsn); if (last_ok_lsn != lsn_cur && expected_seq != (lsn_cur >> log->file_data_bits)) { goto check_tail; } /* * Check that the page position and page count values are correct. * If this is the first page of a transfer the position must be 1 * and the count will be unknown. */ if (page_cnt == page_pos) { if (page->page_pos != cpu_to_le16(1) && (!reuse_page || page->page_pos != page->page_count)) { /* * If the current page is the first page we are * looking at and we are reusing this page then * it can be either the first or last page of a * transfer. Otherwise it can only be the first. */ goto check_tail; } } else if (le16_to_cpu(page->page_count) != page_cnt || le16_to_cpu(page->page_pos) != page_pos + 1) { /* * The page position better be 1 more than the last page * position and the page count better match. */ goto check_tail; } /* * We have a valid page the file and may have a valid page * the tail copy area. * If the tail page was written after the page the file then * break of the loop. */ if (tail_page && le64_to_cpu(tail_page->record_hdr.last_end_lsn) > lsn_cur) { /* Remember if we will replace the page. */ replace_page = true; goto check_tail; } tail_page = NULL; if (is_log_record_end(page)) { /* * Since we have read this page we know the sequence number * is the same as our expected value. */ log->seq_num = expected_seq; log->last_lsn = le64_to_cpu(page->record_hdr.last_end_lsn); log->ra->current_lsn = page->record_hdr.last_end_lsn; log->l_flags &= ~NTFSLOG_NO_LAST_LSN; /* * If there is room on this page for another header then * remember we want to reuse the page. */ if (log->record_header_len <= log->page_size - le16_to_cpu(page->record_hdr.next_record_off)) { log->l_flags |= NTFSLOG_REUSE_TAIL; log->next_page = curpage_off; } else { log->l_flags &= ~NTFSLOG_REUSE_TAIL; log->next_page = nextpage_off; } /* Remember if we wrapped the log file. */ if (wrapped_file) log->l_flags |= NTFSLOG_WRAPPED; } /* * Remember the last page count and position. * Also remember the last known lsn. */ page_cnt = le16_to_cpu(page->page_count); page_pos = le16_to_cpu(page->page_pos); last_ok_lsn = le64_to_cpu(page->rhdr.lsn); next_page_1: if (wrapped) { expected_seq += 1; wrapped_file = 1; } curpage_off = nextpage_off; kfree(page); page = NULL; reuse_page = 0; goto next_page; check_tail: if (tail_page) { log->seq_num = expected_seq; log->last_lsn = le64_to_cpu(tail_page->record_hdr.last_end_lsn); log->ra->current_lsn = tail_page->record_hdr.last_end_lsn; log->l_flags &= ~NTFSLOG_NO_LAST_LSN; if (log->page_size - le16_to_cpu( tail_page->record_hdr.next_record_off) >= log->record_header_len) { log->l_flags |= NTFSLOG_REUSE_TAIL; log->next_page = curpage_off; } else { log->l_flags &= ~NTFSLOG_REUSE_TAIL; log->next_page = nextpage_off; } if (wrapped) log->l_flags |= NTFSLOG_WRAPPED; } /* Remember that the partial IO will start at the next page. */ second_off = nextpage_off; /* * If the next page is the first page of the file then update * the sequence number for log records which begon the next page. */ if (wrapped) expected_seq += 1; /* * If we have a tail copy or are performing single page I/O we can * immediately look at the next page. */ if (replace_page || (log->ra->flags & RESTART_SINGLE_PAGE_IO)) { page_cnt = 2; page_pos = 1; goto check_valid; } if (page_pos != page_cnt) goto check_valid; /* * If the next page causes us to wrap to the beginning of the log * file then we know which page to check next. */ if (wrapped) { page_cnt = 2; page_pos = 1; goto check_valid; } cur_pos = 2; next_test_page: kfree(tst_page); tst_page = NULL; /* Walk through the file, reading log pages. */ err = read_log_page(log, nextpage_off, &tst_page, &usa_error); /* * If we get a USA error then assume that we correctly found * the end of the original transfer. */ if (usa_error) goto file_is_valid; /* * If we were able to read the page, we examine it to see if it * is the same or different Io block. */ if (err) goto next_test_page_1; if (le16_to_cpu(tst_page->page_pos) == cur_pos && check_subseq_log_page(log, tst_page, nextpage_off, expected_seq)) { page_cnt = le16_to_cpu(tst_page->page_count) + 1; page_pos = le16_to_cpu(tst_page->page_pos); goto check_valid; } else { goto file_is_valid; } next_test_page_1: nextpage_off = next_page_off(log, curpage_off); wrapped = nextpage_off == log->first_page; if (wrapped) { expected_seq += 1; page_cnt = 2; page_pos = 1; } cur_pos += 1; part_io_count += 1; if (!wrapped) goto next_test_page; check_valid: /* Skip over the remaining pages this transfer. */ remain_pages = page_cnt - page_pos - 1; part_io_count += remain_pages; while (remain_pages--) { nextpage_off = next_page_off(log, curpage_off); wrapped = nextpage_off == log->first_page; if (wrapped) expected_seq += 1; } /* Call our routine to check this log page. */ kfree(tst_page); tst_page = NULL; err = read_log_page(log, nextpage_off, &tst_page, &usa_error); if (!err && !usa_error && check_subseq_log_page(log, tst_page, nextpage_off, expected_seq)) { err = -EINVAL; goto out; } file_is_valid: /* We have a valid file. */ if (page_off1 || tail_page) { struct RECORD_PAGE_HDR *tmp_page; if (sb_rdonly(log->ni->mi.sbi->sb)) { err = -EROFS; goto out; } if (page_off1) { tmp_page = Add2Ptr(page_bufs, page_off1 - page_off); tails -= (page_off1 - page_off) / log->page_size; if (!tail_page) tails -= 1; } else { tmp_page = tail_page; tails = 1; } while (tails--) { u64 off = hdr_file_off(log, tmp_page); if (!page) { page = kmalloc(log->page_size, GFP_NOFS); if (!page) { err = -ENOMEM; goto out; } } /* * Correct page and copy the data from this page * into it and flush it to disk. */ memcpy(page, tmp_page, log->page_size); /* Fill last flushed lsn value flush the page. */ if (log->major_ver < 2) page->rhdr.lsn = page->record_hdr.last_end_lsn; else page->file_off = 0; page->page_pos = page->page_count = cpu_to_le16(1); ntfs_fix_pre_write(&page->rhdr, log->page_size); err = ntfs_sb_write_run(log->ni->mi.sbi, &log->ni->file.run, off, page, log->page_size, 0); if (err) goto out; if (part_io_count && second_off == off) { second_off += log->page_size; part_io_count -= 1; } tmp_page = Add2Ptr(tmp_page, log->page_size); } } if (part_io_count) { if (sb_rdonly(log->ni->mi.sbi->sb)) { err = -EROFS; goto out; } } out: kfree(second_tail); kfree(first_tail); kfree(page); kfree(tst_page); kfree(page_bufs); return err; } /* * read_log_rec_buf - Copy a log record from the file to a buffer. * * The log record may span several log pages and may even wrap the file. */ static int read_log_rec_buf(struct ntfs_log *log, const struct LFS_RECORD_HDR *rh, void *buffer) { int err; struct RECORD_PAGE_HDR *ph = NULL; u64 lsn = le64_to_cpu(rh->this_lsn); u32 vbo = lsn_to_vbo(log, lsn) & ~log->page_mask; u32 off = lsn_to_page_off(log, lsn) + log->record_header_len; u32 data_len = le32_to_cpu(rh->client_data_len); /* * While there are more bytes to transfer, * we continue to attempt to perform the read. */ for (;;) { bool usa_error; u32 tail = log->page_size - off; if (tail >= data_len) tail = data_len; data_len -= tail; err = read_log_page(log, vbo, &ph, &usa_error); if (err) goto out; /* * The last lsn on this page better be greater or equal * to the lsn we are copying. */ if (lsn > le64_to_cpu(ph->rhdr.lsn)) { err = -EINVAL; goto out; } memcpy(buffer, Add2Ptr(ph, off), tail); /* If there are no more bytes to transfer, we exit the loop. */ if (!data_len) { if (!is_log_record_end(ph) || lsn > le64_to_cpu(ph->record_hdr.last_end_lsn)) { err = -EINVAL; goto out; } break; } if (ph->rhdr.lsn == ph->record_hdr.last_end_lsn || lsn > le64_to_cpu(ph->rhdr.lsn)) { err = -EINVAL; goto out; } vbo = next_page_off(log, vbo); off = log->data_off; /* * Adjust our pointer the user's buffer to transfer * the next block to. */ buffer = Add2Ptr(buffer, tail); } out: kfree(ph); return err; } static int read_rst_area(struct ntfs_log *log, struct NTFS_RESTART **rst_, u64 *lsn) { int err; struct LFS_RECORD_HDR *rh = NULL; const struct CLIENT_REC *cr = Add2Ptr(log->ra, le16_to_cpu(log->ra->client_off)); u64 lsnr, lsnc = le64_to_cpu(cr->restart_lsn); u32 len; struct NTFS_RESTART *rst; *lsn = 0; *rst_ = NULL; /* If the client doesn't have a restart area, go ahead and exit now. */ if (!lsnc) return 0; err = read_log_page(log, lsn_to_vbo(log, lsnc), (struct RECORD_PAGE_HDR **)&rh, NULL); if (err) return err; rst = NULL; lsnr = le64_to_cpu(rh->this_lsn); if (lsnc != lsnr) { /* If the lsn values don't match, then the disk is corrupt. */ err = -EINVAL; goto out; } *lsn = lsnr; len = le32_to_cpu(rh->client_data_len); if (!len) { err = 0; goto out; } if (len < sizeof(struct NTFS_RESTART)) { err = -EINVAL; goto out; } rst = kmalloc(len, GFP_NOFS); if (!rst) { err = -ENOMEM; goto out; } /* Copy the data into the 'rst' buffer. */ err = read_log_rec_buf(log, rh, rst); if (err) goto out; *rst_ = rst; rst = NULL; out: kfree(rh); kfree(rst); return err; } static int find_log_rec(struct ntfs_log *log, u64 lsn, struct lcb *lcb) { int err; struct LFS_RECORD_HDR *rh = lcb->lrh; u32 rec_len, len; /* Read the record header for this lsn. */ if (!rh) { err = read_log_page(log, lsn_to_vbo(log, lsn), (struct RECORD_PAGE_HDR **)&rh, NULL); lcb->lrh = rh; if (err) return err; } /* * If the lsn the log record doesn't match the desired * lsn then the disk is corrupt. */ if (lsn != le64_to_cpu(rh->this_lsn)) return -EINVAL; len = le32_to_cpu(rh->client_data_len); /* * Check that the length field isn't greater than the total * available space the log file. */ rec_len = len + log->record_header_len; if (rec_len >= log->total_avail) return -EINVAL; /* * If the entire log record is on this log page, * put a pointer to the log record the context block. */ if (rh->flags & LOG_RECORD_MULTI_PAGE) { void *lr = kmalloc(len, GFP_NOFS); if (!lr) return -ENOMEM; lcb->log_rec = lr; lcb->alloc = true; /* Copy the data into the buffer returned. */ err = read_log_rec_buf(log, rh, lr); if (err) return err; } else { /* If beyond the end of the current page -> an error. */ u32 page_off = lsn_to_page_off(log, lsn); if (page_off + len + log->record_header_len > log->page_size) return -EINVAL; lcb->log_rec = Add2Ptr(rh, sizeof(struct LFS_RECORD_HDR)); lcb->alloc = false; } return 0; } /* * read_log_rec_lcb - Init the query operation. */ static int read_log_rec_lcb(struct ntfs_log *log, u64 lsn, u32 ctx_mode, struct lcb **lcb_) { int err; const struct CLIENT_REC *cr; struct lcb *lcb; switch (ctx_mode) { case lcb_ctx_undo_next: case lcb_ctx_prev: case lcb_ctx_next: break; default: return -EINVAL; } /* Check that the given lsn is the legal range for this client. */ cr = Add2Ptr(log->ra, le16_to_cpu(log->ra->client_off)); if (!verify_client_lsn(log, cr, lsn)) return -EINVAL; lcb = kzalloc(sizeof(struct lcb), GFP_NOFS); if (!lcb) return -ENOMEM; lcb->client = log->client_id; lcb->ctx_mode = ctx_mode; /* Find the log record indicated by the given lsn. */ err = find_log_rec(log, lsn, lcb); if (err) goto out; *lcb_ = lcb; return 0; out: lcb_put(lcb); *lcb_ = NULL; return err; } /* * find_client_next_lsn * * Attempt to find the next lsn to return to a client based on the context mode. */ static int find_client_next_lsn(struct ntfs_log *log, struct lcb *lcb, u64 *lsn) { int err; u64 next_lsn; struct LFS_RECORD_HDR *hdr; hdr = lcb->lrh; *lsn = 0; if (lcb_ctx_next != lcb->ctx_mode) goto check_undo_next; /* Loop as long as another lsn can be found. */ for (;;) { u64 current_lsn; err = next_log_lsn(log, hdr, ¤t_lsn); if (err) goto out; if (!current_lsn) break; if (hdr != lcb->lrh) kfree(hdr); hdr = NULL; err = read_log_page(log, lsn_to_vbo(log, current_lsn), (struct RECORD_PAGE_HDR **)&hdr, NULL); if (err) goto out; if (memcmp(&hdr->client, &lcb->client, sizeof(struct CLIENT_ID))) { /*err = -EINVAL; */ } else if (LfsClientRecord == hdr->record_type) { kfree(lcb->lrh); lcb->lrh = hdr; *lsn = current_lsn; return 0; } } out: if (hdr != lcb->lrh) kfree(hdr); return err; check_undo_next: if (lcb_ctx_undo_next == lcb->ctx_mode) next_lsn = le64_to_cpu(hdr->client_undo_next_lsn); else if (lcb_ctx_prev == lcb->ctx_mode) next_lsn = le64_to_cpu(hdr->client_prev_lsn); else return 0; if (!next_lsn) return 0; if (!verify_client_lsn( log, Add2Ptr(log->ra, le16_to_cpu(log->ra->client_off)), next_lsn)) return 0; hdr = NULL; err = read_log_page(log, lsn_to_vbo(log, next_lsn), (struct RECORD_PAGE_HDR **)&hdr, NULL); if (err) return err; kfree(lcb->lrh); lcb->lrh = hdr; *lsn = next_lsn; return 0; } static int read_next_log_rec(struct ntfs_log *log, struct lcb *lcb, u64 *lsn) { int err; err = find_client_next_lsn(log, lcb, lsn); if (err) return err; if (!*lsn) return 0; if (lcb->alloc) kfree(lcb->log_rec); lcb->log_rec = NULL; lcb->alloc = false; kfree(lcb->lrh); lcb->lrh = NULL; return find_log_rec(log, *lsn, lcb); } bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes) { __le16 mask; u32 min_de, de_off, used, total; const struct NTFS_DE *e; if (hdr_has_subnode(hdr)) { min_de = sizeof(struct NTFS_DE) + sizeof(u64); mask = NTFS_IE_HAS_SUBNODES; } else { min_de = sizeof(struct NTFS_DE); mask = 0; } de_off = le32_to_cpu(hdr->de_off); used = le32_to_cpu(hdr->used); total = le32_to_cpu(hdr->total); if (de_off > bytes - min_de || used > bytes || total > bytes || de_off + min_de > used || used > total) { return false; } e = Add2Ptr(hdr, de_off); for (;;) { u16 esize = le16_to_cpu(e->size); struct NTFS_DE *next = Add2Ptr(e, esize); if (esize < min_de || PtrOffset(hdr, next) > used || (e->flags & NTFS_IE_HAS_SUBNODES) != mask) { return false; } if (de_is_last(e)) break; e = next; } return true; } static inline bool check_index_buffer(const struct INDEX_BUFFER *ib, u32 bytes) { u16 fo; const struct NTFS_RECORD_HEADER *r = &ib->rhdr; if (r->sign != NTFS_INDX_SIGNATURE) return false; fo = (SECTOR_SIZE - ((bytes >> SECTOR_SHIFT) + 1) * sizeof(short)); if (le16_to_cpu(r->fix_off) > fo) return false; if ((le16_to_cpu(r->fix_num) - 1) * SECTOR_SIZE != bytes) return false; return check_index_header(&ib->ihdr, bytes - offsetof(struct INDEX_BUFFER, ihdr)); } static inline bool check_index_root(const struct ATTRIB *attr, struct ntfs_sb_info *sbi) { bool ret; const struct INDEX_ROOT *root = resident_data(attr); u8 index_bits = le32_to_cpu(root->index_block_size) >= sbi->cluster_size ? sbi->cluster_bits : SECTOR_SHIFT; u8 block_clst = root->index_block_clst; if (le32_to_cpu(attr->res.data_size) < sizeof(struct INDEX_ROOT) || (root->type != ATTR_NAME && root->type != ATTR_ZERO) || (root->type == ATTR_NAME && root->rule != NTFS_COLLATION_TYPE_FILENAME) || (le32_to_cpu(root->index_block_size) != (block_clst << index_bits)) || (block_clst != 1 && block_clst != 2 && block_clst != 4 && block_clst != 8 && block_clst != 0x10 && block_clst != 0x20 && block_clst != 0x40 && block_clst != 0x80)) { return false; } ret = check_index_header(&root->ihdr, le32_to_cpu(attr->res.data_size) - offsetof(struct INDEX_ROOT, ihdr)); return ret; } static inline bool check_attr(const struct MFT_REC *rec, const struct ATTRIB *attr, struct ntfs_sb_info *sbi) { u32 asize = le32_to_cpu(attr->size); u32 rsize = 0; u64 dsize, svcn, evcn; u16 run_off; /* Check the fixed part of the attribute record header. */ if (asize >= sbi->record_size || asize + PtrOffset(rec, attr) >= sbi->record_size || (attr->name_len && le16_to_cpu(attr->name_off) + attr->name_len * sizeof(short) > asize)) { return false; } /* Check the attribute fields. */ switch (attr->non_res) { case 0: rsize = le32_to_cpu(attr->res.data_size); if (rsize >= asize || le16_to_cpu(attr->res.data_off) + rsize > asize) { return false; } break; case 1: dsize = le64_to_cpu(attr->nres.data_size); svcn = le64_to_cpu(attr->nres.svcn); evcn = le64_to_cpu(attr->nres.evcn); run_off = le16_to_cpu(attr->nres.run_off); if (svcn > evcn + 1 || run_off >= asize || le64_to_cpu(attr->nres.valid_size) > dsize || dsize > le64_to_cpu(attr->nres.alloc_size)) { return false; } if (run_off > asize) return false; if (run_unpack(NULL, sbi, 0, svcn, evcn, svcn, Add2Ptr(attr, run_off), asize - run_off) < 0) { return false; } return true; default: return false; } switch (attr->type) { case ATTR_NAME: if (fname_full_size(Add2Ptr( attr, le16_to_cpu(attr->res.data_off))) > asize) { return false; } break; case ATTR_ROOT: return check_index_root(attr, sbi); case ATTR_STD: if (rsize < sizeof(struct ATTR_STD_INFO5) && rsize != sizeof(struct ATTR_STD_INFO)) { return false; } break; case ATTR_LIST: case ATTR_ID: case ATTR_SECURE: case ATTR_LABEL: case ATTR_VOL_INFO: case ATTR_DATA: case ATTR_ALLOC: case ATTR_BITMAP: case ATTR_REPARSE: case ATTR_EA_INFO: case ATTR_EA: case ATTR_PROPERTYSET: case ATTR_LOGGED_UTILITY_STREAM: break; default: return false; } return true; } static inline bool check_file_record(const struct MFT_REC *rec, const struct MFT_REC *rec2, struct ntfs_sb_info *sbi) { const struct ATTRIB *attr; u16 fo = le16_to_cpu(rec->rhdr.fix_off); u16 fn = le16_to_cpu(rec->rhdr.fix_num); u16 ao = le16_to_cpu(rec->attr_off); u32 rs = sbi->record_size; /* Check the file record header for consistency. */ if (rec->rhdr.sign != NTFS_FILE_SIGNATURE || fo > (SECTOR_SIZE - ((rs >> SECTOR_SHIFT) + 1) * sizeof(short)) || (fn - 1) * SECTOR_SIZE != rs || ao < MFTRECORD_FIXUP_OFFSET_1 || ao > sbi->record_size - SIZEOF_RESIDENT || !is_rec_inuse(rec) || le32_to_cpu(rec->total) != rs) { return false; } /* Loop to check all of the attributes. */ for (attr = Add2Ptr(rec, ao); attr->type != ATTR_END; attr = Add2Ptr(attr, le32_to_cpu(attr->size))) { if (check_attr(rec, attr, sbi)) continue; return false; } return true; } static inline int check_lsn(const struct NTFS_RECORD_HEADER *hdr, const u64 *rlsn) { u64 lsn; if (!rlsn) return true; lsn = le64_to_cpu(hdr->lsn); if (hdr->sign == NTFS_HOLE_SIGNATURE) return false; if (*rlsn > lsn) return true; return false; } static inline bool check_if_attr(const struct MFT_REC *rec, const struct LOG_REC_HDR *lrh) { u16 ro = le16_to_cpu(lrh->record_off); u16 o = le16_to_cpu(rec->attr_off); const struct ATTRIB *attr = Add2Ptr(rec, o); while (o < ro) { u32 asize; if (attr->type == ATTR_END) break; asize = le32_to_cpu(attr->size); if (!asize) break; o += asize; attr = Add2Ptr(attr, asize); } return o == ro; } static inline bool check_if_index_root(const struct MFT_REC *rec, const struct LOG_REC_HDR *lrh) { u16 ro = le16_to_cpu(lrh->record_off); u16 o = le16_to_cpu(rec->attr_off); const struct ATTRIB *attr = Add2Ptr(rec, o); while (o < ro) { u32 asize; if (attr->type == ATTR_END) break; asize = le32_to_cpu(attr->size); if (!asize) break; o += asize; attr = Add2Ptr(attr, asize); } return o == ro && attr->type == ATTR_ROOT; } static inline bool check_if_root_index(const struct ATTRIB *attr, const struct INDEX_HDR *hdr, const struct LOG_REC_HDR *lrh) { u16 ao = le16_to_cpu(lrh->attr_off); u32 de_off = le32_to_cpu(hdr->de_off); u32 o = PtrOffset(attr, hdr) + de_off; const struct NTFS_DE *e = Add2Ptr(hdr, de_off); u32 asize = le32_to_cpu(attr->size); while (o < ao) { u16 esize; if (o >= asize) break; esize = le16_to_cpu(e->size); if (!esize) break; o += esize; e = Add2Ptr(e, esize); } return o == ao; } static inline bool check_if_alloc_index(const struct INDEX_HDR *hdr, u32 attr_off) { u32 de_off = le32_to_cpu(hdr->de_off); u32 o = offsetof(struct INDEX_BUFFER, ihdr) + de_off; const struct NTFS_DE *e = Add2Ptr(hdr, de_off); u32 used = le32_to_cpu(hdr->used); while (o < attr_off) { u16 esize; if (de_off >= used) break; esize = le16_to_cpu(e->size); if (!esize) break; o += esize; de_off += esize; e = Add2Ptr(e, esize); } return o == attr_off; } static inline void change_attr_size(struct MFT_REC *rec, struct ATTRIB *attr, u32 nsize) { u32 asize = le32_to_cpu(attr->size); int dsize = nsize - asize; u8 *next = Add2Ptr(attr, asize); u32 used = le32_to_cpu(rec->used); memmove(Add2Ptr(attr, nsize), next, used - PtrOffset(rec, next)); rec->used = cpu_to_le32(used + dsize); attr->size = cpu_to_le32(nsize); } struct OpenAttr { struct ATTRIB *attr; struct runs_tree *run1; struct runs_tree run0; struct ntfs_inode *ni; // CLST rno; }; /* * cmp_type_and_name * * Return: 0 if 'attr' has the same type and name. */ static inline int cmp_type_and_name(const struct ATTRIB *a1, const struct ATTRIB *a2) { return a1->type != a2->type || a1->name_len != a2->name_len || (a1->name_len && memcmp(attr_name(a1), attr_name(a2), a1->name_len * sizeof(short))); } static struct OpenAttr *find_loaded_attr(struct ntfs_log *log, const struct ATTRIB *attr, CLST rno) { struct OPEN_ATTR_ENRTY *oe = NULL; while ((oe = enum_rstbl(log->open_attr_tbl, oe))) { struct OpenAttr *op_attr; if (ino_get(&oe->ref) != rno) continue; op_attr = (struct OpenAttr *)oe->ptr; if (!cmp_type_and_name(op_attr->attr, attr)) return op_attr; } return NULL; } static struct ATTRIB *attr_create_nonres_log(struct ntfs_sb_info *sbi, enum ATTR_TYPE type, u64 size, const u16 *name, size_t name_len, __le16 flags) { struct ATTRIB *attr; u32 name_size = ALIGN(name_len * sizeof(short), 8); bool is_ext = flags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED); u32 asize = name_size + (is_ext ? SIZEOF_NONRESIDENT_EX : SIZEOF_NONRESIDENT); attr = kzalloc(asize, GFP_NOFS); if (!attr) return NULL; attr->type = type; attr->size = cpu_to_le32(asize); attr->flags = flags; attr->non_res = 1; attr->name_len = name_len; attr->nres.evcn = cpu_to_le64((u64)bytes_to_cluster(sbi, size) - 1); attr->nres.alloc_size = cpu_to_le64(ntfs_up_cluster(sbi, size)); attr->nres.data_size = cpu_to_le64(size); attr->nres.valid_size = attr->nres.data_size; if (is_ext) { attr->name_off = SIZEOF_NONRESIDENT_EX_LE; if (is_attr_compressed(attr)) attr->nres.c_unit = NTFS_LZNT_CUNIT; attr->nres.run_off = cpu_to_le16(SIZEOF_NONRESIDENT_EX + name_size); memcpy(Add2Ptr(attr, SIZEOF_NONRESIDENT_EX), name, name_len * sizeof(short)); } else { attr->name_off = SIZEOF_NONRESIDENT_LE; attr->nres.run_off = cpu_to_le16(SIZEOF_NONRESIDENT + name_size); memcpy(Add2Ptr(attr, SIZEOF_NONRESIDENT), name, name_len * sizeof(short)); } return attr; } /* * do_action - Common routine for the Redo and Undo Passes. * @rlsn: If it is NULL then undo. */ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe, const struct LOG_REC_HDR *lrh, u32 op, void *data, u32 dlen, u32 rec_len, const u64 *rlsn) { int err = 0; struct ntfs_sb_info *sbi = log->ni->mi.sbi; struct inode *inode = NULL, *inode_parent; struct mft_inode *mi = NULL, *mi2_child = NULL; CLST rno = 0, rno_base = 0; struct INDEX_BUFFER *ib = NULL; struct MFT_REC *rec = NULL; struct ATTRIB *attr = NULL, *attr2; struct INDEX_HDR *hdr; struct INDEX_ROOT *root; struct NTFS_DE *e, *e1, *e2; struct NEW_ATTRIBUTE_SIZES *new_sz; struct ATTR_FILE_NAME *fname; struct OpenAttr *oa, *oa2; u32 nsize, t32, asize, used, esize, off, bits; u16 id, id2; u32 record_size = sbi->record_size; u64 t64; u16 roff = le16_to_cpu(lrh->record_off); u16 aoff = le16_to_cpu(lrh->attr_off); u64 lco = 0; u64 cbo = (u64)le16_to_cpu(lrh->cluster_off) << SECTOR_SHIFT; u64 tvo = le64_to_cpu(lrh->target_vcn) << sbi->cluster_bits; u64 vbo = cbo + tvo; void *buffer_le = NULL; u32 bytes = 0; bool a_dirty = false; u16 data_off; oa = oe->ptr; /* Big switch to prepare. */ switch (op) { /* ============================================================ * Process MFT records, as described by the current log record. * ============================================================ */ case InitializeFileRecordSegment: case DeallocateFileRecordSegment: case WriteEndOfFileRecordSegment: case CreateAttribute: case DeleteAttribute: case UpdateResidentValue: case UpdateMappingPairs: case SetNewAttributeSizes: case AddIndexEntryRoot: case DeleteIndexEntryRoot: case SetIndexEntryVcnRoot: case UpdateFileNameRoot: case UpdateRecordDataRoot: case ZeroEndOfFileRecord: rno = vbo >> sbi->record_bits; inode = ilookup(sbi->sb, rno); if (inode) { mi = &ntfs_i(inode)->mi; } else { /* Read from disk. */ err = mi_get(sbi, rno, &mi); if (err && op == InitializeFileRecordSegment) { mi = kzalloc(sizeof(struct mft_inode), GFP_NOFS); if (!mi) return -ENOMEM; err = mi_format_new(mi, sbi, rno, 0, false); } if (err) return err; } rec = mi->mrec; if (op == DeallocateFileRecordSegment) goto skip_load_parent; if (rec->rhdr.sign == NTFS_BAAD_SIGNATURE) goto dirty_vol; if (!check_lsn(&rec->rhdr, rlsn)) goto out; if (!check_file_record(rec, NULL, sbi)) goto dirty_vol; attr = Add2Ptr(rec, roff); if (is_rec_base(rec) || InitializeFileRecordSegment == op) { rno_base = rno; goto skip_load_parent; } rno_base = ino_get(&rec->parent_ref); inode_parent = ntfs_iget5(sbi->sb, &rec->parent_ref, NULL); if (IS_ERR(inode_parent)) goto skip_load_parent; if (is_bad_inode(inode_parent)) { iput(inode_parent); goto skip_load_parent; } if (ni_load_mi_ex(ntfs_i(inode_parent), rno, &mi2_child)) { iput(inode_parent); } else { if (mi2_child->mrec != mi->mrec) memcpy(mi2_child->mrec, mi->mrec, sbi->record_size); if (inode) iput(inode); else mi_put(mi); inode = inode_parent; mi = mi2_child; rec = mi2_child->mrec; attr = Add2Ptr(rec, roff); } skip_load_parent: inode_parent = NULL; break; /* * Process attributes, as described by the current log record. */ case UpdateNonresidentValue: case AddIndexEntryAllocation: case DeleteIndexEntryAllocation: case WriteEndOfIndexBuffer: case SetIndexEntryVcnAllocation: case UpdateFileNameAllocation: case SetBitsInNonresidentBitMap: case ClearBitsInNonresidentBitMap: case UpdateRecordDataAllocation: attr = oa->attr; bytes = UpdateNonresidentValue == op ? dlen : 0; lco = (u64)le16_to_cpu(lrh->lcns_follow) << sbi->cluster_bits; if (attr->type == ATTR_ALLOC) { t32 = le32_to_cpu(oe->bytes_per_index); if (bytes < t32) bytes = t32; } if (!bytes) bytes = lco - cbo; bytes += roff; if (attr->type == ATTR_ALLOC) bytes = (bytes + 511) & ~511; // align buffer_le = kmalloc(bytes, GFP_NOFS); if (!buffer_le) return -ENOMEM; err = ntfs_read_run_nb(sbi, oa->run1, vbo, buffer_le, bytes, NULL); if (err) goto out; if (attr->type == ATTR_ALLOC && *(int *)buffer_le) ntfs_fix_post_read(buffer_le, bytes, false); break; default: WARN_ON(1); } /* Big switch to do operation. */ switch (op) { case InitializeFileRecordSegment: if (roff + dlen > record_size) goto dirty_vol; memcpy(Add2Ptr(rec, roff), data, dlen); mi->dirty = true; break; case DeallocateFileRecordSegment: clear_rec_inuse(rec); le16_add_cpu(&rec->seq, 1); mi->dirty = true; break; case WriteEndOfFileRecordSegment: attr2 = (struct ATTRIB *)data; if (!check_if_attr(rec, lrh) || roff + dlen > record_size) goto dirty_vol; memmove(attr, attr2, dlen); rec->used = cpu_to_le32(ALIGN(roff + dlen, 8)); mi->dirty = true; break; case CreateAttribute: attr2 = (struct ATTRIB *)data; asize = le32_to_cpu(attr2->size); used = le32_to_cpu(rec->used); if (!check_if_attr(rec, lrh) || dlen < SIZEOF_RESIDENT || !IS_ALIGNED(asize, 8) || Add2Ptr(attr2, asize) > Add2Ptr(lrh, rec_len) || dlen > record_size - used) { goto dirty_vol; } memmove(Add2Ptr(attr, asize), attr, used - roff); memcpy(attr, attr2, asize); rec->used = cpu_to_le32(used + asize); id = le16_to_cpu(rec->next_attr_id); id2 = le16_to_cpu(attr2->id); if (id <= id2) rec->next_attr_id = cpu_to_le16(id2 + 1); if (is_attr_indexed(attr)) le16_add_cpu(&rec->hard_links, 1); oa2 = find_loaded_attr(log, attr, rno_base); if (oa2) { void *p2 = kmemdup(attr, le32_to_cpu(attr->size), GFP_NOFS); if (p2) { // run_close(oa2->run1); kfree(oa2->attr); oa2->attr = p2; } } mi->dirty = true; break; case DeleteAttribute: asize = le32_to_cpu(attr->size); used = le32_to_cpu(rec->used); if (!check_if_attr(rec, lrh)) goto dirty_vol; rec->used = cpu_to_le32(used - asize); if (is_attr_indexed(attr)) le16_add_cpu(&rec->hard_links, -1); memmove(attr, Add2Ptr(attr, asize), used - asize - roff); mi->dirty = true; break; case UpdateResidentValue: nsize = aoff + dlen; if (!check_if_attr(rec, lrh)) goto dirty_vol; asize = le32_to_cpu(attr->size); used = le32_to_cpu(rec->used); if (lrh->redo_len == lrh->undo_len) { if (nsize > asize) goto dirty_vol; goto move_data; } if (nsize > asize && nsize - asize > record_size - used) goto dirty_vol; nsize = ALIGN(nsize, 8); data_off = le16_to_cpu(attr->res.data_off); if (nsize < asize) { memmove(Add2Ptr(attr, aoff), data, dlen); data = NULL; // To skip below memmove(). } memmove(Add2Ptr(attr, nsize), Add2Ptr(attr, asize), used - le16_to_cpu(lrh->record_off) - asize); rec->used = cpu_to_le32(used + nsize - asize); attr->size = cpu_to_le32(nsize); attr->res.data_size = cpu_to_le32(aoff + dlen - data_off); move_data: if (data) memmove(Add2Ptr(attr, aoff), data, dlen); oa2 = find_loaded_attr(log, attr, rno_base); if (oa2) { void *p2 = kmemdup(attr, le32_to_cpu(attr->size), GFP_NOFS); if (p2) { // run_close(&oa2->run0); oa2->run1 = &oa2->run0; kfree(oa2->attr); oa2->attr = p2; } } mi->dirty = true; break; case UpdateMappingPairs: nsize = aoff + dlen; asize = le32_to_cpu(attr->size); used = le32_to_cpu(rec->used); if (!check_if_attr(rec, lrh) || !attr->non_res || aoff < le16_to_cpu(attr->nres.run_off) || aoff > asize || (nsize > asize && nsize - asize > record_size - used)) { goto dirty_vol; } nsize = ALIGN(nsize, 8); memmove(Add2Ptr(attr, nsize), Add2Ptr(attr, asize), used - le16_to_cpu(lrh->record_off) - asize); rec->used = cpu_to_le32(used + nsize - asize); attr->size = cpu_to_le32(nsize); memmove(Add2Ptr(attr, aoff), data, dlen); if (run_get_highest_vcn(le64_to_cpu(attr->nres.svcn), attr_run(attr), &t64)) { goto dirty_vol; } attr->nres.evcn = cpu_to_le64(t64); oa2 = find_loaded_attr(log, attr, rno_base); if (oa2 && oa2->attr->non_res) oa2->attr->nres.evcn = attr->nres.evcn; mi->dirty = true; break; case SetNewAttributeSizes: new_sz = data; if (!check_if_attr(rec, lrh) || !attr->non_res) goto dirty_vol; attr->nres.alloc_size = new_sz->alloc_size; attr->nres.data_size = new_sz->data_size; attr->nres.valid_size = new_sz->valid_size; if (dlen >= sizeof(struct NEW_ATTRIBUTE_SIZES)) attr->nres.total_size = new_sz->total_size; oa2 = find_loaded_attr(log, attr, rno_base); if (oa2) { void *p2 = kmemdup(attr, le32_to_cpu(attr->size), GFP_NOFS); if (p2) { kfree(oa2->attr); oa2->attr = p2; } } mi->dirty = true; break; case AddIndexEntryRoot: e = (struct NTFS_DE *)data; esize = le16_to_cpu(e->size); root = resident_data(attr); hdr = &root->ihdr; used = le32_to_cpu(hdr->used); if (!check_if_index_root(rec, lrh) || !check_if_root_index(attr, hdr, lrh) || Add2Ptr(data, esize) > Add2Ptr(lrh, rec_len) || esize > le32_to_cpu(rec->total) - le32_to_cpu(rec->used)) { goto dirty_vol; } e1 = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); change_attr_size(rec, attr, le32_to_cpu(attr->size) + esize); memmove(Add2Ptr(e1, esize), e1, PtrOffset(e1, Add2Ptr(hdr, used))); memmove(e1, e, esize); le32_add_cpu(&attr->res.data_size, esize); hdr->used = cpu_to_le32(used + esize); le32_add_cpu(&hdr->total, esize); mi->dirty = true; break; case DeleteIndexEntryRoot: root = resident_data(attr); hdr = &root->ihdr; used = le32_to_cpu(hdr->used); if (!check_if_index_root(rec, lrh) || !check_if_root_index(attr, hdr, lrh)) { goto dirty_vol; } e1 = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); esize = le16_to_cpu(e1->size); e2 = Add2Ptr(e1, esize); memmove(e1, e2, PtrOffset(e2, Add2Ptr(hdr, used))); le32_sub_cpu(&attr->res.data_size, esize); hdr->used = cpu_to_le32(used - esize); le32_sub_cpu(&hdr->total, esize); change_attr_size(rec, attr, le32_to_cpu(attr->size) - esize); mi->dirty = true; break; case SetIndexEntryVcnRoot: root = resident_data(attr); hdr = &root->ihdr; if (!check_if_index_root(rec, lrh) || !check_if_root_index(attr, hdr, lrh)) { goto dirty_vol; } e = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); de_set_vbn_le(e, *(__le64 *)data); mi->dirty = true; break; case UpdateFileNameRoot: root = resident_data(attr); hdr = &root->ihdr; if (!check_if_index_root(rec, lrh) || !check_if_root_index(attr, hdr, lrh)) { goto dirty_vol; } e = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); fname = (struct ATTR_FILE_NAME *)(e + 1); memmove(&fname->dup, data, sizeof(fname->dup)); // mi->dirty = true; break; case UpdateRecordDataRoot: root = resident_data(attr); hdr = &root->ihdr; if (!check_if_index_root(rec, lrh) || !check_if_root_index(attr, hdr, lrh)) { goto dirty_vol; } e = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); memmove(Add2Ptr(e, le16_to_cpu(e->view.data_off)), data, dlen); mi->dirty = true; break; case ZeroEndOfFileRecord: if (roff + dlen > record_size) goto dirty_vol; memset(attr, 0, dlen); mi->dirty = true; break; case UpdateNonresidentValue: if (lco < cbo + roff + dlen) goto dirty_vol; memcpy(Add2Ptr(buffer_le, roff), data, dlen); a_dirty = true; if (attr->type == ATTR_ALLOC) ntfs_fix_pre_write(buffer_le, bytes); break; case AddIndexEntryAllocation: ib = Add2Ptr(buffer_le, roff); hdr = &ib->ihdr; e = data; esize = le16_to_cpu(e->size); e1 = Add2Ptr(ib, aoff); if (is_baad(&ib->rhdr)) goto dirty_vol; if (!check_lsn(&ib->rhdr, rlsn)) goto out; used = le32_to_cpu(hdr->used); if (!check_index_buffer(ib, bytes) || !check_if_alloc_index(hdr, aoff) || Add2Ptr(e, esize) > Add2Ptr(lrh, rec_len) || used + esize > le32_to_cpu(hdr->total)) { goto dirty_vol; } memmove(Add2Ptr(e1, esize), e1, PtrOffset(e1, Add2Ptr(hdr, used))); memcpy(e1, e, esize); hdr->used = cpu_to_le32(used + esize); a_dirty = true; ntfs_fix_pre_write(&ib->rhdr, bytes); break; case DeleteIndexEntryAllocation: ib = Add2Ptr(buffer_le, roff); hdr = &ib->ihdr; e = Add2Ptr(ib, aoff); esize = le16_to_cpu(e->size); if (is_baad(&ib->rhdr)) goto dirty_vol; if (!check_lsn(&ib->rhdr, rlsn)) goto out; if (!check_index_buffer(ib, bytes) || !check_if_alloc_index(hdr, aoff)) { goto dirty_vol; } e1 = Add2Ptr(e, esize); nsize = esize; used = le32_to_cpu(hdr->used); memmove(e, e1, PtrOffset(e1, Add2Ptr(hdr, used))); hdr->used = cpu_to_le32(used - nsize); a_dirty = true; ntfs_fix_pre_write(&ib->rhdr, bytes); break; case WriteEndOfIndexBuffer: ib = Add2Ptr(buffer_le, roff); hdr = &ib->ihdr; e = Add2Ptr(ib, aoff); if (is_baad(&ib->rhdr)) goto dirty_vol; if (!check_lsn(&ib->rhdr, rlsn)) goto out; if (!check_index_buffer(ib, bytes) || !check_if_alloc_index(hdr, aoff) || aoff + dlen > offsetof(struct INDEX_BUFFER, ihdr) + le32_to_cpu(hdr->total)) { goto dirty_vol; } hdr->used = cpu_to_le32(dlen + PtrOffset(hdr, e)); memmove(e, data, dlen); a_dirty = true; ntfs_fix_pre_write(&ib->rhdr, bytes); break; case SetIndexEntryVcnAllocation: ib = Add2Ptr(buffer_le, roff); hdr = &ib->ihdr; e = Add2Ptr(ib, aoff); if (is_baad(&ib->rhdr)) goto dirty_vol; if (!check_lsn(&ib->rhdr, rlsn)) goto out; if (!check_index_buffer(ib, bytes) || !check_if_alloc_index(hdr, aoff)) { goto dirty_vol; } de_set_vbn_le(e, *(__le64 *)data); a_dirty = true; ntfs_fix_pre_write(&ib->rhdr, bytes); break; case UpdateFileNameAllocation: ib = Add2Ptr(buffer_le, roff); hdr = &ib->ihdr; e = Add2Ptr(ib, aoff); if (is_baad(&ib->rhdr)) goto dirty_vol; if (!check_lsn(&ib->rhdr, rlsn)) goto out; if (!check_index_buffer(ib, bytes) || !check_if_alloc_index(hdr, aoff)) { goto dirty_vol; } fname = (struct ATTR_FILE_NAME *)(e + 1); memmove(&fname->dup, data, sizeof(fname->dup)); a_dirty = true; ntfs_fix_pre_write(&ib->rhdr, bytes); break; case SetBitsInNonresidentBitMap: off = le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off); bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits); if (cbo + (off + 7) / 8 > lco || cbo + ((off + bits + 7) / 8) > lco) { goto dirty_vol; } ntfs_bitmap_set_le(Add2Ptr(buffer_le, roff), off, bits); a_dirty = true; break; case ClearBitsInNonresidentBitMap: off = le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off); bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits); if (cbo + (off + 7) / 8 > lco || cbo + ((off + bits + 7) / 8) > lco) { goto dirty_vol; } ntfs_bitmap_clear_le(Add2Ptr(buffer_le, roff), off, bits); a_dirty = true; break; case UpdateRecordDataAllocation: ib = Add2Ptr(buffer_le, roff); hdr = &ib->ihdr; e = Add2Ptr(ib, aoff); if (is_baad(&ib->rhdr)) goto dirty_vol; if (!check_lsn(&ib->rhdr, rlsn)) goto out; if (!check_index_buffer(ib, bytes) || !check_if_alloc_index(hdr, aoff)) { goto dirty_vol; } memmove(Add2Ptr(e, le16_to_cpu(e->view.data_off)), data, dlen); a_dirty = true; ntfs_fix_pre_write(&ib->rhdr, bytes); break; default: WARN_ON(1); } if (rlsn) { __le64 t64 = cpu_to_le64(*rlsn); if (rec) rec->rhdr.lsn = t64; if (ib) ib->rhdr.lsn = t64; } if (mi && mi->dirty) { err = mi_write(mi, 0); if (err) goto out; } if (a_dirty) { attr = oa->attr; err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes, 0); if (err) goto out; } out: if (inode) iput(inode); else if (mi != mi2_child) mi_put(mi); kfree(buffer_le); return err; dirty_vol: log->set_dirty = true; goto out; } /* * log_replay - Replays log and empties it. * * This function is called during mount operation. * It replays log and empties it. * Initialized is set false if logfile contains '-1'. */ int log_replay(struct ntfs_inode *ni, bool *initialized) { int err; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ntfs_log *log; u64 rec_lsn, checkpt_lsn = 0, rlsn = 0; struct ATTR_NAME_ENTRY *attr_names = NULL; u32 attr_names_bytes = 0; u32 oatbl_bytes = 0; struct RESTART_TABLE *dptbl = NULL; struct RESTART_TABLE *trtbl = NULL; const struct RESTART_TABLE *rt; struct RESTART_TABLE *oatbl = NULL; struct inode *inode; struct OpenAttr *oa; struct ntfs_inode *ni_oe; struct ATTRIB *attr = NULL; u64 size, vcn, undo_next_lsn; CLST rno, lcn, lcn0, len0, clen; void *data; struct NTFS_RESTART *rst = NULL; struct lcb *lcb = NULL; struct OPEN_ATTR_ENRTY *oe; struct ATTR_NAME_ENTRY *ane; struct TRANSACTION_ENTRY *tr; struct DIR_PAGE_ENTRY *dp; u32 i, bytes_per_attr_entry; u32 vbo, tail, off, dlen; u32 saved_len, rec_len, transact_id; bool use_second_page; struct RESTART_AREA *ra2, *ra = NULL; struct CLIENT_REC *ca, *cr; __le16 client; struct RESTART_HDR *rh; const struct LFS_RECORD_HDR *frh; const struct LOG_REC_HDR *lrh; bool is_mapped; bool is_ro = sb_rdonly(sbi->sb); u64 t64; u16 t16; u32 t32; log = kzalloc(sizeof(struct ntfs_log), GFP_NOFS); if (!log) return -ENOMEM; log->ni = ni; log->l_size = log->orig_file_size = ni->vfs_inode.i_size; /* Get the size of page. NOTE: To replay we can use default page. */ #if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2 log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, true); #else log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, false); #endif if (!log->page_size) { err = -EINVAL; goto out; } log->one_page_buf = kmalloc(log->page_size, GFP_NOFS); if (!log->one_page_buf) { err = -ENOMEM; goto out; } log->page_mask = log->page_size - 1; log->page_bits = blksize_bits(log->page_size); /* Look for a restart area on the disk. */ err = log_read_rst(log, true, &log->rst_info); if (err) goto out; /* remember 'initialized' */ *initialized = log->rst_info.initialized; if (!log->rst_info.restart) { if (log->rst_info.initialized) { /* No restart area but the file is not initialized. */ err = -EINVAL; goto out; } log_init_pg_hdr(log, 1, 1); log_create(log, 0, get_random_u32(), false, false); ra = log_create_ra(log); if (!ra) { err = -ENOMEM; goto out; } log->ra = ra; log->init_ra = true; goto process_log; } /* * If the restart offset above wasn't zero then we won't * look for a second restart. */ if (log->rst_info.vbo) goto check_restart_area; err = log_read_rst(log, false, &log->rst_info2); if (err) goto out; /* Determine which restart area to use. */ if (!log->rst_info2.restart || log->rst_info2.last_lsn <= log->rst_info.last_lsn) goto use_first_page; use_second_page = true; if (log->rst_info.chkdsk_was_run && log->page_size != log->rst_info.vbo) { struct RECORD_PAGE_HDR *sp = NULL; bool usa_error; if (!read_log_page(log, log->page_size, &sp, &usa_error) && sp->rhdr.sign == NTFS_CHKD_SIGNATURE) { use_second_page = false; } kfree(sp); } if (use_second_page) { kfree(log->rst_info.r_page); memcpy(&log->rst_info, &log->rst_info2, sizeof(struct restart_info)); log->rst_info2.r_page = NULL; } use_first_page: kfree(log->rst_info2.r_page); check_restart_area: /* * If the restart area is at offset 0, we want * to write the second restart area first. */ log->init_ra = !!log->rst_info.vbo; /* If we have a valid page then grab a pointer to the restart area. */ ra2 = log->rst_info.valid_page ? Add2Ptr(log->rst_info.r_page, le16_to_cpu(log->rst_info.r_page->ra_off)) : NULL; if (log->rst_info.chkdsk_was_run || (ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) { bool wrapped = false; bool use_multi_page = false; u32 open_log_count; /* Do some checks based on whether we have a valid log page. */ open_log_count = log->rst_info.valid_page ? le32_to_cpu(ra2->open_log_count) : get_random_u32(); log_init_pg_hdr(log, 1, 1); log_create(log, log->rst_info.last_lsn, open_log_count, wrapped, use_multi_page); ra = log_create_ra(log); if (!ra) { err = -ENOMEM; goto out; } log->ra = ra; /* Put the restart areas and initialize * the log file as required. */ goto process_log; } if (!ra2) { err = -EINVAL; goto out; } /* * If the log page or the system page sizes have changed, we can't * use the log file. We must use the system page size instead of the * default size if there is not a clean shutdown. */ t32 = le32_to_cpu(log->rst_info.r_page->sys_page_size); if (log->page_size != t32) { log->l_size = log->orig_file_size; log->page_size = norm_file_page(t32, &log->l_size, t32 == DefaultLogPageSize); } if (log->page_size != t32 || log->page_size != le32_to_cpu(log->rst_info.r_page->page_size)) { err = -EINVAL; goto out; } log->page_mask = log->page_size - 1; log->page_bits = blksize_bits(log->page_size); /* If the file size has shrunk then we won't mount it. */ if (log->l_size < le64_to_cpu(ra2->l_size)) { err = -EINVAL; goto out; } log_init_pg_hdr(log, le16_to_cpu(log->rst_info.r_page->major_ver), le16_to_cpu(log->rst_info.r_page->minor_ver)); log->l_size = le64_to_cpu(ra2->l_size); log->seq_num_bits = le32_to_cpu(ra2->seq_num_bits); log->file_data_bits = sizeof(u64) * 8 - log->seq_num_bits; log->seq_num_mask = (8 << log->file_data_bits) - 1; log->last_lsn = le64_to_cpu(ra2->current_lsn); log->seq_num = log->last_lsn >> log->file_data_bits; log->ra_off = le16_to_cpu(log->rst_info.r_page->ra_off); log->restart_size = log->sys_page_size - log->ra_off; log->record_header_len = le16_to_cpu(ra2->rec_hdr_len); log->ra_size = le16_to_cpu(ra2->ra_len); log->data_off = le16_to_cpu(ra2->data_off); log->data_size = log->page_size - log->data_off; log->reserved = log->data_size - log->record_header_len; vbo = lsn_to_vbo(log, log->last_lsn); if (vbo < log->first_page) { /* This is a pseudo lsn. */ log->l_flags |= NTFSLOG_NO_LAST_LSN; log->next_page = log->first_page; goto find_oldest; } /* Find the end of this log record. */ off = final_log_off(log, log->last_lsn, le32_to_cpu(ra2->last_lsn_data_len)); /* If we wrapped the file then increment the sequence number. */ if (off <= vbo) { log->seq_num += 1; log->l_flags |= NTFSLOG_WRAPPED; } /* Now compute the next log page to use. */ vbo &= ~log->sys_page_mask; tail = log->page_size - (off & log->page_mask) - 1; /* *If we can fit another log record on the page, * move back a page the log file. */ if (tail >= log->record_header_len) { log->l_flags |= NTFSLOG_REUSE_TAIL; log->next_page = vbo; } else { log->next_page = next_page_off(log, vbo); } find_oldest: /* * Find the oldest client lsn. Use the last * flushed lsn as a starting point. */ log->oldest_lsn = log->last_lsn; oldest_client_lsn(Add2Ptr(ra2, le16_to_cpu(ra2->client_off)), ra2->client_idx[1], &log->oldest_lsn); log->oldest_lsn_off = lsn_to_vbo(log, log->oldest_lsn); if (log->oldest_lsn_off < log->first_page) log->l_flags |= NTFSLOG_NO_OLDEST_LSN; if (!(ra2->flags & RESTART_SINGLE_PAGE_IO)) log->l_flags |= NTFSLOG_WRAPPED | NTFSLOG_MULTIPLE_PAGE_IO; log->current_openlog_count = le32_to_cpu(ra2->open_log_count); log->total_avail_pages = log->l_size - log->first_page; log->total_avail = log->total_avail_pages >> log->page_bits; log->max_current_avail = log->total_avail * log->reserved; log->total_avail = log->total_avail * log->data_size; log->current_avail = current_log_avail(log); ra = kzalloc(log->restart_size, GFP_NOFS); if (!ra) { err = -ENOMEM; goto out; } log->ra = ra; t16 = le16_to_cpu(ra2->client_off); if (t16 == offsetof(struct RESTART_AREA, clients)) { memcpy(ra, ra2, log->ra_size); } else { memcpy(ra, ra2, offsetof(struct RESTART_AREA, clients)); memcpy(ra->clients, Add2Ptr(ra2, t16), le16_to_cpu(ra2->ra_len) - t16); log->current_openlog_count = get_random_u32(); ra->open_log_count = cpu_to_le32(log->current_openlog_count); log->ra_size = offsetof(struct RESTART_AREA, clients) + sizeof(struct CLIENT_REC); ra->client_off = cpu_to_le16(offsetof(struct RESTART_AREA, clients)); ra->ra_len = cpu_to_le16(log->ra_size); } le32_add_cpu(&ra->open_log_count, 1); /* Now we need to walk through looking for the last lsn. */ err = last_log_lsn(log); if (err) goto out; log->current_avail = current_log_avail(log); /* Remember which restart area to write first. */ log->init_ra = log->rst_info.vbo; process_log: /* 1.0, 1.1, 2.0 log->major_ver/minor_ver - short values. */ switch ((log->major_ver << 16) + log->minor_ver) { case 0x10000: case 0x10001: case 0x20000: break; default: ntfs_warn(sbi->sb, "\x24LogFile version %d.%d is not supported", log->major_ver, log->minor_ver); err = -EOPNOTSUPP; log->set_dirty = true; goto out; } /* One client "NTFS" per logfile. */ ca = Add2Ptr(ra, le16_to_cpu(ra->client_off)); for (client = ra->client_idx[1];; client = cr->next_client) { if (client == LFS_NO_CLIENT_LE) { /* Insert "NTFS" client LogFile. */ client = ra->client_idx[0]; if (client == LFS_NO_CLIENT_LE) { err = -EINVAL; goto out; } t16 = le16_to_cpu(client); cr = ca + t16; remove_client(ca, cr, &ra->client_idx[0]); cr->restart_lsn = 0; cr->oldest_lsn = cpu_to_le64(log->oldest_lsn); cr->name_bytes = cpu_to_le32(8); cr->name[0] = cpu_to_le16('N'); cr->name[1] = cpu_to_le16('T'); cr->name[2] = cpu_to_le16('F'); cr->name[3] = cpu_to_le16('S'); add_client(ca, t16, &ra->client_idx[1]); break; } cr = ca + le16_to_cpu(client); if (cpu_to_le32(8) == cr->name_bytes && cpu_to_le16('N') == cr->name[0] && cpu_to_le16('T') == cr->name[1] && cpu_to_le16('F') == cr->name[2] && cpu_to_le16('S') == cr->name[3]) break; } /* Update the client handle with the client block information. */ log->client_id.seq_num = cr->seq_num; log->client_id.client_idx = client; err = read_rst_area(log, &rst, &checkpt_lsn); if (err) goto out; if (!rst) goto out; bytes_per_attr_entry = !rst->major_ver ? 0x2C : 0x28; if (rst->check_point_start) checkpt_lsn = le64_to_cpu(rst->check_point_start); /* Allocate and Read the Transaction Table. */ if (!rst->transact_table_len) goto check_dirty_page_table; /* reduce tab pressure. */ t64 = le64_to_cpu(rst->transact_table_lsn); err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb); if (err) goto out; lrh = lcb->log_rec; frh = lcb->lrh; rec_len = le32_to_cpu(frh->client_data_len); if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id), bytes_per_attr_entry)) { err = -EINVAL; goto out; } t16 = le16_to_cpu(lrh->redo_off); rt = Add2Ptr(lrh, t16); t32 = rec_len - t16; /* Now check that this is a valid restart table. */ if (!check_rstbl(rt, t32)) { err = -EINVAL; goto out; } trtbl = kmemdup(rt, t32, GFP_NOFS); if (!trtbl) { err = -ENOMEM; goto out; } lcb_put(lcb); lcb = NULL; check_dirty_page_table: /* The next record back should be the Dirty Pages Table. */ if (!rst->dirty_pages_len) goto check_attribute_names; /* reduce tab pressure. */ t64 = le64_to_cpu(rst->dirty_pages_table_lsn); err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb); if (err) goto out; lrh = lcb->log_rec; frh = lcb->lrh; rec_len = le32_to_cpu(frh->client_data_len); if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id), bytes_per_attr_entry)) { err = -EINVAL; goto out; } t16 = le16_to_cpu(lrh->redo_off); rt = Add2Ptr(lrh, t16); t32 = rec_len - t16; /* Now check that this is a valid restart table. */ if (!check_rstbl(rt, t32)) { err = -EINVAL; goto out; } dptbl = kmemdup(rt, t32, GFP_NOFS); if (!dptbl) { err = -ENOMEM; goto out; } /* Convert Ra version '0' into version '1'. */ if (rst->major_ver) goto end_conv_1; /* reduce tab pressure. */ dp = NULL; while ((dp = enum_rstbl(dptbl, dp))) { struct DIR_PAGE_ENTRY_32 *dp0 = (struct DIR_PAGE_ENTRY_32 *)dp; // NOTE: Danger. Check for of boundary. memmove(&dp->vcn, &dp0->vcn_low, 2 * sizeof(u64) + le32_to_cpu(dp->lcns_follow) * sizeof(u64)); } end_conv_1: lcb_put(lcb); lcb = NULL; /* * Go through the table and remove the duplicates, * remembering the oldest lsn values. */ if (sbi->cluster_size <= log->page_size) goto trace_dp_table; /* reduce tab pressure. */ dp = NULL; while ((dp = enum_rstbl(dptbl, dp))) { struct DIR_PAGE_ENTRY *next = dp; while ((next = enum_rstbl(dptbl, next))) { if (next->target_attr == dp->target_attr && next->vcn == dp->vcn) { if (le64_to_cpu(next->oldest_lsn) < le64_to_cpu(dp->oldest_lsn)) { dp->oldest_lsn = next->oldest_lsn; } free_rsttbl_idx(dptbl, PtrOffset(dptbl, next)); } } } trace_dp_table: check_attribute_names: /* The next record should be the Attribute Names. */ if (!rst->attr_names_len) goto check_attr_table; /* reduce tab pressure. */ t64 = le64_to_cpu(rst->attr_names_lsn); err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb); if (err) goto out; lrh = lcb->log_rec; frh = lcb->lrh; rec_len = le32_to_cpu(frh->client_data_len); if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id), bytes_per_attr_entry)) { err = -EINVAL; goto out; } t32 = lrh_length(lrh); attr_names_bytes = rec_len - t32; attr_names = kmemdup(Add2Ptr(lrh, t32), attr_names_bytes, GFP_NOFS); if (!attr_names) { err = -ENOMEM; goto out; } lcb_put(lcb); lcb = NULL; check_attr_table: /* The next record should be the attribute Table. */ if (!rst->open_attr_len) goto check_attribute_names2; /* reduce tab pressure. */ t64 = le64_to_cpu(rst->open_attr_table_lsn); err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb); if (err) goto out; lrh = lcb->log_rec; frh = lcb->lrh; rec_len = le32_to_cpu(frh->client_data_len); if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id), bytes_per_attr_entry)) { err = -EINVAL; goto out; } t16 = le16_to_cpu(lrh->redo_off); rt = Add2Ptr(lrh, t16); oatbl_bytes = rec_len - t16; if (!check_rstbl(rt, oatbl_bytes)) { err = -EINVAL; goto out; } oatbl = kmemdup(rt, oatbl_bytes, GFP_NOFS); if (!oatbl) { err = -ENOMEM; goto out; } log->open_attr_tbl = oatbl; /* Clear all of the Attr pointers. */ oe = NULL; while ((oe = enum_rstbl(oatbl, oe))) { if (!rst->major_ver) { struct OPEN_ATTR_ENRTY_32 oe0; /* Really 'oe' points to OPEN_ATTR_ENRTY_32. */ memcpy(&oe0, oe, SIZEOF_OPENATTRIBUTEENTRY0); oe->bytes_per_index = oe0.bytes_per_index; oe->type = oe0.type; oe->is_dirty_pages = oe0.is_dirty_pages; oe->name_len = 0; oe->ref = oe0.ref; oe->open_record_lsn = oe0.open_record_lsn; } oe->is_attr_name = 0; oe->ptr = NULL; } lcb_put(lcb); lcb = NULL; check_attribute_names2: if (attr_names && oatbl) { off = 0; for (;;) { /* Check we can use attribute name entry 'ane'. */ static_assert(sizeof(*ane) == 4); if (off + sizeof(*ane) > attr_names_bytes) { /* just ignore the rest. */ break; } ane = Add2Ptr(attr_names, off); t16 = le16_to_cpu(ane->off); if (!t16) { /* this is the only valid exit. */ break; } /* Check we can use open attribute entry 'oe'. */ if (t16 + sizeof(*oe) > oatbl_bytes) { /* just ignore the rest. */ break; } /* TODO: Clear table on exit! */ oe = Add2Ptr(oatbl, t16); t16 = le16_to_cpu(ane->name_bytes); off += t16 + sizeof(*ane); if (off > attr_names_bytes) { /* just ignore the rest. */ break; } oe->name_len = t16 / sizeof(short); oe->ptr = ane->name; oe->is_attr_name = 2; } } /* * If the checkpt_lsn is zero, then this is a freshly * formatted disk and we have no work to do. */ if (!checkpt_lsn) { err = 0; goto out; } if (!oatbl) { oatbl = init_rsttbl(bytes_per_attr_entry, 8); if (!oatbl) { err = -ENOMEM; goto out; } } log->open_attr_tbl = oatbl; /* Start the analysis pass from the Checkpoint lsn. */ rec_lsn = checkpt_lsn; /* Read the first lsn. */ err = read_log_rec_lcb(log, checkpt_lsn, lcb_ctx_next, &lcb); if (err) goto out; /* Loop to read all subsequent records to the end of the log file. */ next_log_record_analyze: err = read_next_log_rec(log, lcb, &rec_lsn); if (err) goto out; if (!rec_lsn) goto end_log_records_enumerate; frh = lcb->lrh; transact_id = le32_to_cpu(frh->transact_id); rec_len = le32_to_cpu(frh->client_data_len); lrh = lcb->log_rec; if (!check_log_rec(lrh, rec_len, transact_id, bytes_per_attr_entry)) { err = -EINVAL; goto out; } /* * The first lsn after the previous lsn remembered * the checkpoint is the first candidate for the rlsn. */ if (!rlsn) rlsn = rec_lsn; if (LfsClientRecord != frh->record_type) goto next_log_record_analyze; /* * Now update the Transaction Table for this transaction. If there * is no entry present or it is unallocated we allocate the entry. */ if (!trtbl) { trtbl = init_rsttbl(sizeof(struct TRANSACTION_ENTRY), INITIAL_NUMBER_TRANSACTIONS); if (!trtbl) { err = -ENOMEM; goto out; } } tr = Add2Ptr(trtbl, transact_id); if (transact_id >= bytes_per_rt(trtbl) || tr->next != RESTART_ENTRY_ALLOCATED_LE) { tr = alloc_rsttbl_from_idx(&trtbl, transact_id); if (!tr) { err = -ENOMEM; goto out; } tr->transact_state = TransactionActive; tr->first_lsn = cpu_to_le64(rec_lsn); } tr->prev_lsn = tr->undo_next_lsn = cpu_to_le64(rec_lsn); /* * If this is a compensation log record, then change * the undo_next_lsn to be the undo_next_lsn of this record. */ if (lrh->undo_op == cpu_to_le16(CompensationLogRecord)) tr->undo_next_lsn = frh->client_undo_next_lsn; /* Dispatch to handle log record depending on type. */ switch (le16_to_cpu(lrh->redo_op)) { case InitializeFileRecordSegment: case DeallocateFileRecordSegment: case WriteEndOfFileRecordSegment: case CreateAttribute: case DeleteAttribute: case UpdateResidentValue: case UpdateNonresidentValue: case UpdateMappingPairs: case SetNewAttributeSizes: case AddIndexEntryRoot: case DeleteIndexEntryRoot: case AddIndexEntryAllocation: case DeleteIndexEntryAllocation: case WriteEndOfIndexBuffer: case SetIndexEntryVcnRoot: case SetIndexEntryVcnAllocation: case UpdateFileNameRoot: case UpdateFileNameAllocation: case SetBitsInNonresidentBitMap: case ClearBitsInNonresidentBitMap: case UpdateRecordDataRoot: case UpdateRecordDataAllocation: case ZeroEndOfFileRecord: t16 = le16_to_cpu(lrh->target_attr); t64 = le64_to_cpu(lrh->target_vcn); dp = find_dp(dptbl, t16, t64); if (dp) goto copy_lcns; /* * Calculate the number of clusters per page the system * which wrote the checkpoint, possibly creating the table. */ if (dptbl) { t32 = (le16_to_cpu(dptbl->size) - sizeof(struct DIR_PAGE_ENTRY)) / sizeof(u64); } else { t32 = log->clst_per_page; kfree(dptbl); dptbl = init_rsttbl(struct_size(dp, page_lcns, t32), 32); if (!dptbl) { err = -ENOMEM; goto out; } } dp = alloc_rsttbl_idx(&dptbl); if (!dp) { err = -ENOMEM; goto out; } dp->target_attr = cpu_to_le32(t16); dp->transfer_len = cpu_to_le32(t32 << sbi->cluster_bits); dp->lcns_follow = cpu_to_le32(t32); dp->vcn = cpu_to_le64(t64 & ~((u64)t32 - 1)); dp->oldest_lsn = cpu_to_le64(rec_lsn); copy_lcns: /* * Copy the Lcns from the log record into the Dirty Page Entry. * TODO: For different page size support, must somehow make * whole routine a loop, case Lcns do not fit below. */ t16 = le16_to_cpu(lrh->lcns_follow); for (i = 0; i < t16; i++) { size_t j = (size_t)(le64_to_cpu(lrh->target_vcn) - le64_to_cpu(dp->vcn)); dp->page_lcns[j + i] = lrh->page_lcns[i]; } goto next_log_record_analyze; case DeleteDirtyClusters: { u32 range_count = le16_to_cpu(lrh->redo_len) / sizeof(struct LCN_RANGE); const struct LCN_RANGE *r = Add2Ptr(lrh, le16_to_cpu(lrh->redo_off)); /* Loop through all of the Lcn ranges this log record. */ for (i = 0; i < range_count; i++, r++) { u64 lcn0 = le64_to_cpu(r->lcn); u64 lcn_e = lcn0 + le64_to_cpu(r->len) - 1; dp = NULL; while ((dp = enum_rstbl(dptbl, dp))) { u32 j; t32 = le32_to_cpu(dp->lcns_follow); for (j = 0; j < t32; j++) { t64 = le64_to_cpu(dp->page_lcns[j]); if (t64 >= lcn0 && t64 <= lcn_e) dp->page_lcns[j] = 0; } } } goto next_log_record_analyze; } case OpenNonresidentAttribute: t16 = le16_to_cpu(lrh->target_attr); if (t16 >= bytes_per_rt(oatbl)) { /* * Compute how big the table needs to be. * Add 10 extra entries for some cushion. */ u32 new_e = t16 / le16_to_cpu(oatbl->size); new_e += 10 - le16_to_cpu(oatbl->used); oatbl = extend_rsttbl(oatbl, new_e, ~0u); log->open_attr_tbl = oatbl; if (!oatbl) { err = -ENOMEM; goto out; } } /* Point to the entry being opened. */ oe = alloc_rsttbl_from_idx(&oatbl, t16); log->open_attr_tbl = oatbl; if (!oe) { err = -ENOMEM; goto out; } /* Initialize this entry from the log record. */ t16 = le16_to_cpu(lrh->redo_off); if (!rst->major_ver) { /* Convert version '0' into version '1'. */ struct OPEN_ATTR_ENRTY_32 *oe0 = Add2Ptr(lrh, t16); oe->bytes_per_index = oe0->bytes_per_index; oe->type = oe0->type; oe->is_dirty_pages = oe0->is_dirty_pages; oe->name_len = 0; //oe0.name_len; oe->ref = oe0->ref; oe->open_record_lsn = oe0->open_record_lsn; } else { memcpy(oe, Add2Ptr(lrh, t16), bytes_per_attr_entry); } t16 = le16_to_cpu(lrh->undo_len); if (t16) { oe->ptr = kmalloc(t16, GFP_NOFS); if (!oe->ptr) { err = -ENOMEM; goto out; } oe->name_len = t16 / sizeof(short); memcpy(oe->ptr, Add2Ptr(lrh, le16_to_cpu(lrh->undo_off)), t16); oe->is_attr_name = 1; } else { oe->ptr = NULL; oe->is_attr_name = 0; } goto next_log_record_analyze; case HotFix: t16 = le16_to_cpu(lrh->target_attr); t64 = le64_to_cpu(lrh->target_vcn); dp = find_dp(dptbl, t16, t64); if (dp) { size_t j = le64_to_cpu(lrh->target_vcn) - le64_to_cpu(dp->vcn); if (dp->page_lcns[j]) dp->page_lcns[j] = lrh->page_lcns[0]; } goto next_log_record_analyze; case EndTopLevelAction: tr = Add2Ptr(trtbl, transact_id); tr->prev_lsn = cpu_to_le64(rec_lsn); tr->undo_next_lsn = frh->client_undo_next_lsn; goto next_log_record_analyze; case PrepareTransaction: tr = Add2Ptr(trtbl, transact_id); tr->transact_state = TransactionPrepared; goto next_log_record_analyze; case CommitTransaction: tr = Add2Ptr(trtbl, transact_id); tr->transact_state = TransactionCommitted; goto next_log_record_analyze; case ForgetTransaction: free_rsttbl_idx(trtbl, transact_id); goto next_log_record_analyze; case Noop: case OpenAttributeTableDump: case AttributeNamesDump: case DirtyPageTableDump: case TransactionTableDump: /* The following cases require no action the Analysis Pass. */ goto next_log_record_analyze; default: /* * All codes will be explicitly handled. * If we see a code we do not expect, then we are trouble. */ goto next_log_record_analyze; } end_log_records_enumerate: lcb_put(lcb); lcb = NULL; /* * Scan the Dirty Page Table and Transaction Table for * the lowest lsn, and return it as the Redo lsn. */ dp = NULL; while ((dp = enum_rstbl(dptbl, dp))) { t64 = le64_to_cpu(dp->oldest_lsn); if (t64 && t64 < rlsn) rlsn = t64; } tr = NULL; while ((tr = enum_rstbl(trtbl, tr))) { t64 = le64_to_cpu(tr->first_lsn); if (t64 && t64 < rlsn) rlsn = t64; } /* * Only proceed if the Dirty Page Table or Transaction * table are not empty. */ if ((!dptbl || !dptbl->total) && (!trtbl || !trtbl->total)) goto end_replay; sbi->flags |= NTFS_FLAGS_NEED_REPLAY; if (is_ro) goto out; /* Reopen all of the attributes with dirty pages. */ oe = NULL; next_open_attribute: oe = enum_rstbl(oatbl, oe); if (!oe) { err = 0; dp = NULL; goto next_dirty_page; } oa = kzalloc(sizeof(struct OpenAttr), GFP_NOFS); if (!oa) { err = -ENOMEM; goto out; } inode = ntfs_iget5(sbi->sb, &oe->ref, NULL); if (IS_ERR(inode)) goto fake_attr; if (is_bad_inode(inode)) { iput(inode); fake_attr: if (oa->ni) { iput(&oa->ni->vfs_inode); oa->ni = NULL; } attr = attr_create_nonres_log(sbi, oe->type, 0, oe->ptr, oe->name_len, 0); if (!attr) { kfree(oa); err = -ENOMEM; goto out; } oa->attr = attr; oa->run1 = &oa->run0; goto final_oe; } ni_oe = ntfs_i(inode); oa->ni = ni_oe; attr = ni_find_attr(ni_oe, NULL, NULL, oe->type, oe->ptr, oe->name_len, NULL, NULL); if (!attr) goto fake_attr; t32 = le32_to_cpu(attr->size); oa->attr = kmemdup(attr, t32, GFP_NOFS); if (!oa->attr) goto fake_attr; if (!S_ISDIR(inode->i_mode)) { if (attr->type == ATTR_DATA && !attr->name_len) { oa->run1 = &ni_oe->file.run; goto final_oe; } } else { if (attr->type == ATTR_ALLOC && attr->name_len == ARRAY_SIZE(I30_NAME) && !memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME))) { oa->run1 = &ni_oe->dir.alloc_run; goto final_oe; } } if (attr->non_res) { u16 roff = le16_to_cpu(attr->nres.run_off); CLST svcn = le64_to_cpu(attr->nres.svcn); if (roff > t32) { kfree(oa->attr); oa->attr = NULL; goto fake_attr; } err = run_unpack(&oa->run0, sbi, inode->i_ino, svcn, le64_to_cpu(attr->nres.evcn), svcn, Add2Ptr(attr, roff), t32 - roff); if (err < 0) { kfree(oa->attr); oa->attr = NULL; goto fake_attr; } err = 0; } oa->run1 = &oa->run0; attr = oa->attr; final_oe: if (oe->is_attr_name == 1) kfree(oe->ptr); oe->is_attr_name = 0; oe->ptr = oa; oe->name_len = attr->name_len; goto next_open_attribute; /* * Now loop through the dirty page table to extract all of the Vcn/Lcn. * Mapping that we have, and insert it into the appropriate run. */ next_dirty_page: dp = enum_rstbl(dptbl, dp); if (!dp) goto do_redo_1; oe = Add2Ptr(oatbl, le32_to_cpu(dp->target_attr)); if (oe->next != RESTART_ENTRY_ALLOCATED_LE) goto next_dirty_page; oa = oe->ptr; if (!oa) goto next_dirty_page; i = -1; next_dirty_page_vcn: i += 1; if (i >= le32_to_cpu(dp->lcns_follow)) goto next_dirty_page; vcn = le64_to_cpu(dp->vcn) + i; size = (vcn + 1) << sbi->cluster_bits; if (!dp->page_lcns[i]) goto next_dirty_page_vcn; rno = ino_get(&oe->ref); if (rno <= MFT_REC_MIRR && size < (MFT_REC_VOL + 1) * sbi->record_size && oe->type == ATTR_DATA) { goto next_dirty_page_vcn; } lcn = le64_to_cpu(dp->page_lcns[i]); if ((!run_lookup_entry(oa->run1, vcn, &lcn0, &len0, NULL) || lcn0 != lcn) && !run_add_entry(oa->run1, vcn, lcn, 1, false)) { err = -ENOMEM; goto out; } attr = oa->attr; if (size > le64_to_cpu(attr->nres.alloc_size)) { attr->nres.valid_size = attr->nres.data_size = attr->nres.alloc_size = cpu_to_le64(size); } goto next_dirty_page_vcn; do_redo_1: /* * Perform the Redo Pass, to restore all of the dirty pages to the same * contents that they had immediately before the crash. If the dirty * page table is empty, then we can skip the entire Redo Pass. */ if (!dptbl || !dptbl->total) goto do_undo_action; rec_lsn = rlsn; /* * Read the record at the Redo lsn, before falling * into common code to handle each record. */ err = read_log_rec_lcb(log, rlsn, lcb_ctx_next, &lcb); if (err) goto out; /* * Now loop to read all of our log records forwards, until * we hit the end of the file, cleaning up at the end. */ do_action_next: frh = lcb->lrh; if (LfsClientRecord != frh->record_type) goto read_next_log_do_action; transact_id = le32_to_cpu(frh->transact_id); rec_len = le32_to_cpu(frh->client_data_len); lrh = lcb->log_rec; if (!check_log_rec(lrh, rec_len, transact_id, bytes_per_attr_entry)) { err = -EINVAL; goto out; } /* Ignore log records that do not update pages. */ if (lrh->lcns_follow) goto find_dirty_page; goto read_next_log_do_action; find_dirty_page: t16 = le16_to_cpu(lrh->target_attr); t64 = le64_to_cpu(lrh->target_vcn); dp = find_dp(dptbl, t16, t64); if (!dp) goto read_next_log_do_action; if (rec_lsn < le64_to_cpu(dp->oldest_lsn)) goto read_next_log_do_action; t16 = le16_to_cpu(lrh->target_attr); if (t16 >= bytes_per_rt(oatbl)) { err = -EINVAL; goto out; } oe = Add2Ptr(oatbl, t16); if (oe->next != RESTART_ENTRY_ALLOCATED_LE) { err = -EINVAL; goto out; } oa = oe->ptr; if (!oa) { err = -EINVAL; goto out; } attr = oa->attr; vcn = le64_to_cpu(lrh->target_vcn); if (!run_lookup_entry(oa->run1, vcn, &lcn, NULL, NULL) || lcn == SPARSE_LCN) { goto read_next_log_do_action; } /* Point to the Redo data and get its length. */ data = Add2Ptr(lrh, le16_to_cpu(lrh->redo_off)); dlen = le16_to_cpu(lrh->redo_len); /* Shorten length by any Lcns which were deleted. */ saved_len = dlen; for (i = le16_to_cpu(lrh->lcns_follow); i; i--) { size_t j; u32 alen, voff; voff = le16_to_cpu(lrh->record_off) + le16_to_cpu(lrh->attr_off); voff += le16_to_cpu(lrh->cluster_off) << SECTOR_SHIFT; /* If the Vcn question is allocated, we can just get out. */ j = le64_to_cpu(lrh->target_vcn) - le64_to_cpu(dp->vcn); if (dp->page_lcns[j + i - 1]) break; if (!saved_len) saved_len = 1; /* * Calculate the allocated space left relative to the * log record Vcn, after removing this unallocated Vcn. */ alen = (i - 1) << sbi->cluster_bits; /* * If the update described this log record goes beyond * the allocated space, then we will have to reduce the length. */ if (voff >= alen) dlen = 0; else if (voff + dlen > alen) dlen = alen - voff; } /* * If the resulting dlen from above is now zero, * we can skip this log record. */ if (!dlen && saved_len) goto read_next_log_do_action; t16 = le16_to_cpu(lrh->redo_op); if (can_skip_action(t16)) goto read_next_log_do_action; /* Apply the Redo operation a common routine. */ err = do_action(log, oe, lrh, t16, data, dlen, rec_len, &rec_lsn); if (err) goto out; /* Keep reading and looping back until end of file. */ read_next_log_do_action: err = read_next_log_rec(log, lcb, &rec_lsn); if (!err && rec_lsn) goto do_action_next; lcb_put(lcb); lcb = NULL; do_undo_action: /* Scan Transaction Table. */ tr = NULL; transaction_table_next: tr = enum_rstbl(trtbl, tr); if (!tr) goto undo_action_done; if (TransactionActive != tr->transact_state || !tr->undo_next_lsn) { free_rsttbl_idx(trtbl, PtrOffset(trtbl, tr)); goto transaction_table_next; } log->transaction_id = PtrOffset(trtbl, tr); undo_next_lsn = le64_to_cpu(tr->undo_next_lsn); /* * We only have to do anything if the transaction has * something its undo_next_lsn field. */ if (!undo_next_lsn) goto commit_undo; /* Read the first record to be undone by this transaction. */ err = read_log_rec_lcb(log, undo_next_lsn, lcb_ctx_undo_next, &lcb); if (err) goto out; /* * Now loop to read all of our log records forwards, * until we hit the end of the file, cleaning up at the end. */ undo_action_next: lrh = lcb->log_rec; frh = lcb->lrh; transact_id = le32_to_cpu(frh->transact_id); rec_len = le32_to_cpu(frh->client_data_len); if (!check_log_rec(lrh, rec_len, transact_id, bytes_per_attr_entry)) { err = -EINVAL; goto out; } if (lrh->undo_op == cpu_to_le16(Noop)) goto read_next_log_undo_action; oe = Add2Ptr(oatbl, le16_to_cpu(lrh->target_attr)); oa = oe->ptr; t16 = le16_to_cpu(lrh->lcns_follow); if (!t16) goto add_allocated_vcns; is_mapped = run_lookup_entry(oa->run1, le64_to_cpu(lrh->target_vcn), &lcn, &clen, NULL); /* * If the mapping isn't already the table or the mapping * corresponds to a hole the mapping, we need to make sure * there is no partial page already memory. */ if (is_mapped && lcn != SPARSE_LCN && clen >= t16) goto add_allocated_vcns; vcn = le64_to_cpu(lrh->target_vcn); vcn &= ~(u64)(log->clst_per_page - 1); add_allocated_vcns: for (i = 0, vcn = le64_to_cpu(lrh->target_vcn), size = (vcn + 1) << sbi->cluster_bits; i < t16; i++, vcn += 1, size += sbi->cluster_size) { attr = oa->attr; if (!attr->non_res) { if (size > le32_to_cpu(attr->res.data_size)) attr->res.data_size = cpu_to_le32(size); } else { if (size > le64_to_cpu(attr->nres.data_size)) attr->nres.valid_size = attr->nres.data_size = attr->nres.alloc_size = cpu_to_le64(size); } } t16 = le16_to_cpu(lrh->undo_op); if (can_skip_action(t16)) goto read_next_log_undo_action; /* Point to the Redo data and get its length. */ data = Add2Ptr(lrh, le16_to_cpu(lrh->undo_off)); dlen = le16_to_cpu(lrh->undo_len); /* It is time to apply the undo action. */ err = do_action(log, oe, lrh, t16, data, dlen, rec_len, NULL); read_next_log_undo_action: /* * Keep reading and looping back until we have read the * last record for this transaction. */ err = read_next_log_rec(log, lcb, &rec_lsn); if (err) goto out; if (rec_lsn) goto undo_action_next; lcb_put(lcb); lcb = NULL; commit_undo: free_rsttbl_idx(trtbl, log->transaction_id); log->transaction_id = 0; goto transaction_table_next; undo_action_done: ntfs_update_mftmirr(sbi, 0); sbi->flags &= ~NTFS_FLAGS_NEED_REPLAY; end_replay: err = 0; if (is_ro) goto out; rh = kzalloc(log->page_size, GFP_NOFS); if (!rh) { err = -ENOMEM; goto out; } rh->rhdr.sign = NTFS_RSTR_SIGNATURE; rh->rhdr.fix_off = cpu_to_le16(offsetof(struct RESTART_HDR, fixups)); t16 = (log->page_size >> SECTOR_SHIFT) + 1; rh->rhdr.fix_num = cpu_to_le16(t16); rh->sys_page_size = cpu_to_le32(log->page_size); rh->page_size = cpu_to_le32(log->page_size); t16 = ALIGN(offsetof(struct RESTART_HDR, fixups) + sizeof(short) * t16, 8); rh->ra_off = cpu_to_le16(t16); rh->minor_ver = cpu_to_le16(1); // 0x1A: rh->major_ver = cpu_to_le16(1); // 0x1C: ra2 = Add2Ptr(rh, t16); memcpy(ra2, ra, sizeof(struct RESTART_AREA)); ra2->client_idx[0] = 0; ra2->client_idx[1] = LFS_NO_CLIENT_LE; ra2->flags = cpu_to_le16(2); le32_add_cpu(&ra2->open_log_count, 1); ntfs_fix_pre_write(&rh->rhdr, log->page_size); err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size, 0); if (!err) err = ntfs_sb_write_run(sbi, &log->ni->file.run, log->page_size, rh, log->page_size, 0); kfree(rh); if (err) goto out; out: kfree(rst); if (lcb) lcb_put(lcb); /* * Scan the Open Attribute Table to close all of * the open attributes. */ oe = NULL; while ((oe = enum_rstbl(oatbl, oe))) { rno = ino_get(&oe->ref); if (oe->is_attr_name == 1) { kfree(oe->ptr); oe->ptr = NULL; continue; } if (oe->is_attr_name) continue; oa = oe->ptr; if (!oa) continue; run_close(&oa->run0); kfree(oa->attr); if (oa->ni) iput(&oa->ni->vfs_inode); kfree(oa); } kfree(trtbl); kfree(oatbl); kfree(dptbl); kfree(attr_names); kfree(log->rst_info.r_page); kfree(ra); kfree(log->one_page_buf); if (err) sbi->flags |= NTFS_FLAGS_NEED_REPLAY; if (err == -EROFS) err = 0; else if (log->set_dirty) ntfs_set_state(sbi, NTFS_DIRTY_ERROR); kfree(log); return err; } |
| 6148 6310 224 1667 1669 1658 1661 614 221 7966 2216 14 14 10 14 221 224 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_FIND_H_ #define __LINUX_FIND_H_ #ifndef __LINUX_BITMAP_H #error only <linux/bitmap.h> can be included directly #endif #include <linux/bitops.h> unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits, unsigned long start); unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n); unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n); extern unsigned long _find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size); unsigned long _find_first_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size); unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); #ifdef __BIG_ENDIAN unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size); unsigned long _find_next_zero_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset); unsigned long _find_next_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset); #endif #ifndef find_next_bit /** * find_next_bit - find the next set bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_bit(addr, size, offset); } #endif #ifndef find_next_and_bit /** * find_next_and_bit - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr1 & *addr2 & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_and_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_andnot_bit /** * find_next_andnot_bit - find the next set bit in *addr1 excluding all the bits * in *addr2 * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr1 & ~*addr2 & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_andnot_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_or_bit /** * find_next_or_bit - find the next set bit in either memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = (*addr1 | *addr2) & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_or_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_zero_bit /** * find_next_zero_bit - find the next cleared bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number of the next zero bit * If no bits are zero, returns @size. */ static __always_inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr | ~GENMASK(size - 1, offset); return val == ~0UL ? size : ffz(val); } return _find_next_zero_bit(addr, size, offset); } #endif #ifndef find_first_bit /** * find_first_bit - find the first set bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * * Returns the bit number of the first set bit. * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_bit(addr, size); } #endif /** * find_nth_bit - find N'th set bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * The following is semantically equivalent: * idx = find_nth_bit(addr, size, 0); * idx = find_first_bit(addr, size); * * Returns the bit number of the N'th set bit. * If no such, returns >= @size. */ static __always_inline unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_bit(addr, size, n); } /** * find_nth_and_bit - find N'th set bit in 2 memory regions * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_and_bit(addr1, addr2, size, n); } /** * find_nth_andnot_bit - find N'th set bit in 2 memory regions, * flipping bits in 2nd region * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & (~*addr2) & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_andnot_bit(addr1, addr2, size, n); } /** * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions, * excluding those set in 3rd region * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @addr3: The 3rd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n); } #ifndef find_first_and_bit /** * find_first_and_bit - find the first set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_and_bit(addr1, addr2, size); } #endif /** * find_first_andnot_bit - find the first bit set in 1st memory region and unset in 2nd * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the first set bit * If no bits are set, returns >= @size. */ static __always_inline unsigned long find_first_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & (~*addr2) & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_andnot_bit(addr1, addr2, size); } /** * find_first_and_and_bit - find the first set bit in 3 memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @addr3: The third address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the first set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & *addr3 & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_and_and_bit(addr1, addr2, addr3, size); } #ifndef find_first_zero_bit /** * find_first_zero_bit - find the first cleared bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * * Returns the bit number of the first cleared bit. * If no bits are zero, returns @size. */ static __always_inline unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr | ~GENMASK(size - 1, 0); return val == ~0UL ? size : ffz(val); } return _find_first_zero_bit(addr, size); } #endif #ifndef find_last_bit /** * find_last_bit - find the last set bit in a memory region * @addr: The address to start the search at * @size: The number of bits to search * * Returns the bit number of the last set bit, or size. */ static __always_inline unsigned long find_last_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? __fls(val) : size; } return _find_last_bit(addr, size); } #endif /** * find_next_and_bit_wrap - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_and_bit_wrap(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { unsigned long bit = find_next_and_bit(addr1, addr2, size, offset); if (bit < size || offset == 0) return bit; bit = find_first_and_bit(addr1, addr2, offset); return bit < offset ? bit : size; } /** * find_next_bit_wrap - find the next set bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_bit_wrap(const unsigned long *addr, unsigned long size, unsigned long offset) { unsigned long bit = find_next_bit(addr, size, offset); if (bit < size || offset == 0) return bit; bit = find_first_bit(addr, offset); return bit < offset ? bit : size; } /* * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing * before using it alone. */ static __always_inline unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size, unsigned long start, unsigned long n) { unsigned long bit; /* If not wrapped around */ if (n > start) { /* and have a bit, just return it. */ bit = find_next_bit(bitmap, size, n); if (bit < size) return bit; /* Otherwise, wrap around and ... */ n = 0; } /* Search the other part. */ bit = find_next_bit(bitmap, start, n); return bit < start ? bit : size; } /** * find_next_clump8 - find next 8-bit clump with set bits in a memory region * @clump: location to store copy of found clump * @addr: address to base the search on * @size: bitmap size in number of bits * @offset: bit offset at which to start searching * * Returns the bit offset for the next set clump; the found clump value is * copied to the location pointed by @clump. If no bits are set, returns @size. */ extern unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr, unsigned long size, unsigned long offset); #define find_first_clump8(clump, bits, size) \ find_next_clump8((clump), (bits), (size), 0) #if defined(__LITTLE_ENDIAN) static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_zero_bit(addr, size, offset); } static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_bit(addr, size, offset); } static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { return find_first_zero_bit(addr, size); } #elif defined(__BIG_ENDIAN) #ifndef find_next_zero_bit_le static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val = *(const unsigned long *)addr; if (unlikely(offset >= size)) return size; val = swab(val) | ~GENMASK(size - 1, offset); return val == ~0UL ? size : ffz(val); } return _find_next_zero_bit_le(addr, size, offset); } #endif #ifndef find_first_zero_bit_le static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = swab(*(const unsigned long *)addr) | ~GENMASK(size - 1, 0); return val == ~0UL ? size : ffz(val); } return _find_first_zero_bit_le(addr, size); } #endif #ifndef find_next_bit_le static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val = *(const unsigned long *)addr; if (unlikely(offset >= size)) return size; val = swab(val) & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_bit_le(addr, size, offset); } #endif #else #error "Please fix <asm/byteorder.h>" #endif #define for_each_set_bit(bit, addr, size) \ for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_and_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) #define for_each_andnot_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) #define for_each_or_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) /* same as for_each_set_bit() but use bit as value to start with */ #define for_each_set_bit_from(bit, addr, size) \ for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_clear_bit(bit, addr, size) \ for ((bit) = 0; \ (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); \ (bit)++) /* same as for_each_clear_bit() but use bit as value to start with */ #define for_each_clear_bit_from(bit, addr, size) \ for (; (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); (bit)++) /** * for_each_set_bitrange - iterate over all set bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit) * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_bitrange(b, e, addr, size) \ for ((b) = 0; \ (b) = find_next_bit((addr), (size), b), \ (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_set_bitrange_from - iterate over all set bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit); must be initialized * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_bitrange_from(b, e, addr, size) \ for (; \ (b) = find_next_bit((addr), (size), (b)), \ (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_clear_bitrange - iterate over all unset bit ranges [b; e) * @b: bit offset of start of current bitrange (first unset bit) * @e: bit offset of end of current bitrange (first set bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_clear_bitrange(b, e, addr, size) \ for ((b) = 0; \ (b) = find_next_zero_bit((addr), (size), (b)), \ (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit); must be initialized * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_clear_bitrange_from(b, e, addr, size) \ for (; \ (b) = find_next_zero_bit((addr), (size), (b)), \ (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_set_bit_wrap - iterate over all set bits starting from @start, and * wrapping around the end of bitmap. * @bit: offset for current iteration * @addr: bitmap address to base the search on * @size: bitmap size in number of bits * @start: Starting bit for bitmap traversing, wrapping around the bitmap end */ #define for_each_set_bit_wrap(bit, addr, size, start) \ for ((bit) = find_next_bit_wrap((addr), (size), (start)); \ (bit) < (size); \ (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1)) /** * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits * @start: bit offset to start search and to store the current iteration offset * @clump: location to store copy of current 8-bit clump * @bits: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_clump8(start, clump, bits, size) \ for ((start) = find_first_clump8(&(clump), (bits), (size)); \ (start) < (size); \ (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) #endif /*__LINUX_FIND_H_ */ |
| 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 | #ifndef __LINUX_MROUTE_BASE_H #define __LINUX_MROUTE_BASE_H #include <linux/netdevice.h> #include <linux/rhashtable-types.h> #include <linux/spinlock.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> /** * struct vif_device - interface representor for multicast routing * @dev: network device being used * @dev_tracker: refcount tracker for @dev reference * @bytes_in: statistic; bytes ingressing * @bytes_out: statistic; bytes egresing * @pkt_in: statistic; packets ingressing * @pkt_out: statistic; packets egressing * @rate_limit: Traffic shaping (NI) * @threshold: TTL threshold * @flags: Control flags * @link: Physical interface index * @dev_parent_id: device parent id * @local: Local address * @remote: Remote address for tunnels */ struct vif_device { struct net_device __rcu *dev; netdevice_tracker dev_tracker; unsigned long bytes_in, bytes_out; unsigned long pkt_in, pkt_out; unsigned long rate_limit; unsigned char threshold; unsigned short flags; int link; /* Currently only used by ipmr */ struct netdev_phys_item_id dev_parent_id; __be32 local, remote; }; struct vif_entry_notifier_info { struct fib_notifier_info info; struct net_device *dev; unsigned short vif_index; unsigned short vif_flags; u32 tb_id; }; static inline int mr_call_vif_notifier(struct notifier_block *nb, unsigned short family, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, unsigned short vif_index, u32 tb_id, struct netlink_ext_ack *extack) { struct vif_entry_notifier_info info = { .info = { .family = family, .extack = extack, }, .dev = vif_dev, .vif_index = vif_index, .vif_flags = vif->flags, .tb_id = tb_id, }; return call_fib_notifier(nb, event_type, &info.info); } static inline int mr_call_vif_notifiers(struct net *net, unsigned short family, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, unsigned short vif_index, u32 tb_id, unsigned int *ipmr_seq) { struct vif_entry_notifier_info info = { .info = { .family = family, }, .dev = vif_dev, .vif_index = vif_index, .vif_flags = vif->flags, .tb_id = tb_id, }; ASSERT_RTNL(); (*ipmr_seq)++; return call_fib_notifiers(net, event_type, &info.info); } #ifndef MAXVIFS /* This one is nasty; value is defined in uapi using different symbols for * mroute and morute6 but both map into same 32. */ #define MAXVIFS 32 #endif /* Note: This helper is deprecated. */ #define VIF_EXISTS(_mrt, _idx) (!!rcu_access_pointer((_mrt)->vif_table[_idx].dev)) /* mfc_flags: * MFC_STATIC - the entry was added statically (not by a routing daemon) * MFC_OFFLOAD - the entry was offloaded to the hardware */ enum { MFC_STATIC = BIT(0), MFC_OFFLOAD = BIT(1), }; /** * struct mr_mfc - common multicast routing entries * @mnode: rhashtable list * @mfc_parent: source interface (iif) * @mfc_flags: entry flags * @expires: unresolved entry expire time * @unresolved: unresolved cached skbs * @last_assert: time of last assert * @minvif: minimum VIF id * @maxvif: maximum VIF id * @bytes: bytes that have passed for this entry * @pkt: packets that have passed for this entry * @wrong_if: number of wrong source interface hits * @lastuse: time of last use of the group (traffic or update) * @ttls: OIF TTL threshold array * @refcount: reference count for this entry * @list: global entry list * @rcu: used for entry destruction * @free: Operation used for freeing an entry under RCU */ struct mr_mfc { struct rhlist_head mnode; unsigned short mfc_parent; int mfc_flags; union { struct { unsigned long expires; struct sk_buff_head unresolved; } unres; struct { unsigned long last_assert; int minvif; int maxvif; atomic_long_t bytes; atomic_long_t pkt; atomic_long_t wrong_if; unsigned long lastuse; unsigned char ttls[MAXVIFS]; refcount_t refcount; } res; } mfc_un; struct list_head list; struct rcu_head rcu; void (*free)(struct rcu_head *head); }; static inline void mr_cache_put(struct mr_mfc *c) { if (refcount_dec_and_test(&c->mfc_un.res.refcount)) call_rcu(&c->rcu, c->free); } static inline void mr_cache_hold(struct mr_mfc *c) { refcount_inc(&c->mfc_un.res.refcount); } struct mfc_entry_notifier_info { struct fib_notifier_info info; struct mr_mfc *mfc; u32 tb_id; }; static inline int mr_call_mfc_notifier(struct notifier_block *nb, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id, struct netlink_ext_ack *extack) { struct mfc_entry_notifier_info info = { .info = { .family = family, .extack = extack, }, .mfc = mfc, .tb_id = tb_id }; return call_fib_notifier(nb, event_type, &info.info); } static inline int mr_call_mfc_notifiers(struct net *net, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id, unsigned int *ipmr_seq) { struct mfc_entry_notifier_info info = { .info = { .family = family, }, .mfc = mfc, .tb_id = tb_id }; ASSERT_RTNL(); (*ipmr_seq)++; return call_fib_notifiers(net, event_type, &info.info); } struct mr_table; /** * struct mr_table_ops - callbacks and info for protocol-specific ops * @rht_params: parameters for accessing the MFC hash * @cmparg_any: a hash key to be used for matching on (*,*) routes */ struct mr_table_ops { const struct rhashtable_params *rht_params; void *cmparg_any; }; /** * struct mr_table - a multicast routing table * @list: entry within a list of multicast routing tables * @net: net where this table belongs * @ops: protocol specific operations * @id: identifier of the table * @mroute_sk: socket associated with the table * @ipmr_expire_timer: timer for handling unresolved routes * @mfc_unres_queue: list of unresolved MFC entries * @vif_table: array containing all possible vifs * @mfc_hash: Hash table of all resolved routes for easy lookup * @mfc_cache_list: list of resovled routes for possible traversal * @maxvif: Identifier of highest value vif currently in use * @cache_resolve_queue_len: current size of unresolved queue * @mroute_do_assert: Whether to inform userspace on wrong ingress * @mroute_do_pim: Whether to receive IGMP PIMv1 * @mroute_reg_vif_num: PIM-device vif index */ struct mr_table { struct list_head list; possible_net_t net; struct mr_table_ops ops; u32 id; struct sock __rcu *mroute_sk; struct timer_list ipmr_expire_timer; struct list_head mfc_unres_queue; struct vif_device vif_table[MAXVIFS]; struct rhltable mfc_hash; struct list_head mfc_cache_list; int maxvif; atomic_t cache_resolve_queue_len; bool mroute_do_assert; bool mroute_do_pim; bool mroute_do_wrvifwhole; int mroute_reg_vif_num; }; static inline bool mr_can_free_table(struct net *net) { return !check_net(net) || !net_initialized(net); } #ifdef CONFIG_IP_MROUTE_COMMON void vif_device_init(struct vif_device *v, struct net_device *dev, unsigned long rate_limit, unsigned char threshold, unsigned short flags, unsigned short get_iflink_mask); struct mr_table * mr_table_alloc(struct net *net, u32 id, struct mr_table_ops *ops, void (*expire_func)(struct timer_list *t), void (*table_set)(struct mr_table *mrt, struct net *net)); /* These actually return 'struct mr_mfc *', but to avoid need for explicit * castings they simply return void. */ void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent); void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi); void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg); int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mr_mfc *c, struct rtmsg *rtm); int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb, struct netlink_callback *cb, int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter); int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, struct mr_table *(*iter)(struct net *net, struct mr_table *mrt), int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter); int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), struct netlink_ext_ack *extack); #else static inline void vif_device_init(struct vif_device *v, struct net_device *dev, unsigned long rate_limit, unsigned char threshold, unsigned short flags, unsigned short get_iflink_mask) { } static inline void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent) { return NULL; } static inline void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi) { return NULL; } static inline struct mr_mfc *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg) { return NULL; } static inline int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mr_mfc *c, struct rtmsg *rtm) { return -EINVAL; } static inline int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, struct mr_table *(*iter)(struct net *net, struct mr_table *mrt), int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter) { return -EINVAL; } static inline int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), struct netlink_ext_ack *extack) { return -EINVAL; } #endif static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg) { return mr_mfc_find_parent(mrt, hasharg, -1); } #ifdef CONFIG_PROC_FS struct mr_vif_iter { struct seq_net_private p; struct mr_table *mrt; int ct; }; struct mr_mfc_iter { struct seq_net_private p; struct mr_table *mrt; struct list_head *cache; /* Lock protecting the mr_table's unresolved queue */ spinlock_t *lock; }; #ifdef CONFIG_IP_MROUTE_COMMON void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos); void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos); static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos) { return *pos ? mr_vif_seq_idx(seq_file_net(seq), seq->private, *pos - 1) : SEQ_START_TOKEN; } /* These actually return 'struct mr_mfc *', but to avoid need for explicit * castings they simply return void. */ void *mr_mfc_seq_idx(struct net *net, struct mr_mfc_iter *it, loff_t pos); void *mr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos); static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos, struct mr_table *mrt, spinlock_t *lock) { struct mr_mfc_iter *it = seq->private; it->mrt = mrt; it->cache = NULL; it->lock = lock; return *pos ? mr_mfc_seq_idx(seq_file_net(seq), seq->private, *pos - 1) : SEQ_START_TOKEN; } static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v) { struct mr_mfc_iter *it = seq->private; struct mr_table *mrt = it->mrt; if (it->cache == &mrt->mfc_unres_queue) spin_unlock_bh(it->lock); else if (it->cache == &mrt->mfc_cache_list) rcu_read_unlock(); } #else static inline void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos) { return NULL; } static inline void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return NULL; } static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos) { return NULL; } static inline void *mr_mfc_seq_idx(struct net *net, struct mr_mfc_iter *it, loff_t pos) { return NULL; } static inline void *mr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return NULL; } static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos, struct mr_table *mrt, spinlock_t *lock) { return NULL; } static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v) { } #endif #endif #endif |
| 309 308 329 327 128 28 17 107 126 39 90 90 126 24 126 78 14 66 27 37 4 298 297 2 3 1 138 17 22 13 190 24 24 126 37 62 126 190 149 41 98 92 168 22 20 17 17 307 306 1 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 | /* * Ext4 orphan inode handling */ #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include "ext4.h" #include "ext4_jbd2.h" static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) { int i, j, start; struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; int ret = 0; bool found = false; __le32 *bdata; int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); int looped = 0; /* * Find block with free orphan entry. Use CPU number for a naive hash * for a search start in the orphan file */ start = raw_smp_processor_id()*13 % oi->of_blocks; i = start; do { if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries) >= 0) { found = true; break; } if (++i >= oi->of_blocks) i = 0; } while (i != start); if (!found) { /* * For now we don't grow or shrink orphan file. We just use * whatever was allocated at mke2fs time. The additional * credits we would have to reserve for each orphan inode * operation just don't seem worth it. */ return -ENOSPC; } ret = ext4_journal_get_write_access(handle, inode->i_sb, oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE); if (ret) { atomic_inc(&oi->of_binfo[i].ob_free_entries); return ret; } bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); /* Find empty slot in a block */ j = 0; do { if (looped) { /* * Did we walk through the block several times without * finding free entry? It is theoretically possible * if entries get constantly allocated and freed or * if the block is corrupted. Avoid indefinite looping * and bail. We'll use orphan list instead. */ if (looped > 3) { atomic_inc(&oi->of_binfo[i].ob_free_entries); return -ENOSPC; } cond_resched(); } while (bdata[j]) { if (++j >= inodes_per_ob) { j = 0; looped++; } } } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) != (__le32)0); EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh); } /* * ext4_orphan_add() links an unlinked or truncated inode into a list of * such inodes, starting at the superblock, in case we crash before the * file is closed/deleted, or in case the inode truncate spans multiple * transactions and the last transaction is not recovered after a crash. * * At filesystem recovery time, we walk this list deleting unlinked * inodes and truncating linked inodes in ext4_orphan_cleanup(). * * Orphan list manipulation functions must be called under i_rwsem unless * we are just creating the inode or deleting it. */ int ext4_orphan_add(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_iloc iloc; int err = 0, rc; bool dirty = false; if (!sbi->s_journal || is_bad_inode(inode)) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); /* * Inode orphaned in orphan file or in orphan list? */ if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || !list_empty(&EXT4_I(inode)->i_orphan)) return 0; /* * Orphan handling is only valid for files with data blocks * being truncated, or files being unlinked. Note that we either * hold i_rwsem, or the inode can not be referenced from outside, * so i_nlink should not be bumped due to race */ ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); if (sbi->s_orphan_info.of_blocks) { err = ext4_orphan_file_add(handle, inode); /* * Fallback to normal orphan list of orphan file is * out of space */ if (err != -ENOSPC) return err; } BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto out; err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out; mutex_lock(&sbi->s_orphan_lock); /* * Due to previous errors inode may be already a part of on-disk * orphan list. If so skip on-disk list modification. */ if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > (le32_to_cpu(sbi->s_es->s_inodes_count))) { /* Insert this inode at the head of the on-disk orphan list */ NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); lock_buffer(sbi->s_sbh); sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); dirty = true; } list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); mutex_unlock(&sbi->s_orphan_lock); if (dirty) { err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; if (err) { /* * We have to remove inode from in-memory list if * addition to on disk orphan list failed. Stray orphan * list entries can cause panics at unmount time. */ mutex_lock(&sbi->s_orphan_lock); list_del_init(&EXT4_I(inode)->i_orphan); mutex_unlock(&sbi->s_orphan_lock); } } else brelse(iloc.bh); ext4_debug("superblock will point to %lu\n", inode->i_ino); ext4_debug("orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out: ext4_std_error(sb, err); return err; } static int ext4_orphan_file_del(handle_t *handle, struct inode *inode) { struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; __le32 *bdata; int blk, off; int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); int ret = 0; if (!handle) goto out; blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob; off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob; if (WARN_ON_ONCE(blk >= oi->of_blocks)) goto out; ret = ext4_journal_get_write_access(handle, inode->i_sb, oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE); if (ret) goto out; bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data); bdata[off] = 0; atomic_inc(&oi->of_binfo[blk].ob_free_entries); ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh); out: ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE); INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan); return ret; } /* * ext4_orphan_del() removes an unlinked or truncated inode from the list * of such inodes stored on disk, because it is finally being cleaned up. */ int ext4_orphan_del(handle_t *handle, struct inode *inode) { struct list_head *prev; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 ino_next; struct ext4_iloc iloc; int err = 0; if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) return ext4_orphan_file_del(handle, inode); /* Do this quick check before taking global s_orphan_lock. */ if (list_empty(&ei->i_orphan)) return 0; if (handle) { /* Grab inode buffer early before taking global s_orphan_lock */ err = ext4_reserve_inode_write(handle, inode, &iloc); } mutex_lock(&sbi->s_orphan_lock); ext4_debug("remove inode %lu from orphan list\n", inode->i_ino); prev = ei->i_orphan.prev; list_del_init(&ei->i_orphan); /* If we're on an error path, we may not have a valid * transaction handle with which to update the orphan list on * disk, but we still need to remove the inode from the linked * list in memory. */ if (!handle || err) { mutex_unlock(&sbi->s_orphan_lock); goto out_err; } ino_next = NEXT_ORPHAN(inode); if (prev == &sbi->s_orphan) { ext4_debug("superblock will point to %u\n", ino_next); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) { mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; } lock_buffer(sbi->s_sbh); sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); ext4_superblock_csum_set(inode->i_sb); unlock_buffer(sbi->s_sbh); mutex_unlock(&sbi->s_orphan_lock); err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); } else { struct ext4_iloc iloc2; struct inode *i_prev = &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; ext4_debug("orphan inode %lu will point to %u\n", i_prev->i_ino, ino_next); err = ext4_reserve_inode_write(handle, i_prev, &iloc2); if (err) { mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; } NEXT_ORPHAN(i_prev) = ino_next; err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); mutex_unlock(&sbi->s_orphan_lock); } if (err) goto out_brelse; NEXT_ORPHAN(inode) = 0; err = ext4_mark_iloc_dirty(handle, inode, &iloc); out_err: ext4_std_error(inode->i_sb, err); return err; out_brelse: brelse(iloc.bh); goto out_err; } #ifdef CONFIG_QUOTA static int ext4_quota_on_mount(struct super_block *sb, int type) { return dquot_quota_on_mount(sb, rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type], lockdep_is_held(&sb->s_umount)), EXT4_SB(sb)->s_jquota_fmt, type); } #endif static void ext4_process_orphan(struct inode *inode, int *nr_truncates, int *nr_orphans) { struct super_block *sb = inode->i_sb; int ret; dquot_initialize(inode); if (inode->i_nlink) { if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, "%s: truncating inode %lu to %lld bytes", __func__, inode->i_ino, inode->i_size); ext4_debug("truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); ret = ext4_truncate(inode); if (ret) { /* * We need to clean up the in-core orphan list * manually if ext4_truncate() failed to get a * transaction handle. */ ext4_orphan_del(NULL, inode); ext4_std_error(inode->i_sb, ret); } inode_unlock(inode); (*nr_truncates)++; } else { if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, "%s: deleting unreferenced inode %lu", __func__, inode->i_ino); ext4_debug("deleting unreferenced inode %lu\n", inode->i_ino); (*nr_orphans)++; } iput(inode); /* The delete magic happens here! */ } /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at * the superblock) which were deleted from all directories, but held open by * a process at the time of a crash. We walk the list and try to delete these * inodes at recovery time (only with a read-write filesystem). * * In order to keep the orphan inode chain consistent during traversal (in * case of crash during recovery), we link each inode into the superblock * orphan list_head and handle it the same way as an inode deletion during * normal operation (which journals the operations for us). * * We only do an iget() and an iput() on each inode, which is very safe if we * accidentally point at an in-use or already deleted inode. The worst that * can happen in this case is that we get a "bit already cleared" message from * ext4_free_inode(). The only reason we would point at a wrong inode is if * e2fsck was run on this filesystem, and it must have already done the orphan * inode cleanup for us, so we can safely abort without any further action. */ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) { unsigned int s_flags = sb->s_flags; int nr_orphans = 0, nr_truncates = 0; struct inode *inode; int i, j; #ifdef CONFIG_QUOTA int quota_update = 0; #endif __le32 *bdata; struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); if (!es->s_last_orphan && !oi->of_blocks) { ext4_debug("no orphan inodes to clean up\n"); return; } if (bdev_read_only(sb->s_bdev)) { ext4_msg(sb, KERN_ERR, "write access " "unavailable, skipping orphan cleanup"); return; } /* Check if feature set would not allow a r/w mount */ if (!ext4_feature_set_ok(sb, 0)) { ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " "unknown ROCOMPAT features"); return; } if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { /* don't clear list on RO mount w/ errors */ if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { ext4_msg(sb, KERN_INFO, "Errors on filesystem, " "clearing orphan list."); es->s_last_orphan = 0; } ext4_debug("Skipping orphan recovery on fs with errors.\n"); return; } if (s_flags & SB_RDONLY) { ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); sb->s_flags &= ~SB_RDONLY; } #ifdef CONFIG_QUOTA /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. */ if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { int ret = ext4_enable_quotas(sb); if (!ret) quota_update = 1; else ext4_msg(sb, KERN_ERR, "Cannot turn on quotas: error %d", ret); } /* Turn on journaled quotas used for old sytle */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); if (!ret) quota_update = 1; else ext4_msg(sb, KERN_ERR, "Cannot turn on journaled " "quota: type %d: error %d", i, ret); } } #endif while (es->s_last_orphan) { /* * We may have encountered an error during cleanup; if * so, skip the rest. */ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { ext4_debug("Skipping orphan recovery on fs with errors.\n"); es->s_last_orphan = 0; break; } inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); if (IS_ERR(inode)) { es->s_last_orphan = 0; break; } list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); ext4_process_orphan(inode, &nr_truncates, &nr_orphans); } for (i = 0; i < oi->of_blocks; i++) { bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); for (j = 0; j < inodes_per_ob; j++) { if (!bdata[j]) continue; inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j])); if (IS_ERR(inode)) continue; ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; ext4_process_orphan(inode, &nr_truncates, &nr_orphans); } } #define PLURAL(x) (x), ((x) == 1) ? "" : "s" if (nr_orphans) ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", PLURAL(nr_orphans)); if (nr_truncates) ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA /* Turn off quotas if they were enabled for orphan cleanup */ if (quota_update) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) dquot_quota_off(sb, i); } } #endif sb->s_flags = s_flags; /* Restore SB_RDONLY status */ } void ext4_release_orphan_info(struct super_block *sb) { int i; struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; if (!oi->of_blocks) return; for (i = 0; i < oi->of_blocks; i++) brelse(oi->of_binfo[i].ob_bh); kfree(oi->of_binfo); } static struct ext4_orphan_block_tail *ext4_orphan_block_tail( struct super_block *sb, struct buffer_head *bh) { return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)); } static int ext4_orphan_file_block_csum_verify(struct super_block *sb, struct buffer_head *bh) { __u32 calculated; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); if (!ext4_has_feature_metadata_csum(sb)) return 1; ot = ext4_orphan_block_tail(sb, bh); calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); calculated = ext4_chksum(calculated, (__u8 *)bh->b_data, inodes_per_ob * sizeof(__u32)); return le32_to_cpu(ot->ob_checksum) == calculated; } /* This gets called only when checksumming is enabled */ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { struct super_block *sb = EXT4_TRIGGER(triggers)->sb; __u32 csum; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32)); ot = ext4_orphan_block_tail(sb, bh); ot->ob_checksum = cpu_to_le32(csum); } int ext4_init_orphan_info(struct super_block *sb) { struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct inode *inode; int i, j; int ret; int free; __le32 *bdata; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_block_tail *ot; ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum); if (!ext4_has_feature_orphan_file(sb)) return 0; inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL); if (IS_ERR(inode)) { ext4_msg(sb, KERN_ERR, "get orphan inode failed"); return PTR_ERR(inode); } oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block), GFP_KERNEL); if (!oi->of_binfo) { ret = -ENOMEM; goto out_put; } for (i = 0; i < oi->of_blocks; i++) { oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0); if (IS_ERR(oi->of_binfo[i].ob_bh)) { ret = PTR_ERR(oi->of_binfo[i].ob_bh); goto out_free; } if (!oi->of_binfo[i].ob_bh) { ret = -EIO; goto out_free; } ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh); if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) { ext4_error(sb, "orphan file block %d: bad magic", i); ret = -EIO; goto out_free; } if (!ext4_orphan_file_block_csum_verify(sb, oi->of_binfo[i].ob_bh)) { ext4_error(sb, "orphan file block %d: bad checksum", i); ret = -EIO; goto out_free; } bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); free = 0; for (j = 0; j < inodes_per_ob; j++) if (bdata[j] == 0) free++; atomic_set(&oi->of_binfo[i].ob_free_entries, free); } iput(inode); return 0; out_free: for (i--; i >= 0; i--) brelse(oi->of_binfo[i].ob_bh); kfree(oi->of_binfo); out_put: iput(inode); return ret; } int ext4_orphan_file_empty(struct super_block *sb) { struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; int i; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); if (!ext4_has_feature_orphan_file(sb)) return 1; for (i = 0; i < oi->of_blocks; i++) if (atomic_read(&oi->of_binfo[i].ob_free_entries) != inodes_per_ob) return 0; return 1; } |
| 41 41 38 2 1 41 37 20 11 13 13 8 13 3 1 2 81 78 3 43 43 38 4 4 4 34 4 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfs/btree.c * * Copyright (C) 2001 * Brad Boyer (flar@allandria.com) * (C) 2003 Ardis Technologies <roman@ardistech.com> * * Handle opening/closing btree */ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/log2.h> #include "btree.h" /* Get a reference to a B*Tree and do some initial checks */ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp keycmp) { struct hfs_btree *tree; struct hfs_btree_header_rec *head; struct address_space *mapping; struct page *page; unsigned int size; tree = kzalloc(sizeof(*tree), GFP_KERNEL); if (!tree) return NULL; mutex_init(&tree->tree_lock); spin_lock_init(&tree->hash_lock); /* Set the correct compare function */ tree->sb = sb; tree->cnid = id; tree->keycmp = keycmp; tree->inode = iget_locked(sb, id); if (!tree->inode) goto free_tree; BUG_ON(!(tree->inode->i_state & I_NEW)); { struct hfs_mdb *mdb = HFS_SB(sb)->mdb; HFS_I(tree->inode)->flags = 0; mutex_init(&HFS_I(tree->inode)->extents_lock); switch (id) { case HFS_EXT_CNID: hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize, mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz)); if (HFS_I(tree->inode)->alloc_blocks > HFS_I(tree->inode)->first_blocks) { pr_err("invalid btree extent records\n"); unlock_new_inode(tree->inode); goto free_inode; } tree->inode->i_mapping->a_ops = &hfs_btree_aops; break; case HFS_CAT_CNID: hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize, mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz)); if (!HFS_I(tree->inode)->first_blocks) { pr_err("invalid btree extent records (0 size)\n"); unlock_new_inode(tree->inode); goto free_inode; } tree->inode->i_mapping->a_ops = &hfs_btree_aops; break; default: BUG(); } } unlock_new_inode(tree->inode); mapping = tree->inode->i_mapping; page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) goto free_inode; /* Load the header */ head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + sizeof(struct hfs_bnode_desc)); tree->root = be32_to_cpu(head->root); tree->leaf_count = be32_to_cpu(head->leaf_count); tree->leaf_head = be32_to_cpu(head->leaf_head); tree->leaf_tail = be32_to_cpu(head->leaf_tail); tree->node_count = be32_to_cpu(head->node_count); tree->free_nodes = be32_to_cpu(head->free_nodes); tree->attributes = be32_to_cpu(head->attributes); tree->node_size = be16_to_cpu(head->node_size); tree->max_key_len = be16_to_cpu(head->max_key_len); tree->depth = be16_to_cpu(head->depth); size = tree->node_size; if (!is_power_of_2(size)) goto fail_page; if (!tree->node_count) goto fail_page; switch (id) { case HFS_EXT_CNID: if (tree->max_key_len != HFS_MAX_EXT_KEYLEN) { pr_err("invalid extent max_key_len %d\n", tree->max_key_len); goto fail_page; } break; case HFS_CAT_CNID: if (tree->max_key_len != HFS_MAX_CAT_KEYLEN) { pr_err("invalid catalog max_key_len %d\n", tree->max_key_len); goto fail_page; } break; default: BUG(); } tree->node_size_shift = ffs(size) - 1; tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT; kunmap_local(head); put_page(page); return tree; fail_page: kunmap_local(head); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfs_aops; iput(tree->inode); free_tree: kfree(tree); return NULL; } /* Release resources used by a btree */ void hfs_btree_close(struct hfs_btree *tree) { struct hfs_bnode *node; int i; if (!tree) return; for (i = 0; i < NODE_HASH_SIZE; i++) { while ((node = tree->node_hash[i])) { tree->node_hash[i] = node->next_hash; if (atomic_read(&node->refcnt)) pr_err("node %d:%d still has %d user(s)!\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); hfs_bnode_free(node); tree->node_hash_cnt--; } } iput(tree->inode); kfree(tree); } void hfs_btree_write(struct hfs_btree *tree) { struct hfs_btree_header_rec *head; struct hfs_bnode *node; struct page *page; node = hfs_bnode_find(tree, 0); if (IS_ERR(node)) /* panic? */ return; /* Load the header */ page = node->page[0]; head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + sizeof(struct hfs_bnode_desc)); head->root = cpu_to_be32(tree->root); head->leaf_count = cpu_to_be32(tree->leaf_count); head->leaf_head = cpu_to_be32(tree->leaf_head); head->leaf_tail = cpu_to_be32(tree->leaf_tail); head->node_count = cpu_to_be32(tree->node_count); head->free_nodes = cpu_to_be32(tree->free_nodes); head->attributes = cpu_to_be32(tree->attributes); head->depth = cpu_to_be16(tree->depth); kunmap_local(head); set_page_dirty(page); hfs_bnode_put(node); } static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx) { struct hfs_btree *tree = prev->tree; struct hfs_bnode *node; struct hfs_bnode_desc desc; __be32 cnid; node = hfs_bnode_create(tree, idx); if (IS_ERR(node)) return node; if (!tree->free_nodes) panic("FIXME!!!"); tree->free_nodes--; prev->next = idx; cnid = cpu_to_be32(idx); hfs_bnode_write(prev, &cnid, offsetof(struct hfs_bnode_desc, next), 4); node->type = HFS_NODE_MAP; node->num_recs = 1; hfs_bnode_clear(node, 0, tree->node_size); desc.next = 0; desc.prev = 0; desc.type = HFS_NODE_MAP; desc.height = 0; desc.num_recs = cpu_to_be16(1); desc.reserved = 0; hfs_bnode_write(node, &desc, 0, sizeof(desc)); hfs_bnode_write_u16(node, 14, 0x8000); hfs_bnode_write_u16(node, tree->node_size - 2, 14); hfs_bnode_write_u16(node, tree->node_size - 4, tree->node_size - 6); return node; } /* Make sure @tree has enough space for the @rsvd_nodes */ int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes) { struct inode *inode = tree->inode; u32 count; int res; while (tree->free_nodes < rsvd_nodes) { res = hfs_extend_file(inode); if (res) return res; HFS_I(inode)->phys_size = inode->i_size = (loff_t)HFS_I(inode)->alloc_blocks * HFS_SB(tree->sb)->alloc_blksz; HFS_I(inode)->fs_blocks = inode->i_size >> tree->sb->s_blocksize_bits; inode_set_bytes(inode, inode->i_size); count = inode->i_size >> tree->node_size_shift; tree->free_nodes += count - tree->node_count; tree->node_count = count; } return 0; } struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) { struct hfs_bnode *node, *next_node; struct page **pagep; u32 nidx, idx; unsigned off; u16 off16; u16 len; u8 *data, byte, m; int i, res; res = hfs_bmap_reserve(tree, 1); if (res) return ERR_PTR(res); nidx = 0; node = hfs_bnode_find(tree, nidx); if (IS_ERR(node)) return node; len = hfs_brec_lenoff(node, 2, &off16); off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); data = kmap_local_page(*pagep); off &= ~PAGE_MASK; idx = 0; for (;;) { while (len) { byte = data[off]; if (byte != 0xff) { for (m = 0x80, i = 0; i < 8; m >>= 1, i++) { if (!(byte & m)) { idx += i; data[off] |= m; set_page_dirty(*pagep); kunmap_local(data); tree->free_nodes--; mark_inode_dirty(tree->inode); hfs_bnode_put(node); return hfs_bnode_create(tree, idx); } } } if (++off >= PAGE_SIZE) { kunmap_local(data); data = kmap_local_page(*++pagep); off = 0; } idx += 8; len--; } kunmap_local(data); nidx = node->next; if (!nidx) { printk(KERN_DEBUG "create new bmap node...\n"); next_node = hfs_bmap_new_bmap(node, idx); } else next_node = hfs_bnode_find(tree, nidx); hfs_bnode_put(node); if (IS_ERR(next_node)) return next_node; node = next_node; len = hfs_brec_lenoff(node, 0, &off16); off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); data = kmap_local_page(*pagep); off &= ~PAGE_MASK; } } void hfs_bmap_free(struct hfs_bnode *node) { struct hfs_btree *tree; struct page *page; u16 off, len; u32 nidx; u8 *data, byte, m; hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); tree = node->tree; nidx = node->this; node = hfs_bnode_find(tree, 0); if (IS_ERR(node)) return; len = hfs_brec_lenoff(node, 2, &off); while (nidx >= len * 8) { u32 i; nidx -= len * 8; i = node->next; if (!i) { /* panic */; pr_crit("unable to free bnode %u. bmap not found!\n", node->this); hfs_bnode_put(node); return; } hfs_bnode_put(node); node = hfs_bnode_find(tree, i); if (IS_ERR(node)) return; if (node->type != HFS_NODE_MAP) { /* panic */; pr_crit("invalid bmap found! (%u,%d)\n", node->this, node->type); hfs_bnode_put(node); return; } len = hfs_brec_lenoff(node, 0, &off); } off += node->page_offset + nidx / 8; page = node->page[off >> PAGE_SHIFT]; data = kmap_local_page(page); off &= ~PAGE_MASK; m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { pr_crit("trying to free free bnode %u(%d)\n", node->this, node->type); kunmap_local(data); hfs_bnode_put(node); return; } data[off] = byte & ~m; set_page_dirty(page); kunmap_local(data); hfs_bnode_put(node); tree->free_nodes++; mark_inode_dirty(tree->inode); } |
| 3792 3789 3796 3793 3792 553 6 3789 554 553 544 23 555 3702 3786 3707 8 555 56 3702 3705 3710 3707 3708 3710 3711 140 139 140 3707 3708 3715 3715 139 3705 3715 2142 2143 2141 2139 187 187 1 1 1 1 1 1 3705 3705 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 | // SPDX-License-Identifier: GPL-2.0-or-later /* audit.c -- Auditing support * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. * System-call specific features have moved to auditsc.c * * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * * Written by Rickard E. (Rik) Faith <faith@redhat.com> * * Goals: 1) Integrate fully with Security Modules. * 2) Minimal run-time overhead: * a) Minimal when syscall auditing is disabled (audit_enable=0). * b) Small when syscall auditing is enabled and no audit record * is generated (defer as much work as possible to record * generation time): * i) context is allocated, * ii) names from getname are stored without a copy, and * iii) inode information stored from path_lookup. * 3) Ability to disable syscall auditing at boot time (audit=0). * 4) Usable by other parts of the kernel (if audit_log* is called, * then a syscall record will be generated automatically for the * current syscall). * 5) Netlink interface to user-space. * 6) Support low-overhead kernel-based filtering to minimize the * information that must be passed to user-space. * * Audit userspace, documentation, tests, and bug/issue trackers: * https://github.com/linux-audit */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/file.h> #include <linux/init.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/kthread.h> #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/mutex.h> #include <linux/gfp.h> #include <linux/pid.h> #include <linux/audit.h> #include <net/sock.h> #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/security.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <net/netns/generic.h> #include "audit.h" /* No auditing will take place until audit_initialized == AUDIT_INITIALIZED. * (Initialization happens after skb_init is called.) */ #define AUDIT_DISABLED -1 #define AUDIT_UNINITIALIZED 0 #define AUDIT_INITIALIZED 1 static int audit_initialized = AUDIT_UNINITIALIZED; u32 audit_enabled = AUDIT_OFF; bool audit_ever_enabled = !!AUDIT_OFF; EXPORT_SYMBOL_GPL(audit_enabled); /* Default state when kernel boots without any parameters. */ static u32 audit_default = AUDIT_OFF; /* If auditing cannot proceed, audit_failure selects what happens. */ static u32 audit_failure = AUDIT_FAIL_PRINTK; /* private audit network namespace index */ static unsigned int audit_net_id; /** * struct audit_net - audit private network namespace data * @sk: communication socket */ struct audit_net { struct sock *sk; }; /** * struct auditd_connection - kernel/auditd connection state * @pid: auditd PID * @portid: netlink portid * @net: the associated network namespace * @rcu: RCU head * * Description: * This struct is RCU protected; you must either hold the RCU lock for reading * or the associated spinlock for writing. */ struct auditd_connection { struct pid *pid; u32 portid; struct net *net; struct rcu_head rcu; }; static struct auditd_connection __rcu *auditd_conn; static DEFINE_SPINLOCK(auditd_conn_lock); /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in * audit records being dropped. */ static u32 audit_rate_limit; /* Number of outstanding audit_buffers allowed. * When set to zero, this means unlimited. */ static u32 audit_backlog_limit = 64; #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; /* The identity of the user shutting down the audit system. */ static kuid_t audit_sig_uid = INVALID_UID; static pid_t audit_sig_pid = -1; static struct lsm_prop audit_sig_lsm; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] 1) out of memory in audit_log_start [kmalloc of struct audit_buffer] 2) out of memory in audit_log_move [alloc_skb] 3) suppressed due to audit_rate_limit 4) suppressed due to audit_backlog_limit */ static atomic_t audit_lost = ATOMIC_INIT(0); /* Monotonically increasing sum of time the kernel has spent * waiting while the backlog limit is exceeded. */ static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0); /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; static struct kmem_cache *audit_buffer_cache; /* queue msgs to send via kauditd_task */ static struct sk_buff_head audit_queue; /* queue msgs due to temporary unicast send problems */ static struct sk_buff_head audit_retry_queue; /* queue msgs waiting for new auditd connection */ static struct sk_buff_head audit_hold_queue; /* queue servicing thread */ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); /* waitqueue for callers who are blocked on the audit backlog */ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION, .mask = -1, .features = 0, .lock = 0,}; static char *audit_feature_names[2] = { "only_unset_loginuid", "loginuid_immutable", }; /** * struct audit_ctl_mutex - serialize requests from userspace * @lock: the mutex used for locking * @owner: the task which owns the lock * * Description: * This is the lock struct used to ensure we only process userspace requests * in an orderly fashion. We can't simply use a mutex/lock here because we * need to track lock ownership so we don't end up blocking the lock owner in * audit_log_start() or similar. */ static struct audit_ctl_mutex { struct mutex lock; void *owner; } audit_cmd_mutex; /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer * should be at least that large. */ #define AUDIT_BUFSIZ 1024 /* The audit_buffer is used when formatting an audit record. The caller * locks briefly to get the record off the freelist or to allocate the * buffer, and locks briefly to send the buffer to the netlink layer or * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { struct sk_buff *skb; /* formatted skb ready to send */ struct audit_context *ctx; /* NULL or associated context */ gfp_t gfp_mask; }; struct audit_reply { __u32 portid; struct net *net; struct sk_buff *skb; }; /** * auditd_test_task - Check to see if a given task is an audit daemon * @task: the task to check * * Description: * Return 1 if the task is a registered audit daemon, 0 otherwise. */ int auditd_test_task(struct task_struct *task) { int rc; struct auditd_connection *ac; rcu_read_lock(); ac = rcu_dereference(auditd_conn); rc = (ac && ac->pid == task_tgid(task) ? 1 : 0); rcu_read_unlock(); return rc; } /** * audit_ctl_lock - Take the audit control lock */ void audit_ctl_lock(void) { mutex_lock(&audit_cmd_mutex.lock); audit_cmd_mutex.owner = current; } /** * audit_ctl_unlock - Drop the audit control lock */ void audit_ctl_unlock(void) { audit_cmd_mutex.owner = NULL; mutex_unlock(&audit_cmd_mutex.lock); } /** * audit_ctl_owner_current - Test to see if the current task owns the lock * * Description: * Return true if the current task owns the audit control lock, false if it * doesn't own the lock. */ static bool audit_ctl_owner_current(void) { return (current == audit_cmd_mutex.owner); } /** * auditd_pid_vnr - Return the auditd PID relative to the namespace * * Description: * Returns the PID in relation to the namespace, 0 on failure. */ static pid_t auditd_pid_vnr(void) { pid_t pid; const struct auditd_connection *ac; rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac || !ac->pid) pid = 0; else pid = pid_vnr(ac->pid); rcu_read_unlock(); return pid; } /** * audit_get_sk - Return the audit socket for the given network namespace * @net: the destination network namespace * * Description: * Returns the sock pointer if valid, NULL otherwise. The caller must ensure * that a reference is held for the network namespace while the sock is in use. */ static struct sock *audit_get_sk(const struct net *net) { struct audit_net *aunet; if (!net) return NULL; aunet = net_generic(net, audit_net_id); return aunet->sk; } void audit_panic(const char *message) { switch (audit_failure) { case AUDIT_FAIL_SILENT: break; case AUDIT_FAIL_PRINTK: if (printk_ratelimit()) pr_err("%s\n", message); break; case AUDIT_FAIL_PANIC: panic("audit: %s\n", message); break; } } static inline int audit_rate_check(void) { static unsigned long last_check = 0; static int messages = 0; static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; int retval = 0; if (!audit_rate_limit) return 1; spin_lock_irqsave(&lock, flags); if (++messages < audit_rate_limit) { retval = 1; } else { now = jiffies; if (time_after(now, last_check + HZ)) { last_check = now; messages = 0; retval = 1; } } spin_unlock_irqrestore(&lock, flags); return retval; } /** * audit_log_lost - conditionally log lost audit message event * @message: the message stating reason for lost audit message * * Emit at least 1 message per second, even if audit_rate_check is * throttling. * Always increment the lost messages counter. */ void audit_log_lost(const char *message) { static unsigned long last_msg = 0; static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; int print; atomic_inc(&audit_lost); print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit); if (!print) { spin_lock_irqsave(&lock, flags); now = jiffies; if (time_after(now, last_msg + HZ)) { print = 1; last_msg = now; } spin_unlock_irqrestore(&lock, flags); } if (print) { if (printk_ratelimit()) pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n", atomic_read(&audit_lost), audit_rate_limit, audit_backlog_limit); audit_panic(message); } } static int audit_log_config_change(char *function_name, u32 new, u32 old, int allow_changes) { struct audit_buffer *ab; int rc = 0; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old); audit_log_session_info(ab); rc = audit_log_task_context(ab); if (rc) allow_changes = 0; /* Something weird, deny request */ audit_log_format(ab, " res=%d", allow_changes); audit_log_end(ab); return rc; } static int audit_do_config_change(char *function_name, u32 *to_change, u32 new) { int allow_changes, rc = 0; u32 old = *to_change; /* check if we are locked */ if (audit_enabled == AUDIT_LOCKED) allow_changes = 0; else allow_changes = 1; if (audit_enabled != AUDIT_OFF) { rc = audit_log_config_change(function_name, new, old, allow_changes); if (rc) allow_changes = 0; } /* If we are allowed, make the change */ if (allow_changes == 1) *to_change = new; /* Not allowed, update reason */ else if (rc == 0) rc = -EPERM; return rc; } static int audit_set_rate_limit(u32 limit) { return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit); } static int audit_set_backlog_limit(u32 limit) { return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit); } static int audit_set_backlog_wait_time(u32 timeout) { return audit_do_config_change("audit_backlog_wait_time", &audit_backlog_wait_time, timeout); } static int audit_set_enabled(u32 state) { int rc; if (state > AUDIT_LOCKED) return -EINVAL; rc = audit_do_config_change("audit_enabled", &audit_enabled, state); if (!rc) audit_ever_enabled |= !!state; return rc; } static int audit_set_failure(u32 state) { if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK && state != AUDIT_FAIL_PANIC) return -EINVAL; return audit_do_config_change("audit_failure", &audit_failure, state); } /** * auditd_conn_free - RCU helper to release an auditd connection struct * @rcu: RCU head * * Description: * Drop any references inside the auditd connection tracking struct and free * the memory. */ static void auditd_conn_free(struct rcu_head *rcu) { struct auditd_connection *ac; ac = container_of(rcu, struct auditd_connection, rcu); put_pid(ac->pid); put_net(ac->net); kfree(ac); } /** * auditd_set - Set/Reset the auditd connection state * @pid: auditd PID * @portid: auditd netlink portid * @net: auditd network namespace pointer * @skb: the netlink command from the audit daemon * @ack: netlink ack flag, cleared if ack'd here * * Description: * This function will obtain and drop network namespace references as * necessary. Returns zero on success, negative values on failure. */ static int auditd_set(struct pid *pid, u32 portid, struct net *net, struct sk_buff *skb, bool *ack) { unsigned long flags; struct auditd_connection *ac_old, *ac_new; struct nlmsghdr *nlh; if (!pid || !net) return -EINVAL; ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL); if (!ac_new) return -ENOMEM; ac_new->pid = get_pid(pid); ac_new->portid = portid; ac_new->net = get_net(net); /* send the ack now to avoid a race with the queue backlog */ if (*ack) { nlh = nlmsg_hdr(skb); netlink_ack(skb, nlh, 0, NULL); *ack = false; } spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); rcu_assign_pointer(auditd_conn, ac_new); spin_unlock_irqrestore(&auditd_conn_lock, flags); if (ac_old) call_rcu(&ac_old->rcu, auditd_conn_free); return 0; } /** * kauditd_printk_skb - Print the audit record to the ring buffer * @skb: audit record * * Whatever the reason, this packet may not make it to the auditd connection * so write it via printk so the information isn't completely lost. */ static void kauditd_printk_skb(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); char *data = nlmsg_data(nlh); if (nlh->nlmsg_type != AUDIT_EOE && printk_ratelimit()) pr_notice("type=%d %s\n", nlh->nlmsg_type, data); } /** * kauditd_rehold_skb - Handle a audit record send failure in the hold queue * @skb: audit record * @error: error code (unused) * * Description: * This should only be used by the kauditd_thread when it fails to flush the * hold queue. */ static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error) { /* put the record back in the queue */ skb_queue_tail(&audit_hold_queue, skb); } /** * kauditd_hold_skb - Queue an audit record, waiting for auditd * @skb: audit record * @error: error code * * Description: * Queue the audit record, waiting for an instance of auditd. When this * function is called we haven't given up yet on sending the record, but things * are not looking good. The first thing we want to do is try to write the * record via printk and then see if we want to try and hold on to the record * and queue it, if we have room. If we want to hold on to the record, but we * don't have room, record a record lost message. */ static void kauditd_hold_skb(struct sk_buff *skb, int error) { /* at this point it is uncertain if we will ever send this to auditd so * try to send the message via printk before we go any further */ kauditd_printk_skb(skb); /* can we just silently drop the message? */ if (!audit_default) goto drop; /* the hold queue is only for when the daemon goes away completely, * not -EAGAIN failures; if we are in a -EAGAIN state requeue the * record on the retry queue unless it's full, in which case drop it */ if (error == -EAGAIN) { if (!audit_backlog_limit || skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { skb_queue_tail(&audit_retry_queue, skb); return; } audit_log_lost("kauditd retry queue overflow"); goto drop; } /* if we have room in the hold queue, queue the message */ if (!audit_backlog_limit || skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { skb_queue_tail(&audit_hold_queue, skb); return; } /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); drop: kfree_skb(skb); } /** * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd * @skb: audit record * @error: error code (unused) * * Description: * Not as serious as kauditd_hold_skb() as we still have a connected auditd, * but for some reason we are having problems sending it audit records so * queue the given record and attempt to resend. */ static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) { if (!audit_backlog_limit || skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { skb_queue_tail(&audit_retry_queue, skb); return; } /* we have to drop the record, send it via printk as a last effort */ kauditd_printk_skb(skb); audit_log_lost("kauditd retry queue overflow"); kfree_skb(skb); } /** * auditd_reset - Disconnect the auditd connection * @ac: auditd connection state * * Description: * Break the auditd/kauditd connection and move all the queued records into the * hold queue in case auditd reconnects. It is important to note that the @ac * pointer should never be dereferenced inside this function as it may be NULL * or invalid, you can only compare the memory address! If @ac is NULL then * the connection will always be reset. */ static void auditd_reset(const struct auditd_connection *ac) { unsigned long flags; struct sk_buff *skb; struct auditd_connection *ac_old; /* if it isn't already broken, break the connection */ spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); if (ac && ac != ac_old) { /* someone already registered a new auditd connection */ spin_unlock_irqrestore(&auditd_conn_lock, flags); return; } rcu_assign_pointer(auditd_conn, NULL); spin_unlock_irqrestore(&auditd_conn_lock, flags); if (ac_old) call_rcu(&ac_old->rcu, auditd_conn_free); /* flush the retry queue to the hold queue, but don't touch the main * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) kauditd_hold_skb(skb, -ECONNREFUSED); } /** * auditd_send_unicast_skb - Send a record via unicast to auditd * @skb: audit record * * Description: * Send a skb to the audit daemon, returns positive/zero values on success and * negative values on failure; in all cases the skb will be consumed by this * function. If the send results in -ECONNREFUSED the connection with auditd * will be reset. This function may sleep so callers should not hold any locks * where this would cause a problem. */ static int auditd_send_unicast_skb(struct sk_buff *skb) { int rc; u32 portid; struct net *net; struct sock *sk; struct auditd_connection *ac; /* NOTE: we can't call netlink_unicast while in the RCU section so * take a reference to the network namespace and grab local * copies of the namespace, the sock, and the portid; the * namespace and sock aren't going to go away while we hold a * reference and if the portid does become invalid after the RCU * section netlink_unicast() should safely return an error */ rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); kfree_skb(skb); rc = -ECONNREFUSED; goto err; } net = get_net(ac->net); sk = audit_get_sk(net); portid = ac->portid; rcu_read_unlock(); rc = netlink_unicast(sk, skb, portid, 0); put_net(net); if (rc < 0) goto err; return rc; err: if (ac && rc == -ECONNREFUSED) auditd_reset(ac); return rc; } /** * kauditd_send_queue - Helper for kauditd_thread to flush skb queues * @sk: the sending sock * @portid: the netlink destination * @queue: the skb queue to process * @retry_limit: limit on number of netlink unicast failures * @skb_hook: per-skb hook for additional processing * @err_hook: hook called if the skb fails the netlink unicast send * * Description: * Run through the given queue and attempt to send the audit records to auditd, * returns zero on success, negative values on failure. It is up to the caller * to ensure that the @sk is valid for the duration of this function. * */ static int kauditd_send_queue(struct sock *sk, u32 portid, struct sk_buff_head *queue, unsigned int retry_limit, void (*skb_hook)(struct sk_buff *skb), void (*err_hook)(struct sk_buff *skb, int error)) { int rc = 0; struct sk_buff *skb = NULL; struct sk_buff *skb_tail; unsigned int failed = 0; /* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */ skb_tail = skb_peek_tail(queue); while ((skb != skb_tail) && (skb = skb_dequeue(queue))) { /* call the skb_hook for each skb we touch */ if (skb_hook) (*skb_hook)(skb); /* can we send to anyone via unicast? */ if (!sk) { if (err_hook) (*err_hook)(skb, -ECONNREFUSED); continue; } retry: /* grab an extra skb reference in case of error */ skb_get(skb); rc = netlink_unicast(sk, skb, portid, 0); if (rc < 0) { /* send failed - try a few times unless fatal error */ if (++failed >= retry_limit || rc == -ECONNREFUSED || rc == -EPERM) { sk = NULL; if (err_hook) (*err_hook)(skb, rc); if (rc == -EAGAIN) rc = 0; /* continue to drain the queue */ continue; } else goto retry; } else { /* skb sent - drop the extra reference and continue */ consume_skb(skb); failed = 0; } } return (rc >= 0 ? 0 : rc); } /* * kauditd_send_multicast_skb - Send a record to any multicast listeners * @skb: audit record * * Description: * Write a multicast message to anyone listening in the initial network * namespace. This function doesn't consume an skb as might be expected since * it has to copy it anyways. */ static void kauditd_send_multicast_skb(struct sk_buff *skb) { struct sk_buff *copy; struct sock *sock = audit_get_sk(&init_net); struct nlmsghdr *nlh; /* NOTE: we are not taking an additional reference for init_net since * we don't have to worry about it going away */ if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG)) return; /* * The seemingly wasteful skb_copy() rather than bumping the refcount * using skb_get() is necessary because non-standard mods are made to * the skb by the original kaudit unicast socket send routine. The * existing auditd daemon assumes this breakage. Fixing this would * require co-ordinating a change in the established protocol between * the kaudit kernel subsystem and the auditd userspace code. There is * no reason for new multicast clients to continue with this * non-compliance. */ copy = skb_copy(skb, GFP_KERNEL); if (!copy) return; nlh = nlmsg_hdr(copy); nlh->nlmsg_len = skb->len; nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); } /** * kauditd_thread - Worker thread to send audit records to userspace * @dummy: unused */ static int kauditd_thread(void *dummy) { int rc; u32 portid = 0; struct net *net = NULL; struct sock *sk = NULL; struct auditd_connection *ac; #define UNICAST_RETRIES 5 set_freezable(); while (!kthread_should_stop()) { /* NOTE: see the lock comments in auditd_send_unicast_skb() */ rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); goto main_queue; } net = get_net(ac->net); sk = audit_get_sk(net); portid = ac->portid; rcu_read_unlock(); /* attempt to flush the hold queue */ rc = kauditd_send_queue(sk, portid, &audit_hold_queue, UNICAST_RETRIES, NULL, kauditd_rehold_skb); if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; } /* attempt to flush the retry queue */ rc = kauditd_send_queue(sk, portid, &audit_retry_queue, UNICAST_RETRIES, NULL, kauditd_hold_skb); if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; } main_queue: /* process the main queue - do the multicast send and attempt * unicast, dump failed record sends to the retry queue; if * sk == NULL due to previous failures we will just do the * multicast send and move the record to the hold queue */ rc = kauditd_send_queue(sk, portid, &audit_queue, 1, kauditd_send_multicast_skb, (sk ? kauditd_retry_skb : kauditd_hold_skb)); if (ac && rc < 0) auditd_reset(ac); sk = NULL; /* drop our netns reference, no auditd sends past this line */ if (net) { put_net(net); net = NULL; } /* we have processed all the queues so wake everyone */ wake_up(&audit_backlog_wait); /* NOTE: we want to wake up if there is anything on the queue, * regardless of if an auditd is connected, as we need to * do the multicast send and rotate records from the * main queue to the retry/hold queues */ wait_event_freezable(kauditd_wait, (skb_queue_len(&audit_queue) ? 1 : 0)); } return 0; } int audit_send_list_thread(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; struct sock *sk = audit_get_sk(dest->net); /* wait for parent to finish and send an ACK */ audit_ctl_lock(); audit_ctl_unlock(); while ((skb = __skb_dequeue(&dest->q)) != NULL) netlink_unicast(sk, skb, dest->portid, 0); put_net(dest->net); kfree(dest); return 0; } struct sk_buff *audit_make_reply(int seq, int type, int done, int multi, const void *payload, int size) { struct sk_buff *skb; struct nlmsghdr *nlh; void *data; int flags = multi ? NLM_F_MULTI : 0; int t = done ? NLMSG_DONE : type; skb = nlmsg_new(size, GFP_KERNEL); if (!skb) return NULL; nlh = nlmsg_put(skb, 0, seq, t, size, flags); if (!nlh) goto out_kfree_skb; data = nlmsg_data(nlh); memcpy(data, payload, size); return skb; out_kfree_skb: kfree_skb(skb); return NULL; } static void audit_free_reply(struct audit_reply *reply) { if (!reply) return; kfree_skb(reply->skb); if (reply->net) put_net(reply->net); kfree(reply); } static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; audit_ctl_lock(); audit_ctl_unlock(); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0); reply->skb = NULL; audit_free_reply(reply); return 0; } /** * audit_send_reply - send an audit reply message via netlink * @request_skb: skb of request we are replying to (used to target the reply) * @seq: sequence number * @type: audit message type * @done: done (last) flag * @multi: multi-part message flag * @payload: payload data * @size: payload size * * Allocates a skb, builds the netlink message, and sends it to the port id. */ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { struct task_struct *tsk; struct audit_reply *reply; reply = kzalloc(sizeof(*reply), GFP_KERNEL); if (!reply) return; reply->skb = audit_make_reply(seq, type, done, multi, payload, size); if (!reply->skb) goto err; reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); reply->portid = NETLINK_CB(request_skb).portid; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); if (IS_ERR(tsk)) goto err; return; err: audit_free_reply(reply); } /* * Check for appropriate CAP_AUDIT_ capabilities on incoming audit * control messages. */ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) { int err = 0; /* Only support initial user namespace for now. */ /* * We return ECONNREFUSED because it tricks userspace into thinking * that audit was not configured into the kernel. Lots of users * configure their PAM stack (because that's what the distro does) * to reject login if unable to send messages to audit. If we return * ECONNREFUSED the PAM stack thinks the kernel does not have audit * configured in and will let login proceed. If we return EPERM * userspace will reject all logins. This should be removed when we * support non init namespaces!! */ if (current_user_ns() != &init_user_ns) return -ECONNREFUSED; switch (msg_type) { case AUDIT_LIST: case AUDIT_ADD: case AUDIT_DEL: return -EOPNOTSUPP; case AUDIT_GET: case AUDIT_SET: case AUDIT_GET_FEATURE: case AUDIT_SET_FEATURE: case AUDIT_LIST_RULES: case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: case AUDIT_SIGNAL_INFO: case AUDIT_TTY_GET: case AUDIT_TTY_SET: case AUDIT_TRIM: case AUDIT_MAKE_EQUIV: /* Only support auditd and auditctl in initial pid namespace * for now. */ if (task_active_pid_ns(current) != &init_pid_ns) return -EPERM; if (!netlink_capable(skb, CAP_AUDIT_CONTROL)) err = -EPERM; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (!netlink_capable(skb, CAP_AUDIT_WRITE)) err = -EPERM; break; default: /* bad msg */ err = -EINVAL; } return err; } static void audit_log_common_recv_msg(struct audit_context *context, struct audit_buffer **ab, u16 msg_type) { uid_t uid = from_kuid(&init_user_ns, current_uid()); pid_t pid = task_tgid_nr(current); if (!audit_enabled && msg_type != AUDIT_USER_AVC) { *ab = NULL; return; } *ab = audit_log_start(context, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return; audit_log_format(*ab, "pid=%d uid=%u ", pid, uid); audit_log_session_info(*ab); audit_log_task_context(*ab); } static inline void audit_log_user_recv_msg(struct audit_buffer **ab, u16 msg_type) { audit_log_common_recv_msg(NULL, ab, msg_type); } static int is_audit_feature_set(int i) { return af.features & AUDIT_FEATURE_TO_MASK(i); } static int audit_get_feature(struct sk_buff *skb) { u32 seq; seq = nlmsg_hdr(skb)->nlmsg_seq; audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af)); return 0; } static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature, u32 old_lock, u32 new_lock, int res) { struct audit_buffer *ab; if (audit_enabled == AUDIT_OFF) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE); if (!ab) return; audit_log_task_info(ab); audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", audit_feature_names[which], !!old_feature, !!new_feature, !!old_lock, !!new_lock, res); audit_log_end(ab); } static int audit_set_feature(struct audit_features *uaf) { int i; BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); /* if there is ever a version 2 we should handle that here */ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { u32 feature = AUDIT_FEATURE_TO_MASK(i); u32 old_feature, new_feature, old_lock, new_lock; /* if we are not changing this feature, move along */ if (!(feature & uaf->mask)) continue; old_feature = af.features & feature; new_feature = uaf->features & feature; new_lock = (uaf->lock | af.lock) & feature; old_lock = af.lock & feature; /* are we changing a locked feature? */ if (old_lock && (new_feature != old_feature)) { audit_log_feature_change(i, old_feature, new_feature, old_lock, new_lock, 0); return -EPERM; } } /* nothing invalid, do the changes */ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { u32 feature = AUDIT_FEATURE_TO_MASK(i); u32 old_feature, new_feature, old_lock, new_lock; /* if we are not changing this feature, move along */ if (!(feature & uaf->mask)) continue; old_feature = af.features & feature; new_feature = uaf->features & feature; old_lock = af.lock & feature; new_lock = (uaf->lock | af.lock) & feature; if (new_feature != old_feature) audit_log_feature_change(i, old_feature, new_feature, old_lock, new_lock, 1); if (new_feature) af.features |= feature; else af.features &= ~feature; af.lock |= new_lock; } return 0; } static int audit_replace(struct pid *pid) { pid_t pvnr; struct sk_buff *skb; pvnr = pid_vnr(pid); skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr)); if (!skb) return -ENOMEM; return auditd_send_unicast_skb(skb); } static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, bool *ack) { u32 seq; void *data; int data_len; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; struct audit_sig_info *sig_data; struct lsm_context lsmctx = { NULL, 0, 0 }; err = audit_netlink_ok(skb, msg_type); if (err) return err; seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); data_len = nlmsg_len(nlh); switch (msg_type) { case AUDIT_GET: { struct audit_status s; memset(&s, 0, sizeof(s)); s.enabled = audit_enabled; s.failure = audit_failure; /* NOTE: use pid_vnr() so the PID is relative to the current * namespace */ s.pid = auditd_pid_vnr(); s.rate_limit = audit_rate_limit; s.backlog_limit = audit_backlog_limit; s.lost = atomic_read(&audit_lost); s.backlog = skb_queue_len(&audit_queue); s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; s.backlog_wait_time = audit_backlog_wait_time; s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual); audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_SET: { struct audit_status s; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); if (s.mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(s.enabled); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_FAILURE) { err = audit_set_failure(s.failure); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_PID) { /* NOTE: we are using the vnr PID functions below * because the s.pid value is relative to the * namespace of the caller; at present this * doesn't matter much since you can really only * run auditd from the initial pid namespace, but * something to keep in mind if this changes */ pid_t new_pid = s.pid; pid_t auditd_pid; struct pid *req_pid = task_tgid(current); /* Sanity check - PID values must match. Setting * pid to 0 is how auditd ends auditing. */ if (new_pid && (new_pid != pid_vnr(req_pid))) return -EINVAL; /* test the auditd connection */ audit_replace(req_pid); auditd_pid = auditd_pid_vnr(); if (auditd_pid) { /* replacing a healthy auditd is not allowed */ if (new_pid) { audit_log_config_change("audit_pid", new_pid, auditd_pid, 0); return -EEXIST; } /* only current auditd can unregister itself */ if (pid_vnr(req_pid) != auditd_pid) { audit_log_config_change("audit_pid", new_pid, auditd_pid, 0); return -EACCES; } } if (new_pid) { /* register a new auditd connection */ err = auditd_set(req_pid, NETLINK_CB(skb).portid, sock_net(NETLINK_CB(skb).sk), skb, ack); if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, auditd_pid, err ? 0 : 1); if (err) return err; /* try to process any backlog */ wake_up_interruptible(&kauditd_wait); } else { if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, auditd_pid, 1); /* unregister the auditd connection */ auditd_reset(NULL); } } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(s.rate_limit); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) { err = audit_set_backlog_limit(s.backlog_limit); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) { if (sizeof(s) > (size_t)nlh->nlmsg_len) return -EINVAL; if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME) return -EINVAL; err = audit_set_backlog_wait_time(s.backlog_wait_time); if (err < 0) return err; } if (s.mask == AUDIT_STATUS_LOST) { u32 lost = atomic_xchg(&audit_lost, 0); audit_log_config_change("lost", 0, lost, 1); return lost; } if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) { u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0); audit_log_config_change("backlog_wait_time_actual", 0, actual, 1); return actual; } break; } case AUDIT_GET_FEATURE: err = audit_get_feature(skb); if (err) return err; break; case AUDIT_SET_FEATURE: if (data_len < sizeof(struct audit_features)) return -EINVAL; err = audit_set_feature(data); if (err) return err; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; /* exit early if there isn't at least one character to print */ if (data_len < 2) return -EINVAL; err = audit_filter(msg_type, AUDIT_FILTER_USER); if (err == 1) { /* match or error */ char *str = data; err = 0; if (msg_type == AUDIT_USER_TTY) { err = tty_audit_push(); if (err) break; } audit_log_user_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) { /* ensure NULL termination */ str[data_len - 1] = '\0'; audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, str); } else { audit_log_format(ab, " data="); if (str[data_len - 1] == '\0') data_len--; audit_log_n_untrustedstring(ab, str, data_len); } audit_log_end(ab); } break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: if (data_len < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=%s audit_enabled=%d res=0", msg_type == AUDIT_ADD_RULE ? "add_rule" : "remove_rule", audit_enabled); audit_log_end(ab); return -EPERM; } err = audit_rule_change(msg_type, seq, data, data_len); break; case AUDIT_LIST_RULES: err = audit_list_rules_send(skb, seq); break; case AUDIT_TRIM: audit_trim_trees(); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); break; case AUDIT_MAKE_EQUIV: { void *bufp = data; u32 sizes[2]; size_t msglen = data_len; char *old, *new; err = -EINVAL; if (msglen < 2 * sizeof(u32)) break; memcpy(sizes, bufp, 2 * sizeof(u32)); bufp += 2 * sizeof(u32); msglen -= 2 * sizeof(u32); old = audit_unpack_string(&bufp, &msglen, sizes[0]); if (IS_ERR(old)) { err = PTR_ERR(old); break; } new = audit_unpack_string(&bufp, &msglen, sizes[1]); if (IS_ERR(new)) { err = PTR_ERR(new); kfree(old); break; } /* OK, here comes... */ err = audit_tag_tree(old, new); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); audit_log_format(ab, " new="); audit_log_untrustedstring(ab, new); audit_log_format(ab, " res=%d", !err); audit_log_end(ab); kfree(old); kfree(new); break; } case AUDIT_SIGNAL_INFO: if (lsmprop_is_set(&audit_sig_lsm)) { err = security_lsmprop_to_secctx(&audit_sig_lsm, &lsmctx); if (err < 0) return err; } sig_data = kmalloc(struct_size(sig_data, ctx, lsmctx.len), GFP_KERNEL); if (!sig_data) { if (lsmprop_is_set(&audit_sig_lsm)) security_release_secctx(&lsmctx); return -ENOMEM; } sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid); sig_data->pid = audit_sig_pid; if (lsmprop_is_set(&audit_sig_lsm)) { memcpy(sig_data->ctx, lsmctx.context, lsmctx.len); security_release_secctx(&lsmctx); } audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, struct_size(sig_data, ctx, lsmctx.len)); kfree(sig_data); break; case AUDIT_TTY_GET: { struct audit_tty_status s; unsigned int t; t = READ_ONCE(current->signal->audit_tty); s.enabled = t & AUDIT_TTY_ENABLE; s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status s, old; struct audit_buffer *ab; unsigned int t; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); /* check if new data is valid */ if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) err = -EINVAL; if (err) t = READ_ONCE(current->signal->audit_tty); else { t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD); t = xchg(¤t->signal->audit_tty, t); } old.enabled = t & AUDIT_TTY_ENABLE; old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" " old-log_passwd=%d new-log_passwd=%d res=%d", old.enabled, s.enabled, old.log_passwd, s.log_passwd, !err); audit_log_end(ab); break; } default: err = -EINVAL; break; } return err < 0 ? err : 0; } /** * audit_receive - receive messages from a netlink control socket * @skb: the message buffer * * Parse the provided skb and deal with any messages that may be present, * malformed skbs are discarded. */ static void audit_receive(struct sk_buff *skb) { struct nlmsghdr *nlh; bool ack; /* * len MUST be signed for nlmsg_next to be able to dec it below 0 * if the nlmsg_len was not aligned */ int len; int err; nlh = nlmsg_hdr(skb); len = skb->len; audit_ctl_lock(); while (nlmsg_ok(nlh, len)) { ack = nlh->nlmsg_flags & NLM_F_ACK; err = audit_receive_msg(skb, nlh, &ack); /* send an ack if the user asked for one and audit_receive_msg * didn't already do it, or if there was an error. */ if (ack || err) netlink_ack(skb, nlh, err, NULL); nlh = nlmsg_next(nlh, &len); } audit_ctl_unlock(); /* can't block with the ctrl lock, so penalize the sender now */ if (audit_backlog_limit && (skb_queue_len(&audit_queue) > audit_backlog_limit)) { DECLARE_WAITQUEUE(wait, current); /* wake kauditd to try and flush the queue */ wake_up_interruptible(&kauditd_wait); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(audit_backlog_wait_time); remove_wait_queue(&audit_backlog_wait, &wait); } } /* Log information about who is connecting to the audit multicast socket */ static void audit_log_multicast(int group, const char *op, int err) { const struct cred *cred; struct tty_struct *tty; char comm[sizeof(current->comm)]; struct audit_buffer *ab; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER); if (!ab) return; cred = current_cred(); tty = audit_get_tty(); audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u", task_tgid_nr(current), from_kuid(&init_user_ns, cred->uid), from_kuid(&init_user_ns, audit_get_loginuid(current)), tty ? tty_name(tty) : "(none)", audit_get_sessionid(current)); audit_put_tty(tty); audit_log_task_context(ab); /* subj= */ audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); /* exe= */ audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err); audit_log_end(ab); } /* Run custom bind function on netlink socket group connect or bind requests. */ static int audit_multicast_bind(struct net *net, int group) { int err = 0; if (!capable(CAP_AUDIT_READ)) err = -EPERM; audit_log_multicast(group, "connect", err); return err; } static void audit_multicast_unbind(struct net *net, int group) { audit_log_multicast(group, "disconnect", 0); } static int __net_init audit_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = audit_receive, .bind = audit_multicast_bind, .unbind = audit_multicast_unbind, .flags = NL_CFG_F_NONROOT_RECV, .groups = AUDIT_NLGRP_MAX, }; struct audit_net *aunet = net_generic(net, audit_net_id); aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg); if (aunet->sk == NULL) { audit_panic("cannot initialize netlink socket in namespace"); return -ENOMEM; } /* limit the timeout in case auditd is blocked/stopped */ aunet->sk->sk_sndtimeo = HZ / 10; return 0; } static void __net_exit audit_net_exit(struct net *net) { struct audit_net *aunet = net_generic(net, audit_net_id); /* NOTE: you would think that we would want to check the auditd * connection and potentially reset it here if it lives in this * namespace, but since the auditd connection tracking struct holds a * reference to this namespace (see auditd_set()) we are only ever * going to get here after that connection has been released */ netlink_kernel_release(aunet->sk); } static struct pernet_operations audit_net_ops __net_initdata = { .init = audit_net_init, .exit = audit_net_exit, .id = &audit_net_id, .size = sizeof(struct audit_net), }; /* Initialize audit support at boot time. */ static int __init audit_init(void) { int i; if (audit_initialized == AUDIT_DISABLED) return 0; audit_buffer_cache = KMEM_CACHE(audit_buffer, SLAB_PANIC); skb_queue_head_init(&audit_queue); skb_queue_head_init(&audit_retry_queue); skb_queue_head_init(&audit_hold_queue); for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); mutex_init(&audit_cmd_mutex.lock); audit_cmd_mutex.owner = NULL; pr_info("initializing netlink subsys (%s)\n", str_enabled_disabled(audit_default)); register_pernet_subsys(&audit_net_ops); audit_initialized = AUDIT_INITIALIZED; kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); if (IS_ERR(kauditd_task)) { int err = PTR_ERR(kauditd_task); panic("audit: failed to start the kauditd thread (%d)\n", err); } audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "state=initialized audit_enabled=%u res=1", audit_enabled); return 0; } postcore_initcall(audit_init); /* * Process kernel command-line parameter at boot time. * audit={0|off} or audit={1|on}. */ static int __init audit_enable(char *str) { if (!strcasecmp(str, "off") || !strcmp(str, "0")) audit_default = AUDIT_OFF; else if (!strcasecmp(str, "on") || !strcmp(str, "1")) audit_default = AUDIT_ON; else { pr_err("audit: invalid 'audit' parameter value (%s)\n", str); audit_default = AUDIT_ON; } if (audit_default == AUDIT_OFF) audit_initialized = AUDIT_DISABLED; if (audit_set_enabled(audit_default)) pr_err("audit: error setting audit state (%d)\n", audit_default); pr_info("%s\n", audit_default ? "enabled (after initialization)" : "disabled (until reboot)"); return 1; } __setup("audit=", audit_enable); /* Process kernel command-line parameter at boot time. * audit_backlog_limit=<n> */ static int __init audit_backlog_limit_set(char *str) { u32 audit_backlog_limit_arg; pr_info("audit_backlog_limit: "); if (kstrtouint(str, 0, &audit_backlog_limit_arg)) { pr_cont("using default of %u, unable to parse %s\n", audit_backlog_limit, str); return 1; } audit_backlog_limit = audit_backlog_limit_arg; pr_cont("%d\n", audit_backlog_limit); return 1; } __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { if (!ab) return; kfree_skb(ab->skb); kmem_cache_free(audit_buffer_cache, ab); } static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask); if (!ab) return NULL; ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) goto err; ab->ctx = ctx; ab->gfp_mask = gfp_mask; return ab; err: audit_buffer_free(ab); return NULL; } /** * audit_serial - compute a serial number for the audit record * * Compute a serial number for the audit record. Audit records are * written to user-space as soon as they are generated, so a complete * audit record may be written in several pieces. The timestamp of the * record and this serial number are used by the user-space tools to * determine which pieces belong to the same audit record. The * (timestamp,serial) tuple is unique for each syscall and is live from * syscall entry to syscall exit. * * NOTE: Another possibility is to store the formatted records off the * audit context (for those records that have a context), and emit them * all at syscall exit. However, this could delay the reporting of * significant errors until syscall exit (or never, if the system * halts). */ unsigned int audit_serial(void) { static atomic_t serial = ATOMIC_INIT(0); return atomic_inc_return(&serial); } static inline void audit_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial) { if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { ktime_get_coarse_real_ts64(t); *serial = audit_serial(); } } /** * audit_log_start - obtain an audit buffer * @ctx: audit_context (may be NULL) * @gfp_mask: type of allocation * @type: audit message type * * Returns audit_buffer pointer on success or NULL on error. * * Obtain an audit buffer. This routine does locking to obtain the * audit buffer, but then no locking is required for calls to * audit_log_*format. If the task (ctx) is a task that is currently in a * syscall, then the syscall is marked as auditable and an audit record * will be written at syscall exit. If there is no associated task, then * task context (ctx) should be NULL. */ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; struct timespec64 t; unsigned int serial; if (audit_initialized != AUDIT_INITIALIZED) return NULL; if (unlikely(!audit_filter(type, AUDIT_FILTER_EXCLUDE))) return NULL; /* NOTE: don't ever fail/sleep on these two conditions: * 1. auditd generated record - since we need auditd to drain the * queue; also, when we are checking for auditd, compare PIDs using * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() * using a PID anchored in the caller's namespace * 2. generator holding the audit_cmd_mutex - we don't want to block * while holding the mutex, although we do penalize the sender * later in audit_receive() when it is safe to block */ if (!(auditd_test_task(current) || audit_ctl_owner_current())) { long stime = audit_backlog_wait_time; while (audit_backlog_limit && (skb_queue_len(&audit_queue) > audit_backlog_limit)) { /* wake kauditd to try and flush the queue */ wake_up_interruptible(&kauditd_wait); /* sleep if we are allowed and we haven't exhausted our * backlog wait limit */ if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) { long rtime = stime; DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); stime = schedule_timeout(rtime); atomic_add(rtime - stime, &audit_backlog_wait_time_actual); remove_wait_queue(&audit_backlog_wait, &wait); } else { if (audit_rate_check() && printk_ratelimit()) pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n", skb_queue_len(&audit_queue), audit_backlog_limit); audit_log_lost("backlog limit exceeded"); return NULL; } } } ab = audit_buffer_alloc(ctx, gfp_mask, type); if (!ab) { audit_log_lost("out of memory in audit_log_start"); return NULL; } audit_get_stamp(ab->ctx, &t, &serial); /* cancel dummy context to enable supporting records */ if (ctx) ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); return ab; } /** * audit_expand - expand skb in the audit buffer * @ab: audit_buffer * @extra: space to add at tail of the skb * * Returns 0 (no space) on failed expansion, or available space if * successful. */ static inline int audit_expand(struct audit_buffer *ab, int extra) { struct sk_buff *skb = ab->skb; int oldtail = skb_tailroom(skb); int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask); int newtail = skb_tailroom(skb); if (ret < 0) { audit_log_lost("out of memory in audit_expand"); return 0; } skb->truesize += newtail - oldtail; return newtail; } /* * Format an audit message into the audit buffer. If there isn't enough * room in the audit buffer, more room will be allocated and vsnprint * will be called a second time. Currently, we assume that a printk * can't format message larger than 1024 bytes, so we don't either. */ static __printf(2, 0) void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args) { int len, avail; struct sk_buff *skb; va_list args2; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); if (avail == 0) { avail = audit_expand(ab, AUDIT_BUFSIZ); if (!avail) goto out; } va_copy(args2, args); len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args); if (len >= avail) { /* The printk buffer is 1024 bytes long, so if we get * here and AUDIT_BUFSIZ is at least 1024, then we can * log everything that printk could have logged. */ avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); if (!avail) goto out_va_end; len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); } if (len > 0) skb_put(skb, len); out_va_end: va_end(args2); out: return; } /** * audit_log_format - format a message into the audit buffer. * @ab: audit_buffer * @fmt: format string * @...: optional parameters matching @fmt string * * All the work is done in audit_log_vformat. */ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) { va_list args; if (!ab) return; va_start(args, fmt); audit_log_vformat(ab, fmt, args); va_end(args); } /** * audit_log_n_hex - convert a buffer to hex and append it to the audit skb * @ab: the audit_buffer * @buf: buffer to convert to hex * @len: length of @buf to be converted * * No return value; failure to expand is silently ignored. * * This function will take the passed buf and convert it into a string of * ascii hex digits. The new string is placed onto the skb. */ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) { int i, avail, new_len; unsigned char *ptr; struct sk_buff *skb; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); new_len = len<<1; if (new_len >= avail) { /* Round the buffer request up to the next multiple */ new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1); avail = audit_expand(ab, new_len); if (!avail) return; } ptr = skb_tail_pointer(skb); for (i = 0; i < len; i++) ptr = hex_byte_pack_upper(ptr, buf[i]); *ptr = 0; skb_put(skb, len << 1); /* new string is twice the old string */ } /* * Format a string of no more than slen characters into the audit buffer, * enclosed in quote marks. */ void audit_log_n_string(struct audit_buffer *ab, const char *string, size_t slen) { int avail, new_len; unsigned char *ptr; struct sk_buff *skb; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); new_len = slen + 3; /* enclosing quotes + null terminator */ if (new_len > avail) { avail = audit_expand(ab, new_len); if (!avail) return; } ptr = skb_tail_pointer(skb); *ptr++ = '"'; memcpy(ptr, string, slen); ptr += slen; *ptr++ = '"'; *ptr = 0; skb_put(skb, slen + 2); /* don't include null terminator */ } /** * audit_string_contains_control - does a string need to be logged in hex * @string: string to be checked * @len: max length of the string to check */ bool audit_string_contains_control(const char *string, size_t len) { const unsigned char *p; for (p = string; p < (const unsigned char *)string + len; p++) { if (*p == '"' || *p < 0x21 || *p > 0x7e) return true; } return false; } /** * audit_log_n_untrustedstring - log a string that may contain random characters * @ab: audit_buffer * @string: string to be logged * @len: length of string (not including trailing null) * * This code will escape a string that is passed to it if the string * contains a control character, unprintable character, double quote mark, * or a space. Unescaped strings will start and end with a double quote mark. * Strings that are escaped are printed in hex (2 digits per char). * * The caller specifies the number of characters in the string to log, which may * or may not be the entire string. */ void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t len) { if (audit_string_contains_control(string, len)) audit_log_n_hex(ab, string, len); else audit_log_n_string(ab, string, len); } /** * audit_log_untrustedstring - log a string that may contain random characters * @ab: audit_buffer * @string: string to be logged * * Same as audit_log_n_untrustedstring(), except that strlen is used to * determine string length. */ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { audit_log_n_untrustedstring(ab, string, strlen(string)); } /* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path) { char *p, *pathname; if (prefix) audit_log_format(ab, "%s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); if (!pathname) { audit_log_format(ab, "\"<no_memory>\""); return; } p = d_path(path, pathname, PATH_MAX+11); if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ /* FIXME: can we save some information here? */ audit_log_format(ab, "\"<too_long>\""); } else audit_log_untrustedstring(ab, p); kfree(pathname); } void audit_log_session_info(struct audit_buffer *ab) { unsigned int sessionid = audit_get_sessionid(current); uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); audit_log_format(ab, "auid=%u ses=%u", auid, sessionid); } void audit_log_key(struct audit_buffer *ab, char *key) { audit_log_format(ab, " key="); if (key) audit_log_untrustedstring(ab, key); else audit_log_format(ab, "(null)"); } int audit_log_task_context(struct audit_buffer *ab) { struct lsm_prop prop; struct lsm_context ctx; int error; security_current_getlsmprop_subj(&prop); if (!lsmprop_is_set(&prop)) return 0; error = security_lsmprop_to_secctx(&prop, &ctx); if (error < 0) { if (error != -EINVAL) goto error_path; return 0; } audit_log_format(ab, " subj=%s", ctx.context); security_release_secctx(&ctx); return 0; error_path: audit_panic("error in audit_log_task_context"); return error; } EXPORT_SYMBOL(audit_log_task_context); void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm) { struct file *exe_file; if (!mm) goto out_null; exe_file = get_mm_exe_file(mm); if (!exe_file) goto out_null; audit_log_d_path(ab, " exe=", &exe_file->f_path); fput(exe_file); return; out_null: audit_log_format(ab, " exe=(null)"); } struct tty_struct *audit_get_tty(void) { struct tty_struct *tty = NULL; unsigned long flags; spin_lock_irqsave(¤t->sighand->siglock, flags); if (current->signal) tty = tty_kref_get(current->signal->tty); spin_unlock_irqrestore(¤t->sighand->siglock, flags); return tty; } void audit_put_tty(struct tty_struct *tty) { tty_kref_put(tty); } void audit_log_task_info(struct audit_buffer *ab) { const struct cred *cred; char comm[sizeof(current->comm)]; struct tty_struct *tty; if (!ab) return; cred = current_cred(); tty = audit_get_tty(); audit_log_format(ab, " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", task_ppid_nr(current), task_tgid_nr(current), from_kuid(&init_user_ns, audit_get_loginuid(current)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), from_kuid(&init_user_ns, cred->euid), from_kuid(&init_user_ns, cred->suid), from_kuid(&init_user_ns, cred->fsuid), from_kgid(&init_user_ns, cred->egid), from_kgid(&init_user_ns, cred->sgid), from_kgid(&init_user_ns, cred->fsgid), tty ? tty_name(tty) : "(none)", audit_get_sessionid(current)); audit_put_tty(tty); audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); audit_log_task_context(ab); } EXPORT_SYMBOL(audit_log_task_info); /** * audit_log_path_denied - report a path restriction denial * @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc) * @operation: specific operation name */ void audit_log_path_denied(int type, const char *operation) { struct audit_buffer *ab; if (!audit_enabled) return; /* Generate log with subject, operation, outcome. */ ab = audit_log_start(audit_context(), GFP_KERNEL, type); if (!ab) return; audit_log_format(ab, "op=%s", operation); audit_log_task_info(ab); audit_log_format(ab, " res=0"); audit_log_end(ab); } /* global counter which is incremented every time something logs in */ static atomic_t session_id = ATOMIC_INIT(0); static int audit_set_loginuid_perm(kuid_t loginuid) { /* if we are unset, we don't need privs */ if (!audit_loginuid_set(current)) return 0; /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) return -EPERM; /* it is set, you need permission */ if (!capable(CAP_AUDIT_CONTROL)) return -EPERM; /* reject if this is not an unset and we don't allow that */ if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid)) return -EPERM; return 0; } static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, unsigned int oldsessionid, unsigned int sessionid, int rc) { struct audit_buffer *ab; uid_t uid, oldloginuid, loginuid; struct tty_struct *tty; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; uid = from_kuid(&init_user_ns, task_uid(current)); oldloginuid = from_kuid(&init_user_ns, koldloginuid); loginuid = from_kuid(&init_user_ns, kloginuid); tty = audit_get_tty(); audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); audit_log_task_context(ab); audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", oldsessionid, sessionid, !rc); audit_put_tty(tty); audit_log_end(ab); } /** * audit_set_loginuid - set current task's loginuid * @loginuid: loginuid value * * Returns 0. * * Called (set) from fs/proc/base.c::proc_loginuid_write(). */ int audit_set_loginuid(kuid_t loginuid) { unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; kuid_t oldloginuid; int rc; oldloginuid = audit_get_loginuid(current); oldsessionid = audit_get_sessionid(current); rc = audit_set_loginuid_perm(loginuid); if (rc) goto out; /* are we setting or clearing? */ if (uid_valid(loginuid)) { sessionid = (unsigned int)atomic_inc_return(&session_id); if (unlikely(sessionid == AUDIT_SID_UNSET)) sessionid = (unsigned int)atomic_inc_return(&session_id); } current->sessionid = sessionid; current->loginuid = loginuid; out: audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); return rc; } /** * audit_signal_info - record signal info for shutting down audit subsystem * @sig: signal value * @t: task being signaled * * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ int audit_signal_info(int sig, struct task_struct *t) { kuid_t uid = current_uid(), auid; if (auditd_test_task(t) && (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2)) { audit_sig_pid = task_tgid_nr(current); auid = audit_get_loginuid(current); if (uid_valid(auid)) audit_sig_uid = auid; else audit_sig_uid = uid; security_current_getlsmprop_subj(&audit_sig_lsm); } return audit_signal_info_syscall(t); } /** * audit_log_end - end one audit record * @ab: the audit_buffer * * We can not do a netlink send inside an irq context because it blocks (last * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a * queue and a kthread is scheduled to remove them from the queue outside the * irq context. May be called in any context. */ void audit_log_end(struct audit_buffer *ab) { struct sk_buff *skb; struct nlmsghdr *nlh; if (!ab) return; if (audit_rate_check()) { skb = ab->skb; ab->skb = NULL; /* setup the netlink header, see the comments in * kauditd_send_multicast_skb() for length quirks */ nlh = nlmsg_hdr(skb); nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; /* queue the netlink packet and poke the kauditd thread */ skb_queue_tail(&audit_queue, skb); wake_up_interruptible(&kauditd_wait); } else audit_log_lost("rate limit exceeded"); audit_buffer_free(ab); } /** * audit_log - Log an audit record * @ctx: audit context * @gfp_mask: type of allocation * @type: audit message type * @fmt: format string to use * @...: variable parameters matching the format string * * This is a convenience function that calls audit_log_start, * audit_log_vformat, and audit_log_end. It may be called * in any context. */ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { struct audit_buffer *ab; va_list args; ab = audit_log_start(ctx, gfp_mask, type); if (ab) { va_start(args, fmt); audit_log_vformat(ab, fmt, args); va_end(args); audit_log_end(ab); } } EXPORT_SYMBOL(audit_log_start); EXPORT_SYMBOL(audit_log_end); EXPORT_SYMBOL(audit_log_format); EXPORT_SYMBOL(audit_log); |
| 1046 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM mmap #if !defined(_TRACE_MMAP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MMAP_H #include <linux/tracepoint.h> TRACE_EVENT(vm_unmapped_area, TP_PROTO(unsigned long addr, struct vm_unmapped_area_info *info), TP_ARGS(addr, info), TP_STRUCT__entry( __field(unsigned long, addr) __field(unsigned long, total_vm) __field(unsigned long, flags) __field(unsigned long, length) __field(unsigned long, low_limit) __field(unsigned long, high_limit) __field(unsigned long, align_mask) __field(unsigned long, align_offset) ), TP_fast_assign( __entry->addr = addr; __entry->total_vm = current->mm->total_vm; __entry->flags = info->flags; __entry->length = info->length; __entry->low_limit = info->low_limit; __entry->high_limit = info->high_limit; __entry->align_mask = info->align_mask; __entry->align_offset = info->align_offset; ), TP_printk("addr=0x%lx err=%ld total_vm=0x%lx flags=0x%lx len=0x%lx lo=0x%lx hi=0x%lx mask=0x%lx ofs=0x%lx", IS_ERR_VALUE(__entry->addr) ? 0 : __entry->addr, IS_ERR_VALUE(__entry->addr) ? __entry->addr : 0, __entry->total_vm, __entry->flags, __entry->length, __entry->low_limit, __entry->high_limit, __entry->align_mask, __entry->align_offset) ); TRACE_EVENT(vma_mas_szero, TP_PROTO(struct maple_tree *mt, unsigned long start, unsigned long end), TP_ARGS(mt, start, end), TP_STRUCT__entry( __field(struct maple_tree *, mt) __field(unsigned long, start) __field(unsigned long, end) ), TP_fast_assign( __entry->mt = mt; __entry->start = start; __entry->end = end; ), TP_printk("mt_mod %p, (NULL), SNULL, %lu, %lu,", __entry->mt, (unsigned long) __entry->start, (unsigned long) __entry->end ) ); TRACE_EVENT(vma_store, TP_PROTO(struct maple_tree *mt, struct vm_area_struct *vma), TP_ARGS(mt, vma), TP_STRUCT__entry( __field(struct maple_tree *, mt) __field(struct vm_area_struct *, vma) __field(unsigned long, vm_start) __field(unsigned long, vm_end) ), TP_fast_assign( __entry->mt = mt; __entry->vma = vma; __entry->vm_start = vma->vm_start; __entry->vm_end = vma->vm_end - 1; ), TP_printk("mt_mod %p, (%p), STORE, %lu, %lu,", __entry->mt, __entry->vma, (unsigned long) __entry->vm_start, (unsigned long) __entry->vm_end ) ); TRACE_EVENT(exit_mmap, TP_PROTO(struct mm_struct *mm), TP_ARGS(mm), TP_STRUCT__entry( __field(struct mm_struct *, mm) __field(struct maple_tree *, mt) ), TP_fast_assign( __entry->mm = mm; __entry->mt = &mm->mm_mt; ), TP_printk("mt_mod %p, DESTROY", __entry->mt ) ); #endif /* This part must be outside protection */ #include <trace/define_trace.h> |
| 13 13 3 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H #define _BCACHEFS_BTREE_JOURNAL_ITER_H #include "bkey.h" struct journal_iter { struct list_head list; enum btree_id btree_id; unsigned level; size_t idx; struct journal_keys *keys; }; /* * Iterate over keys in the btree, with keys from the journal overlaid on top: */ struct btree_and_journal_iter { struct btree_trans *trans; struct btree *b; struct btree_node_iter node_iter; struct bkey unpacked; struct journal_iter journal; struct bpos pos; bool at_end; bool prefetch; bool fail_if_too_many_whiteouts; }; static inline int __journal_key_btree_cmp(enum btree_id l_btree_id, unsigned l_level, const struct journal_key *r) { return -cmp_int(l_level, r->level) ?: cmp_int(l_btree_id, r->btree_id); } static inline int __journal_key_cmp(enum btree_id l_btree_id, unsigned l_level, struct bpos l_pos, const struct journal_key *r) { return __journal_key_btree_cmp(l_btree_id, l_level, r) ?: bpos_cmp(l_pos, r->k->k.p); } static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) { return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); } struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos, size_t *); struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos, size_t *); struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, unsigned, struct bpos); int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *, struct btree_and_journal_iter *); int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_insert(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_delete(struct bch_fs *, enum btree_id, unsigned, struct bpos); bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos); void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos); void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, struct btree_and_journal_iter *, struct btree *, struct btree_node_iter, struct bpos); void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, struct btree_and_journal_iter *, struct btree *); void bch2_journal_keys_put(struct bch_fs *); static inline void bch2_journal_keys_put_initial(struct bch_fs *c) { if (c->journal_keys.initial_ref_held) bch2_journal_keys_put(c); c->journal_keys.initial_ref_held = false; } int bch2_journal_keys_sort(struct bch_fs *); void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, unsigned, unsigned, struct bpos, struct bpos); void bch2_journal_keys_dump(struct bch_fs *); void bch2_fs_journal_keys_init(struct bch_fs *); #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ |
| 1752 1 271 214 15 196 283 2357 156 37 1752 272 283 196 2444 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* I/O iterator iteration building functions. * * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_IOV_ITER_H #define _LINUX_IOV_ITER_H #include <linux/uio.h> #include <linux/bvec.h> #include <linux/folio_queue.h> typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len, void *priv, void *priv2); typedef size_t (*iov_ustep_f)(void __user *iter_base, size_t progress, size_t len, void *priv, void *priv2); /* * Handle ITER_UBUF. */ static __always_inline size_t iterate_ubuf(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_ustep_f step) { void __user *base = iter->ubuf; size_t progress = 0, remain; remain = step(base + iter->iov_offset, 0, len, priv, priv2); progress = len - remain; iter->iov_offset += progress; iter->count -= progress; return progress; } /* * Handle ITER_IOVEC. */ static __always_inline size_t iterate_iovec(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_ustep_f step) { const struct iovec *p = iter->__iov; size_t progress = 0, skip = iter->iov_offset; do { size_t remain, consumed; size_t part = min(len, p->iov_len - skip); if (likely(part)) { remain = step(p->iov_base + skip, progress, part, priv, priv2); consumed = part - remain; progress += consumed; skip += consumed; len -= consumed; if (skip < p->iov_len) break; } p++; skip = 0; } while (len); iter->nr_segs -= p - iter->__iov; iter->__iov = p; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_KVEC. */ static __always_inline size_t iterate_kvec(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { const struct kvec *p = iter->kvec; size_t progress = 0, skip = iter->iov_offset; do { size_t remain, consumed; size_t part = min(len, p->iov_len - skip); if (likely(part)) { remain = step(p->iov_base + skip, progress, part, priv, priv2); consumed = part - remain; progress += consumed; skip += consumed; len -= consumed; if (skip < p->iov_len) break; } p++; skip = 0; } while (len); iter->nr_segs -= p - iter->kvec; iter->kvec = p; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_BVEC. */ static __always_inline size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { const struct bio_vec *p = iter->bvec; size_t progress = 0, skip = iter->iov_offset; do { size_t remain, consumed; size_t offset = p->bv_offset + skip, part; void *kaddr = kmap_local_page(p->bv_page + offset / PAGE_SIZE); part = min3(len, (size_t)(p->bv_len - skip), (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); remain = step(kaddr + offset % PAGE_SIZE, progress, part, priv, priv2); kunmap_local(kaddr); consumed = part - remain; len -= consumed; progress += consumed; skip += consumed; if (skip >= p->bv_len) { skip = 0; p++; } if (remain) break; } while (len); iter->nr_segs -= p - iter->bvec; iter->bvec = p; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_FOLIOQ. */ static __always_inline size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { const struct folio_queue *folioq = iter->folioq; unsigned int slot = iter->folioq_slot; size_t progress = 0, skip = iter->iov_offset; if (slot == folioq_nr_slots(folioq)) { /* The iterator may have been extended. */ folioq = folioq->next; slot = 0; } do { struct folio *folio = folioq_folio(folioq, slot); size_t part, remain, consumed; size_t fsize; void *base; if (!folio) break; fsize = folioq_folio_size(folioq, slot); base = kmap_local_folio(folio, skip); part = umin(len, PAGE_SIZE - skip % PAGE_SIZE); remain = step(base, progress, part, priv, priv2); kunmap_local(base); consumed = part - remain; len -= consumed; progress += consumed; skip += consumed; if (skip >= fsize) { skip = 0; slot++; if (slot == folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } if (remain) break; } while (len); iter->folioq_slot = slot; iter->folioq = folioq; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_XARRAY. */ static __always_inline size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { struct folio *folio; size_t progress = 0; loff_t start = iter->xarray_start + iter->iov_offset; pgoff_t index = start / PAGE_SIZE; XA_STATE(xas, iter->xarray, index); rcu_read_lock(); xas_for_each(&xas, folio, ULONG_MAX) { size_t remain, consumed, offset, part, flen; if (xas_retry(&xas, folio)) continue; if (WARN_ON(xa_is_value(folio))) break; if (WARN_ON(folio_test_hugetlb(folio))) break; offset = offset_in_folio(folio, start + progress); flen = min(folio_size(folio) - offset, len); while (flen) { void *base = kmap_local_folio(folio, offset); part = min_t(size_t, flen, PAGE_SIZE - offset_in_page(offset)); remain = step(base, progress, part, priv, priv2); kunmap_local(base); consumed = part - remain; progress += consumed; len -= consumed; if (remain || len == 0) goto out; flen -= consumed; offset += consumed; } } out: rcu_read_unlock(); iter->iov_offset += progress; iter->count -= progress; return progress; } /* * Handle ITER_DISCARD. */ static __always_inline size_t iterate_discard(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { size_t progress = len; iter->count -= progress; return progress; } /** * iterate_and_advance2 - Iterate over an iterator * @iter: The iterator to iterate over. * @len: The amount to iterate over. * @priv: Data for the step functions. * @priv2: More data for the step functions. * @ustep: Function for UBUF/IOVEC iterators; given __user addresses. * @step: Function for other iterators; given kernel addresses. * * Iterate over the next part of an iterator, up to the specified length. The * buffer is presented in segments, which for kernel iteration are broken up by * physical pages and mapped, with the mapped address being presented. * * Two step functions, @step and @ustep, must be provided, one for handling * mapped kernel addresses and the other is given user addresses which have the * potential to fault since no pinning is performed. * * The step functions are passed the address and length of the segment, @priv, * @priv2 and the amount of data so far iterated over (which can, for example, * be added to @priv to point to the right part of a second buffer). The step * functions should return the amount of the segment they didn't process (ie. 0 * indicates complete processsing). * * This function returns the amount of data processed (ie. 0 means nothing was * processed and the value of @len means processes to completion). */ static __always_inline size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_ustep_f ustep, iov_step_f step) { if (unlikely(iter->count < len)) len = iter->count; if (unlikely(!len)) return 0; if (likely(iter_is_ubuf(iter))) return iterate_ubuf(iter, len, priv, priv2, ustep); if (likely(iter_is_iovec(iter))) return iterate_iovec(iter, len, priv, priv2, ustep); if (iov_iter_is_bvec(iter)) return iterate_bvec(iter, len, priv, priv2, step); if (iov_iter_is_kvec(iter)) return iterate_kvec(iter, len, priv, priv2, step); if (iov_iter_is_folioq(iter)) return iterate_folioq(iter, len, priv, priv2, step); if (iov_iter_is_xarray(iter)) return iterate_xarray(iter, len, priv, priv2, step); return iterate_discard(iter, len, priv, priv2, step); } /** * iterate_and_advance - Iterate over an iterator * @iter: The iterator to iterate over. * @len: The amount to iterate over. * @priv: Data for the step functions. * @ustep: Function for UBUF/IOVEC iterators; given __user addresses. * @step: Function for other iterators; given kernel addresses. * * As iterate_and_advance2(), but priv2 is always NULL. */ static __always_inline size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv, iov_ustep_f ustep, iov_step_f step) { return iterate_and_advance2(iter, len, priv, NULL, ustep, step); } /** * iterate_and_advance_kernel - Iterate over a kernel-internal iterator * @iter: The iterator to iterate over. * @len: The amount to iterate over. * @priv: Data for the step functions. * @priv2: More data for the step functions. * @step: Function for other iterators; given kernel addresses. * * Iterate over the next part of an iterator, up to the specified length. The * buffer is presented in segments, which for kernel iteration are broken up by * physical pages and mapped, with the mapped address being presented. * * [!] Note This will only handle BVEC, KVEC, FOLIOQ, XARRAY and DISCARD-type * iterators; it will not handle UBUF or IOVEC-type iterators. * * A step functions, @step, must be provided, one for handling mapped kernel * addresses and the other is given user addresses which have the potential to * fault since no pinning is performed. * * The step functions are passed the address and length of the segment, @priv, * @priv2 and the amount of data so far iterated over (which can, for example, * be added to @priv to point to the right part of a second buffer). The step * functions should return the amount of the segment they didn't process (ie. 0 * indicates complete processsing). * * This function returns the amount of data processed (ie. 0 means nothing was * processed and the value of @len means processes to completion). */ static __always_inline size_t iterate_and_advance_kernel(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { if (unlikely(iter->count < len)) len = iter->count; if (unlikely(!len)) return 0; if (iov_iter_is_bvec(iter)) return iterate_bvec(iter, len, priv, priv2, step); if (iov_iter_is_kvec(iter)) return iterate_kvec(iter, len, priv, priv2, step); if (iov_iter_is_folioq(iter)) return iterate_folioq(iter, len, priv, priv2, step); if (iov_iter_is_xarray(iter)) return iterate_xarray(iter, len, priv, priv2, step); return iterate_discard(iter, len, priv, priv2, step); } #endif /* _LINUX_IOV_ITER_H */ |
| 1 1 1 1 2 2 2 3 1 2 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 | // SPDX-License-Identifier: GPL-2.0-or-later /* * resize.c * * volume resize. * Inspired by ext3/resize.c. * * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/types.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "alloc.h" #include "dlmglue.h" #include "inode.h" #include "journal.h" #include "super.h" #include "sysfile.h" #include "uptodate.h" #include "ocfs2_trace.h" #include "buffer_head_io.h" #include "suballoc.h" #include "resize.h" /* * Check whether there are new backup superblocks exist * in the last group. If there are some, mark them or clear * them in the bitmap. * * Return how many backups we find in the last group. */ static u16 ocfs2_calc_new_backup_super(struct inode *inode, struct ocfs2_group_desc *gd, u16 cl_cpg, u16 old_bg_clusters, int set) { int i; u16 backups = 0; u32 cluster, lgd_cluster; u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno); for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { blkno = ocfs2_backup_super_blkno(inode->i_sb, i); cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); gd_blkno = ocfs2_which_cluster_group(inode, cluster); if (gd_blkno < lgd_blkno) continue; else if (gd_blkno > lgd_blkno) break; /* check if already done backup super */ lgd_cluster = ocfs2_blocks_to_clusters(inode->i_sb, lgd_blkno); lgd_cluster += old_bg_clusters; if (lgd_cluster >= cluster) continue; if (set) ocfs2_set_bit(cluster % cl_cpg, (unsigned long *)gd->bg_bitmap); else ocfs2_clear_bit(cluster % cl_cpg, (unsigned long *)gd->bg_bitmap); backups++; } return backups; } static int ocfs2_update_last_group_and_inode(handle_t *handle, struct inode *bm_inode, struct buffer_head *bm_bh, struct buffer_head *group_bh, u32 first_new_cluster, int new_clusters) { int ret = 0; struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb); struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data; struct ocfs2_chain_list *cl = &fe->id2.i_chain; struct ocfs2_chain_rec *cr; struct ocfs2_group_desc *group; u16 chain, num_bits, backups = 0; u16 cl_bpc = le16_to_cpu(cl->cl_bpc); u16 cl_cpg = le16_to_cpu(cl->cl_cpg); u16 old_bg_clusters; u16 contig_bits; __le16 old_bg_contig_free_bits; trace_ocfs2_update_last_group_and_inode(new_clusters, first_new_cluster); ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode), group_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out; } group = (struct ocfs2_group_desc *)group_bh->b_data; old_bg_clusters = le16_to_cpu(group->bg_bits) / cl_bpc; /* update the group first. */ num_bits = new_clusters * cl_bpc; le16_add_cpu(&group->bg_bits, num_bits); le16_add_cpu(&group->bg_free_bits_count, num_bits); /* * check whether there are some new backup superblocks exist in * this group and update the group bitmap accordingly. */ if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB)) { backups = ocfs2_calc_new_backup_super(bm_inode, group, cl_cpg, old_bg_clusters, 1); le16_add_cpu(&group->bg_free_bits_count, -1 * backups); } contig_bits = ocfs2_find_max_contig_free_bits(group->bg_bitmap, le16_to_cpu(group->bg_bits), 0); old_bg_contig_free_bits = group->bg_contig_free_bits; group->bg_contig_free_bits = cpu_to_le16(contig_bits); ocfs2_journal_dirty(handle, group_bh); /* update the inode accordingly. */ ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_rollback; } chain = le16_to_cpu(group->bg_chain); cr = (&cl->cl_recs[chain]); le32_add_cpu(&cr->c_total, num_bits); le32_add_cpu(&cr->c_free, num_bits); le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits); le32_add_cpu(&fe->i_clusters, new_clusters); if (backups) { le32_add_cpu(&cr->c_free, -1 * backups); le32_add_cpu(&fe->id1.bitmap1.i_used, backups); } spin_lock(&OCFS2_I(bm_inode)->ip_lock); OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits); spin_unlock(&OCFS2_I(bm_inode)->ip_lock); i_size_write(bm_inode, le64_to_cpu(fe->i_size)); ocfs2_journal_dirty(handle, bm_bh); out_rollback: if (ret < 0) { ocfs2_calc_new_backup_super(bm_inode, group, cl_cpg, old_bg_clusters, 0); le16_add_cpu(&group->bg_free_bits_count, backups); le16_add_cpu(&group->bg_bits, -1 * num_bits); le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits); group->bg_contig_free_bits = old_bg_contig_free_bits; } out: if (ret) mlog_errno(ret); return ret; } static int update_backups(struct inode * inode, u32 clusters, char *data) { int i, ret = 0; u32 cluster; u64 blkno; struct buffer_head *backup = NULL; struct ocfs2_dinode *backup_di = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); /* calculate the real backups we need to update. */ for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { blkno = ocfs2_backup_super_blkno(inode->i_sb, i); cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); if (cluster >= clusters) break; ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup); if (ret < 0) { mlog_errno(ret); break; } memcpy(backup->b_data, data, inode->i_sb->s_blocksize); backup_di = (struct ocfs2_dinode *)backup->b_data; backup_di->i_blkno = cpu_to_le64(blkno); ret = ocfs2_write_super_or_backup(osb, backup); brelse(backup); backup = NULL; if (ret < 0) { mlog_errno(ret); break; } } return ret; } static void ocfs2_update_super_and_backups(struct inode *inode, int new_clusters) { int ret; u32 clusters = 0; struct buffer_head *super_bh = NULL; struct ocfs2_dinode *super_di = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); /* * update the superblock last. * It doesn't matter if the write failed. */ ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1, &super_bh); if (ret < 0) { mlog_errno(ret); goto out; } super_di = (struct ocfs2_dinode *)super_bh->b_data; le32_add_cpu(&super_di->i_clusters, new_clusters); clusters = le32_to_cpu(super_di->i_clusters); ret = ocfs2_write_super_or_backup(osb, super_bh); if (ret < 0) { mlog_errno(ret); goto out; } if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB)) ret = update_backups(inode, clusters, super_bh->b_data); out: brelse(super_bh); if (ret) printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s" " during fs resize. This condition is not fatal," " but fsck.ocfs2 should be run to fix it\n", osb->dev_str); return; } /* * Extend the filesystem to the new number of clusters specified. This entry * point is only used to extend the current filesystem to the end of the last * existing group. */ int ocfs2_group_extend(struct inode * inode, int new_clusters) { int ret; handle_t *handle; struct buffer_head *main_bm_bh = NULL; struct buffer_head *group_bh = NULL; struct inode *main_bm_inode = NULL; struct ocfs2_dinode *fe = NULL; struct ocfs2_group_desc *group = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); u16 cl_bpc; u32 first_new_cluster; u64 lgd_blkno; if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; if (new_clusters < 0) return -EINVAL; else if (new_clusters == 0) return 0; main_bm_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); if (!main_bm_inode) { ret = -EINVAL; mlog_errno(ret); goto out; } inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (ret < 0) { mlog_errno(ret); goto out_mutex; } fe = (struct ocfs2_dinode *)main_bm_bh->b_data; /* main_bm_bh is validated by inode read inside ocfs2_inode_lock(), * so any corruption is a code bug. */ BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != ocfs2_group_bitmap_size(osb->sb, 0, osb->s_feature_incompat) * 8) { mlog(ML_ERROR, "The disk is too old and small. " "Force to do offline resize."); ret = -EINVAL; goto out_unlock; } first_new_cluster = le32_to_cpu(fe->i_clusters); lgd_blkno = ocfs2_which_cluster_group(main_bm_inode, first_new_cluster - 1); ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno, &group_bh); if (ret < 0) { mlog_errno(ret); goto out_unlock; } group = (struct ocfs2_group_desc *)group_bh->b_data; cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc); if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters > le16_to_cpu(fe->id2.i_chain.cl_cpg)) { ret = -EINVAL; goto out_unlock; } trace_ocfs2_group_extend( (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters); handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS); if (IS_ERR(handle)) { mlog_errno(PTR_ERR(handle)); ret = -EINVAL; goto out_unlock; } /* update the last group descriptor and inode. */ ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode, main_bm_bh, group_bh, first_new_cluster, new_clusters); if (ret) { mlog_errno(ret); goto out_commit; } ocfs2_update_super_and_backups(main_bm_inode, new_clusters); out_commit: ocfs2_commit_trans(osb, handle); out_unlock: brelse(group_bh); brelse(main_bm_bh); ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: inode_unlock(main_bm_inode); iput(main_bm_inode); out: return ret; } static int ocfs2_check_new_group(struct inode *inode, struct ocfs2_dinode *di, struct ocfs2_new_group_input *input, struct buffer_head *group_bh) { int ret; struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)group_bh->b_data; u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc); ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh); if (ret) goto out; ret = -EINVAL; if (le16_to_cpu(gd->bg_chain) != input->chain) mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u " "while input has %u set.\n", (unsigned long long)le64_to_cpu(gd->bg_blkno), le16_to_cpu(gd->bg_chain), input->chain); else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc) mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but " "input has %u clusters set\n", (unsigned long long)le64_to_cpu(gd->bg_blkno), le16_to_cpu(gd->bg_bits), input->clusters); else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc) mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u " "but it should have %u set\n", (unsigned long long)le64_to_cpu(gd->bg_blkno), le16_to_cpu(gd->bg_bits), input->frees * cl_bpc); else ret = 0; out: return ret; } static int ocfs2_verify_group_and_input(struct inode *inode, struct ocfs2_dinode *di, struct ocfs2_new_group_input *input, struct buffer_head *group_bh) { u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count); u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg); u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec); u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group); u32 total_clusters = le32_to_cpu(di->i_clusters); int ret = -EINVAL; if (cluster < total_clusters) mlog(ML_ERROR, "add a group which is in the current volume.\n"); else if (input->chain >= cl_count) mlog(ML_ERROR, "input chain exceeds the limit.\n"); else if (next_free != cl_count && next_free != input->chain) mlog(ML_ERROR, "the add group should be in chain %u\n", next_free); else if (total_clusters + input->clusters < total_clusters) mlog(ML_ERROR, "add group's clusters overflow.\n"); else if (input->clusters > cl_cpg) mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n"); else if (input->frees > input->clusters) mlog(ML_ERROR, "the free cluster exceeds the total clusters\n"); else if (total_clusters % cl_cpg != 0) mlog(ML_ERROR, "the last group isn't full. Use group extend first.\n"); else if (input->group != ocfs2_which_cluster_group(inode, cluster)) mlog(ML_ERROR, "group blkno is invalid\n"); else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh))) mlog(ML_ERROR, "group descriptor check failed.\n"); else ret = 0; return ret; } /* Add a new group descriptor to global_bitmap. */ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) { int ret; handle_t *handle; struct buffer_head *main_bm_bh = NULL; struct inode *main_bm_inode = NULL; struct ocfs2_dinode *fe = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *group_bh = NULL; struct ocfs2_group_desc *group = NULL; struct ocfs2_chain_list *cl; struct ocfs2_chain_rec *cr; u16 cl_bpc; u64 bg_ptr; if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; main_bm_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); if (!main_bm_inode) { ret = -EINVAL; mlog_errno(ret); goto out; } inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (ret < 0) { mlog_errno(ret); goto out_mutex; } fe = (struct ocfs2_dinode *)main_bm_bh->b_data; if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != ocfs2_group_bitmap_size(osb->sb, 0, osb->s_feature_incompat) * 8) { mlog(ML_ERROR, "The disk is too old and small." " Force to do offline resize."); ret = -EINVAL; goto out_unlock; } ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh); if (ret < 0) { mlog(ML_ERROR, "Can't read the group descriptor # %llu " "from the device.", (unsigned long long)input->group); goto out_unlock; } ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), group_bh); ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh); if (ret) { mlog_errno(ret); goto out_free_group_bh; } trace_ocfs2_group_add((unsigned long long)input->group, input->chain, input->clusters, input->frees); handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS); if (IS_ERR(handle)) { mlog_errno(PTR_ERR(handle)); ret = -EINVAL; goto out_free_group_bh; } cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc); cl = &fe->id2.i_chain; cr = &cl->cl_recs[input->chain]; ret = ocfs2_journal_access_gd(handle, INODE_CACHE(main_bm_inode), group_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_commit; } group = (struct ocfs2_group_desc *)group_bh->b_data; bg_ptr = le64_to_cpu(group->bg_next_group); group->bg_next_group = cr->c_blkno; ocfs2_journal_dirty(handle, group_bh); ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { group->bg_next_group = cpu_to_le64(bg_ptr); mlog_errno(ret); goto out_commit; } if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) { le16_add_cpu(&cl->cl_next_free_rec, 1); memset(cr, 0, sizeof(struct ocfs2_chain_rec)); } cr->c_blkno = cpu_to_le64(input->group); le32_add_cpu(&cr->c_total, input->clusters * cl_bpc); le32_add_cpu(&cr->c_free, input->frees * cl_bpc); le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc); le32_add_cpu(&fe->id1.bitmap1.i_used, (input->clusters - input->frees) * cl_bpc); le32_add_cpu(&fe->i_clusters, input->clusters); ocfs2_journal_dirty(handle, main_bm_bh); spin_lock(&OCFS2_I(main_bm_inode)->ip_lock); OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits); spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock); i_size_write(main_bm_inode, le64_to_cpu(fe->i_size)); ocfs2_update_super_and_backups(main_bm_inode, input->clusters); out_commit: ocfs2_commit_trans(osb, handle); out_free_group_bh: if (ret < 0) ocfs2_remove_from_cache(INODE_CACHE(inode), group_bh); brelse(group_bh); out_unlock: brelse(main_bm_bh); ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: inode_unlock(main_bm_inode); iput(main_bm_inode); out: return ret; } |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <net/net_namespace.h> #include <linux/if_arp.h> #include <net/rtnetlink.h> static netdev_tx_t nlmon_xmit(struct sk_buff *skb, struct net_device *dev) { dev_lstats_add(dev, skb->len); dev_kfree_skb(skb); return NETDEV_TX_OK; } struct nlmon { struct netlink_tap nt; }; static int nlmon_open(struct net_device *dev) { struct nlmon *nlmon = netdev_priv(dev); nlmon->nt.dev = dev; nlmon->nt.module = THIS_MODULE; return netlink_add_tap(&nlmon->nt); } static int nlmon_close(struct net_device *dev) { struct nlmon *nlmon = netdev_priv(dev); return netlink_remove_tap(&nlmon->nt); } static void nlmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { dev_lstats_read(dev, &stats->rx_packets, &stats->rx_bytes); } static u32 always_on(struct net_device *dev) { return 1; } static const struct ethtool_ops nlmon_ethtool_ops = { .get_link = always_on, }; static const struct net_device_ops nlmon_ops = { .ndo_open = nlmon_open, .ndo_stop = nlmon_close, .ndo_start_xmit = nlmon_xmit, .ndo_get_stats64 = nlmon_get_stats64, }; static void nlmon_setup(struct net_device *dev) { dev->type = ARPHRD_NETLINK; dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; dev->netdev_ops = &nlmon_ops; dev->ethtool_ops = &nlmon_ethtool_ops; dev->needs_free_netdev = true; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; dev->flags = IFF_NOARP; dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS; /* That's rather a softlimit here, which, of course, * can be altered. Not a real MTU, but what is to be * expected in most cases. */ dev->mtu = NLMSG_GOODSIZE; dev->min_mtu = sizeof(struct nlmsghdr); } static int nlmon_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) return -EINVAL; return 0; } static struct rtnl_link_ops nlmon_link_ops __read_mostly = { .kind = "nlmon", .priv_size = sizeof(struct nlmon), .setup = nlmon_setup, .validate = nlmon_validate, }; static __init int nlmon_register(void) { return rtnl_link_register(&nlmon_link_ops); } static __exit void nlmon_unregister(void) { rtnl_link_unregister(&nlmon_link_ops); } module_init(nlmon_register); module_exit(nlmon_unregister); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); MODULE_AUTHOR("Mathieu Geli <geli@enseirb.fr>"); MODULE_DESCRIPTION("Netlink monitoring device"); MODULE_ALIAS_RTNL_LINK("nlmon"); |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2012 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/if_arp.h> #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/kernel.h> #include <linux/llc.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/openvswitch.h> #include <linux/export.h> #include <net/ip_tunnels.h> #include <net/rtnetlink.h> #include "datapath.h" #include "vport.h" #include "vport-internal_dev.h" #include "vport-netdev.h" static struct vport_ops ovs_netdev_vport_ops; /* Must be called with rcu_read_lock. */ static void netdev_port_receive(struct sk_buff *skb) { struct vport *vport; vport = ovs_netdev_get_vport(skb->dev); if (unlikely(!vport)) goto error; if (unlikely(skb_warn_if_lro(skb))) goto error; /* Make our own copy of the packet. Otherwise we will mangle the * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return; if (skb->dev->type == ARPHRD_ETHER) skb_push_rcsum(skb, ETH_HLEN); ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: kfree_skb(skb); } /* Called with rcu_read_lock and bottom-halves disabled. */ static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; netdev_port_receive(skb); return RX_HANDLER_CONSUMED; } static struct net_device *get_dpdev(const struct datapath *dp) { struct vport *local; local = ovs_vport_ovsl(dp, OVSP_LOCAL); return local->dev; } struct vport *ovs_netdev_link(struct vport *vport, const char *name) { int err; vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); if (!vport->dev) { err = -ENODEV; goto error_free_vport; } /* Ensure that the device exists and that the provided * name is not one of its aliases. */ if (strcmp(name, ovs_vport_name(vport))) { err = -ENODEV; goto error_put; } netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL); if (vport->dev->flags & IFF_LOOPBACK || (vport->dev->type != ARPHRD_ETHER && vport->dev->type != ARPHRD_NONE) || ovs_is_internal_dev(vport->dev)) { err = -EINVAL; goto error_put; } rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, get_dpdev(vport->dp), NULL, NULL, NULL); if (err) goto error_unlock; err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, vport); if (err) goto error_master_upper_dev_unlink; dev_disable_lro(vport->dev); dev_set_promiscuity(vport->dev, 1); vport->dev->priv_flags |= IFF_OVS_DATAPATH; rtnl_unlock(); return vport; error_master_upper_dev_unlink: netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); error_unlock: rtnl_unlock(); error_put: netdev_put(vport->dev, &vport->dev_tracker); error_free_vport: ovs_vport_free(vport); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(ovs_netdev_link); static struct vport *netdev_create(const struct vport_parms *parms) { struct vport *vport; vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); if (IS_ERR(vport)) return vport; return ovs_netdev_link(vport, parms->name); } static void vport_netdev_free(struct rcu_head *rcu) { struct vport *vport = container_of(rcu, struct vport, rcu); netdev_put(vport->dev, &vport->dev_tracker); ovs_vport_free(vport); } void ovs_netdev_detach_dev(struct vport *vport) { ASSERT_RTNL(); vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; netdev_rx_handler_unregister(vport->dev); netdev_upper_dev_unlink(vport->dev, netdev_master_upper_dev_get(vport->dev)); dev_set_promiscuity(vport->dev, -1); } static void netdev_destroy(struct vport *vport) { rtnl_lock(); if (netif_is_ovs_port(vport->dev)) ovs_netdev_detach_dev(vport); rtnl_unlock(); call_rcu(&vport->rcu, vport_netdev_free); } void ovs_netdev_tunnel_destroy(struct vport *vport) { rtnl_lock(); if (netif_is_ovs_port(vport->dev)) ovs_netdev_detach_dev(vport); /* We can be invoked by both explicit vport deletion and * underlying netdev deregistration; delete the link only * if it's not already shutting down. */ if (vport->dev->reg_state == NETREG_REGISTERED) rtnl_delete_link(vport->dev, 0, NULL); netdev_put(vport->dev, &vport->dev_tracker); vport->dev = NULL; rtnl_unlock(); call_rcu(&vport->rcu, vport_netdev_free); } EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); /* Returns null if this device is not attached to a datapath. */ struct vport *ovs_netdev_get_vport(struct net_device *dev) { if (likely(netif_is_ovs_port(dev))) return (struct vport *) rcu_dereference_rtnl(dev->rx_handler_data); else return NULL; } static struct vport_ops ovs_netdev_vport_ops = { .type = OVS_VPORT_TYPE_NETDEV, .create = netdev_create, .destroy = netdev_destroy, .send = dev_queue_xmit, }; int __init ovs_netdev_init(void) { return ovs_vport_ops_register(&ovs_netdev_vport_ops); } void ovs_netdev_exit(void) { ovs_vport_ops_unregister(&ovs_netdev_vport_ops); } |
| 26 36 11 30 44 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 | /* SPDX-License-Identifier: GPL-2.0+ */ /* * NILFS block mapping. * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * * Written by Koji Sato. */ #ifndef _NILFS_BMAP_H #define _NILFS_BMAP_H #include <linux/types.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/nilfs2_ondisk.h> /* nilfs_binfo, nilfs_inode, etc */ #include "alloc.h" #include "dat.h" #define NILFS_BMAP_INVALID_PTR 0 #define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff)) struct nilfs_bmap; /** * union nilfs_bmap_ptr_req - request for bmap ptr * @bpr_ptr: bmap pointer * @bpr_req: request for persistent allocator */ union nilfs_bmap_ptr_req { __u64 bpr_ptr; struct nilfs_palloc_req bpr_req; }; /** * struct nilfs_bmap_stats - bmap statistics * @bs_nblocks: number of blocks created or deleted */ struct nilfs_bmap_stats { unsigned int bs_nblocks; }; /** * struct nilfs_bmap_operations - bmap operation table * @bop_lookup: single block search operation * @bop_lookup_contig: consecutive block search operation * @bop_insert: block insertion operation * @bop_delete: block delete operation * @bop_clear: block mapping resource release operation * @bop_propagate: operation to propagate dirty state towards the * mapping root * @bop_lookup_dirty_buffers: operation to collect dirty block buffers * @bop_assign: disk block address assignment operation * @bop_mark: operation to mark in-use blocks as dirty for * relocation by GC * @bop_seek_key: find valid block key operation * @bop_last_key: find last valid block key operation */ struct nilfs_bmap_operations { int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *); int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *, unsigned int); int (*bop_insert)(struct nilfs_bmap *, __u64, __u64); int (*bop_delete)(struct nilfs_bmap *, __u64); void (*bop_clear)(struct nilfs_bmap *); int (*bop_propagate)(struct nilfs_bmap *, struct buffer_head *); void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *, struct list_head *); int (*bop_assign)(struct nilfs_bmap *, struct buffer_head **, sector_t, union nilfs_binfo *); int (*bop_mark)(struct nilfs_bmap *, __u64, int); int (*bop_seek_key)(const struct nilfs_bmap *, __u64, __u64 *); int (*bop_last_key)(const struct nilfs_bmap *, __u64 *); /* private: internal use only */ int (*bop_check_insert)(const struct nilfs_bmap *, __u64); int (*bop_check_delete)(struct nilfs_bmap *, __u64); int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int); }; #define NILFS_BMAP_SIZE (NILFS_INODE_BMAP_SIZE * sizeof(__le64)) #define NILFS_BMAP_KEY_BIT BITS_PER_LONG #define NILFS_BMAP_NEW_PTR_INIT (1UL << (BITS_PER_LONG - 1)) static inline int nilfs_bmap_is_new_ptr(unsigned long ptr) { return !!(ptr & NILFS_BMAP_NEW_PTR_INIT); } /** * struct nilfs_bmap - bmap structure * @b_u: raw data * @b_sem: semaphore * @b_inode: owner of bmap * @b_ops: bmap operation table * @b_last_allocated_key: last allocated key for data block * @b_last_allocated_ptr: last allocated ptr for data block * @b_ptr_type: pointer type * @b_state: state * @b_nchildren_per_block: maximum number of child nodes for non-root nodes */ struct nilfs_bmap { union { __u8 u_flags; __le64 u_data[NILFS_BMAP_SIZE / sizeof(__le64)]; } b_u; struct rw_semaphore b_sem; struct inode *b_inode; const struct nilfs_bmap_operations *b_ops; __u64 b_last_allocated_key; __u64 b_last_allocated_ptr; int b_ptr_type; int b_state; __u16 b_nchildren_per_block; }; /* pointer type */ #define NILFS_BMAP_PTR_P 0 /* physical block number (i.e. LBN) */ #define NILFS_BMAP_PTR_VS 1 /* * virtual block number (single * version) */ #define NILFS_BMAP_PTR_VM 2 /* * virtual block number (has multiple * versions) */ #define NILFS_BMAP_PTR_U (-1) /* never perform pointer operations */ #define NILFS_BMAP_USE_VBN(bmap) ((bmap)->b_ptr_type > 0) /* state */ #define NILFS_BMAP_DIRTY 0x00000001 /** * struct nilfs_bmap_store - shadow copy of bmap state * @data: cached raw block mapping of on-disk inode * @last_allocated_key: cached value of last allocated key for data block * @last_allocated_ptr: cached value of last allocated ptr for data block * @state: cached value of state field of bmap structure */ struct nilfs_bmap_store { __le64 data[NILFS_BMAP_SIZE / sizeof(__le64)]; __u64 last_allocated_key; __u64 last_allocated_ptr; int state; }; int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned int); int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec); int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key); int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp); int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp); int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key); void nilfs_bmap_clear(struct nilfs_bmap *); int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *); void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *); int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **, unsigned long, union nilfs_binfo *); int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *); int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int); void nilfs_bmap_init_gc(struct nilfs_bmap *); void nilfs_bmap_save(const struct nilfs_bmap *, struct nilfs_bmap_store *); void nilfs_bmap_restore(struct nilfs_bmap *, const struct nilfs_bmap_store *); static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key, __u64 *ptr) { return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr); } /* * Internal use only */ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *); static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req, struct inode *dat) { if (dat) return nilfs_dat_prepare_alloc(dat, &req->bpr_req); /* ignore target ptr */ req->bpr_ptr = bmap->b_last_allocated_ptr++; return 0; } static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req, struct inode *dat) { if (dat) nilfs_dat_commit_alloc(dat, &req->bpr_req); } static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req, struct inode *dat) { if (dat) nilfs_dat_abort_alloc(dat, &req->bpr_req); else bmap->b_last_allocated_ptr--; } static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req, struct inode *dat) { return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0; } static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req, struct inode *dat) { if (dat) nilfs_dat_commit_end(dat, &req->bpr_req, bmap->b_ptr_type == NILFS_BMAP_PTR_VS); } static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req, struct inode *dat) { if (dat) nilfs_dat_abort_end(dat, &req->bpr_req); } static inline void nilfs_bmap_set_target_v(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) { bmap->b_last_allocated_key = key; bmap->b_last_allocated_ptr = ptr; } __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, const struct buffer_head *); __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); /* Assume that bmap semaphore is locked. */ static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap) { return !!(bmap->b_state & NILFS_BMAP_DIRTY); } /* Assume that bmap semaphore is locked. */ static inline void nilfs_bmap_set_dirty(struct nilfs_bmap *bmap) { bmap->b_state |= NILFS_BMAP_DIRTY; } /* Assume that bmap semaphore is locked. */ static inline void nilfs_bmap_clear_dirty(struct nilfs_bmap *bmap) { bmap->b_state &= ~NILFS_BMAP_DIRTY; } #define NILFS_BMAP_LARGE 0x1 #define NILFS_BMAP_SMALL_LOW NILFS_DIRECT_KEY_MIN #define NILFS_BMAP_SMALL_HIGH NILFS_DIRECT_KEY_MAX #define NILFS_BMAP_LARGE_LOW NILFS_BTREE_ROOT_NCHILDREN_MAX #define NILFS_BMAP_LARGE_HIGH NILFS_BTREE_KEY_MAX #endif /* _NILFS_BMAP_H */ |
| 84 84 84 84 60 60 60 60 60 60 60 2 6 5 5 6 6 6 6 6 6 6 6 6 6 6 6 6 35 35 35 134 133 134 130 27 132 133 131 33 35 6 14 27 20 29 34 6 6 6 35 35 35 35 35 18 35 35 35 22 18 35 5 35 18 21 12 22 5 35 35 35 6 34 31 16 29 10 28 9 8 9 58 6 6 64 6 58 64 64 64 64 63 64 56 64 64 62 63 64 64 64 64 62 58 81 3 81 81 81 81 2 81 86 86 58 57 83 83 58 55 3 112 111 111 112 112 112 112 112 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 | // SPDX-License-Identifier: GPL-2.0 /* * Code for manipulating bucket marks for garbage collection. * * Copyright 2014 Datera, Inc. */ #include "bcachefs.h" #include "alloc_background.h" #include "backpointers.h" #include "bset.h" #include "btree_gc.h" #include "btree_update.h" #include "buckets.h" #include "buckets_waiting_for_journal.h" #include "disk_accounting.h" #include "ec.h" #include "error.h" #include "inode.h" #include "movinggc.h" #include "rebalance.h" #include "recovery.h" #include "recovery_passes.h" #include "reflink.h" #include "replicas.h" #include "subvolume.h" #include "trace.h" #include <linux/preempt.h> void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) { for (unsigned i = 0; i < BCH_DATA_NR; i++) usage->buckets[i] = percpu_u64_get(&ca->usage->d[i].buckets); } void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage) { memset(usage, 0, sizeof(*usage)); acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, sizeof(struct bch_dev_usage_full) / sizeof(u64)); } static u64 reserve_factor(u64 r) { return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); } static struct bch_fs_usage_short __bch2_fs_usage_read_short(struct bch_fs *c) { struct bch_fs_usage_short ret; u64 data, reserved; ret.capacity = c->capacity - percpu_u64_get(&c->usage->hidden); data = percpu_u64_get(&c->usage->data) + percpu_u64_get(&c->usage->btree); reserved = percpu_u64_get(&c->usage->reserved) + percpu_u64_get(c->online_reserved); ret.used = min(ret.capacity, data + reserve_factor(reserved)); ret.free = ret.capacity - ret.used; ret.nr_inodes = percpu_u64_get(&c->usage->nr_inodes); return ret; } struct bch_fs_usage_short bch2_fs_usage_read_short(struct bch_fs *c) { struct bch_fs_usage_short ret; percpu_down_read(&c->mark_lock); ret = __bch2_fs_usage_read_short(c); percpu_up_read(&c->mark_lock); return ret; } void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev *ca, struct bch_dev_usage_full *usage) { if (out->nr_tabstops < 5) { printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 12); printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 16); } prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n"); for (unsigned i = 0; i < BCH_DATA_NR; i++) { bch2_prt_data_type(out, i); prt_printf(out, "\t%llu\r%llu\r%llu\r\n", usage->d[i].buckets, usage->d[i].sectors, usage->d[i].fragmented); } prt_printf(out, "capacity\t%llu\r\n", ca->mi.nbuckets); } static int bch2_check_fix_ptr(struct btree_trans *trans, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, bool *do_update) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0; struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); if (!ca) { if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID, trans, ptr_to_invalid_device, "pointer to missing device %u\n" "while marking %s", p.ptr.dev, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) *do_update = true; return 0; } struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); if (!g) { if (fsck_err(trans, ptr_to_invalid_device, "pointer to invalid bucket on device %u\n" "while marking %s", p.ptr.dev, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) *do_update = true; goto out; } enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); if (fsck_err_on(!g->gen_valid, trans, ptr_to_missing_alloc_key, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), p.ptr.gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { if (!p.ptr.cached) { g->gen_valid = true; g->gen = p.ptr.gen; } else { /* this pointer will be dropped */ *do_update = true; goto out; } } /* g->gen_valid == true */ if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, trans, ptr_gen_newer_than_bucket_gen, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), p.ptr.gen, g->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { if (!p.ptr.cached && (g->data_type != BCH_DATA_btree || data_type == BCH_DATA_btree)) { g->data_type = data_type; g->stripe_sectors = 0; g->dirty_sectors = 0; g->cached_sectors = 0; } *do_update = true; } if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, trans, ptr_gen_newer_than_bucket_gen, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), p.ptr.gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) *do_update = true; if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, trans, stale_dirty_ptr, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), p.ptr.gen, g->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) *do_update = true; if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) goto out; if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), trans, ptr_bucket_data_type_mismatch, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, bch2_data_type_str(g->data_type), bch2_data_type_str(data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { if (!p.ptr.cached && data_type == BCH_DATA_btree) { switch (g->data_type) { case BCH_DATA_sb: bch_err(c, "btree and superblock in the same bucket - cannot repair"); ret = bch_err_throw(c, fsck_repair_unimplemented); goto out; case BCH_DATA_journal: ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr)); bch_err_msg(c, ret, "error deleting journal bucket %zu", PTR_BUCKET_NR(ca, &p.ptr)); if (ret) goto out; break; } g->data_type = data_type; g->stripe_sectors = 0; g->dirty_sectors = 0; g->cached_sectors = 0; } else { *do_update = true; } } if (p.has_ec) { struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); if (fsck_err_on(!m || !m->alive, trans, ptr_to_missing_stripe, "pointer to nonexistent stripe %llu\n" "while marking %s", (u64) p.ec.idx, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) *do_update = true; if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), trans, ptr_to_incorrect_stripe, "pointer does not match stripe %llu\n" "while marking %s", (u64) p.ec.idx, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) *do_update = true; } out: fsck_err: bch2_dev_put(ca); printbuf_exit(&buf); return ret; } int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c k, enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry_c; struct extent_ptr_decoded p = { 0 }; bool do_update = false; struct printbuf buf = PRINTBUF; int ret = 0; /* We don't yet do btree key updates correctly for when we're RW */ BUG_ON(test_bit(BCH_FS_rw, &c->flags)); bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); if (ret) goto err; } if (do_update) { struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(new); if (ret) goto err; scoped_guard(rcu) bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev)); if (level) { /* * We don't want to drop btree node pointers - if the * btree node isn't there anymore, the read path will * sort it out: */ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); scoped_guard(rcu) bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); ptr->gen = PTR_GC_BUCKET(ca, ptr)->gen; } } else { struct bkey_ptrs ptrs; union bch_extent_entry *entry; rcu_read_lock(); restart_drop_ptrs: ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) { struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry); if ((p.ptr.cached && (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) || (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0) || gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX || (g->data_type && g->data_type != data_type)) { bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr); goto restart_drop_ptrs; } } rcu_read_unlock(); again: ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_extent_entry_for_each(ptrs, entry) { if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { struct gc_stripe *m = genradix_ptr(&c->gc_stripes, entry->stripe_ptr.idx); union bch_extent_entry *next_ptr; bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) goto found; next_ptr = NULL; found: if (!next_ptr) { bch_err(c, "aieee, found stripe ptr with no data ptr"); continue; } if (!m || !m->alive || !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], &next_ptr->ptr, m->sectors)) { bch2_bkey_extent_entry_drop(new, entry); goto again; } } } } if (0) { printbuf_reset(&buf); bch2_bkey_val_to_text(&buf, c, k); bch_info(c, "updated %s", buf.buf); printbuf_reset(&buf); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); bch_info(c, "new key %s", buf.buf); } if (!(flags & BTREE_TRIGGER_is_root)) { struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, BTREE_ITER_intent|BTREE_ITER_all_snapshots); ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node| BTREE_TRIGGER_norun); bch2_trans_iter_exit(trans, &iter); if (ret) goto err; if (level) bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); } else { struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(new->k.u64s)); ret = PTR_ERR_OR_ZERO(e); if (ret) goto err; journal_entry_set(e, BCH_JSET_ENTRY_btree_root, btree, level - 1, new, new->k.u64s); /* * no locking, we're single threaded and not rw yet, see * the big assertino above that we repeat here: */ BUG_ON(test_bit(BCH_FS_rw, &c->flags)); struct btree *b = bch2_btree_id_root(c, btree)->b; bkey_copy(&b->key, new); } } err: printbuf_exit(&buf); return ret; } static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf, struct bkey_s_c k, bool insert, enum bch_sb_error_id id) { struct bch_fs *c = trans->c; prt_printf(buf, "\nwhile marking "); bch2_bkey_val_to_text(buf, c, k); prt_newline(buf); bool print = __bch2_count_fsck_err(c, id, buf); int ret = bch2_run_explicit_recovery_pass(c, buf, BCH_RECOVERY_PASS_check_allocations, 0); if (insert) { bch2_trans_updates_to_text(buf, trans); __bch2_inconsistent_error(c, buf); /* * If we're in recovery, run_explicit_recovery_pass might give * us an error code for rewinding recovery */ if (!ret) ret = bch_err_throw(c, bucket_ref_update); } else { /* Always ignore overwrite errors, so that deletion works */ ret = 0; } if (print || insert) bch2_print_str(c, KERN_ERR, buf->buf); return ret; } int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c k, const struct bch_extent_ptr *ptr, s64 sectors, enum bch_data_type ptr_data_type, u8 b_gen, u8 bucket_data_type, u32 *bucket_sectors) { struct bch_fs *c = trans->c; size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); struct printbuf buf = PRINTBUF; bool inserting = sectors > 0; int ret = 0; BUG_ON(!sectors); if (unlikely(gen_after(ptr->gen, b_gen))) { bch2_log_msg_start(c, &buf); prt_printf(&buf, "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen", ptr->dev, bucket_nr, b_gen, bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen); ret = bucket_ref_update_err(trans, &buf, k, inserting, BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen); goto out; } if (unlikely(gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX)) { bch2_log_msg_start(c, &buf); prt_printf(&buf, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale", ptr->dev, bucket_nr, b_gen, bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen); ret = bucket_ref_update_err(trans, &buf, k, inserting, BCH_FSCK_ERR_ptr_too_stale); goto out; } if (b_gen != ptr->gen && ptr->cached) { ret = 1; goto out; } if (unlikely(b_gen != ptr->gen)) { bch2_log_msg_start(c, &buf); prt_printf(&buf, "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)", ptr->dev, bucket_nr, b_gen, bucket_gen_get(ca, bucket_nr), bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen); ret = bucket_ref_update_err(trans, &buf, k, inserting, BCH_FSCK_ERR_stale_dirty_ptr); goto out; } if (unlikely(bucket_data_type_mismatch(bucket_data_type, ptr_data_type))) { bch2_log_msg_start(c, &buf); prt_printf(&buf, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s", ptr->dev, bucket_nr, b_gen, bch2_data_type_str(bucket_data_type), bch2_data_type_str(ptr_data_type)); ret = bucket_ref_update_err(trans, &buf, k, inserting, BCH_FSCK_ERR_ptr_bucket_data_type_mismatch); goto out; } if (unlikely((u64) *bucket_sectors + sectors > U32_MAX)) { bch2_log_msg_start(c, &buf); prt_printf(&buf, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX", ptr->dev, bucket_nr, b_gen, bch2_data_type_str(bucket_data_type ?: ptr_data_type), *bucket_sectors, sectors); ret = bucket_ref_update_err(trans, &buf, k, inserting, BCH_FSCK_ERR_bucket_sector_count_overflow); sectors = -*bucket_sectors; goto out; } *bucket_sectors += sectors; out: printbuf_exit(&buf); return ret; } void bch2_trans_account_disk_usage_change(struct btree_trans *trans) { struct bch_fs *c = trans->c; u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; static int warned_disk_usage = 0; bool warn = false; percpu_down_read(&c->mark_lock); struct bch_fs_usage_base *src = &trans->fs_usage_delta; s64 added = src->btree + src->data + src->reserved; /* * Not allowed to reduce sectors_available except by getting a * reservation: */ s64 should_not_have_added = added - (s64) disk_res_sectors; if (unlikely(should_not_have_added > 0)) { u64 old, new; old = atomic64_read(&c->sectors_available); do { new = max_t(s64, 0, old - should_not_have_added); } while (!atomic64_try_cmpxchg(&c->sectors_available, &old, new)); added -= should_not_have_added; warn = true; } if (added > 0) { trans->disk_res->sectors -= added; this_cpu_sub(*c->online_reserved, added); } preempt_disable(); struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); preempt_enable(); percpu_up_read(&c->mark_lock); if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) bch2_trans_inconsistent(trans, "disk usage increased %lli more than %llu sectors reserved)", should_not_have_added, disk_res_sectors); } /* KEY_TYPE_extent: */ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c k, const struct extent_ptr_decoded *p, s64 sectors, enum bch_data_type ptr_data_type, struct bch_alloc_v4 *a, bool insert) { u32 *dst_sectors = p->has_ec ? &a->stripe_sectors : !p->ptr.cached ? &a->dirty_sectors : &a->cached_sectors; int ret = bch2_bucket_ref_update(trans, ca, k, &p->ptr, sectors, ptr_data_type, a->gen, a->data_type, dst_sectors); if (ret) return ret; if (insert) alloc_data_type_set(a, ptr_data_type); return 0; } static int bch2_trigger_pointer(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, s64 *sectors, enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; bool insert = !(flags & BTREE_TRIGGER_overwrite); struct printbuf buf = PRINTBUF; int ret = 0; struct bkey_i_backpointer bp; bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp); *sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len; struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); if (unlikely(!ca)) { if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID) ret = bch_err_throw(c, trigger_pointer); goto err; } struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); if (!bucket_valid(ca, bucket.offset)) { if (insert) { bch2_dev_bucket_missing(ca, bucket.offset); ret = bch_err_throw(c, trigger_pointer); } goto err; } if (flags & BTREE_TRIGGER_transactional) { struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); ret = PTR_ERR_OR_ZERO(a) ?: __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert); if (ret) goto err; ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); if (ret) goto err; } if (flags & BTREE_TRIGGER_gc) { struct bucket *g = gc_bucket(ca, bucket.offset); if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", p.ptr.dev, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch_err_throw(c, trigger_pointer); goto err; } bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert); alloc_to_bucket(g, new); bucket_unlock(g); if (!ret) ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); } err: bch2_dev_put(ca); printbuf_exit(&buf); return ret; } static int bch2_trigger_stripe_ptr(struct btree_trans *trans, struct bkey_s_c k, struct extent_ptr_decoded p, enum bch_data_type data_type, s64 sectors, enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; if (flags & BTREE_TRIGGER_transactional) { struct btree_iter iter; struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx), BTREE_ITER_with_updates, stripe); int ret = PTR_ERR_OR_ZERO(s); if (unlikely(ret)) { bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, "pointer to nonexistent stripe %llu", (u64) p.ec.idx); goto err; } if (!bch2_ptr_matches_stripe(&s->v, p)) { bch2_trans_inconsistent(trans, "stripe pointer doesn't match stripe %llu", (u64) p.ec.idx); ret = bch_err_throw(c, trigger_stripe_pointer); goto err; } stripe_blockcount_set(&s->v, p.ec.block, stripe_blockcount_get(&s->v, p.ec.block) + sectors); struct disk_accounting_pos acc; memset(&acc, 0, sizeof(acc)); acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); acc.replicas.data_type = data_type; ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); err: bch2_trans_iter_exit(trans, &iter); return ret; } if (flags & BTREE_TRIGGER_gc) { struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); if (!m) { bch_err(c, "error allocating memory for gc_stripes, idx %llu", (u64) p.ec.idx); return bch_err_throw(c, ENOMEM_mark_stripe_ptr); } gc_stripe_lock(m); if (!m || !m->alive) { gc_stripe_unlock(m); struct printbuf buf = PRINTBUF; bch2_log_msg_start(c, &buf); prt_printf(&buf, "pointer to nonexistent stripe %llu\n while marking ", (u64) p.ec.idx); bch2_bkey_val_to_text(&buf, c, k); __bch2_inconsistent_error(c, &buf); bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); return bch_err_throw(c, trigger_stripe_pointer); } m->block_sectors[p.ec.block] += sectors; struct disk_accounting_pos acc; memset(&acc, 0, sizeof(acc)); acc.type = BCH_DISK_ACCOUNTING_replicas; unsafe_memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e), "VLA"); gc_stripe_unlock(m); acc.replicas.data_type = data_type; int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, true); if (ret) return ret; } return 0; } static int __trigger_extent(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, enum btree_iter_update_trigger_flags flags) { bool gc = flags & BTREE_TRIGGER_gc; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; enum bch_data_type data_type = bkey_is_btree_ptr(k.k) ? BCH_DATA_btree : BCH_DATA_user; int ret = 0; s64 replicas_sectors = 0; struct disk_accounting_pos acc_replicas_key; memset(&acc_replicas_key, 0, sizeof(acc_replicas_key)); acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas; acc_replicas_key.replicas.data_type = data_type; acc_replicas_key.replicas.nr_devs = 0; acc_replicas_key.replicas.nr_required = 1; unsigned cur_compression_type = 0; u64 compression_acct[3] = { 1, 0, 0 }; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { s64 disk_sectors = 0; ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags); if (ret < 0) return ret; bool stale = ret > 0; if (p.ptr.cached && stale) continue; if (p.ptr.cached) { ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc); if (ret) return ret; } else if (!p.has_ec) { replicas_sectors += disk_sectors; replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev); } else { ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); if (ret) return ret; /* * There may be other dirty pointers in this extent, but * if so they're not required for mounting if we have an * erasure coded pointer in this extent: */ acc_replicas_key.replicas.nr_required = 0; } if (cur_compression_type && cur_compression_type != p.crc.compression_type) { if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, compression, cur_compression_type); if (ret) return ret; compression_acct[0] = 1; compression_acct[1] = 0; compression_acct[2] = 0; } cur_compression_type = p.crc.compression_type; if (p.crc.compression_type) { compression_acct[1] += p.crc.uncompressed_size; compression_acct[2] += p.crc.compressed_size; } } if (acc_replicas_key.replicas.nr_devs) { ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc); if (ret) return ret; } if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, snapshot, k.k->p.snapshot); if (ret) return ret; } if (cur_compression_type) { if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, compression, cur_compression_type); if (ret) return ret; } if (level) { ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id); if (ret) return ret; } else { bool insert = !(flags & BTREE_TRIGGER_overwrite); s64 v[3] = { insert ? 1 : -1, insert ? k.k->size : -((s64) k.k->size), replicas_sectors, }; ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); if (ret) return ret; } return 0; } int bch2_trigger_extent(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c); struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old); unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start; if (unlikely(flags & BTREE_TRIGGER_check_repair)) return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags); /* if pointers aren't changing - nothing to do: */ if (new_ptrs_bytes == old_ptrs_bytes && !memcmp(new_ptrs.start, old_ptrs.start, new_ptrs_bytes)) return 0; if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { if (old.k->type) { int ret = __trigger_extent(trans, btree, level, old, flags & ~BTREE_TRIGGER_insert); if (ret) return ret; } if (new.k->type) { int ret = __trigger_extent(trans, btree, level, new.s_c, flags & ~BTREE_TRIGGER_overwrite); if (ret) return ret; } int need_rebalance_delta = 0; s64 need_rebalance_sectors_delta[1] = { 0 }; s64 s = bch2_bkey_sectors_need_rebalance(c, old); need_rebalance_delta -= s != 0; need_rebalance_sectors_delta[0] -= s; s = bch2_bkey_sectors_need_rebalance(c, new.s_c); need_rebalance_delta += s != 0; need_rebalance_sectors_delta[0] += s; if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, new.k->p, need_rebalance_delta > 0); if (ret) return ret; } if (need_rebalance_sectors_delta[0]) { int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, need_rebalance_sectors_delta, rebalance_work); if (ret) return ret; } } return 0; } /* KEY_TYPE_reservation */ static int __trigger_reservation(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, enum btree_iter_update_trigger_flags flags) { if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { s64 sectors[1] = { k.k->size }; if (flags & BTREE_TRIGGER_overwrite) sectors[0] = -sectors[0]; return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors, persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas); } return 0; } int bch2_trigger_reservation(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, enum btree_iter_update_trigger_flags flags) { return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags); } /* Mark superblocks: */ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, u64 b, enum bch_data_type type, unsigned sectors) { struct bch_fs *c = trans->c; struct btree_iter iter; int ret = 0; struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b)); if (IS_ERR(a)) return PTR_ERR(a); if (a->v.data_type && type && a->v.data_type != type) { struct printbuf buf = PRINTBUF; bch2_log_msg_start(c, &buf); prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s\n", iter.pos.inode, iter.pos.offset, a->v.gen, bch2_data_type_str(a->v.data_type), bch2_data_type_str(type), bch2_data_type_str(type)); bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); ret = bch2_run_explicit_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_allocations, 0); /* Always print, this is always fatal */ bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); if (!ret) ret = bch_err_throw(c, metadata_bucket_inconsistency); goto err; } if (a->v.data_type != type || a->v.dirty_sectors != sectors) { a->v.data_type = type; a->v.dirty_sectors = sectors; ret = bch2_trans_update(trans, &iter, &a->k_i, 0); } err: bch2_trans_iter_exit(trans, &iter); return ret; } static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, u64 b, enum bch_data_type data_type, unsigned sectors, enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; int ret = 0; struct bucket *g = gc_bucket(ca, b); if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", ca->dev_idx, bch2_data_type_str(data_type))) goto err; bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g); if (bch2_fs_inconsistent_on(g->data_type && g->data_type != data_type, c, "different types of data in same bucket: %s, %s", bch2_data_type_str(g->data_type), bch2_data_type_str(data_type))) goto err_unlock; if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size", ca->dev_idx, b, g->gen, bch2_data_type_str(g->data_type ?: data_type), g->dirty_sectors, sectors)) goto err_unlock; g->data_type = data_type; g->dirty_sectors += sectors; struct bch_alloc_v4 new = bucket_m_to_alloc(*g); bucket_unlock(g); ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); return ret; err_unlock: bucket_unlock(g); err: return bch_err_throw(c, metadata_bucket_inconsistency); } int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, u64 b, enum bch_data_type type, unsigned sectors, enum btree_iter_update_trigger_flags flags) { BUG_ON(type != BCH_DATA_free && type != BCH_DATA_sb && type != BCH_DATA_journal); /* * Backup superblock might be past the end of our normal usable space: */ if (b >= ca->mi.nbuckets) return 0; if (flags & BTREE_TRIGGER_gc) return bch2_mark_metadata_bucket(trans, ca, b, type, sectors, flags); else if (flags & BTREE_TRIGGER_transactional) return commit_do(trans, NULL, NULL, 0, __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); else BUG(); } static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, struct bch_dev *ca, u64 start, u64 end, enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors, enum btree_iter_update_trigger_flags flags) { do { u64 b = sector_to_bucket(ca, start); unsigned sectors = min_t(u64, bucket_to_sector(ca, b + 1), end) - start; if (b != *bucket && *bucket_sectors) { int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, type, *bucket_sectors, flags); if (ret) return ret; *bucket_sectors = 0; } *bucket = b; *bucket_sectors += sectors; start += sectors; } while (start < end); return 0; } static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca, enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; mutex_lock(&c->sb_lock); struct bch_sb_layout layout = ca->disk_sb.sb->layout; mutex_unlock(&c->sb_lock); u64 bucket = 0; unsigned i, bucket_sectors = 0; int ret; for (i = 0; i < layout.nr_superblocks; i++) { u64 offset = le64_to_cpu(layout.sb_offset[i]); if (offset == BCH_SB_SECTOR) { ret = bch2_trans_mark_metadata_sectors(trans, ca, 0, BCH_SB_SECTOR, BCH_DATA_sb, &bucket, &bucket_sectors, flags); if (ret) return ret; } ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, offset + (1 << layout.sb_max_size_bits), BCH_DATA_sb, &bucket, &bucket_sectors, flags); if (ret) return ret; } if (bucket_sectors) { ret = bch2_trans_mark_metadata_bucket(trans, ca, bucket, BCH_DATA_sb, bucket_sectors, flags); if (ret) return ret; } for (i = 0; i < ca->journal.nr; i++) { ret = bch2_trans_mark_metadata_bucket(trans, ca, ca->journal.buckets[i], BCH_DATA_journal, ca->mi.bucket_size, flags); if (ret) return ret; } return 0; } int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca, enum btree_iter_update_trigger_flags flags) { int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca, flags)); bch_err_fn(c, ret); return ret; } int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c, enum btree_iter_update_trigger_flags flags) { for_each_online_member(c, ca, BCH_DEV_READ_REF_trans_mark_dev_sbs) { int ret = bch2_trans_mark_dev_sb(c, ca, flags); if (ret) { enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_trans_mark_dev_sbs); return ret; } } return 0; } int bch2_trans_mark_dev_sbs(struct bch_fs *c) { return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional); } bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b) { struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; u64 b_offset = bucket_to_sector(ca, b); u64 b_end = bucket_to_sector(ca, b + 1); unsigned i; if (!b) return true; for (i = 0; i < layout->nr_superblocks; i++) { u64 offset = le64_to_cpu(layout->sb_offset[i]); u64 end = offset + (1 << layout->sb_max_size_bits); if (!(offset >= b_end || end <= b_offset)) return true; } for (i = 0; i < ca->journal.nr; i++) if (b == ca->journal.buckets[i]) return true; return false; } /* Disk reservations: */ #define SECTORS_CACHE 1024 int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, u64 sectors, enum bch_reservation_flags flags) { struct bch_fs_pcpu *pcpu; u64 old, get; u64 sectors_available; int ret; percpu_down_read(&c->mark_lock); preempt_disable(); pcpu = this_cpu_ptr(c->pcpu); if (sectors <= pcpu->sectors_available) goto out; old = atomic64_read(&c->sectors_available); do { get = min((u64) sectors + SECTORS_CACHE, old); if (get < sectors) { preempt_enable(); goto recalculate; } } while (!atomic64_try_cmpxchg(&c->sectors_available, &old, old - get)); pcpu->sectors_available += get; out: pcpu->sectors_available -= sectors; this_cpu_add(*c->online_reserved, sectors); res->sectors += sectors; preempt_enable(); percpu_up_read(&c->mark_lock); return 0; recalculate: mutex_lock(&c->sectors_available_lock); percpu_u64_set(&c->pcpu->sectors_available, 0); sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL)) sectors = min(sectors, sectors_available); if (sectors <= sectors_available || (flags & BCH_DISK_RESERVATION_NOFAIL)) { atomic64_set(&c->sectors_available, max_t(s64, 0, sectors_available - sectors)); this_cpu_add(*c->online_reserved, sectors); res->sectors += sectors; ret = 0; } else { atomic64_set(&c->sectors_available, sectors_available); ret = bch_err_throw(c, ENOSPC_disk_reservation); } mutex_unlock(&c->sectors_available_lock); percpu_up_read(&c->mark_lock); return ret; } /* Startup/shutdown: */ void bch2_buckets_nouse_free(struct bch_fs *c) { for_each_member_device(c, ca) { kvfree_rcu_mightsleep(ca->buckets_nouse); ca->buckets_nouse = NULL; } } int bch2_buckets_nouse_alloc(struct bch_fs *c) { for_each_member_device(c, ca) { BUG_ON(ca->buckets_nouse); ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long), GFP_KERNEL|__GFP_ZERO); if (!ca->buckets_nouse) { bch2_dev_put(ca); return bch_err_throw(c, ENOMEM_buckets_nouse); } } return 0; } static void bucket_gens_free_rcu(struct rcu_head *rcu) { struct bucket_gens *buckets = container_of(rcu, struct bucket_gens, rcu); kvfree(buckets); } int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; bool resize = ca->bucket_gens != NULL; int ret; if (resize) lockdep_assert_held(&c->state_lock); if (resize && ca->buckets_nouse) return bch_err_throw(c, no_resize_with_buckets_nouse); bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets), GFP_KERNEL|__GFP_ZERO); if (!bucket_gens) { ret = bch_err_throw(c, ENOMEM_bucket_gens); goto err; } bucket_gens->first_bucket = ca->mi.first_bucket; bucket_gens->nbuckets = nbuckets; bucket_gens->nbuckets_minus_first = bucket_gens->nbuckets - bucket_gens->first_bucket; old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); if (resize) { u64 copy = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); memcpy(bucket_gens->b, old_bucket_gens->b, sizeof(bucket_gens->b[0]) * copy); } ret = bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_mismatch, ca->mi.nbuckets, nbuckets) ?: bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_empty, ca->mi.nbuckets, nbuckets); rcu_assign_pointer(ca->bucket_gens, bucket_gens); bucket_gens = old_bucket_gens; nbuckets = ca->mi.nbuckets; ret = 0; err: if (bucket_gens) call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); return ret; } void bch2_dev_buckets_free(struct bch_dev *ca) { kvfree(ca->buckets_nouse); kvfree(rcu_dereference_protected(ca->bucket_gens, 1)); free_percpu(ca->usage); } int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { ca->usage = alloc_percpu(struct bch_dev_usage_full); if (!ca->usage) return bch_err_throw(c, ENOMEM_usage_init); return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); } |
| 1 2 1 30 31 26 15 26 5 26 25 1 26 26 15 26 26 5 26 26 26 3 1 1 2 18 18 17 18 18 7 18 7 17 7 18 23 18 18 18 18 18 7 3 1 1 1 1 20 12 20 20 8 20 20 20 20 19 12 20 20 12 19 20 20 20 20 20 10 20 20 20 20 20 19 15 15 15 15 15 2 2 1 2 2 20 9 20 15 5 15 15 15 15 15 4 5 5 5 5 5 5 5 5 1 1 1 1 1 1 2 2 1 2 2 2 2 6 6 6 2 4 1 4 6 6 6 5 1 10 10 2 2 7 1 2 2 2 4 7 8 12 10 12 12 9 9 6 8 7 2 8 8 8 8 8 8 8 1 1 1 1 1 1 1 16 1 1 1 14 14 10 13 13 13 1 1 1 1 1 1 1 1 1 1 1 5 1 1 1 6 1 5 8 8 8 8 8 8 1 8 8 8 8 8 8 8 8 8 8 8 8 8 8 2 1 1 8 8 8 8 8 8 8 8 8 2 8 8 8 8 8 8 8 8 8 8 8 8 8 1 8 8 8 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 | // SPDX-License-Identifier: GPL-2.0 #include <linux/pagewalk.h> #include <linux/mm_inline.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/mount.h> #include <linux/ksm.h> #include <linux/seq_file.h> #include <linux/highmem.h> #include <linux/ptrace.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/sched/mm.h> #include <linux/swapops.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> #include <linux/shmem_fs.h> #include <linux/uaccess.h> #include <linux/pkeys.h> #include <linux/minmax.h> #include <linux/overflow.h> #include <linux/buildid.h> #include <asm/elf.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include "internal.h" #define SEQ_PUT_DEC(str, val) \ seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) void task_mem(struct seq_file *m, struct mm_struct *mm) { unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; anon = get_mm_counter_sum(mm, MM_ANONPAGES); file = get_mm_counter_sum(mm, MM_FILEPAGES); shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES); /* * Note: to minimize their overhead, mm maintains hiwater_vm and * hiwater_rss only when about to *lower* total_vm or rss. Any * collector of these hiwater stats must therefore get total_vm * and rss too, which will usually be the higher. Barriers? not * worth the effort, such snapshots can always be inconsistent. */ hiwater_vm = total_vm = mm->total_vm; if (hiwater_vm < mm->hiwater_vm) hiwater_vm = mm->hiwater_vm; hiwater_rss = total_rss = anon + file + shmem; if (hiwater_rss < mm->hiwater_rss) hiwater_rss = mm->hiwater_rss; /* split executable areas between text and lib */ text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK); text = min(text, mm->exec_vm << PAGE_SHIFT); lib = (mm->exec_vm << PAGE_SHIFT) - text; swap = get_mm_counter_sum(mm, MM_SWAPENTS); SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); SEQ_PUT_DEC(" kB\nRssFile:\t", file); SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem); SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm); SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm); seq_put_decimal_ull_width(m, " kB\nVmExe:\t", text >> 10, 8); seq_put_decimal_ull_width(m, " kB\nVmLib:\t", lib >> 10, 8); seq_put_decimal_ull_width(m, " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8); SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); seq_puts(m, " kB\n"); hugetlb_report_usage(m, mm); } #undef SEQ_PUT_DEC unsigned long task_vsize(struct mm_struct *mm) { return PAGE_SIZE * mm->total_vm; } unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { *shared = get_mm_counter_sum(mm, MM_FILEPAGES) + get_mm_counter_sum(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->data_vm + mm->stack_vm; *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES); return mm->total_vm; } #ifdef CONFIG_NUMA /* * Save get_task_policy() for show_numa_map(). */ static void hold_task_mempolicy(struct proc_maps_private *priv) { struct task_struct *task = priv->task; task_lock(task); priv->task_mempolicy = get_task_policy(task); mpol_get(priv->task_mempolicy); task_unlock(task); } static void release_task_mempolicy(struct proc_maps_private *priv) { mpol_put(priv->task_mempolicy); } #else static void hold_task_mempolicy(struct proc_maps_private *priv) { } static void release_task_mempolicy(struct proc_maps_private *priv) { } #endif static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, loff_t *ppos) { struct vm_area_struct *vma = vma_next(&priv->iter); if (vma) { *ppos = vma->vm_start; } else { *ppos = -2UL; vma = get_gate_vma(priv->mm); } return vma; } static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; unsigned long last_addr = *ppos; struct mm_struct *mm; /* See m_next(). Zero at the start or after lseek. */ if (last_addr == -1UL) return NULL; priv->task = get_proc_task(priv->inode); if (!priv->task) return ERR_PTR(-ESRCH); mm = priv->mm; if (!mm || !mmget_not_zero(mm)) { put_task_struct(priv->task); priv->task = NULL; return NULL; } if (mmap_read_lock_killable(mm)) { mmput(mm); put_task_struct(priv->task); priv->task = NULL; return ERR_PTR(-EINTR); } vma_iter_init(&priv->iter, mm, last_addr); hold_task_mempolicy(priv); if (last_addr == -2UL) return get_gate_vma(mm); return proc_get_vma(priv, ppos); } static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { if (*ppos == -2UL) { *ppos = -1UL; return NULL; } return proc_get_vma(m->private, ppos); } static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mm_struct *mm = priv->mm; if (!priv->task) return; release_task_mempolicy(priv); mmap_read_unlock(mm); mmput(mm); put_task_struct(priv->task); priv->task = NULL; } static int proc_maps_open(struct inode *inode, struct file *file, const struct seq_operations *ops, int psize) { struct proc_maps_private *priv = __seq_open_private(file, ops, psize); if (!priv) return -ENOMEM; priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); if (IS_ERR_OR_NULL(priv->mm)) { int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; seq_release_private(inode, file); return err; } return 0; } static int proc_map_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; if (priv->mm) mmdrop(priv->mm); return seq_release_private(inode, file); } static int do_maps_open(struct inode *inode, struct file *file, const struct seq_operations *ops) { return proc_maps_open(inode, file, ops, sizeof(struct proc_maps_private)); } static void get_vma_name(struct vm_area_struct *vma, const struct path **path, const char **name, const char **name_fmt) { struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL; *name = NULL; *path = NULL; *name_fmt = NULL; /* * Print the dentry name for named mappings, and a * special [heap] marker for the heap: */ if (vma->vm_file) { /* * If user named this anon shared memory via * prctl(PR_SET_VMA ..., use the provided name. */ if (anon_name) { *name_fmt = "[anon_shmem:%s]"; *name = anon_name->name; } else { *path = file_user_path(vma->vm_file); } return; } if (vma->vm_ops && vma->vm_ops->name) { *name = vma->vm_ops->name(vma); if (*name) return; } *name = arch_vma_name(vma); if (*name) return; if (!vma->vm_mm) { *name = "[vdso]"; return; } if (vma_is_initial_heap(vma)) { *name = "[heap]"; return; } if (vma_is_initial_stack(vma)) { *name = "[stack]"; return; } if (anon_name) { *name_fmt = "[anon:%s]"; *name = anon_name->name; return; } } static void show_vma_header_prefix(struct seq_file *m, unsigned long start, unsigned long end, vm_flags_t flags, unsigned long long pgoff, dev_t dev, unsigned long ino) { seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_put_hex_ll(m, NULL, start, 8); seq_put_hex_ll(m, "-", end, 8); seq_putc(m, ' '); seq_putc(m, flags & VM_READ ? 'r' : '-'); seq_putc(m, flags & VM_WRITE ? 'w' : '-'); seq_putc(m, flags & VM_EXEC ? 'x' : '-'); seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p'); seq_put_hex_ll(m, " ", pgoff, 8); seq_put_hex_ll(m, " ", MAJOR(dev), 2); seq_put_hex_ll(m, ":", MINOR(dev), 2); seq_put_decimal_ull(m, " ", ino); seq_putc(m, ' '); } static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { const struct path *path; const char *name_fmt, *name; vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; unsigned long start, end; dev_t dev = 0; if (vma->vm_file) { const struct inode *inode = file_user_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; } start = vma->vm_start; end = vma->vm_end; show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); get_vma_name(vma, &path, &name, &name_fmt); if (path) { seq_pad(m, ' '); seq_path(m, path, "\n"); } else if (name_fmt) { seq_pad(m, ' '); seq_printf(m, name_fmt, name); } else if (name) { seq_pad(m, ' '); seq_puts(m, name); } seq_putc(m, '\n'); } static int show_map(struct seq_file *m, void *v) { show_map_vma(m, v); return 0; } static const struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_map }; static int pid_maps_open(struct inode *inode, struct file *file) { return do_maps_open(inode, file, &proc_pid_maps_op); } #define PROCMAP_QUERY_VMA_FLAGS ( \ PROCMAP_QUERY_VMA_READABLE | \ PROCMAP_QUERY_VMA_WRITABLE | \ PROCMAP_QUERY_VMA_EXECUTABLE | \ PROCMAP_QUERY_VMA_SHARED \ ) #define PROCMAP_QUERY_VALID_FLAGS_MASK ( \ PROCMAP_QUERY_COVERING_OR_NEXT_VMA | \ PROCMAP_QUERY_FILE_BACKED_VMA | \ PROCMAP_QUERY_VMA_FLAGS \ ) static int query_vma_setup(struct mm_struct *mm) { return mmap_read_lock_killable(mm); } static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma) { mmap_read_unlock(mm); } static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr) { return find_vma(mm, addr); } static struct vm_area_struct *query_matching_vma(struct mm_struct *mm, unsigned long addr, u32 flags) { struct vm_area_struct *vma; next_vma: vma = query_vma_find_by_addr(mm, addr); if (!vma) goto no_vma; /* user requested only file-backed VMA, keep iterating */ if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file) goto skip_vma; /* VMA permissions should satisfy query flags */ if (flags & PROCMAP_QUERY_VMA_FLAGS) { u32 perm = 0; if (flags & PROCMAP_QUERY_VMA_READABLE) perm |= VM_READ; if (flags & PROCMAP_QUERY_VMA_WRITABLE) perm |= VM_WRITE; if (flags & PROCMAP_QUERY_VMA_EXECUTABLE) perm |= VM_EXEC; if (flags & PROCMAP_QUERY_VMA_SHARED) perm |= VM_MAYSHARE; if ((vma->vm_flags & perm) != perm) goto skip_vma; } /* found covering VMA or user is OK with the matching next VMA */ if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr) return vma; skip_vma: /* * If the user needs closest matching VMA, keep iterating. */ addr = vma->vm_end; if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) goto next_vma; no_vma: return ERR_PTR(-ENOENT); } static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) { struct procmap_query karg; struct vm_area_struct *vma; struct mm_struct *mm; const char *name = NULL; char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; __u64 usize; int err; if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize))) return -EFAULT; /* argument struct can never be that large, reject abuse */ if (usize > PAGE_SIZE) return -E2BIG; /* argument struct should have at least query_flags and query_addr fields */ if (usize < offsetofend(struct procmap_query, query_addr)) return -EINVAL; err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); if (err) return err; /* reject unknown flags */ if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK) return -EINVAL; /* either both buffer address and size are set, or both should be zero */ if (!!karg.vma_name_size != !!karg.vma_name_addr) return -EINVAL; if (!!karg.build_id_size != !!karg.build_id_addr) return -EINVAL; mm = priv->mm; if (!mm || !mmget_not_zero(mm)) return -ESRCH; err = query_vma_setup(mm); if (err) { mmput(mm); return err; } vma = query_matching_vma(mm, karg.query_addr, karg.query_flags); if (IS_ERR(vma)) { err = PTR_ERR(vma); vma = NULL; goto out; } karg.vma_start = vma->vm_start; karg.vma_end = vma->vm_end; karg.vma_flags = 0; if (vma->vm_flags & VM_READ) karg.vma_flags |= PROCMAP_QUERY_VMA_READABLE; if (vma->vm_flags & VM_WRITE) karg.vma_flags |= PROCMAP_QUERY_VMA_WRITABLE; if (vma->vm_flags & VM_EXEC) karg.vma_flags |= PROCMAP_QUERY_VMA_EXECUTABLE; if (vma->vm_flags & VM_MAYSHARE) karg.vma_flags |= PROCMAP_QUERY_VMA_SHARED; karg.vma_page_size = vma_kernel_pagesize(vma); if (vma->vm_file) { const struct inode *inode = file_user_inode(vma->vm_file); karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT; karg.dev_major = MAJOR(inode->i_sb->s_dev); karg.dev_minor = MINOR(inode->i_sb->s_dev); karg.inode = inode->i_ino; } else { karg.vma_offset = 0; karg.dev_major = 0; karg.dev_minor = 0; karg.inode = 0; } if (karg.build_id_size) { __u32 build_id_sz; err = build_id_parse(vma, build_id_buf, &build_id_sz); if (err) { karg.build_id_size = 0; } else { if (karg.build_id_size < build_id_sz) { err = -ENAMETOOLONG; goto out; } karg.build_id_size = build_id_sz; } } if (karg.vma_name_size) { size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size); const struct path *path; const char *name_fmt; size_t name_sz = 0; get_vma_name(vma, &path, &name, &name_fmt); if (path || name_fmt || name) { name_buf = kmalloc(name_buf_sz, GFP_KERNEL); if (!name_buf) { err = -ENOMEM; goto out; } } if (path) { name = d_path(path, name_buf, name_buf_sz); if (IS_ERR(name)) { err = PTR_ERR(name); goto out; } name_sz = name_buf + name_buf_sz - name; } else if (name || name_fmt) { name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name); name = name_buf; } if (name_sz > name_buf_sz) { err = -ENAMETOOLONG; goto out; } karg.vma_name_size = name_sz; } /* unlock vma or mmap_lock, and put mm_struct before copying data to user */ query_vma_teardown(mm, vma); mmput(mm); if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr), name, karg.vma_name_size)) { kfree(name_buf); return -EFAULT; } kfree(name_buf); if (karg.build_id_size && copy_to_user(u64_to_user_ptr(karg.build_id_addr), build_id_buf, karg.build_id_size)) return -EFAULT; if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize))) return -EFAULT; return 0; out: query_vma_teardown(mm, vma); mmput(mm); kfree(name_buf); return err; } static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; switch (cmd) { case PROCMAP_QUERY: return do_procmap_query(priv, (void __user *)arg); default: return -ENOIOCTLCMD; } } const struct file_operations proc_pid_maps_operations = { .open = pid_maps_open, .read = seq_read, .llseek = seq_lseek, .release = proc_map_release, .unlocked_ioctl = procfs_procmap_ioctl, .compat_ioctl = compat_ptr_ioctl, }; /* * Proportional Set Size(PSS): my share of RSS. * * PSS of a process is the count of pages it has in memory, where each * page is divided by the number of processes sharing it. So if a * process has 1000 pages all to itself, and 1000 shared with one other * process, its PSS will be 1500. * * To keep (accumulated) division errors low, we adopt a 64bit * fixed-point pss counter to minimize division errors. So (pss >> * PSS_SHIFT) would be the real byte count. * * A shift of 12 before division means (assuming 4K page size): * - 1M 3-user-pages add up to 8KB errors; * - supports mapcount up to 2^24, or 16M; * - supports PSS up to 2^52 bytes, or 4PB. */ #define PSS_SHIFT 12 #ifdef CONFIG_PROC_PAGE_MONITOR struct mem_size_stats { unsigned long resident; unsigned long shared_clean; unsigned long shared_dirty; unsigned long private_clean; unsigned long private_dirty; unsigned long referenced; unsigned long anonymous; unsigned long lazyfree; unsigned long anonymous_thp; unsigned long shmem_thp; unsigned long file_thp; unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; unsigned long ksm; u64 pss; u64 pss_anon; u64 pss_file; u64 pss_shmem; u64 pss_dirty; u64 pss_locked; u64 swap_pss; }; static void smaps_page_accumulate(struct mem_size_stats *mss, struct folio *folio, unsigned long size, unsigned long pss, bool dirty, bool locked, bool private) { mss->pss += pss; if (folio_test_anon(folio)) mss->pss_anon += pss; else if (folio_test_swapbacked(folio)) mss->pss_shmem += pss; else mss->pss_file += pss; if (locked) mss->pss_locked += pss; if (dirty || folio_test_dirty(folio)) { mss->pss_dirty += pss; if (private) mss->private_dirty += size; else mss->shared_dirty += size; } else { if (private) mss->private_clean += size; else mss->shared_clean += size; } } static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty, bool locked, bool present) { struct folio *folio = page_folio(page); int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; bool exclusive; int mapcount; /* * First accumulate quantities that depend only on |size| and the type * of the compound page. */ if (folio_test_anon(folio)) { mss->anonymous += size; if (!folio_test_swapbacked(folio) && !dirty && !folio_test_dirty(folio)) mss->lazyfree += size; } if (folio_test_ksm(folio)) mss->ksm += size; mss->resident += size; /* Accumulate the size in pages that have been accessed. */ if (young || folio_test_young(folio) || folio_test_referenced(folio)) mss->referenced += size; /* * Then accumulate quantities that may depend on sharing, or that may * differ page-by-page. * * refcount == 1 for present entries guarantees that the folio is mapped * exactly once. For large folios this implies that exactly one * PTE/PMD/... maps (a part of) this folio. * * Treat all non-present entries (where relying on the mapcount and * refcount doesn't make sense) as "maybe shared, but not sure how * often". We treat device private entries as being fake-present. * * Note that it would not be safe to read the mapcount especially for * pages referenced by migration entries, even with the PTL held. */ if (folio_ref_count(folio) == 1 || !present) { smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT, dirty, locked, present); return; } if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { mapcount = folio_average_page_mapcount(folio); exclusive = !folio_maybe_mapped_shared(folio); } /* * We obtain a snapshot of the mapcount. Without holding the folio lock * this snapshot can be slightly wrong as we cannot always read the * mapcount atomically. */ for (i = 0; i < nr; i++, page++) { unsigned long pss = PAGE_SIZE << PSS_SHIFT; if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) { mapcount = folio_precise_page_mapcount(folio, page); exclusive = mapcount < 2; } if (mapcount >= 2) pss /= mapcount; smaps_page_accumulate(mss, folio, PAGE_SIZE, pss, dirty, locked, exclusive); } } #ifdef CONFIG_SHMEM static int smaps_pte_hole(unsigned long addr, unsigned long end, __always_unused int depth, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping, linear_page_index(vma, addr), linear_page_index(vma, end)); return 0; } #else #define smaps_pte_hole NULL #endif /* CONFIG_SHMEM */ static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk) { #ifdef CONFIG_SHMEM if (walk->ops->pte_hole) { /* depth is not used */ smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk); } #endif } static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; bool present = false, young = false, dirty = false; pte_t ptent = ptep_get(pte); if (pte_present(ptent)) { page = vm_normal_page(vma, addr, ptent); young = pte_young(ptent); dirty = pte_dirty(ptent); present = true; } else if (is_swap_pte(ptent)) { swp_entry_t swpent = pte_to_swp_entry(ptent); if (!non_swap_entry(swpent)) { int mapcount; mss->swap += PAGE_SIZE; mapcount = swp_swapcount(swpent); if (mapcount >= 2) { u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; do_div(pss_delta, mapcount); mss->swap_pss += pss_delta; } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } } else if (is_pfn_swap_entry(swpent)) { if (is_device_private_entry(swpent)) present = true; page = pfn_swap_entry_to_page(swpent); } } else { smaps_pte_hole_lookup(addr, walk); return; } if (!page) return; smaps_account(mss, page, false, young, dirty, locked, present); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; bool present = false; struct folio *folio; if (pmd_present(*pmd)) { page = vm_normal_page_pmd(vma, addr, *pmd); present = true; } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { swp_entry_t entry = pmd_to_swp_entry(*pmd); if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); } if (IS_ERR_OR_NULL(page)) return; folio = page_folio(page); if (folio_test_anon(folio)) mss->anonymous_thp += HPAGE_PMD_SIZE; else if (folio_test_swapbacked(folio)) mss->shmem_thp += HPAGE_PMD_SIZE; else if (folio_is_zone_device(folio)) /* pass */; else mss->file_thp += HPAGE_PMD_SIZE; smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked, present); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct mm_walk *walk) { } #endif static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; pte_t *pte; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); goto out; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) smaps_pte_entry(pte, addr, walk); pte_unmap_unlock(pte - 1, ptl); out: cond_resched(); return 0; } static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) { /* * Don't forget to update Documentation/ on changes. * * The length of the second argument of mnemonics[] * needs to be 3 instead of previously set 2 * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3]) * to avoid spurious * -Werror=unterminated-string-initialization warning * with GCC 15 */ static const char mnemonics[BITS_PER_LONG][3] = { /* * In case if we meet a flag we don't know about. */ [0 ... (BITS_PER_LONG-1)] = "??", [ilog2(VM_READ)] = "rd", [ilog2(VM_WRITE)] = "wr", [ilog2(VM_EXEC)] = "ex", [ilog2(VM_SHARED)] = "sh", [ilog2(VM_MAYREAD)] = "mr", [ilog2(VM_MAYWRITE)] = "mw", [ilog2(VM_MAYEXEC)] = "me", [ilog2(VM_MAYSHARE)] = "ms", [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", [ilog2(VM_RAND_READ)] = "rr", [ilog2(VM_DONTCOPY)] = "dc", [ilog2(VM_DONTEXPAND)] = "de", [ilog2(VM_LOCKONFAULT)] = "lf", [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", [ilog2(VM_SYNC)] = "sf", [ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", #ifdef CONFIG_ARM64_BTI [ilog2(VM_ARM64_BTI)] = "bt", #endif #ifdef CONFIG_MEM_SOFT_DIRTY [ilog2(VM_SOFTDIRTY)] = "sd", #endif [ilog2(VM_MIXEDMAP)] = "mm", [ilog2(VM_HUGEPAGE)] = "hg", [ilog2(VM_NOHUGEPAGE)] = "nh", [ilog2(VM_MERGEABLE)] = "mg", [ilog2(VM_UFFD_MISSING)]= "um", [ilog2(VM_UFFD_WP)] = "uw", #ifdef CONFIG_ARM64_MTE [ilog2(VM_MTE)] = "mt", [ilog2(VM_MTE_ALLOWED)] = "", #endif #ifdef CONFIG_ARCH_HAS_PKEYS /* These come out via ProtectionKey: */ [ilog2(VM_PKEY_BIT0)] = "", [ilog2(VM_PKEY_BIT1)] = "", [ilog2(VM_PKEY_BIT2)] = "", #if VM_PKEY_BIT3 [ilog2(VM_PKEY_BIT3)] = "", #endif #if VM_PKEY_BIT4 [ilog2(VM_PKEY_BIT4)] = "", #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ #ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) [ilog2(VM_DROPPABLE)] = "dp", #endif #ifdef CONFIG_64BIT [ilog2(VM_SEALED)] = "sl", #endif }; size_t i; seq_puts(m, "VmFlags: "); for (i = 0; i < BITS_PER_LONG; i++) { if (!mnemonics[i][0]) continue; if (vma->vm_flags & (1UL << i)) seq_printf(m, "%s ", mnemonics[i]); } seq_putc(m, '\n'); } #ifdef CONFIG_HUGETLB_PAGE static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; pte_t ptent = huge_ptep_get(walk->mm, addr, pte); struct folio *folio = NULL; bool present = false; if (pte_present(ptent)) { folio = page_folio(pte_page(ptent)); present = true; } else if (is_swap_pte(ptent)) { swp_entry_t swpent = pte_to_swp_entry(ptent); if (is_pfn_swap_entry(swpent)) folio = pfn_swap_entry_folio(swpent); } if (folio) { /* We treat non-present entries as "maybe shared". */ if (!present || folio_maybe_mapped_shared(folio) || hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else mss->private_hugetlb += huge_page_size(hstate_vma(vma)); } return 0; } #else #define smaps_hugetlb_range NULL #endif /* HUGETLB_PAGE */ static const struct mm_walk_ops smaps_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, .walk_lock = PGWALK_RDLOCK, }; static const struct mm_walk_ops smaps_shmem_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, .pte_hole = smaps_pte_hole, .walk_lock = PGWALK_RDLOCK, }; /* * Gather mem stats from @vma with the indicated beginning * address @start, and keep them in @mss. * * Use vm_start of @vma as the beginning address if @start is 0. */ static void smap_gather_stats(struct vm_area_struct *vma, struct mem_size_stats *mss, unsigned long start) { const struct mm_walk_ops *ops = &smaps_walk_ops; /* Invalid start */ if (start >= vma->vm_end) return; if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all * swapped out pages belong to the shmem object, and we can * obtain the swap value much more efficiently. For private * writable mappings, we might have COW pages that are * not affected by the parent swapped out pages of the shmem * object, so we have to distinguish them during the page walk. * Unless we know that the shmem object (or the part mapped by * our VMA) has no swapped out pages at all. */ unsigned long shmem_swapped = shmem_swap_usage(vma); if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) || !(vma->vm_flags & VM_WRITE))) { mss->swap += shmem_swapped; } else { ops = &smaps_shmem_walk_ops; } } /* mmap_lock is held in m_start */ if (!start) walk_page_vma(vma, ops, mss); else walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); } #define SEQ_PUT_DEC(str, val) \ seq_put_decimal_ull_width(m, str, (val) >> 10, 8) /* Show the contents common for smaps and smaps_rollup */ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, bool rollup_mode) { SEQ_PUT_DEC("Rss: ", mss->resident); SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT); if (rollup_mode) { /* * These are meaningful only for smaps_rollup, otherwise two of * them are zero, and the other one is the same as Pss. */ SEQ_PUT_DEC(" kB\nPss_Anon: ", mss->pss_anon >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nPss_File: ", mss->pss_file >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nPss_Shmem: ", mss->pss_shmem >> PSS_SHIFT); } SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm); SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", mss->private_hugetlb >> 10, 7); SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); SEQ_PUT_DEC(" kB\nSwapPss: ", mss->swap_pss >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nLocked: ", mss->pss_locked >> PSS_SHIFT); seq_puts(m, " kB\n"); } static int show_smap(struct seq_file *m, void *v) { struct vm_area_struct *vma = v; struct mem_size_stats mss = {}; smap_gather_stats(vma, &mss, 0); show_map_vma(m, vma); SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); seq_puts(m, " kB\n"); __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %8u\n", !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); show_smap_vma_flags(m, vma); return 0; } static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mem_size_stats mss = {}; struct mm_struct *mm = priv->mm; struct vm_area_struct *vma; unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; VMA_ITERATOR(vmi, mm, 0); priv->task = get_proc_task(priv->inode); if (!priv->task) return -ESRCH; if (!mm || !mmget_not_zero(mm)) { ret = -ESRCH; goto out_put_task; } ret = mmap_read_lock_killable(mm); if (ret) goto out_put_mm; hold_task_mempolicy(priv); vma = vma_next(&vmi); if (unlikely(!vma)) goto empty_set; vma_start = vma->vm_start; do { smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; /* * Release mmap_lock temporarily if someone wants to * access it for write request. */ if (mmap_lock_is_contended(mm)) { vma_iter_invalidate(&vmi); mmap_read_unlock(mm); ret = mmap_read_lock_killable(mm); if (ret) { release_task_mempolicy(priv); goto out_put_mm; } /* * After dropping the lock, there are four cases to * consider. See the following example for explanation. * * +------+------+-----------+ * | VMA1 | VMA2 | VMA3 | * +------+------+-----------+ * | | | | * 4k 8k 16k 400k * * Suppose we drop the lock after reading VMA2 due to * contention, then we get: * * last_vma_end = 16k * * 1) VMA2 is freed, but VMA3 exists: * * vma_next(vmi) will return VMA3. * In this case, just continue from VMA3. * * 2) VMA2 still exists: * * vma_next(vmi) will return VMA3. * In this case, just continue from VMA3. * * 3) No more VMAs can be found: * * vma_next(vmi) will return NULL. * No more things to do, just break. * * 4) (last_vma_end - 1) is the middle of a vma (VMA'): * * vma_next(vmi) will return VMA' whose range * contains last_vma_end. * Iterate VMA' from last_vma_end. */ vma = vma_next(&vmi); /* Case 3 above */ if (!vma) break; /* Case 1 and 2 above */ if (vma->vm_start >= last_vma_end) { smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; continue; } /* Case 4 above */ if (vma->vm_end > last_vma_end) { smap_gather_stats(vma, &mss, last_vma_end); last_vma_end = vma->vm_end; } } } for_each_vma(vmi, vma); empty_set: show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); seq_pad(m, ' '); seq_puts(m, "[rollup]\n"); __show_smap(m, &mss, true); release_task_mempolicy(priv); mmap_read_unlock(mm); out_put_mm: mmput(mm); out_put_task: put_task_struct(priv->task); priv->task = NULL; return ret; } #undef SEQ_PUT_DEC static const struct seq_operations proc_pid_smaps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_smap }; static int pid_smaps_open(struct inode *inode, struct file *file) { return do_maps_open(inode, file, &proc_pid_smaps_op); } static int smaps_rollup_open(struct inode *inode, struct file *file) { int ret; struct proc_maps_private *priv; priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT); if (!priv) return -ENOMEM; ret = single_open(file, show_smaps_rollup, priv); if (ret) goto out_free; priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); if (IS_ERR_OR_NULL(priv->mm)) { ret = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; single_release(inode, file); goto out_free; } return 0; out_free: kfree(priv); return ret; } static int smaps_rollup_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; if (priv->mm) mmdrop(priv->mm); kfree(priv); return single_release(inode, file); } const struct file_operations proc_pid_smaps_operations = { .open = pid_smaps_open, .read = seq_read, .llseek = seq_lseek, .release = proc_map_release, }; const struct file_operations proc_pid_smaps_rollup_operations = { .open = smaps_rollup_open, .read = seq_read, .llseek = seq_lseek, .release = smaps_rollup_release, }; enum clear_refs_types { CLEAR_REFS_ALL = 1, CLEAR_REFS_ANON, CLEAR_REFS_MAPPED, CLEAR_REFS_SOFT_DIRTY, CLEAR_REFS_MM_HIWATER_RSS, CLEAR_REFS_LAST, }; struct clear_refs_private { enum clear_refs_types type; }; #ifdef CONFIG_MEM_SOFT_DIRTY static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { struct folio *folio; if (!pte_write(pte)) return false; if (!is_cow_mapping(vma->vm_flags)) return false; if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) return false; folio = vm_normal_folio(vma, addr, pte); if (!folio) return false; return folio_maybe_dma_pinned(folio); } static inline void clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { /* * The soft-dirty tracker uses #PF-s to catch writes * to pages, so write-protect the pte as well. See the * Documentation/admin-guide/mm/soft-dirty.rst for full description * of how soft-dirty works. */ pte_t ptent = ptep_get(pte); if (pte_present(ptent)) { pte_t old_pte; if (pte_is_pinned(vma, addr, ptent)) return; old_pte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } } #else static inline void clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { } #endif #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t old, pmd = *pmdp; if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ old = pmdp_invalidate(vma, addr, pmdp); if (pmd_dirty(old)) pmd = pmd_mkdirty(pmd); if (pmd_young(old)) pmd = pmd_mkyoung(pmd); pmd = pmd_wrprotect(pmd); pmd = pmd_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { pmd = pmd_swp_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } } #else static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { } #endif static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct clear_refs_private *cp = walk->private; struct vm_area_struct *vma = walk->vma; pte_t *pte, ptent; spinlock_t *ptl; struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty_pmd(vma, addr, pmd); goto out; } if (!pmd_present(*pmd)) goto out; folio = pmd_folio(*pmd); /* Clear accessed and referenced bits. */ pmdp_test_and_clear_young(vma, addr, pmd); folio_test_clear_young(folio); folio_clear_referenced(folio); out: spin_unlock(ptl); return 0; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); continue; } if (!pte_present(ptent)) continue; folio = vm_normal_folio(vma, addr, ptent); if (!folio) continue; /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); folio_test_clear_young(folio); folio_clear_referenced(folio); } pte_unmap_unlock(pte - 1, ptl); cond_resched(); return 0; } static int clear_refs_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct clear_refs_private *cp = walk->private; struct vm_area_struct *vma = walk->vma; if (vma->vm_flags & VM_PFNMAP) return 1; /* * Writing 1 to /proc/pid/clear_refs affects all pages. * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. * Writing 3 to /proc/pid/clear_refs only affects file mapped pages. * Writing 4 to /proc/pid/clear_refs affects all pages. */ if (cp->type == CLEAR_REFS_ANON && vma->vm_file) return 1; if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file) return 1; return 0; } static const struct mm_walk_ops clear_refs_walk_ops = { .pmd_entry = clear_refs_pte_range, .test_walk = clear_refs_test_walk, .walk_lock = PGWALK_WRLOCK, }; static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; char buffer[PROC_NUMBUF] = {}; struct mm_struct *mm; struct vm_area_struct *vma; enum clear_refs_types type; int itype; int rv; if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; rv = kstrtoint(strstrip(buffer), 10, &itype); if (rv < 0) return rv; type = (enum clear_refs_types)itype; if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; mm = get_task_mm(task); if (mm) { VMA_ITERATOR(vmi, mm, 0); struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, }; if (mmap_write_lock_killable(mm)) { count = -EINTR; goto out_mm; } if (type == CLEAR_REFS_MM_HIWATER_RSS) { /* * Writing 5 to /proc/pid/clear_refs resets the peak * resident set size to this mm's current rss value. */ reset_mm_hiwater_rss(mm); goto out_unlock; } if (type == CLEAR_REFS_SOFT_DIRTY) { for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; vm_flags_clear(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } inc_tlb_flush_pending(mm); mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 0, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); if (type == CLEAR_REFS_SOFT_DIRTY) { mmu_notifier_invalidate_range_end(&range); flush_tlb_mm(mm); dec_tlb_flush_pending(mm); } out_unlock: mmap_write_unlock(mm); out_mm: mmput(mm); } put_task_struct(task); return count; } const struct file_operations proc_clear_refs_operations = { .write = clear_refs_write, .llseek = noop_llseek, }; typedef struct { u64 pme; } pagemap_entry_t; struct pagemapread { int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ pagemap_entry_t *buffer; bool show_pfn; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) #define PAGEMAP_WALK_MASK (PMD_MASK) #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) #define PM_PFRAME_BITS 55 #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) #define PM_SOFT_DIRTY BIT_ULL(55) #define PM_MMAP_EXCLUSIVE BIT_ULL(56) #define PM_UFFD_WP BIT_ULL(57) #define PM_GUARD_REGION BIT_ULL(58) #define PM_FILE BIT_ULL(61) #define PM_SWAP BIT_ULL(62) #define PM_PRESENT BIT_ULL(63) #define PM_END_OF_BUFFER 1 static inline pagemap_entry_t make_pme(u64 frame, u64 flags) { return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; } static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) { pm->buffer[pm->pos++] = *pme; if (pm->pos >= pm->len) return PM_END_OF_BUFFER; return 0; } static bool __folio_page_mapped_exclusively(struct folio *folio, struct page *page) { if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) return folio_precise_page_mapcount(folio, page) == 1; return !folio_maybe_mapped_shared(folio); } static int pagemap_pte_hole(unsigned long start, unsigned long end, __always_unused int depth, struct mm_walk *walk) { struct pagemapread *pm = walk->private; unsigned long addr = start; int err = 0; while (addr < end) { struct vm_area_struct *vma = find_vma(walk->mm, addr); pagemap_entry_t pme = make_pme(0, 0); /* End of address space hole, which we mark as non-present. */ unsigned long hole_end; if (vma) hole_end = min(end, vma->vm_start); else hole_end = end; for (; addr < hole_end; addr += PAGE_SIZE) { err = add_to_pagemap(&pme, pm); if (err) goto out; } if (!vma) break; /* Addresses in the VMA. */ if (vma->vm_flags & VM_SOFTDIRTY) pme = make_pme(0, PM_SOFT_DIRTY); for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { err = add_to_pagemap(&pme, pm); if (err) goto out; } } out: return err; } static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { u64 frame = 0, flags = 0; struct page *page = NULL; struct folio *folio; if (pte_present(pte)) { if (pm->show_pfn) frame = pte_pfn(pte); flags |= PM_PRESENT; page = vm_normal_page(vma, addr, pte); if (pte_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; if (pte_uffd_wp(pte)) flags |= PM_UFFD_WP; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; if (pte_swp_uffd_wp(pte)) flags |= PM_UFFD_WP; entry = pte_to_swp_entry(pte); if (pm->show_pfn) { pgoff_t offset; /* * For PFN swap offsets, keeping the offset field * to be PFN only to be compatible with old smaps. */ if (is_pfn_swap_entry(entry)) offset = swp_offset_pfn(entry); else offset = swp_offset(entry); frame = swp_type(entry) | (offset << MAX_SWAPFILES_SHIFT); } flags |= PM_SWAP; if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); if (pte_marker_entry_uffd_wp(entry)) flags |= PM_UFFD_WP; if (is_guard_swp_entry(entry)) flags |= PM_GUARD_REGION; } if (page) { folio = page_folio(page); if (!folio_test_anon(folio)) flags |= PM_FILE; if ((flags & PM_PRESENT) && __folio_page_mapped_exclusively(folio, page)) flags |= PM_MMAP_EXCLUSIVE; } if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; return make_pme(frame, flags); } static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct pagemapread *pm = walk->private; spinlock_t *ptl; pte_t *pte, *orig_pte; int err = 0; #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptl = pmd_trans_huge_lock(pmdp, vma); if (ptl) { unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT; u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; struct page *page = NULL; struct folio *folio = NULL; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; if (pmd_present(pmd)) { page = pmd_page(pmd); flags |= PM_PRESENT; if (pmd_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; if (pmd_uffd_wp(pmd)) flags |= PM_UFFD_WP; if (pm->show_pfn) frame = pmd_pfn(pmd) + idx; } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION else if (is_swap_pmd(pmd)) { swp_entry_t entry = pmd_to_swp_entry(pmd); unsigned long offset; if (pm->show_pfn) { if (is_pfn_swap_entry(entry)) offset = swp_offset_pfn(entry) + idx; else offset = swp_offset(entry) + idx; frame = swp_type(entry) | (offset << MAX_SWAPFILES_SHIFT); } flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; VM_BUG_ON(!is_pmd_migration_entry(pmd)); page = pfn_swap_entry_to_page(entry); } #endif if (page) { folio = page_folio(page); if (!folio_test_anon(folio)) flags |= PM_FILE; } for (; addr != end; addr += PAGE_SIZE, idx++) { u64 cur_flags = flags; pagemap_entry_t pme; if (folio && (flags & PM_PRESENT) && __folio_page_mapped_exclusively(folio, page)) cur_flags |= PM_MMAP_EXCLUSIVE; pme = make_pme(frame, cur_flags); err = add_to_pagemap(&pme, pm); if (err) break; if (pm->show_pfn) { if (flags & PM_PRESENT) frame++; else if (flags & PM_SWAP) frame += (1 << MAX_SWAPFILES_SHIFT); } } spin_unlock(ptl); return err; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * We can assume that @vma always points to a valid one and @end never * goes beyond vma->vm_end. */ orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return err; } for (; addr < end; pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); err = add_to_pagemap(&pme, pm); if (err) break; } pte_unmap_unlock(orig_pte, ptl); cond_resched(); return err; } #ifdef CONFIG_HUGETLB_PAGE /* This function walks within one hugetlb entry in the single call */ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct pagemapread *pm = walk->private; struct vm_area_struct *vma = walk->vma; u64 flags = 0, frame = 0; int err = 0; pte_t pte; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; pte = huge_ptep_get(walk->mm, addr, ptep); if (pte_present(pte)) { struct folio *folio = page_folio(pte_page(pte)); if (!folio_test_anon(folio)) flags |= PM_FILE; if (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(ptep)) flags |= PM_MMAP_EXCLUSIVE; if (huge_pte_uffd_wp(pte)) flags |= PM_UFFD_WP; flags |= PM_PRESENT; if (pm->show_pfn) frame = pte_pfn(pte) + ((addr & ~hmask) >> PAGE_SHIFT); } else if (pte_swp_uffd_wp_any(pte)) { flags |= PM_UFFD_WP; } for (; addr != end; addr += PAGE_SIZE) { pagemap_entry_t pme = make_pme(frame, flags); err = add_to_pagemap(&pme, pm); if (err) return err; if (pm->show_pfn && (flags & PM_PRESENT)) frame++; } cond_resched(); return err; } #else #define pagemap_hugetlb_range NULL #endif /* HUGETLB_PAGE */ static const struct mm_walk_ops pagemap_ops = { .pmd_entry = pagemap_pmd_range, .pte_hole = pagemap_pte_hole, .hugetlb_entry = pagemap_hugetlb_range, .walk_lock = PGWALK_RDLOCK, }; /* * /proc/pid/pagemap - an array mapping virtual pages to pfns * * For each page in the address space, this file contains one 64-bit entry * consisting of the following: * * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) * Bit 56 page exclusively mapped * Bit 57 pte is uffd-wp write-protected * Bit 58 pte is a guard region * Bits 59-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present * * If the page is not present but in swap, then the PFN contains an * encoding of the swap file number and the page's offset into the * swap. Unmapped pages return a null PFN. This allows determining * precisely which pages are mapped (or in swap) and comparing mapped * pages between processes. * * Efficient users of this interface will use /proc/pid/maps to * determine which areas of memory are actually mapped and llseek to * skip over unmapped regions. */ static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct mm_struct *mm = file->private_data; struct pagemapread pm; unsigned long src; unsigned long svpfn; unsigned long start_vaddr; unsigned long end_vaddr; int ret = 0, copied = 0; if (!mm || !mmget_not_zero(mm)) goto out; ret = -EINVAL; /* file position must be aligned */ if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) goto out_mm; ret = 0; if (!count) goto out_mm; /* do not disclose physical addresses: attack vector */ pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); ret = -ENOMEM; if (!pm.buffer) goto out_mm; src = *ppos; svpfn = src / PM_ENTRY_BYTES; end_vaddr = mm->task_size; /* watch out for wraparound */ start_vaddr = end_vaddr; if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { unsigned long end; ret = mmap_read_lock_killable(mm); if (ret) goto out_free; start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); mmap_read_unlock(mm); end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT); if (end >= start_vaddr && end < mm->task_size) end_vaddr = end; } /* Ensure the address is inside the task */ if (start_vaddr > mm->task_size) start_vaddr = end_vaddr; ret = 0; while (count && (start_vaddr < end_vaddr)) { int len; unsigned long end; pm.pos = 0; end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; /* overflow ? */ if (end < start_vaddr || end > end_vaddr) end = end_vaddr; ret = mmap_read_lock_killable(mm); if (ret) goto out_free; ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); mmap_read_unlock(mm); start_vaddr = end; len = min(count, PM_ENTRY_BYTES * pm.pos); if (copy_to_user(buf, pm.buffer, len)) { ret = -EFAULT; goto out_free; } copied += len; buf += len; count -= len; } *ppos += copied; if (!ret || ret == PM_END_OF_BUFFER) ret = copied; out_free: kfree(pm.buffer); out_mm: mmput(mm); out: return ret; } static int pagemap_open(struct inode *inode, struct file *file) { struct mm_struct *mm; mm = proc_mem_open(inode, PTRACE_MODE_READ); if (IS_ERR_OR_NULL(mm)) return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; } static int pagemap_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; if (mm) mmdrop(mm); return 0; } #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ PAGE_IS_FILE | PAGE_IS_PRESENT | \ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \ PAGE_IS_GUARD) #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) struct pagemap_scan_private { struct pm_scan_arg arg; unsigned long masks_of_interest, cur_vma_category; struct page_region *vec_buf; unsigned long vec_buf_len, vec_buf_index, found_pages; struct page_region __user *vec_out; }; static unsigned long pagemap_page_category(struct pagemap_scan_private *p, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { unsigned long categories = 0; if (pte_present(pte)) { struct page *page; categories |= PAGE_IS_PRESENT; if (!pte_uffd_wp(pte)) categories |= PAGE_IS_WRITTEN; if (p->masks_of_interest & PAGE_IS_FILE) { page = vm_normal_page(vma, addr, pte); if (page && !PageAnon(page)) categories |= PAGE_IS_FILE; } if (is_zero_pfn(pte_pfn(pte))) categories |= PAGE_IS_PFNZERO; if (pte_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pte(pte)) { swp_entry_t swp; categories |= PAGE_IS_SWAPPED; if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; swp = pte_to_swp_entry(pte); if (is_guard_swp_entry(swp)) categories |= PAGE_IS_GUARD; else if ((p->masks_of_interest & PAGE_IS_FILE) && is_pfn_swap_entry(swp) && !folio_test_anon(pfn_swap_entry_folio(swp))) categories |= PAGE_IS_FILE; if (pte_swp_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } return categories; } static void make_uffd_wp_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t ptent) { if (pte_present(ptent)) { pte_t old_pte; old_pte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_mkuffd_wp(old_pte); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_mkuffd_wp(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } else { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); } } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { unsigned long categories = PAGE_IS_HUGE; if (pmd_present(pmd)) { struct page *page; categories |= PAGE_IS_PRESENT; if (!pmd_uffd_wp(pmd)) categories |= PAGE_IS_WRITTEN; if (p->masks_of_interest & PAGE_IS_FILE) { page = vm_normal_page_pmd(vma, addr, pmd); if (page && !PageAnon(page)) categories |= PAGE_IS_FILE; } if (is_huge_zero_pmd(pmd)) categories |= PAGE_IS_PFNZERO; if (pmd_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pmd(pmd)) { swp_entry_t swp; categories |= PAGE_IS_SWAPPED; if (!pmd_swp_uffd_wp(pmd)) categories |= PAGE_IS_WRITTEN; if (pmd_swp_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; if (p->masks_of_interest & PAGE_IS_FILE) { swp = pmd_to_swp_entry(pmd); if (is_pfn_swap_entry(swp) && !folio_test_anon(pfn_swap_entry_folio(swp))) categories |= PAGE_IS_FILE; } } return categories; } static void make_uffd_wp_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t old, pmd = *pmdp; if (pmd_present(pmd)) { old = pmdp_invalidate_ad(vma, addr, pmdp); pmd = pmd_mkuffd_wp(old); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_HUGETLB_PAGE static unsigned long pagemap_hugetlb_category(pte_t pte) { unsigned long categories = PAGE_IS_HUGE; /* * According to pagemap_hugetlb_range(), file-backed HugeTLB * page cannot be swapped. So PAGE_IS_FILE is not checked for * swapped pages. */ if (pte_present(pte)) { categories |= PAGE_IS_PRESENT; if (!huge_pte_uffd_wp(pte)) categories |= PAGE_IS_WRITTEN; if (!PageAnon(pte_page(pte))) categories |= PAGE_IS_FILE; if (is_zero_pfn(pte_pfn(pte))) categories |= PAGE_IS_PFNZERO; if (pte_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pte(pte)) { categories |= PAGE_IS_SWAPPED; if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; if (pte_swp_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } return categories; } static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t ptent) { unsigned long psize; if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent)) return; psize = huge_page_size(hstate_vma(vma)); if (is_hugetlb_entry_migration(ptent)) set_huge_pte_at(vma->vm_mm, addr, ptep, pte_swp_mkuffd_wp(ptent), psize); else if (!huge_pte_none(ptent)) huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, huge_pte_mkuffd_wp(ptent)); else set_huge_pte_at(vma->vm_mm, addr, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), psize); } #endif /* CONFIG_HUGETLB_PAGE */ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) static void pagemap_scan_backout_range(struct pagemap_scan_private *p, unsigned long addr, unsigned long end) { struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; if (cur_buf->start != addr) cur_buf->end = addr; else cur_buf->start = cur_buf->end = 0; p->found_pages -= (end - addr) / PAGE_SIZE; } #endif static bool pagemap_scan_is_interesting_page(unsigned long categories, const struct pagemap_scan_private *p) { categories ^= p->arg.category_inverted; if ((categories & p->arg.category_mask) != p->arg.category_mask) return false; if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask)) return false; return true; } static bool pagemap_scan_is_interesting_vma(unsigned long categories, const struct pagemap_scan_private *p) { unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED; categories ^= p->arg.category_inverted; if ((categories & required) != required) return false; return true; } static int pagemap_scan_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long vma_category = 0; bool wp_allowed = userfaultfd_wp_async(vma) && userfaultfd_wp_use_markers(vma); if (!wp_allowed) { /* User requested explicit failure over wp-async capability */ if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) return -EPERM; /* * User requires wr-protect, and allows silently skipping * unsupported vmas. */ if (p->arg.flags & PM_SCAN_WP_MATCHING) return 1; /* * Then the request doesn't involve wr-protects at all, * fall through to the rest checks, and allow vma walk. */ } if (vma->vm_flags & VM_PFNMAP) return 1; if (wp_allowed) vma_category |= PAGE_IS_WPALLOWED; if (vma->vm_flags & VM_SOFTDIRTY) vma_category |= PAGE_IS_SOFT_DIRTY; if (!pagemap_scan_is_interesting_vma(vma_category, p)) return 1; p->cur_vma_category = vma_category; return 0; } static bool pagemap_scan_push_range(unsigned long categories, struct pagemap_scan_private *p, unsigned long addr, unsigned long end) { struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; /* * When there is no output buffer provided at all, the sentinel values * won't match here. There is no other way for `cur_buf->end` to be * non-zero other than it being non-empty. */ if (addr == cur_buf->end && categories == cur_buf->categories) { cur_buf->end = end; return true; } if (cur_buf->end) { if (p->vec_buf_index >= p->vec_buf_len - 1) return false; cur_buf = &p->vec_buf[++p->vec_buf_index]; } cur_buf->start = addr; cur_buf->end = end; cur_buf->categories = categories; return true; } static int pagemap_scan_output(unsigned long categories, struct pagemap_scan_private *p, unsigned long addr, unsigned long *end) { unsigned long n_pages, total_pages; int ret = 0; if (!p->vec_buf) return 0; categories &= p->arg.return_mask; n_pages = (*end - addr) / PAGE_SIZE; if (check_add_overflow(p->found_pages, n_pages, &total_pages) || total_pages > p->arg.max_pages) { size_t n_too_much = total_pages - p->arg.max_pages; *end -= n_too_much * PAGE_SIZE; n_pages -= n_too_much; ret = -ENOSPC; } if (!pagemap_scan_push_range(categories, p, addr, *end)) { *end = addr; n_pages = 0; ret = -ENOSPC; } p->found_pages += n_pages; if (ret) p->arg.walk_end = *end; return ret; } static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long categories; spinlock_t *ptl; int ret = 0; ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) return -ENOENT; categories = p->cur_vma_category | pagemap_thp_category(p, vma, start, *pmd); if (!pagemap_scan_is_interesting_page(categories, p)) goto out_unlock; ret = pagemap_scan_output(categories, p, start, &end); if (start == end) goto out_unlock; if (~p->arg.flags & PM_SCAN_WP_MATCHING) goto out_unlock; if (~categories & PAGE_IS_WRITTEN) goto out_unlock; /* * Break huge page into small pages if the WP operation * needs to be performed on a portion of the huge page. */ if (end != start + HPAGE_SIZE) { spin_unlock(ptl); split_huge_pmd(vma, pmd, start); pagemap_scan_backout_range(p, start, end); /* Report as if there was no THP */ return -ENOENT; } make_uffd_wp_pmd(vma, start, pmd); flush_tlb_range(vma, start, end); out_unlock: spin_unlock(ptl); return ret; #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ return -ENOENT; #endif } static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long addr, flush_end = 0; pte_t *pte, *start_pte; spinlock_t *ptl; int ret; ret = pagemap_scan_thp_entry(pmd, start, end, walk); if (ret != -ENOENT) return ret; ret = 0; start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } arch_enter_lazy_mmu_mode(); if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { /* Fast path for performing exclusive WP */ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { pte_t ptent = ptep_get(pte); if ((pte_present(ptent) && pte_uffd_wp(ptent)) || pte_swp_uffd_wp_any(ptent)) continue; make_uffd_wp_pte(vma, addr, pte, ptent); if (!flush_end) start = addr; flush_end = addr + PAGE_SIZE; } goto flush_and_return; } if (!p->arg.category_anyof_mask && !p->arg.category_inverted && p->arg.category_mask == PAGE_IS_WRITTEN && p->arg.return_mask == PAGE_IS_WRITTEN) { for (addr = start; addr < end; pte++, addr += PAGE_SIZE) { unsigned long next = addr + PAGE_SIZE; pte_t ptent = ptep_get(pte); if ((pte_present(ptent) && pte_uffd_wp(ptent)) || pte_swp_uffd_wp_any(ptent)) continue; ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN, p, addr, &next); if (next == addr) break; if (~p->arg.flags & PM_SCAN_WP_MATCHING) continue; make_uffd_wp_pte(vma, addr, pte, ptent); if (!flush_end) start = addr; flush_end = next; } goto flush_and_return; } for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { pte_t ptent = ptep_get(pte); unsigned long categories = p->cur_vma_category | pagemap_page_category(p, vma, addr, ptent); unsigned long next = addr + PAGE_SIZE; if (!pagemap_scan_is_interesting_page(categories, p)) continue; ret = pagemap_scan_output(categories, p, addr, &next); if (next == addr) break; if (~p->arg.flags & PM_SCAN_WP_MATCHING) continue; if (~categories & PAGE_IS_WRITTEN) continue; make_uffd_wp_pte(vma, addr, pte, ptent); if (!flush_end) start = addr; flush_end = next; } flush_and_return: if (flush_end) flush_tlb_range(vma, start, addr); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); cond_resched(); return ret; } #ifdef CONFIG_HUGETLB_PAGE static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, unsigned long start, unsigned long end, struct mm_walk *walk) { struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long categories; spinlock_t *ptl; int ret = 0; pte_t pte; if (~p->arg.flags & PM_SCAN_WP_MATCHING) { /* Go the short route when not write-protecting pages. */ pte = huge_ptep_get(walk->mm, start, ptep); categories = p->cur_vma_category | pagemap_hugetlb_category(pte); if (!pagemap_scan_is_interesting_page(categories, p)) return 0; return pagemap_scan_output(categories, p, start, &end); } i_mmap_lock_write(vma->vm_file->f_mapping); ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); pte = huge_ptep_get(walk->mm, start, ptep); categories = p->cur_vma_category | pagemap_hugetlb_category(pte); if (!pagemap_scan_is_interesting_page(categories, p)) goto out_unlock; ret = pagemap_scan_output(categories, p, start, &end); if (start == end) goto out_unlock; if (~categories & PAGE_IS_WRITTEN) goto out_unlock; if (end != start + HPAGE_SIZE) { /* Partial HugeTLB page WP isn't possible. */ pagemap_scan_backout_range(p, start, end); p->arg.walk_end = start; ret = 0; goto out_unlock; } make_uffd_wp_huge_pte(vma, start, ptep, pte); flush_hugetlb_tlb_range(vma, start, end); out_unlock: spin_unlock(ptl); i_mmap_unlock_write(vma->vm_file->f_mapping); return ret; } #else #define pagemap_scan_hugetlb_entry NULL #endif static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, int depth, struct mm_walk *walk) { struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; int ret, err; if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p)) return 0; ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); if (addr == end) return ret; if (~p->arg.flags & PM_SCAN_WP_MATCHING) return ret; err = uffd_wp_range(vma, addr, end - addr, true); if (err < 0) ret = err; return ret; } static const struct mm_walk_ops pagemap_scan_ops = { .test_walk = pagemap_scan_test_walk, .pmd_entry = pagemap_scan_pmd_entry, .pte_hole = pagemap_scan_pte_hole, .hugetlb_entry = pagemap_scan_hugetlb_entry, }; static int pagemap_scan_get_args(struct pm_scan_arg *arg, unsigned long uarg) { if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) return -EFAULT; if (arg->size != sizeof(struct pm_scan_arg)) return -EINVAL; /* Validate requested features */ if (arg->flags & ~PM_SCAN_FLAGS) return -EINVAL; if ((arg->category_inverted | arg->category_mask | arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) return -EINVAL; arg->start = untagged_addr((unsigned long)arg->start); arg->end = untagged_addr((unsigned long)arg->end); arg->vec = untagged_addr((unsigned long)arg->vec); /* Validate memory pointers */ if (!IS_ALIGNED(arg->start, PAGE_SIZE)) return -EINVAL; if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start)) return -EFAULT; if (!arg->vec && arg->vec_len) return -EINVAL; if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX) return -EINVAL; if (arg->vec && !access_ok((void __user *)(long)arg->vec, size_mul(arg->vec_len, sizeof(struct page_region)))) return -EFAULT; /* Fixup default values */ arg->end = ALIGN(arg->end, PAGE_SIZE); arg->walk_end = 0; if (!arg->max_pages) arg->max_pages = ULONG_MAX; return 0; } static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, unsigned long uargl) { struct pm_scan_arg __user *uarg = (void __user *)uargl; if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) return -EFAULT; return 0; } static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) { if (!p->arg.vec_len) return 0; p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, p->arg.vec_len); p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), GFP_KERNEL); if (!p->vec_buf) return -ENOMEM; p->vec_buf->start = p->vec_buf->end = 0; p->vec_out = (struct page_region __user *)(long)p->arg.vec; return 0; } static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) { const struct page_region *buf = p->vec_buf; long n = p->vec_buf_index; if (!p->vec_buf) return 0; if (buf[n].end != buf[n].start) n++; if (!n) return 0; if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) return -EFAULT; p->arg.vec_len -= n; p->vec_out += n; p->vec_buf_index = 0; p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len); p->vec_buf->start = p->vec_buf->end = 0; return n; } static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) { struct pagemap_scan_private p = {0}; unsigned long walk_start; size_t n_ranges_out = 0; int ret; ret = pagemap_scan_get_args(&p.arg, uarg); if (ret) return ret; p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask | p.arg.return_mask; ret = pagemap_scan_init_bounce_buffer(&p); if (ret) return ret; for (walk_start = p.arg.start; walk_start < p.arg.end; walk_start = p.arg.walk_end) { struct mmu_notifier_range range; long n_out; if (fatal_signal_pending(current)) { ret = -EINTR; break; } ret = mmap_read_lock_killable(mm); if (ret) break; /* Protection change for the range is going to happen. */ if (p.arg.flags & PM_SCAN_WP_MATCHING) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, mm, walk_start, p.arg.end); mmu_notifier_invalidate_range_start(&range); } ret = walk_page_range(mm, walk_start, p.arg.end, &pagemap_scan_ops, &p); if (p.arg.flags & PM_SCAN_WP_MATCHING) mmu_notifier_invalidate_range_end(&range); mmap_read_unlock(mm); n_out = pagemap_scan_flush_buffer(&p); if (n_out < 0) ret = n_out; else n_ranges_out += n_out; if (ret != -ENOSPC) break; if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages) break; } /* ENOSPC signifies early stop (buffer full) from the walk. */ if (!ret || ret == -ENOSPC) ret = n_ranges_out; /* The walk_end isn't set when ret is zero */ if (!p.arg.walk_end) p.arg.walk_end = p.arg.end; if (pagemap_scan_writeback_args(&p.arg, uarg)) ret = -EFAULT; kfree(p.vec_buf); return ret; } static long do_pagemap_cmd(struct file *file, unsigned int cmd, unsigned long arg) { struct mm_struct *mm = file->private_data; switch (cmd) { case PAGEMAP_SCAN: return do_pagemap_scan(mm, arg); default: return -EINVAL; } } const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, .release = pagemap_release, .unlocked_ioctl = do_pagemap_cmd, .compat_ioctl = do_pagemap_cmd, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA struct numa_maps { unsigned long pages; unsigned long anon; unsigned long active; unsigned long writeback; unsigned long mapcount_max; unsigned long dirty; unsigned long swapcache; unsigned long node[MAX_NUMNODES]; }; struct numa_maps_private { struct proc_maps_private proc_maps; struct numa_maps md; }; static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, unsigned long nr_pages) { struct folio *folio = page_folio(page); int count; if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) count = folio_precise_page_mapcount(folio, page); else count = folio_average_page_mapcount(folio); md->pages += nr_pages; if (pte_dirty || folio_test_dirty(folio)) md->dirty += nr_pages; if (folio_test_swapcache(folio)) md->swapcache += nr_pages; if (folio_test_active(folio) || folio_test_unevictable(folio)) md->active += nr_pages; if (folio_test_writeback(folio)) md->writeback += nr_pages; if (folio_test_anon(folio)) md->anon += nr_pages; if (count > md->mapcount_max) md->mapcount_max = count; md->node[folio_nid(folio)] += nr_pages; } static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, unsigned long addr) { struct page *page; int nid; if (!pte_present(pte)) return NULL; page = vm_normal_page(vma, addr, pte); if (!page || is_zone_device_page(page)) return NULL; if (PageReserved(page)) return NULL; nid = page_to_nid(page); if (!node_isset(nid, node_states[N_MEMORY])) return NULL; return page; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static struct page *can_gather_numa_stats_pmd(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) { struct page *page; int nid; if (!pmd_present(pmd)) return NULL; page = vm_normal_page_pmd(vma, addr, pmd); if (!page) return NULL; if (PageReserved(page)) return NULL; nid = page_to_nid(page); if (!node_isset(nid, node_states[N_MEMORY])) return NULL; return page; } #endif static int gather_pte_stats(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct numa_maps *md = walk->private; struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *orig_pte; pte_t *pte; #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { struct page *page; page = can_gather_numa_stats_pmd(*pmd, vma, addr); if (page) gather_stats(page, md, pmd_dirty(*pmd), HPAGE_PMD_SIZE/PAGE_SIZE); spin_unlock(ptl); return 0; } #endif orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } do { pte_t ptent = ptep_get(pte); struct page *page = can_gather_numa_stats(ptent, vma, addr); if (!page) continue; gather_stats(page, md, pte_dirty(ptent), 1); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); cond_resched(); return 0; } #ifdef CONFIG_HUGETLB_PAGE static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { pte_t huge_pte = huge_ptep_get(walk->mm, addr, pte); struct numa_maps *md; struct page *page; if (!pte_present(huge_pte)) return 0; page = pte_page(huge_pte); md = walk->private; gather_stats(page, md, pte_dirty(huge_pte), 1); return 0; } #else static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { return 0; } #endif static const struct mm_walk_ops show_numa_ops = { .hugetlb_entry = gather_hugetlb_stats, .pmd_entry = gather_pte_stats, .walk_lock = PGWALK_RDLOCK, }; /* * Display pages allocated per node and memory policy via /proc. */ static int show_numa_map(struct seq_file *m, void *v) { struct numa_maps_private *numa_priv = m->private; struct proc_maps_private *proc_priv = &numa_priv->proc_maps; struct vm_area_struct *vma = v; struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; char buffer[64]; struct mempolicy *pol; pgoff_t ilx; int nid; if (!mm) return 0; /* Ensure we start with an empty set of numa_maps statistics. */ memset(md, 0, sizeof(*md)); pol = __get_vma_policy(vma, vma->vm_start, &ilx); if (pol) { mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); } else { mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy); } seq_printf(m, "%08lx %s", vma->vm_start, buffer); if (file) { seq_puts(m, " file="); seq_path(m, file_user_path(file), "\n\t= "); } else if (vma_is_initial_heap(vma)) { seq_puts(m, " heap"); } else if (vma_is_initial_stack(vma)) { seq_puts(m, " stack"); } if (is_vm_hugetlb_page(vma)) seq_puts(m, " huge"); /* mmap_lock is held by m_start */ walk_page_vma(vma, &show_numa_ops, md); if (!md->pages) goto out; if (md->anon) seq_printf(m, " anon=%lu", md->anon); if (md->dirty) seq_printf(m, " dirty=%lu", md->dirty); if (md->pages != md->anon && md->pages != md->dirty) seq_printf(m, " mapped=%lu", md->pages); if (md->mapcount_max > 1) seq_printf(m, " mapmax=%lu", md->mapcount_max); if (md->swapcache) seq_printf(m, " swapcache=%lu", md->swapcache); if (md->active < md->pages && !is_vm_hugetlb_page(vma)) seq_printf(m, " active=%lu", md->active); if (md->writeback) seq_printf(m, " writeback=%lu", md->writeback); for_each_node_state(nid, N_MEMORY) if (md->node[nid]) seq_printf(m, " N%d=%lu", nid, md->node[nid]); seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); out: seq_putc(m, '\n'); return 0; } static const struct seq_operations proc_pid_numa_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_numa_map, }; static int pid_numa_maps_open(struct inode *inode, struct file *file) { return proc_maps_open(inode, file, &proc_pid_numa_maps_op, sizeof(struct numa_maps_private)); } const struct file_operations proc_pid_numa_maps_operations = { .open = pid_numa_maps_open, .read = seq_read, .llseek = seq_lseek, .release = proc_map_release, }; #endif /* CONFIG_NUMA */ |
| 11 39 38 30 21 30 36 7 39 119 28 16 38 10 120 102 39 39 39 28 1 39 81 17 80 81 213 213 12 194 129 40 5 9 10 30 129 10 4 4 203 203 18 192 214 180 129 35 35 517 214 392 389 214 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/fat/cache.c * * Written 1992,1993 by Werner Almesberger * * Mar 1999. AV. Changed cache, so that it uses the starting cluster instead * of inode number. * May 1999. AV. Fixed the bogosity with FAT32 (read "FAT28"). Fscking lusers. */ #include <linux/slab.h> #include "fat.h" /* this must be > 0. */ #define FAT_MAX_CACHE 8 struct fat_cache { struct list_head cache_list; int nr_contig; /* number of contiguous clusters */ int fcluster; /* cluster number in the file. */ int dcluster; /* cluster number on disk. */ }; struct fat_cache_id { unsigned int id; int nr_contig; int fcluster; int dcluster; }; static inline int fat_max_cache(struct inode *inode) { return FAT_MAX_CACHE; } static struct kmem_cache *fat_cache_cachep; static void init_once(void *foo) { struct fat_cache *cache = (struct fat_cache *)foo; INIT_LIST_HEAD(&cache->cache_list); } int __init fat_cache_init(void) { fat_cache_cachep = kmem_cache_create("fat_cache", sizeof(struct fat_cache), 0, SLAB_RECLAIM_ACCOUNT, init_once); if (fat_cache_cachep == NULL) return -ENOMEM; return 0; } void fat_cache_destroy(void) { kmem_cache_destroy(fat_cache_cachep); } static inline struct fat_cache *fat_cache_alloc(struct inode *inode) { return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS); } static inline void fat_cache_free(struct fat_cache *cache) { BUG_ON(!list_empty(&cache->cache_list)); kmem_cache_free(fat_cache_cachep, cache); } static inline void fat_cache_update_lru(struct inode *inode, struct fat_cache *cache) { if (MSDOS_I(inode)->cache_lru.next != &cache->cache_list) list_move(&cache->cache_list, &MSDOS_I(inode)->cache_lru); } static int fat_cache_lookup(struct inode *inode, int fclus, struct fat_cache_id *cid, int *cached_fclus, int *cached_dclus) { static struct fat_cache nohit = { .fcluster = 0, }; struct fat_cache *hit = &nohit, *p; int offset = -1; spin_lock(&MSDOS_I(inode)->cache_lru_lock); list_for_each_entry(p, &MSDOS_I(inode)->cache_lru, cache_list) { /* Find the cache of "fclus" or nearest cache. */ if (p->fcluster <= fclus && hit->fcluster < p->fcluster) { hit = p; if ((hit->fcluster + hit->nr_contig) < fclus) { offset = hit->nr_contig; } else { offset = fclus - hit->fcluster; break; } } } if (hit != &nohit) { fat_cache_update_lru(inode, hit); cid->id = MSDOS_I(inode)->cache_valid_id; cid->nr_contig = hit->nr_contig; cid->fcluster = hit->fcluster; cid->dcluster = hit->dcluster; *cached_fclus = cid->fcluster + offset; *cached_dclus = cid->dcluster + offset; } spin_unlock(&MSDOS_I(inode)->cache_lru_lock); return offset; } static struct fat_cache *fat_cache_merge(struct inode *inode, struct fat_cache_id *new) { struct fat_cache *p; list_for_each_entry(p, &MSDOS_I(inode)->cache_lru, cache_list) { /* Find the same part as "new" in cluster-chain. */ if (p->fcluster == new->fcluster) { BUG_ON(p->dcluster != new->dcluster); if (new->nr_contig > p->nr_contig) p->nr_contig = new->nr_contig; return p; } } return NULL; } static void fat_cache_add(struct inode *inode, struct fat_cache_id *new) { struct fat_cache *cache, *tmp; if (new->fcluster == -1) /* dummy cache */ return; spin_lock(&MSDOS_I(inode)->cache_lru_lock); if (new->id != FAT_CACHE_VALID && new->id != MSDOS_I(inode)->cache_valid_id) goto out; /* this cache was invalidated */ cache = fat_cache_merge(inode, new); if (cache == NULL) { if (MSDOS_I(inode)->nr_caches < fat_max_cache(inode)) { MSDOS_I(inode)->nr_caches++; spin_unlock(&MSDOS_I(inode)->cache_lru_lock); tmp = fat_cache_alloc(inode); if (!tmp) { spin_lock(&MSDOS_I(inode)->cache_lru_lock); MSDOS_I(inode)->nr_caches--; spin_unlock(&MSDOS_I(inode)->cache_lru_lock); return; } spin_lock(&MSDOS_I(inode)->cache_lru_lock); cache = fat_cache_merge(inode, new); if (cache != NULL) { MSDOS_I(inode)->nr_caches--; fat_cache_free(tmp); goto out_update_lru; } cache = tmp; } else { struct list_head *p = MSDOS_I(inode)->cache_lru.prev; cache = list_entry(p, struct fat_cache, cache_list); } cache->fcluster = new->fcluster; cache->dcluster = new->dcluster; cache->nr_contig = new->nr_contig; } out_update_lru: fat_cache_update_lru(inode, cache); out: spin_unlock(&MSDOS_I(inode)->cache_lru_lock); } /* * Cache invalidation occurs rarely, thus the LRU chain is not updated. It * fixes itself after a while. */ static void __fat_cache_inval_inode(struct inode *inode) { struct msdos_inode_info *i = MSDOS_I(inode); struct fat_cache *cache; while (!list_empty(&i->cache_lru)) { cache = list_entry(i->cache_lru.next, struct fat_cache, cache_list); list_del_init(&cache->cache_list); i->nr_caches--; fat_cache_free(cache); } /* Update. The copy of caches before this id is discarded. */ i->cache_valid_id++; if (i->cache_valid_id == FAT_CACHE_VALID) i->cache_valid_id++; } void fat_cache_inval_inode(struct inode *inode) { spin_lock(&MSDOS_I(inode)->cache_lru_lock); __fat_cache_inval_inode(inode); spin_unlock(&MSDOS_I(inode)->cache_lru_lock); } static inline int cache_contiguous(struct fat_cache_id *cid, int dclus) { cid->nr_contig++; return ((cid->dcluster + cid->nr_contig) == dclus); } static inline void cache_init(struct fat_cache_id *cid, int fclus, int dclus) { cid->id = FAT_CACHE_VALID; cid->fcluster = fclus; cid->dcluster = dclus; cid->nr_contig = 0; } int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); const int limit = sb->s_maxbytes >> sbi->cluster_bits; struct fat_entry fatent; struct fat_cache_id cid; int nr; BUG_ON(MSDOS_I(inode)->i_start == 0); *fclus = 0; *dclus = MSDOS_I(inode)->i_start; if (!fat_valid_entry(sbi, *dclus)) { fat_fs_error_ratelimit(sb, "%s: invalid start cluster (i_pos %lld, start %08x)", __func__, MSDOS_I(inode)->i_pos, *dclus); return -EIO; } if (cluster == 0) return 0; if (fat_cache_lookup(inode, cluster, &cid, fclus, dclus) < 0) { /* * dummy, always not contiguous * This is reinitialized by cache_init(), later. */ cache_init(&cid, -1, -1); } fatent_init(&fatent); while (*fclus < cluster) { /* prevent the infinite loop of cluster chain */ if (*fclus > limit) { fat_fs_error_ratelimit(sb, "%s: detected the cluster chain loop (i_pos %lld)", __func__, MSDOS_I(inode)->i_pos); nr = -EIO; goto out; } nr = fat_ent_read(inode, &fatent, *dclus); if (nr < 0) goto out; else if (nr == FAT_ENT_FREE) { fat_fs_error_ratelimit(sb, "%s: invalid cluster chain (i_pos %lld)", __func__, MSDOS_I(inode)->i_pos); nr = -EIO; goto out; } else if (nr == FAT_ENT_EOF) { fat_cache_add(inode, &cid); goto out; } (*fclus)++; *dclus = nr; if (!cache_contiguous(&cid, *dclus)) cache_init(&cid, *fclus, *dclus); } nr = 0; fat_cache_add(inode, &cid); out: fatent_brelse(&fatent); return nr; } static int fat_bmap_cluster(struct inode *inode, int cluster) { struct super_block *sb = inode->i_sb; int ret, fclus, dclus; if (MSDOS_I(inode)->i_start == 0) return 0; ret = fat_get_cluster(inode, cluster, &fclus, &dclus); if (ret < 0) return ret; else if (ret == FAT_ENT_EOF) { fat_fs_error(sb, "%s: request beyond EOF (i_pos %lld)", __func__, MSDOS_I(inode)->i_pos); return -EIO; } return dclus; } int fat_get_mapped_cluster(struct inode *inode, sector_t sector, sector_t last_block, unsigned long *mapped_blocks, sector_t *bmap) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); int cluster, offset; cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); offset = sector & (sbi->sec_per_clus - 1); cluster = fat_bmap_cluster(inode, cluster); if (cluster < 0) return cluster; else if (cluster) { *bmap = fat_clus_to_blknr(sbi, cluster) + offset; *mapped_blocks = sbi->sec_per_clus - offset; if (*mapped_blocks > last_block - sector) *mapped_blocks = last_block - sector; } return 0; } static int is_exceed_eof(struct inode *inode, sector_t sector, sector_t *last_block, int create) { struct super_block *sb = inode->i_sb; const unsigned long blocksize = sb->s_blocksize; const unsigned char blocksize_bits = sb->s_blocksize_bits; *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits; if (sector >= *last_block) { if (!create) return 1; /* * ->mmu_private can access on only allocation path. * (caller must hold ->i_mutex) */ *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) >> blocksize_bits; if (sector >= *last_block) return 1; } return 0; } int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, unsigned long *mapped_blocks, int create, bool from_bmap) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); sector_t last_block; *phys = 0; *mapped_blocks = 0; if (!is_fat32(sbi) && (inode->i_ino == MSDOS_ROOT_INO)) { if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) { *phys = sector + sbi->dir_start; *mapped_blocks = 1; } return 0; } if (!from_bmap) { if (is_exceed_eof(inode, sector, &last_block, create)) return 0; } else { last_block = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); if (sector >= last_block) return 0; } return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks, phys); } |
| 6 3 1 3 6 6 6 3 7 1 6 1 1 1 35 35 1 1 33 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 * Phillip Lougher <phillip@squashfs.org.uk> * * export.c */ /* * This file implements code to make Squashfs filesystems exportable (NFS etc.) * * The export code uses an inode lookup table to map inode numbers passed in * filehandles to an inode location on disk. This table is stored compressed * into metadata blocks. A second index table is used to locate these. This * second index table for speed of access (and because it is small) is read at * mount time and cached in memory. * * The inode lookup table is used only by the export code, inode disk * locations are directly encoded in directories, enabling direct access * without an intermediate lookup for all operations except the export ops. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/dcache.h> #include <linux/exportfs.h> #include <linux/slab.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" /* * Look-up inode number (ino) in table, returning the inode location. */ static long long squashfs_inode_lookup(struct super_block *sb, int ino_num) { struct squashfs_sb_info *msblk = sb->s_fs_info; int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1); int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1); u64 start; __le64 ino; int err; TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num); if (ino_num == 0 || (ino_num - 1) >= msblk->inodes) return -EINVAL; start = le64_to_cpu(msblk->inode_lookup_table[blk]); err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino)); if (err < 0) return err; TRACE("squashfs_inode_lookup, inode = 0x%llx\n", (u64) le64_to_cpu(ino)); return le64_to_cpu(ino); } static struct dentry *squashfs_export_iget(struct super_block *sb, unsigned int ino_num) { long long ino; struct dentry *dentry = ERR_PTR(-ENOENT); TRACE("Entered squashfs_export_iget\n"); ino = squashfs_inode_lookup(sb, ino_num); if (ino >= 0) dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num)); return dentry; } static struct dentry *squashfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT) || fh_len < 2) return NULL; return squashfs_export_iget(sb, fid->i32.ino); } static struct dentry *squashfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4) return NULL; return squashfs_export_iget(sb, fid->i32.parent_ino); } static struct dentry *squashfs_get_parent(struct dentry *child) { struct inode *inode = d_inode(child); unsigned int parent_ino = squashfs_i(inode)->parent; return squashfs_export_iget(inode->i_sb, parent_ino); } /* * Read uncompressed inode lookup table indexes off disk into memory */ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb, u64 lookup_table_start, u64 next_table, unsigned int inodes) { unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes); unsigned int indexes = SQUASHFS_LOOKUP_BLOCKS(inodes); int n; __le64 *table; u64 start, end; TRACE("In read_inode_lookup_table, length %d\n", length); /* Sanity check values */ /* there should always be at least one inode */ if (inodes == 0) return ERR_PTR(-EINVAL); /* * The computed size of the lookup table (length bytes) should exactly * match the table start and end points */ if (length != (next_table - lookup_table_start)) return ERR_PTR(-EINVAL); table = squashfs_read_table(sb, lookup_table_start, length); if (IS_ERR(table)) return table; /* * table0], table[1], ... table[indexes - 1] store the locations * of the compressed inode lookup blocks. Each entry should be * less than the next (i.e. table[0] < table[1]), and the difference * between them should be SQUASHFS_METADATA_SIZE or less. * table[indexes - 1] should be less than lookup_table_start, and * again the difference should be SQUASHFS_METADATA_SIZE or less */ for (n = 0; n < (indexes - 1); n++) { start = le64_to_cpu(table[n]); end = le64_to_cpu(table[n + 1]); if (start >= end || (end - start) > (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } } start = le64_to_cpu(table[indexes - 1]); if (start >= lookup_table_start || (lookup_table_start - start) > (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } return table; } const struct export_operations squashfs_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = squashfs_fh_to_dentry, .fh_to_parent = squashfs_fh_to_parent, .get_parent = squashfs_get_parent }; |
| 226 112 112 112 65 92 72 164 163 17 17 40 40 40 47 48 45 47 226 224 225 224 16 16 16 16 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 | // SPDX-License-Identifier: GPL-2.0 /* * Asynchronous refcounty things * * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> * Copyright 2012 Google, Inc. */ #include <linux/closure.h> #include <linux/debugfs.h> #include <linux/export.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> #include <linux/sched/debug.h> static inline void closure_put_after_sub_checks(int flags) { int r = flags & CLOSURE_REMAINING_MASK; if (WARN(flags & CLOSURE_GUARD_MASK, "closure has guard bits set: %x (%u)", flags & CLOSURE_GUARD_MASK, (unsigned) __fls(r))) r &= ~CLOSURE_GUARD_MASK; WARN(!r && (flags & ~CLOSURE_DESTRUCTOR), "closure ref hit 0 with incorrect flags set: %x (%u)", flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags)); } static inline void closure_put_after_sub(struct closure *cl, int flags) { closure_put_after_sub_checks(flags); if (!(flags & CLOSURE_REMAINING_MASK)) { smp_acquire__after_ctrl_dep(); cl->closure_get_happened = false; if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); closure_queue(cl); } else { struct closure *parent = cl->parent; closure_fn *destructor = cl->fn; closure_debug_destroy(cl); if (destructor) destructor(&cl->work); if (parent) closure_put(parent); } } } /* For clearing flags with the same atomic op as a put */ void closure_sub(struct closure *cl, int v) { closure_put_after_sub(cl, atomic_sub_return_release(v, &cl->remaining)); } EXPORT_SYMBOL(closure_sub); /* * closure_put - decrement a closure's refcount */ void closure_put(struct closure *cl) { closure_put_after_sub(cl, atomic_dec_return_release(&cl->remaining)); } EXPORT_SYMBOL(closure_put); /* * closure_wake_up - wake up all closures on a wait list, without memory barrier */ void __closure_wake_up(struct closure_waitlist *wait_list) { struct llist_node *list; struct closure *cl, *t; struct llist_node *reverse = NULL; list = llist_del_all(&wait_list->list); /* We first reverse the list to preserve FIFO ordering and fairness */ reverse = llist_reverse_order(list); /* Then do the wakeups */ llist_for_each_entry_safe(cl, t, reverse, list) { closure_set_waiting(cl, 0); closure_sub(cl, CLOSURE_WAITING + 1); } } EXPORT_SYMBOL(__closure_wake_up); /** * closure_wait - add a closure to a waitlist * @waitlist: will own a ref on @cl, which will be released when * closure_wake_up() is called on @waitlist. * @cl: closure pointer. * */ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) { if (atomic_read(&cl->remaining) & CLOSURE_WAITING) return false; cl->closure_get_happened = true; closure_set_waiting(cl, _RET_IP_); atomic_add(CLOSURE_WAITING + 1, &cl->remaining); llist_add(&cl->list, &waitlist->list); return true; } EXPORT_SYMBOL(closure_wait); struct closure_syncer { struct task_struct *task; int done; }; static CLOSURE_CALLBACK(closure_sync_fn) { struct closure *cl = container_of(ws, struct closure, work); struct closure_syncer *s = cl->s; struct task_struct *p; rcu_read_lock(); p = READ_ONCE(s->task); s->done = 1; wake_up_process(p); rcu_read_unlock(); } void __sched __closure_sync(struct closure *cl) { struct closure_syncer s = { .task = current }; cl->s = &s; continue_at(cl, closure_sync_fn, NULL); while (1) { set_current_state(TASK_UNINTERRUPTIBLE); if (s.done) break; schedule(); } __set_current_state(TASK_RUNNING); } EXPORT_SYMBOL(__closure_sync); /* * closure_return_sync - finish running a closure, synchronously (i.e. waiting * for outstanding get()s to finish) and returning once closure refcount is 0. * * Unlike closure_sync() this doesn't reinit the ref to 1; subsequent * closure_get_not_zero() calls waill fail. */ void __sched closure_return_sync(struct closure *cl) { struct closure_syncer s = { .task = current }; cl->s = &s; set_closure_fn(cl, closure_sync_fn, NULL); unsigned flags = atomic_sub_return_release(1 + CLOSURE_RUNNING - CLOSURE_DESTRUCTOR, &cl->remaining); closure_put_after_sub_checks(flags); if (unlikely(flags & CLOSURE_REMAINING_MASK)) { while (1) { set_current_state(TASK_UNINTERRUPTIBLE); if (s.done) break; schedule(); } __set_current_state(TASK_RUNNING); } if (cl->parent) closure_put(cl->parent); } EXPORT_SYMBOL(closure_return_sync); int __sched __closure_sync_timeout(struct closure *cl, unsigned long timeout) { struct closure_syncer s = { .task = current }; int ret = 0; cl->s = &s; continue_at(cl, closure_sync_fn, NULL); while (1) { set_current_state(TASK_UNINTERRUPTIBLE); if (s.done) break; if (!timeout) { /* * Carefully undo the continue_at() - but only if it * hasn't completed, i.e. the final closure_put() hasn't * happened yet: */ unsigned old, new, v = atomic_read(&cl->remaining); do { old = v; if (!old || (old & CLOSURE_RUNNING)) goto success; new = old + CLOSURE_REMAINING_INITIALIZER; } while ((v = atomic_cmpxchg(&cl->remaining, old, new)) != old); ret = -ETIME; } timeout = schedule_timeout(timeout); } success: __set_current_state(TASK_RUNNING); return ret; } EXPORT_SYMBOL(__closure_sync_timeout); #ifdef CONFIG_DEBUG_CLOSURES static LIST_HEAD(closure_list); static DEFINE_SPINLOCK(closure_list_lock); void closure_debug_create(struct closure *cl) { unsigned long flags; BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); cl->magic = CLOSURE_MAGIC_ALIVE; spin_lock_irqsave(&closure_list_lock, flags); list_add(&cl->all, &closure_list); spin_unlock_irqrestore(&closure_list_lock, flags); } EXPORT_SYMBOL(closure_debug_create); void closure_debug_destroy(struct closure *cl) { unsigned long flags; if (cl->magic == CLOSURE_MAGIC_STACK) return; BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); cl->magic = CLOSURE_MAGIC_DEAD; spin_lock_irqsave(&closure_list_lock, flags); list_del(&cl->all); spin_unlock_irqrestore(&closure_list_lock, flags); } EXPORT_SYMBOL(closure_debug_destroy); static int debug_show(struct seq_file *f, void *data) { struct closure *cl; spin_lock_irq(&closure_list_lock); list_for_each_entry(cl, &closure_list, all) { int r = atomic_read(&cl->remaining); seq_printf(f, "%p: %pS -> %pS p %p r %i ", cl, (void *) cl->ip, cl->fn, cl->parent, r & CLOSURE_REMAINING_MASK); seq_printf(f, "%s%s\n", test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&cl->work)) ? "Q" : "", r & CLOSURE_RUNNING ? "R" : ""); if (r & CLOSURE_WAITING) seq_printf(f, " W %pS\n", (void *) cl->waiting_on); seq_putc(f, '\n'); } spin_unlock_irq(&closure_list_lock); return 0; } DEFINE_SHOW_ATTRIBUTE(debug); static int __init closure_debug_init(void) { debugfs_create_file("closures", 0400, NULL, NULL, &debug_fops); return 0; } late_initcall(closure_debug_init) #endif |
| 55 55 55 67 83 83 83 83 66 67 67 67 66 67 67 83 83 136 135 135 56 81 78 77 59 59 60 60 21 21 21 21 2 21 21 21 15 12 16 1 8 9 9 9 9 9 9 9 9 6 6 6 9 9 9 9 6 6 6 9 9 21 9 9 9 1 1 3 3 3 3 2 2 14 14 12 2 2 26 26 3 26 25 26 26 3 3 26 26 44 9 39 63 56 5 23 23 43 43 27 20 11 44 44 27 27 58 58 58 58 55 55 55 54 55 55 95 96 96 1 1 1 1 77 76 78 55 55 55 55 55 78 2 2 2 2 3 1 4 4 1 3 2 1 3 1 77 78 23 55 78 76 7 11 77 7 1 4 140 140 140 140 140 2 139 139 139 139 140 1 140 72 140 140 69 69 69 69 63 6 69 139 27 27 24 24 24 6 3 4 2 4 24 15 15 9 9 8 1 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> * * Code for managing the extent btree and dynamically updating the writeback * dirty sector count. */ #include "bcachefs.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_gc.h" #include "btree_io.h" #include "btree_iter.h" #include "buckets.h" #include "checksum.h" #include "compress.h" #include "debug.h" #include "disk_groups.h" #include "error.h" #include "extents.h" #include "inode.h" #include "journal.h" #include "rebalance.h" #include "replicas.h" #include "super.h" #include "super-io.h" #include "trace.h" #include "util.h" static const char * const bch2_extent_flags_strs[] = { #define x(n, v) [BCH_EXTENT_FLAG_##n] = #n, BCH_EXTENT_FLAGS() #undef x NULL, }; static unsigned bch2_crc_field_size_max[] = { [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, }; static void bch2_extent_crc_pack(union bch_extent_crc *, struct bch_extent_crc_unpacked, enum bch_extent_entry_type); void bch2_io_failures_to_text(struct printbuf *out, struct bch_fs *c, struct bch_io_failures *failed) { static const char * const error_types[] = { "btree validate", "io", "checksum", "ec reconstruct", NULL }; for (struct bch_dev_io_failures *f = failed->devs; f < failed->devs + failed->nr; f++) { unsigned errflags = ((!!f->failed_btree_validate) << 0) | ((!!f->failed_io) << 1) | ((!!f->failed_csum_nr) << 2) | ((!!f->failed_ec) << 3); bch2_printbuf_make_room(out, 1024); out->atomic++; scoped_guard(rcu) { struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev); if (ca) prt_str(out, ca->name); else prt_printf(out, "(invalid device %u)", f->dev); } --out->atomic; prt_char(out, ' '); if (!errflags) { prt_str(out, "no error - confused"); } else if (is_power_of_2(errflags)) { prt_bitflags(out, error_types, errflags); prt_str(out, " error"); } else { prt_str(out, "errors: "); prt_bitflags(out, error_types, errflags); } prt_newline(out); } } struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, unsigned dev) { struct bch_dev_io_failures *i; for (i = f->devs; i < f->devs + f->nr; i++) if (i->dev == dev) return i; return NULL; } void bch2_mark_io_failure(struct bch_io_failures *failed, struct extent_ptr_decoded *p, bool csum_error) { struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev); if (!f) { BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); f = &failed->devs[failed->nr++]; memset(f, 0, sizeof(*f)); f->dev = p->ptr.dev; } if (p->do_ec_reconstruct) f->failed_ec = true; else if (!csum_error) f->failed_io = true; else f->failed_csum_nr++; } void bch2_mark_btree_validate_failure(struct bch_io_failures *failed, unsigned dev) { struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev); if (!f) { BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); f = &failed->devs[failed->nr++]; memset(f, 0, sizeof(*f)); f->dev = dev; } f->failed_btree_validate = true; } static inline u64 dev_latency(struct bch_dev *ca) { return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; } static inline int dev_failed(struct bch_dev *ca) { return !ca || ca->mi.state == BCH_MEMBER_STATE_failed; } /* * returns true if p1 is better than p2: */ static inline bool ptr_better(struct bch_fs *c, const struct extent_ptr_decoded p1, u64 p1_latency, struct bch_dev *ca1, const struct extent_ptr_decoded p2, u64 p2_latency) { struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); int failed_delta = dev_failed(ca1) - dev_failed(ca2); if (unlikely(failed_delta)) return failed_delta < 0; if (static_branch_unlikely(&bch2_force_reconstruct_read)) return p1.do_ec_reconstruct > p2.do_ec_reconstruct; if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct)) return p1.do_ec_reconstruct < p2.do_ec_reconstruct; int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr; if (unlikely(crc_retry_delta)) return crc_retry_delta < 0; /* Pick at random, biased in favor of the faster device: */ return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency; } /* * This picks a non-stale pointer, preferably from a device other than @avoid. * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to * other devices, it will still pick a pointer from avoid. */ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, struct bch_io_failures *failed, struct extent_ptr_decoded *pick, int dev) { bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false; bool have_dirty_ptrs = false, have_pick = false; if (k.k->type == KEY_TYPE_error) return bch_err_throw(c, key_type_error); rcu_read_lock(); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 pick_latency; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { have_dirty_ptrs |= !p.ptr.cached; /* * Unwritten extent: no need to actually read, treat it as a * hole and return 0s: */ if (p.ptr.unwritten) { rcu_read_unlock(); return 0; } /* Are we being asked to read from a specific device? */ if (dev >= 0 && p.ptr.dev != dev) continue; struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) { rcu_read_unlock(); int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev); if (ret) return ret; rcu_read_lock(); } if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; struct bch_dev_io_failures *f = unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; if (unlikely(f)) { p.crc_retry_nr = f->failed_csum_nr; p.has_ec &= ~f->failed_ec; if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) { have_io_errors |= f->failed_io; have_io_errors |= f->failed_btree_validate; have_io_errors |= f->failed_ec; } have_csum_errors |= !!f->failed_csum_nr; if (p.has_ec && (f->failed_io || f->failed_csum_nr)) p.do_ec_reconstruct = true; else if (f->failed_io || f->failed_btree_validate || f->failed_csum_nr > c->opts.checksum_err_retry_nr) continue; } have_missing_devs |= ca && !bch2_dev_is_online(ca); if (!ca || !bch2_dev_is_online(ca)) { if (!p.has_ec) continue; p.do_ec_reconstruct = true; } if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec) p.do_ec_reconstruct = true; u64 p_latency = dev_latency(ca); /* * Square the latencies, to bias more in favor of the faster * device - we never want to stop issuing reads to the slower * device altogether, so that we can update our latency numbers: */ p_latency *= p_latency; if (!have_pick || ptr_better(c, p, p_latency, ca, *pick, pick_latency)) { *pick = p; pick_latency = p_latency; have_pick = true; } } rcu_read_unlock(); if (have_pick) return 1; if (!have_dirty_ptrs) return 0; if (have_missing_devs) return bch_err_throw(c, no_device_to_read_from); if (have_csum_errors) return bch_err_throw(c, data_read_csum_err); if (have_io_errors) return bch_err_throw(c, data_read_io_err); /* * If we get here, we have pointers (bkey_ptrs_validate() ensures that), * but they don't point to valid devices: */ return bch_err_throw(c, no_devices_valid); } /* KEY_TYPE_btree_ptr: */ int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { int ret = 0; bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, btree_ptr_val_too_big, "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); ret = bch2_bkey_ptrs_validate(c, k, from); fsck_err: return ret; } void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { bch2_bkey_ptrs_to_text(out, c, k); } int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); int ret = 0; bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, btree_ptr_v2_val_too_big, "value too big (%zu > %zu)", bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p), c, btree_ptr_v2_min_key_bad, "min_key > key"); if ((from.flags & BCH_VALIDATE_write) && c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written) bkey_fsck_err_on(!bp.v->sectors_written, c, btree_ptr_v2_written_0, "sectors_written == 0"); ret = bch2_bkey_ptrs_validate(c, k, from); fsck_err: return ret; } void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); prt_printf(out, "seq %llx written %u min_key %s", le64_to_cpu(bp.v->seq), le16_to_cpu(bp.v->sectors_written), BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : ""); bch2_bpos_to_text(out, bp.v->min_key); prt_printf(out, " "); bch2_bkey_ptrs_to_text(out, c, k); } void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, unsigned big_endian, int write, struct bkey_s k) { struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); if (version < bcachefs_metadata_version_inode_btree_change && btree_id_is_extents(btree_id) && !bkey_eq(bp.v->min_key, POS_MIN)) bp.v->min_key = write ? bpos_nosnap_predecessor(bp.v->min_key) : bpos_nosnap_successor(bp.v->min_key); } /* KEY_TYPE_extent: */ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r); union bch_extent_entry *en_l; const union bch_extent_entry *en_r; struct extent_ptr_decoded lp, rp; bool use_right_ptr; en_l = l_ptrs.start; en_r = r_ptrs.start; while (en_l < l_ptrs.end && en_r < r_ptrs.end) { if (extent_entry_type(en_l) != extent_entry_type(en_r)) return false; en_l = extent_entry_next(en_l); en_r = extent_entry_next(en_r); } if (en_l < l_ptrs.end || en_r < r_ptrs.end) return false; en_l = l_ptrs.start; en_r = r_ptrs.start; lp.crc = bch2_extent_crc_unpack(l.k, NULL); rp.crc = bch2_extent_crc_unpack(r.k, NULL); guard(rcu)(); while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != rp.ptr.offset + rp.crc.offset || lp.ptr.dev != rp.ptr.dev || lp.ptr.gen != rp.ptr.gen || lp.ptr.unwritten != rp.ptr.unwritten || lp.has_ec != rp.has_ec) return false; /* Extents may not straddle buckets: */ struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev); bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr); if (!same_bucket) return false; if (lp.has_ec != rp.has_ec || (lp.has_ec && (lp.ec.block != rp.ec.block || lp.ec.redundancy != rp.ec.redundancy || lp.ec.idx != rp.ec.idx))) return false; if (lp.crc.compression_type != rp.crc.compression_type || lp.crc.nonce != rp.crc.nonce) return false; if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= lp.crc.uncompressed_size) { /* can use left extent's crc entry */ } else if (lp.crc.live_size <= rp.crc.offset) { /* can use right extent's crc entry */ } else { /* check if checksums can be merged: */ if (lp.crc.csum_type != rp.crc.csum_type || lp.crc.nonce != rp.crc.nonce || crc_is_compressed(lp.crc) || !bch2_checksum_mergeable(lp.crc.csum_type)) return false; if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size || rp.crc.offset) return false; if (lp.crc.csum_type && lp.crc.uncompressed_size + rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) return false; } en_l = extent_entry_next(en_l); en_r = extent_entry_next(en_r); } en_l = l_ptrs.start; en_r = r_ptrs.start; while (en_l < l_ptrs.end && en_r < r_ptrs.end) { if (extent_entry_is_crc(en_l)) { struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); if (crc_l.uncompressed_size + crc_r.uncompressed_size > bch2_crc_field_size_max[extent_entry_type(en_l)]) return false; } en_l = extent_entry_next(en_l); en_r = extent_entry_next(en_r); } use_right_ptr = false; en_l = l_ptrs.start; en_r = r_ptrs.start; while (en_l < l_ptrs.end) { if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr && use_right_ptr) en_l->ptr = en_r->ptr; if (extent_entry_is_crc(en_l)) { struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); use_right_ptr = false; if (crc_l.offset + crc_l.live_size + crc_r.live_size <= crc_l.uncompressed_size) { /* can use left extent's crc entry */ } else if (crc_l.live_size <= crc_r.offset) { /* can use right extent's crc entry */ crc_r.offset -= crc_l.live_size; bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, extent_entry_type(en_l)); use_right_ptr = true; } else { crc_l.csum = bch2_checksum_merge(crc_l.csum_type, crc_l.csum, crc_r.csum, crc_r.uncompressed_size << 9); crc_l.uncompressed_size += crc_r.uncompressed_size; crc_l.compressed_size += crc_r.compressed_size; bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, extent_entry_type(en_l)); } } en_l = extent_entry_next(en_l); en_r = extent_entry_next(en_r); } bch2_key_resize(l.k, l.k->size + r.k->size); return true; } /* KEY_TYPE_reservation: */ int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); int ret = 0; bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, reservation_key_nr_replicas_invalid, "invalid nr_replicas (%u)", r.v->nr_replicas); fsck_err: return ret; } void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); prt_printf(out, "generation %u replicas %u", le32_to_cpu(r.v->generation), r.v->nr_replicas); } bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) { struct bkey_s_reservation l = bkey_s_to_reservation(_l); struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r); if (l.v->generation != r.v->generation || l.v->nr_replicas != r.v->nr_replicas) return false; bch2_key_resize(l.k, l.k->size + r.k->size); return true; } /* Extent checksum entries: */ /* returns true if not equal */ static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, struct bch_extent_crc_unpacked r) { return (l.csum_type != r.csum_type || l.compression_type != r.compression_type || l.compressed_size != r.compressed_size || l.uncompressed_size != r.uncompressed_size || l.offset != r.offset || l.live_size != r.live_size || l.nonce != r.nonce || bch2_crc_cmp(l.csum, r.csum)); } static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, struct bch_extent_crc_unpacked n) { return !crc_is_compressed(u) && u.csum_type && u.uncompressed_size > u.live_size && bch2_csum_type_is_encryption(u.csum_type) == bch2_csum_type_is_encryption(n.csum_type); } bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, struct bch_extent_crc_unpacked n) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_extent_crc_unpacked crc; const union bch_extent_entry *i; if (!n.csum_type) return false; bkey_for_each_crc(k.k, ptrs, crc, i) if (can_narrow_crc(crc, n)) return true; return false; } /* * We're writing another replica for this extent, so while we've got the data in * memory we'll be computing a new checksum for the currently live data. * * If there are other replicas we aren't moving, and they are checksummed but * not compressed, we can modify them to point to only the data that is * currently live (so that readers won't have to bounce) while we've got the * checksum we need: */ bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); struct bch_extent_crc_unpacked u; struct extent_ptr_decoded p; union bch_extent_entry *i; bool ret = false; /* Find a checksum entry that covers only live data: */ if (!n.csum_type) { bkey_for_each_crc(&k->k, ptrs, u, i) if (!crc_is_compressed(u) && u.csum_type && u.live_size == u.uncompressed_size) { n = u; goto found; } return false; } found: BUG_ON(crc_is_compressed(n)); BUG_ON(n.offset); BUG_ON(n.live_size != k->k.size); restart_narrow_pointers: ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); bkey_for_each_ptr_decode(&k->k, ptrs, p, i) if (can_narrow_crc(p.crc, n)) { bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr); p.ptr.offset += p.crc.offset; p.crc = n; bch2_extent_ptr_decoded_append(k, &p); ret = true; goto restart_narrow_pointers; } return ret; } static void bch2_extent_crc_pack(union bch_extent_crc *dst, struct bch_extent_crc_unpacked src, enum bch_extent_entry_type type) { #define common_fields(_src) \ .type = BIT(type), \ .csum_type = _src.csum_type, \ .compression_type = _src.compression_type, \ ._compressed_size = _src.compressed_size - 1, \ ._uncompressed_size = _src.uncompressed_size - 1, \ .offset = _src.offset switch (type) { case BCH_EXTENT_ENTRY_crc32: dst->crc32 = (struct bch_extent_crc32) { common_fields(src), .csum = (u32 __force) *((__le32 *) &src.csum.lo), }; break; case BCH_EXTENT_ENTRY_crc64: dst->crc64 = (struct bch_extent_crc64) { common_fields(src), .nonce = src.nonce, .csum_lo = (u64 __force) src.csum.lo, .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi), }; break; case BCH_EXTENT_ENTRY_crc128: dst->crc128 = (struct bch_extent_crc128) { common_fields(src), .nonce = src.nonce, .csum = src.csum, }; break; default: BUG(); } #undef set_common_fields } void bch2_extent_crc_append(struct bkey_i *k, struct bch_extent_crc_unpacked new) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); union bch_extent_crc *crc = (void *) ptrs.end; enum bch_extent_entry_type type; if (bch_crc_bytes[new.csum_type] <= 4 && new.uncompressed_size <= CRC32_SIZE_MAX && new.nonce <= CRC32_NONCE_MAX) type = BCH_EXTENT_ENTRY_crc32; else if (bch_crc_bytes[new.csum_type] <= 10 && new.uncompressed_size <= CRC64_SIZE_MAX && new.nonce <= CRC64_NONCE_MAX) type = BCH_EXTENT_ENTRY_crc64; else if (bch_crc_bytes[new.csum_type] <= 16 && new.uncompressed_size <= CRC128_SIZE_MAX && new.nonce <= CRC128_NONCE_MAX) type = BCH_EXTENT_ENTRY_crc128; else BUG(); bch2_extent_crc_pack(crc, new, type); k->k.u64s += extent_entry_u64s(ptrs.end); EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); } /* Generic code for keys with pointers: */ unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) { return bch2_bkey_devs(k).nr; } unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) { return k.k->type == KEY_TYPE_reservation ? bkey_s_c_to_reservation(k).v->nr_replicas : bch2_bkey_dirty_devs(k).nr; } unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) { unsigned ret = 0; if (k.k->type == KEY_TYPE_reservation) { ret = bkey_s_c_to_reservation(k).v->nr_replicas; } else { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ret += !p.ptr.cached && !crc_is_compressed(p.crc); } return ret; } unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; unsigned ret = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (!p.ptr.cached && crc_is_compressed(p.crc)) ret += p.crc.compressed_size; return ret; } bool bch2_bkey_is_incompressible(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct bch_extent_crc_unpacked crc; bkey_for_each_crc(k.k, ptrs, crc, entry) if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) return true; return false; } unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p = { 0 }; unsigned replicas = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { if (p.ptr.cached) continue; if (p.has_ec) replicas += p.ec.redundancy; replicas++; } return replicas; } static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p) { if (p->ptr.cached) return 0; return p->has_ec ? p->ec.redundancy + 1 : ca->mi.durability; } unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); return ca ? __extent_ptr_durability(ca, p) : 0; } unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed) return 0; return __extent_ptr_durability(ca, p); } unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; unsigned durability = 0; guard(rcu)(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) durability += bch2_extent_ptr_durability(c, &p); return durability; } static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; unsigned durability = 0; guard(rcu)(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) durability += bch2_extent_ptr_durability(c, &p); return durability; } void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) { union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); union bch_extent_entry *next = extent_entry_next(entry); memmove_u64s(entry, next, (u64 *) end - (u64 *) next); k->k.u64s -= extent_entry_u64s(entry); } void bch2_extent_ptr_decoded_append(struct bkey_i *k, struct extent_ptr_decoded *p) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(&k->k, NULL); union bch_extent_entry *pos; if (!bch2_crc_unpacked_cmp(crc, p->crc)) { pos = ptrs.start; goto found; } bkey_for_each_crc(&k->k, ptrs, crc, pos) if (!bch2_crc_unpacked_cmp(crc, p->crc)) { pos = extent_entry_next(pos); goto found; } bch2_extent_crc_append(k, p->crc); pos = bkey_val_end(bkey_i_to_s(k)); found: p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; __extent_entry_insert(k, pos, to_entry(&p->ptr)); if (p->has_ec) { p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; __extent_entry_insert(k, pos, to_entry(&p->ec)); } } static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, union bch_extent_entry *entry) { union bch_extent_entry *i = ptrs.start; if (i == entry) return NULL; while (extent_entry_next(i) != entry) i = extent_entry_next(i); return i; } /* * Returns pointer to the next entry after the one being dropped: */ void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry = to_entry(ptr), *next; bool drop_crc = true; if (k.k->type == KEY_TYPE_stripe) { ptr->dev = BCH_SB_MEMBER_INVALID; return; } EBUG_ON(ptr < &ptrs.start->ptr || ptr >= &ptrs.end->ptr); EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); for (next = extent_entry_next(entry); next != ptrs.end; next = extent_entry_next(next)) { if (extent_entry_is_crc(next)) { break; } else if (extent_entry_is_ptr(next)) { drop_crc = false; break; } } extent_entry_drop(k, entry); while ((entry = extent_entry_prev(ptrs, entry))) { if (extent_entry_is_ptr(entry)) break; if ((extent_entry_is_crc(entry) && drop_crc) || extent_entry_is_stripe_ptr(entry)) extent_entry_drop(k, entry); } } void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr) { if (k.k->type != KEY_TYPE_stripe) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c); const union bch_extent_entry *entry; struct extent_ptr_decoded p; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (p.ptr.dev == ptr->dev && p.has_ec) { ptr->dev = BCH_SB_MEMBER_INVALID; return; } } bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; bch2_bkey_drop_ptr_noerror(k, ptr); /* * If we deleted all the dirty pointers and there's still cached * pointers, we could set the cached pointers to dirty if they're not * stale - but to do that correctly we'd need to grab an open_bucket * reference so that we don't race with bucket reuse: */ if (have_dirty && !bch2_bkey_dirty_devs(k.s_c).nr) { k.k->type = KEY_TYPE_error; set_bkey_val_u64s(k.k, 0); } else if (!bch2_bkey_nr_ptrs(k.s_c)) { k.k->type = KEY_TYPE_deleted; set_bkey_val_u64s(k.k, 0); } } void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) { bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); } void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) { bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev); } const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) if (ptr->dev == dev) return ptr; return NULL; } bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_dev *ca; guard(rcu)(); bkey_for_each_ptr(ptrs, ptr) if (bch2_dev_in_target(c, ptr->dev, target) && (ca = bch2_dev_rcu(c, ptr->dev)) && (!ptr->cached || !dev_ptr_stale_rcu(ca, ptr))) return true; return false; } bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, struct bch_extent_ptr m, u64 offset) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (p.ptr.dev == m.dev && p.ptr.gen == m.gen && (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == (s64) m.offset - offset) return true; return false; } /* * Returns true if two extents refer to the same data: */ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) { if (k1.k->type != k2.k->type) return false; if (bkey_extent_is_direct_data(k1.k)) { struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); const union bch_extent_entry *entry1, *entry2; struct extent_ptr_decoded p1, p2; if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) return false; bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) if (p1.ptr.dev == p2.ptr.dev && p1.ptr.gen == p2.ptr.gen && /* * This checks that the two pointers point * to the same region on disk - adjusting * for the difference in where the extents * start, since one may have been trimmed: */ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) && /* * This additionally checks that the * extents overlap on disk, since the * previous check may trigger spuriously * when one extent is immediately partially * overwritten with another extent (so that * on disk they are adjacent) and * compression is in use: */ ((p1.ptr.offset >= p2.ptr.offset && p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) || (p2.ptr.offset >= p1.ptr.offset && p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size))) return true; return false; } else { /* KEY_TYPE_deleted, etc. */ return true; } } struct bch_extent_ptr * bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2) { struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2); union bch_extent_entry *entry2; struct extent_ptr_decoded p2; bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) if (p1.ptr.dev == p2.ptr.dev && p1.ptr.gen == p2.ptr.gen && (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) return &entry2->ptr; return NULL; } static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, struct bch_extent_ptr *ptr) { unsigned target = opts->promote_target ?: opts->foreground_target; if (target && !bch2_dev_in_target(c, ptr->dev, target)) return false; struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr); } void bch2_extent_ptr_set_cached(struct bch_fs *c, struct bch_io_opts *opts, struct bkey_s k, struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs; union bch_extent_entry *entry; struct extent_ptr_decoded p; bool have_cached_ptr; unsigned drop_dev = ptr->dev; guard(rcu)(); restart_drop_ptrs: ptrs = bch2_bkey_ptrs(k); have_cached_ptr = false; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { /* * Check if it's erasure coded - stripes can't contain cached * data. Possibly something we can fix in the future? */ if (&entry->ptr == ptr && p.has_ec) goto drop; if (p.ptr.cached) { if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) { bch2_bkey_drop_ptr_noerror(k, &entry->ptr); ptr = NULL; goto restart_drop_ptrs; } have_cached_ptr = true; } } if (!ptr) bkey_for_each_ptr(ptrs, ptr2) if (ptr2->dev == drop_dev) ptr = ptr2; if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) goto drop; ptr->cached = true; return; drop: bch2_bkey_drop_ptr_noerror(k, ptr); } /* * bch2_extent_normalize - clean up an extent, dropping stale pointers etc. * * Returns true if @k should be dropped entirely * * For existing keys, only called when btree nodes are being rewritten, not when * they're merely being compacted/resorted in memory. */ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) { struct bch_dev *ca; guard(rcu)(); bch2_bkey_drop_ptrs(k, ptr, ptr->cached && (!(ca = bch2_dev_rcu(c, ptr->dev)) || dev_ptr_stale_rcu(ca, ptr) > 0)); return bkey_deleted(k.k); } /* * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc. * * Like bch2_extent_normalize(), but also only keeps a single cached pointer on * the promote target. */ bool bch2_extent_normalize_by_opts(struct bch_fs *c, struct bch_io_opts *opts, struct bkey_s k) { struct bkey_ptrs ptrs; bool have_cached_ptr; guard(rcu)(); restart_drop_ptrs: ptrs = bch2_bkey_ptrs(k); have_cached_ptr = false; bkey_for_each_ptr(ptrs, ptr) if (ptr->cached) { if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) { bch2_bkey_drop_ptr(k, ptr); goto restart_drop_ptrs; } have_cached_ptr = true; } return bkey_deleted(k.k); } void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) { out->atomic++; guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); if (!ca) { prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, (u64) ptr->offset, ptr->gen, ptr->cached ? " cached" : ""); } else { u32 offset; u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); prt_printf(out, "ptr: %u:%llu:%u gen %u", ptr->dev, b, offset, ptr->gen); if (ca->mi.durability != 1) prt_printf(out, " d=%u", ca->mi.durability); if (ptr->cached) prt_str(out, " cached"); if (ptr->unwritten) prt_str(out, " unwritten"); int stale = dev_ptr_stale_rcu(ca, ptr); if (stale > 0) prt_printf(out, " stale"); else if (stale) prt_printf(out, " invalid"); } --out->atomic; } void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc) { prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ", crc->compressed_size, crc->uncompressed_size, crc->offset, crc->nonce); bch2_prt_csum_type(out, crc->csum_type); prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo); prt_str(out, " compress "); bch2_prt_compression_type(out, crc->compression_type); } static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_rebalance *r) { prt_str(out, "rebalance:"); prt_printf(out, " replicas=%u", r->data_replicas); if (r->data_replicas_from_inode) prt_str(out, " (inode)"); prt_str(out, " checksum="); bch2_prt_csum_opt(out, r->data_checksum); if (r->data_checksum_from_inode) prt_str(out, " (inode)"); if (r->background_compression || r->background_compression_from_inode) { prt_str(out, " background_compression="); bch2_compression_opt_to_text(out, r->background_compression); if (r->background_compression_from_inode) prt_str(out, " (inode)"); } if (r->background_target || r->background_target_from_inode) { prt_str(out, " background_target="); if (c) bch2_target_to_text(out, c, r->background_target); else prt_printf(out, "%u", r->background_target); if (r->background_target_from_inode) prt_str(out, " (inode)"); } if (r->promote_target || r->promote_target_from_inode) { prt_str(out, " promote_target="); if (c) bch2_target_to_text(out, c, r->promote_target); else prt_printf(out, "%u", r->promote_target); if (r->promote_target_from_inode) prt_str(out, " (inode)"); } if (r->erasure_code || r->erasure_code_from_inode) { prt_printf(out, " ec=%u", r->erasure_code); if (r->erasure_code_from_inode) prt_str(out, " (inode)"); } } void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; bool first = true; if (c) prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k)); bkey_extent_entry_for_each(ptrs, entry) { if (!first) prt_printf(out, " "); switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry)); break; case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: { struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); bch2_extent_crc_unpacked_to_text(out, &crc); break; } case BCH_EXTENT_ENTRY_stripe_ptr: { const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr; prt_printf(out, "ec: idx %llu block %u", (u64) ec->idx, ec->block); break; } case BCH_EXTENT_ENTRY_rebalance: bch2_extent_rebalance_to_text(out, c, &entry->rebalance); break; case BCH_EXTENT_ENTRY_flags: prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags); break; default: prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); return; } first = false; } } static int extent_ptr_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from, const struct bch_extent_ptr *ptr, unsigned size_ondisk, bool metadata) { int ret = 0; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr2) bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, ptr_to_duplicate_device, "multiple pointers to same device (%u)", ptr->dev); /* bad pointers are repaired by check_fix_ptrs(): */ rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); if (!ca) { rcu_read_unlock(); return 0; } u32 bucket_offset; u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); unsigned first_bucket = ca->mi.first_bucket; u64 nbuckets = ca->mi.nbuckets; unsigned bucket_size = ca->mi.bucket_size; rcu_read_unlock(); bkey_fsck_err_on(bucket >= nbuckets, c, ptr_after_last_bucket, "pointer past last bucket (%llu > %llu)", bucket, nbuckets); bkey_fsck_err_on(bucket < first_bucket, c, ptr_before_first_bucket, "pointer before first bucket (%llu < %u)", bucket, first_bucket); bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, c, ptr_spans_multiple_buckets, "pointer spans multiple buckets (%u + %u > %u)", bucket_offset, size_ondisk, bucket_size); fsck_err: return ret; } int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct bch_extent_crc_unpacked crc; unsigned size_ondisk = k.k->size; unsigned nonce = UINT_MAX; unsigned nr_ptrs = 0; bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false; int ret = 0; if (bkey_is_btree_ptr(k.k)) size_ondisk = btree_sectors(c); bkey_extent_entry_for_each(ptrs, entry) { bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, extent_ptrs_invalid_entry, "invalid extent entry type (got %u, max %u)", __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); bkey_fsck_err_on(bkey_is_btree_ptr(k.k) && !extent_entry_is_ptr(entry), c, btree_ptr_has_non_ptr, "has non ptr field"); switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false); if (ret) return ret; bkey_fsck_err_on(entry->ptr.cached && have_ec, c, ptr_cached_and_erasure_coded, "cached, erasure coded ptr"); if (!entry->ptr.unwritten) have_written = true; else have_unwritten = true; have_ec = false; crc_since_last_ptr = false; nr_ptrs++; break; case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, ptr_crc_csum_type_unknown, "invalid checksum type"); bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, ptr_crc_compression_type_unknown, "invalid compression type"); bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, ptr_crc_uncompressed_size_too_small, "checksum offset + key size > uncompressed size"); bkey_fsck_err_on(crc_is_encoded(crc) && (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), c, ptr_crc_uncompressed_size_too_big, "too large encoded extent"); bkey_fsck_err_on(!crc_is_compressed(crc) && crc.compressed_size != crc.uncompressed_size, c, ptr_crc_uncompressed_size_mismatch, "not compressed but compressed != uncompressed size"); if (bch2_csum_type_is_encryption(crc.csum_type)) { if (nonce == UINT_MAX) nonce = crc.offset + crc.nonce; else if (nonce != crc.offset + crc.nonce) bkey_fsck_err(c, ptr_crc_nonce_mismatch, "incorrect nonce"); } bkey_fsck_err_on(crc_since_last_ptr, c, ptr_crc_redundant, "redundant crc entry"); crc_since_last_ptr = true; size_ondisk = crc.compressed_size; break; case BCH_EXTENT_ENTRY_stripe_ptr: bkey_fsck_err_on(have_ec, c, ptr_stripe_redundant, "redundant stripe entry"); have_ec = true; break; case BCH_EXTENT_ENTRY_rebalance: { /* * this shouldn't be a fsck error, for forward * compatibility; the rebalance code should just refetch * the compression opt if it's unknown */ #if 0 const struct bch_extent_rebalance *r = &entry->rebalance; if (!bch2_compression_opt_valid(r->compression)) { struct bch_compression_opt opt = __bch2_compression_decode(r->compression); prt_printf(err, "invalid compression opt %u:%u", opt.type, opt.level); return bch_err_throw(c, invalid_bkey); } #endif break; } case BCH_EXTENT_ENTRY_flags: bkey_fsck_err_on(entry != ptrs.start, c, extent_flags_not_at_start, "extent flags entry not at start"); break; } } bkey_fsck_err_on(!nr_ptrs, c, extent_ptrs_no_ptrs, "no ptrs"); bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, extent_ptrs_too_many_ptrs, "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX); bkey_fsck_err_on(have_written && have_unwritten, c, extent_ptrs_written_and_unwritten, "extent with unwritten and written ptrs"); bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, extent_ptrs_unwritten, "has unwritten ptrs"); bkey_fsck_err_on(crc_since_last_ptr, c, extent_ptrs_redundant_crc, "redundant crc entry"); bkey_fsck_err_on(have_ec, c, extent_ptrs_redundant_stripe, "redundant stripe entry"); fsck_err: return ret; } void bch2_ptr_swab(struct bkey_s k) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; u64 *d; for (d = (u64 *) ptrs.start; d != (u64 *) ptrs.end; d++) *d = swab64(*d); for (entry = ptrs.start; entry < ptrs.end; entry = extent_entry_next(entry)) { switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: break; case BCH_EXTENT_ENTRY_crc32: entry->crc32.csum = swab32(entry->crc32.csum); break; case BCH_EXTENT_ENTRY_crc64: entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); break; case BCH_EXTENT_ENTRY_crc128: entry->crc128.csum.hi = (__force __le64) swab64((__force u64) entry->crc128.csum.hi); entry->crc128.csum.lo = (__force __le64) swab64((__force u64) entry->crc128.csum.lo); break; case BCH_EXTENT_ENTRY_stripe_ptr: break; case BCH_EXTENT_ENTRY_rebalance: break; default: /* Bad entry type: will be caught by validate() */ return; } } } int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags) { int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags); if (ret) return ret; struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); if (ptrs.start != ptrs.end && extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) { ptrs.start->flags.flags = flags; } else { struct bch_extent_flags f = { .type = BIT(BCH_EXTENT_ENTRY_flags), .flags = flags, }; __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f); } return 0; } /* Generic extent code: */ int bch2_cut_front_s(struct bpos where, struct bkey_s k) { unsigned new_val_u64s = bkey_val_u64s(k.k); int val_u64s_delta; u64 sub; if (bkey_le(where, bkey_start_pos(k.k))) return 0; EBUG_ON(bkey_gt(where, k.k->p)); sub = where.offset - bkey_start_offset(k.k); k.k->size -= sub; if (!k.k->size) { k.k->type = KEY_TYPE_deleted; new_val_u64s = 0; } switch (k.k->type) { case KEY_TYPE_extent: case KEY_TYPE_reflink_v: { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; bool seen_crc = false; bkey_extent_entry_for_each(ptrs, entry) { switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: if (!seen_crc) entry->ptr.offset += sub; break; case BCH_EXTENT_ENTRY_crc32: entry->crc32.offset += sub; break; case BCH_EXTENT_ENTRY_crc64: entry->crc64.offset += sub; break; case BCH_EXTENT_ENTRY_crc128: entry->crc128.offset += sub; break; case BCH_EXTENT_ENTRY_stripe_ptr: case BCH_EXTENT_ENTRY_rebalance: case BCH_EXTENT_ENTRY_flags: break; } if (extent_entry_is_crc(entry)) seen_crc = true; } break; } case KEY_TYPE_reflink_p: { struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub); break; } case KEY_TYPE_inline_data: case KEY_TYPE_indirect_inline_data: { void *p = bkey_inline_data_p(k); unsigned bytes = bkey_inline_data_bytes(k.k); sub = min_t(u64, sub << 9, bytes); memmove(p, p + sub, bytes - sub); new_val_u64s -= sub >> 3; break; } } val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; BUG_ON(val_u64s_delta < 0); set_bkey_val_u64s(k.k, new_val_u64s); memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); return -val_u64s_delta; } int bch2_cut_back_s(struct bpos where, struct bkey_s k) { unsigned new_val_u64s = bkey_val_u64s(k.k); int val_u64s_delta; u64 len = 0; if (bkey_ge(where, k.k->p)) return 0; EBUG_ON(bkey_lt(where, bkey_start_pos(k.k))); len = where.offset - bkey_start_offset(k.k); k.k->p.offset = where.offset; k.k->size = len; if (!len) { k.k->type = KEY_TYPE_deleted; new_val_u64s = 0; } switch (k.k->type) { case KEY_TYPE_inline_data: case KEY_TYPE_indirect_inline_data: new_val_u64s = (bkey_inline_data_offset(k.k) + min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3; break; } val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; BUG_ON(val_u64s_delta < 0); set_bkey_val_u64s(k.k, new_val_u64s); memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); return -val_u64s_delta; } |
| 3 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | // SPDX-License-Identifier: GPL-2.0 /* * QNX6 file system, Linux implementation. * * Version : 1.0.0 * * History : * * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release. * */ #include <linux/buffer_head.h> #include <linux/slab.h> #include <linux/crc32.h> #include "qnx6.h" static void qnx6_mmi_copy_sb(struct qnx6_super_block *qsb, struct qnx6_mmi_super_block *sb) { qsb->sb_magic = sb->sb_magic; qsb->sb_checksum = sb->sb_checksum; qsb->sb_serial = sb->sb_serial; qsb->sb_blocksize = sb->sb_blocksize; qsb->sb_num_inodes = sb->sb_num_inodes; qsb->sb_free_inodes = sb->sb_free_inodes; qsb->sb_num_blocks = sb->sb_num_blocks; qsb->sb_free_blocks = sb->sb_free_blocks; /* the rest of the superblock is the same */ memcpy(&qsb->Inode, &sb->Inode, sizeof(sb->Inode)); memcpy(&qsb->Bitmap, &sb->Bitmap, sizeof(sb->Bitmap)); memcpy(&qsb->Longfile, &sb->Longfile, sizeof(sb->Longfile)); } struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) { struct buffer_head *bh1, *bh2 = NULL; struct qnx6_mmi_super_block *sb1, *sb2; struct qnx6_super_block *qsb = NULL; struct qnx6_sb_info *sbi; __u64 offset; /* Check the superblock signatures start with the first superblock */ bh1 = sb_bread(s, 0); if (!bh1) { pr_err("Unable to read first mmi superblock\n"); return NULL; } sb1 = (struct qnx6_mmi_super_block *)bh1->b_data; sbi = QNX6_SB(s); if (fs32_to_cpu(sbi, sb1->sb_magic) != QNX6_SUPER_MAGIC) { if (!silent) { pr_err("wrong signature (magic) in superblock #1.\n"); goto out; } } /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb1->sb_checksum) != crc32_be(0, (char *)(bh1->b_data + 8), 504)) { pr_err("superblock #1 checksum error\n"); goto out; } /* calculate second superblock blocknumber */ offset = fs32_to_cpu(sbi, sb1->sb_num_blocks) + QNX6_SUPERBLOCK_AREA / fs32_to_cpu(sbi, sb1->sb_blocksize); /* set new blocksize */ if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) { pr_err("unable to set blocksize\n"); goto out; } /* blocksize invalidates bh - pull it back in */ brelse(bh1); bh1 = sb_bread(s, 0); if (!bh1) goto out; sb1 = (struct qnx6_mmi_super_block *)bh1->b_data; /* read second superblock */ bh2 = sb_bread(s, offset); if (!bh2) { pr_err("unable to read the second superblock\n"); goto out; } sb2 = (struct qnx6_mmi_super_block *)bh2->b_data; if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) { if (!silent) pr_err("wrong signature (magic) in superblock #2.\n"); goto out; } /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb2->sb_checksum) != crc32_be(0, (char *)(bh2->b_data + 8), 504)) { pr_err("superblock #1 checksum error\n"); goto out; } qsb = kmalloc(sizeof(*qsb), GFP_KERNEL); if (!qsb) { pr_err("unable to allocate memory.\n"); goto out; } if (fs64_to_cpu(sbi, sb1->sb_serial) > fs64_to_cpu(sbi, sb2->sb_serial)) { /* superblock #1 active */ qnx6_mmi_copy_sb(qsb, sb1); #ifdef CONFIG_QNX6FS_DEBUG qnx6_superblock_debug(qsb, s); #endif memcpy(bh1->b_data, qsb, sizeof(struct qnx6_super_block)); sbi->sb_buf = bh1; sbi->sb = (struct qnx6_super_block *)bh1->b_data; brelse(bh2); pr_info("superblock #1 active\n"); } else { /* superblock #2 active */ qnx6_mmi_copy_sb(qsb, sb2); #ifdef CONFIG_QNX6FS_DEBUG qnx6_superblock_debug(qsb, s); #endif memcpy(bh2->b_data, qsb, sizeof(struct qnx6_super_block)); sbi->sb_buf = bh2; sbi->sb = (struct qnx6_super_block *)bh2->b_data; brelse(bh1); pr_info("superblock #2 active\n"); } kfree(qsb); /* offset for mmi_fs is just SUPERBLOCK_AREA bytes */ sbi->s_blks_off = QNX6_SUPERBLOCK_AREA / s->s_blocksize; /* success */ return sbi->sb; out: if (bh1 != NULL) brelse(bh1); if (bh2 != NULL) brelse(bh2); return NULL; } |
| 4 3 27 6 27 6 5 1 3 300 291 2 134 14 11 200 135 22 56 1 37 2 41 201 155 2 36 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ocfs2.h * * Defines macros and structures used in OCFS2 * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ #ifndef OCFS2_H #define OCFS2_H #include <linux/spinlock.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/list.h> #include <linux/llist.h> #include <linux/rbtree.h> #include <linux/workqueue.h> #include <linux/kref.h> #include <linux/mutex.h> #include <linux/lockdep.h> #include <linux/jbd2.h> /* For union ocfs2_dlm_lksb */ #include "stackglue.h" #include "ocfs2_fs.h" #include "ocfs2_lockid.h" #include "ocfs2_ioctl.h" /* For struct ocfs2_blockcheck_stats */ #include "blockcheck.h" #include "reservations.h" #include "filecheck.h" /* Caching of metadata buffers */ /* Most user visible OCFS2 inodes will have very few pieces of * metadata, but larger files (including bitmaps, etc) must be taken * into account when designing an access scheme. We allow a small * amount of inlined blocks to be stored on an array and grow the * structure into a rb tree when necessary. */ #define OCFS2_CACHE_INFO_MAX_ARRAY 2 /* Flags for ocfs2_caching_info */ enum ocfs2_caching_info_flags { /* Indicates that the metadata cache is using the inline array */ OCFS2_CACHE_FL_INLINE = 1<<1, }; struct ocfs2_caching_operations; struct ocfs2_caching_info { /* * The parent structure provides the locks, but because the * parent structure can differ, it provides locking operations * to struct ocfs2_caching_info. */ const struct ocfs2_caching_operations *ci_ops; /* next two are protected by trans_inc_lock */ /* which transaction were we created on? Zero if none. */ unsigned long ci_created_trans; /* last transaction we were a part of. */ unsigned long ci_last_trans; /* Cache structures */ unsigned int ci_flags; unsigned int ci_num_cached; union { sector_t ci_array[OCFS2_CACHE_INFO_MAX_ARRAY]; struct rb_root ci_tree; } ci_cache; }; /* * Need this prototype here instead of in uptodate.h because journal.h * uses it. */ struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci); /* this limits us to 256 nodes * if we need more, we can do a kmalloc for the map */ #define OCFS2_NODE_MAP_MAX_NODES 256 struct ocfs2_node_map { u16 num_nodes; unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)]; }; enum ocfs2_ast_action { OCFS2_AST_INVALID = 0, OCFS2_AST_ATTACH, OCFS2_AST_CONVERT, OCFS2_AST_DOWNCONVERT, }; /* actions for an unlockast function to take. */ enum ocfs2_unlock_action { OCFS2_UNLOCK_INVALID = 0, OCFS2_UNLOCK_CANCEL_CONVERT, OCFS2_UNLOCK_DROP_LOCK, }; /* ocfs2_lock_res->l_flags flags. */ #define OCFS2_LOCK_ATTACHED (0x00000001) /* we have initialized * the lvb */ #define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in * dlm_lock */ #define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to * downconvert*/ #define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */ #define OCFS2_LOCK_NEEDS_REFRESH (0x00000010) #define OCFS2_LOCK_REFRESHING (0x00000020) #define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization * for shutdown paths */ #define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track * when to skip queueing * a lock because it's * about to be * dropped. */ #define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ #define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */ #define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a call to dlm_lock. Only exists with BUSY set. */ #define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread * from downconverting * before the upconvert * has completed */ #define OCFS2_LOCK_NONBLOCK_FINISHED (0x00001000) /* NONBLOCK cluster * lock has already * returned, do not block * dc thread from * downconverting */ struct ocfs2_lock_res_ops; typedef void (*ocfs2_lock_callback)(int status, unsigned long data); #ifdef CONFIG_OCFS2_FS_STATS struct ocfs2_lock_stats { u64 ls_total; /* Total wait in NSEC */ u32 ls_gets; /* Num acquires */ u32 ls_fail; /* Num failed acquires */ /* Storing max wait in usecs saves 24 bytes per inode */ u32 ls_max; /* Max wait in USEC */ u64 ls_last; /* Last unlock time in USEC */ }; #endif struct ocfs2_lock_res { void *l_priv; const struct ocfs2_lock_res_ops *l_ops; struct list_head l_blocked_list; struct list_head l_mask_waiters; struct list_head l_holders; unsigned long l_flags; char l_name[OCFS2_LOCK_ID_MAX_LEN]; unsigned int l_ro_holders; unsigned int l_ex_holders; signed char l_level; signed char l_requested; signed char l_blocking; /* Data packed - type enum ocfs2_lock_type */ unsigned char l_type; /* used from AST/BAST funcs. */ /* Data packed - enum type ocfs2_ast_action */ unsigned char l_action; /* Data packed - enum type ocfs2_unlock_action */ unsigned char l_unlock_action; unsigned int l_pending_gen; spinlock_t l_lock; struct ocfs2_dlm_lksb l_lksb; wait_queue_head_t l_event; struct list_head l_debug_list; #ifdef CONFIG_OCFS2_FS_STATS struct ocfs2_lock_stats l_lock_prmode; /* PR mode stats */ u32 l_lock_refresh; /* Disk refreshes */ u64 l_lock_wait; /* First lock wait time */ struct ocfs2_lock_stats l_lock_exmode; /* EX mode stats */ #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map l_lockdep_map; #endif }; enum ocfs2_orphan_reco_type { ORPHAN_NO_NEED_TRUNCATE = 0, ORPHAN_NEED_TRUNCATE, }; enum ocfs2_orphan_scan_state { ORPHAN_SCAN_ACTIVE, ORPHAN_SCAN_INACTIVE }; struct ocfs2_orphan_scan { struct mutex os_lock; struct ocfs2_super *os_osb; struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */ struct delayed_work os_orphan_scan_work; time64_t os_scantime; /* time this node ran the scan */ u32 os_count; /* tracks node specific scans */ u32 os_seqno; /* tracks cluster wide scans */ atomic_t os_state; /* ACTIVE or INACTIVE */ }; struct ocfs2_dlm_debug { struct kref d_refcnt; u32 d_filter_secs; struct list_head d_lockres_tracking; }; enum ocfs2_vol_state { VOLUME_INIT = 0, VOLUME_MOUNTED, VOLUME_MOUNTED_QUOTAS, VOLUME_DISMOUNTED, VOLUME_DISABLED }; struct ocfs2_alloc_stats { atomic_t moves; atomic_t local_data; atomic_t bitmap_data; atomic_t bg_allocs; atomic_t bg_extends; }; enum ocfs2_local_alloc_state { OCFS2_LA_UNUSED = 0, /* Local alloc will never be used for * this mountpoint. */ OCFS2_LA_ENABLED, /* Local alloc is in use. */ OCFS2_LA_THROTTLED, /* Local alloc is in use, but number * of bits has been reduced. */ OCFS2_LA_DISABLED /* Local alloc has temporarily been * disabled. */ }; enum ocfs2_mount_options { OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */ OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */ OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9, /* Disable POSIX access control lists */ OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT writes */ OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */ OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ }; #define OCFS2_OSB_SOFT_RO 0x0001 #define OCFS2_OSB_HARD_RO 0x0002 #define OCFS2_OSB_ERROR_FS 0x0004 #define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_triggers { struct jbd2_buffer_trigger_type ot_triggers; int ot_offset; struct super_block *sb; }; enum ocfs2_journal_trigger_type { OCFS2_JTR_DI, OCFS2_JTR_EB, OCFS2_JTR_RB, OCFS2_JTR_GD, OCFS2_JTR_DB, OCFS2_JTR_XB, OCFS2_JTR_DQ, OCFS2_JTR_DR, OCFS2_JTR_DL, OCFS2_JTR_NONE /* This must be the last entry */ }; #define OCFS2_JOURNAL_TRIGGER_COUNT OCFS2_JTR_NONE void ocfs2_initialize_journal_triggers(struct super_block *sb, struct ocfs2_triggers triggers[]); enum ocfs2_recovery_state { OCFS2_REC_ENABLED = 0, OCFS2_REC_QUOTA_WANT_DISABLE, /* * Must be OCFS2_REC_QUOTA_WANT_DISABLE + 1 for * ocfs2_recovery_disable_quota() to work. */ OCFS2_REC_QUOTA_DISABLED, OCFS2_REC_WANT_DISABLE, /* * Must be OCFS2_REC_WANT_DISABLE + 1 for ocfs2_recovery_exit() to work */ OCFS2_REC_DISABLED, }; struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; struct ocfs2_replay_map; struct ocfs2_quota_recovery; struct ocfs2_super { struct task_struct *commit_task; struct super_block *sb; struct inode *root_inode; struct inode *sys_root_inode; struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES]; struct inode **local_system_inodes; struct ocfs2_slot_info *slot_info; u32 *slot_recovery_generations; spinlock_t node_map_lock; u64 root_blkno; u64 system_dir_blkno; u64 bitmap_blkno; u32 bitmap_cpg; char *uuid_str; u32 uuid_hash; u8 *vol_label; u64 first_cluster_group_blkno; u32 fs_generation; u32 s_feature_compat; u32 s_feature_incompat; u32 s_feature_ro_compat; /* Protects s_next_generation, osb_flags and s_inode_steal_slot. * Could protect more on osb as it's very short lived. */ spinlock_t osb_lock; u32 s_next_generation; unsigned long osb_flags; u16 s_inode_steal_slot; u16 s_meta_steal_slot; atomic_t s_num_inodes_stolen; atomic_t s_num_meta_stolen; unsigned long s_mount_opt; unsigned int s_atime_quantum; unsigned int max_slots; unsigned int node_num; int slot_num; int preferred_slot; int s_sectsize_bits; int s_clustersize; int s_clustersize_bits; unsigned int s_xattr_inline_size; atomic_t vol_state; struct mutex recovery_lock; struct ocfs2_recovery_map *recovery_map; struct ocfs2_replay_map *replay_map; struct task_struct *recovery_thread_task; enum ocfs2_recovery_state recovery_state; wait_queue_head_t checkpoint_event; struct ocfs2_journal *journal; unsigned long osb_commit_interval; /* Journal triggers for checksum */ struct ocfs2_triggers s_journal_triggers[OCFS2_JOURNAL_TRIGGER_COUNT]; struct delayed_work la_enable_wq; /* * Must hold local alloc i_rwsem and osb->osb_lock to change * local_alloc_bits. Reads can be done under either lock. */ unsigned int local_alloc_bits; unsigned int local_alloc_default_bits; /* osb_clusters_at_boot can become stale! Do not trust it to * be up to date. */ unsigned int osb_clusters_at_boot; enum ocfs2_local_alloc_state local_alloc_state; /* protected * by osb_lock */ struct buffer_head *local_alloc_bh; u64 la_last_gd; struct ocfs2_reservation_map osb_la_resmap; unsigned int osb_resv_level; unsigned int osb_dir_resv_level; /* Next two fields are for local node slot recovery during * mount. */ struct ocfs2_dinode *local_alloc_copy; struct ocfs2_quota_recovery *quota_rec; struct ocfs2_blockcheck_stats osb_ecc_stats; struct ocfs2_alloc_stats alloc_stats; char dev_str[20]; /* "major,minor" of the device */ u8 osb_stackflags; char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1]; struct ocfs2_cluster_connection *cconn; struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres; struct rw_semaphore nfs_sync_rwlock; struct ocfs2_lock_res osb_trim_fs_lockres; struct mutex obs_trim_fs_mutex; struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; wait_queue_head_t recovery_event; spinlock_t dc_task_lock; struct task_struct *dc_task; wait_queue_head_t dc_event; unsigned long dc_wake_sequence; unsigned long dc_work_sequence; /* * Any thread can add locks to the list, but the downconvert * thread is the only one allowed to remove locks. Any change * to this rule requires updating * ocfs2_downconvert_thread_do_work(). */ struct list_head blocked_lock_list; unsigned long blocked_lock_count; /* List of dquot structures to drop last reference to */ struct llist_head dquot_drop_list; struct work_struct dquot_drop_work; wait_queue_head_t osb_mount_event; /* Truncate log info */ struct inode *osb_tl_inode; struct buffer_head *osb_tl_bh; struct delayed_work osb_truncate_log_wq; atomic_t osb_tl_disable; /* * How many clusters in our truncate log. * It must be protected by osb_tl_inode->i_rwsem. */ unsigned int truncated_clusters; struct ocfs2_node_map osb_recovering_orphan_dirs; unsigned int *osb_orphan_wipes; wait_queue_head_t osb_wipe_event; struct ocfs2_orphan_scan osb_orphan_scan; /* used to protect metaecc calculation check of xattr. */ spinlock_t osb_xattr_lock; unsigned int osb_dx_mask; u32 osb_dx_seed[4]; /* the group we used to allocate inodes. */ u64 osb_inode_alloc_group; /* rb tree root for refcount lock. */ struct rb_root osb_rf_lock_tree; struct ocfs2_refcount_tree *osb_ref_tree_lru; struct mutex system_file_mutex; /* * OCFS2 needs to schedule several different types of work which * require cluster locking, disk I/O, recovery waits, etc. Since these * types of work tend to be heavy we avoid using the kernel events * workqueue and schedule on our own. */ struct workqueue_struct *ocfs2_wq; /* sysfs directory per partition */ struct kset *osb_dev_kset; /* file check related stuff */ struct ocfs2_filecheck_sysfs_entry osb_fc_ent; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) /* Useful typedef for passing around journal access functions */ typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); static inline int ocfs2_should_order_data(struct inode *inode) { if (!S_ISREG(inode->i_mode)) return 0; if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) return 0; return 1; } static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) return 1; return 0; } static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) { /* * Support for sparse files is a pre-requisite */ if (!ocfs2_sparse_alloc(osb)) return 0; if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN) return 1; return 0; } static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO) return 1; return 0; } static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA) return 1; return 0; } static inline int ocfs2_supports_xattr(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR) return 1; return 0; } static inline int ocfs2_meta_ecc(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC) return 1; return 0; } static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS) return 1; return 0; } static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) return 1; return 0; } static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb) { if (ocfs2_supports_indexed_dirs(osb)) return OCFS2_DX_LINK_MAX; return OCFS2_LINK_MAX; } static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di) { u32 nlink = le16_to_cpu(di->i_links_count); u32 hi = le16_to_cpu(di->i_links_count_hi); nlink |= (hi << OCFS2_LINKS_HI_SHIFT); return nlink; } static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink) { u16 lo, hi; lo = nlink; hi = nlink >> OCFS2_LINKS_HI_SHIFT; di->i_links_count = cpu_to_le16(lo); di->i_links_count_hi = cpu_to_le16(hi); } static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n) { u32 links = ocfs2_read_links_count(di); links += n; ocfs2_set_links_count(di, links); } static inline int ocfs2_refcount_tree(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) return 1; return 0; } /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock * too! */ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, unsigned long flag) { spin_lock(&osb->osb_lock); osb->osb_flags |= flag; spin_unlock(&osb->osb_lock); } static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, int hard) { spin_lock(&osb->osb_lock); osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO); if (hard) osb->osb_flags |= OCFS2_OSB_HARD_RO; else osb->osb_flags |= OCFS2_OSB_SOFT_RO; spin_unlock(&osb->osb_lock); } static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb) { int ret; spin_lock(&osb->osb_lock); ret = osb->osb_flags & OCFS2_OSB_HARD_RO; spin_unlock(&osb->osb_lock); return ret; } static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) { int ret; spin_lock(&osb->osb_lock); ret = osb->osb_flags & OCFS2_OSB_SOFT_RO; spin_unlock(&osb->osb_lock); return ret; } static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb) { return (osb->s_feature_incompat & (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)); } static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) { if (ocfs2_clusterinfo_valid(osb) && memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, OCFS2_STACK_LABEL_LEN)) return 1; return 0; } static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb) { if (ocfs2_clusterinfo_valid(osb) && !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, OCFS2_STACK_LABEL_LEN)) return 1; return 0; } static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) { return ocfs2_o2cb_stack(osb) && (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT); } static inline int ocfs2_mount_local(struct ocfs2_super *osb) { return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); } static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) { return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP); } #define OCFS2_IS_VALID_DINODE(ptr) \ (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \ (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE)) #define OCFS2_IS_VALID_GROUP_DESC(ptr) \ (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE)) #define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \ (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE)) #define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) #define OCFS2_IS_VALID_DX_ROOT(ptr) \ (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE)) #define OCFS2_IS_VALID_DX_LEAF(ptr) \ (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE)) #define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr) \ (!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE)) static inline unsigned long ino_from_blkno(struct super_block *sb, u64 blkno) { return (unsigned long)(blkno & (u64)ULONG_MAX); } static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb, u32 clusters) { int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; return (u64)clusters << c_to_b_bits; } static inline u32 ocfs2_clusters_for_blocks(struct super_block *sb, u64 blocks) { int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; blocks += (1 << b_to_c_bits) - 1; return (u32)(blocks >> b_to_c_bits); } static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb, u64 blocks) { int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; return (u32)(blocks >> b_to_c_bits); } static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb, u64 bytes) { int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int clusters; bytes += OCFS2_SB(sb)->s_clustersize - 1; /* OCFS2 just cannot have enough clusters to overflow this */ clusters = (unsigned int)(bytes >> cl_bits); return clusters; } static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb, u64 bytes) { int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int clusters; clusters = (unsigned int)(bytes >> cl_bits); return clusters; } static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, u64 bytes) { bytes += sb->s_blocksize - 1; return bytes >> sb->s_blocksize_bits; } static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb, u32 clusters) { return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; } static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb, u64 blocks) { int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; unsigned int clusters; clusters = ocfs2_blocks_to_clusters(sb, blocks); return (u64)clusters << bits; } static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, u64 bytes) { int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int clusters; clusters = ocfs2_clusters_for_bytes(sb, bytes); return (u64)clusters << cl_bits; } static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb, u64 bytes) { u64 blocks; blocks = ocfs2_blocks_for_bytes(sb, bytes); return blocks << sb->s_blocksize_bits; } static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes) { return (unsigned long)((bytes + 511) >> 9); } static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb, unsigned long pg_index) { u32 clusters = pg_index; unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; if (unlikely(PAGE_SHIFT > cbits)) clusters = pg_index << (PAGE_SHIFT - cbits); else if (PAGE_SHIFT < cbits) clusters = pg_index >> (cbits - PAGE_SHIFT); return clusters; } /* * Find the 1st page index which covers the given clusters. */ static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb, u32 clusters) { unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; pgoff_t index = clusters; if (PAGE_SHIFT > cbits) { index = (pgoff_t)clusters >> (PAGE_SHIFT - cbits); } else if (PAGE_SHIFT < cbits) { index = (pgoff_t)clusters << (cbits - PAGE_SHIFT); } return index; } static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) { unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int pages_per_cluster = 1; if (PAGE_SHIFT < cbits) pages_per_cluster = 1 << (cbits - PAGE_SHIFT); return pages_per_cluster; } static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb, unsigned int megs) { BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576); return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); } static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb, unsigned int clusters) { return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits); } static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) { __set_bit_le(bit, bitmap); } #define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) { __clear_bit_le(bit, bitmap); } #define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) #define ocfs2_test_bit test_bit_le #define ocfs2_find_next_zero_bit find_next_zero_bit_le #define ocfs2_find_next_bit find_next_bit_le static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr) { #if BITS_PER_LONG == 64 *bit += ((unsigned long) addr & 7UL) << 3; addr = (void *) ((unsigned long) addr & ~7UL); #elif BITS_PER_LONG == 32 *bit += ((unsigned long) addr & 3UL) << 3; addr = (void *) ((unsigned long) addr & ~3UL); #else #error "how many bits you are?!" #endif return addr; } static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap) { bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); ocfs2_set_bit(bit, bitmap); } static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap) { bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); ocfs2_clear_bit(bit, bitmap); } static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap) { bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); return ocfs2_test_bit(bit, bitmap); } static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max, int start) { int fix = 0, ret, tmpmax; bitmap = correct_addr_and_bit_unaligned(&fix, bitmap); tmpmax = max + fix; start += fix; ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix; if (ret > max) return max; return ret; } #endif /* OCFS2_H */ |
| 12 178 11 26 9 88 79 9 2 9 62 6 105 1 160 136 35 47 13 5 5 12 13 49 41 19 21 1 40 15 39 39 21 21 96 65 30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_EXTENTS_H #define _BCACHEFS_EXTENTS_H #include "bcachefs.h" #include "bkey.h" #include "extents_types.h" struct bch_fs; struct btree_trans; /* extent entries: */ #define extent_entry_last(_e) \ ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) #define entry_to_ptr(_entry) \ ({ \ EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ \ __builtin_choose_expr( \ type_is_exact(_entry, const union bch_extent_entry *), \ (const struct bch_extent_ptr *) (_entry), \ (struct bch_extent_ptr *) (_entry)); \ }) /* downcast, preserves const */ #define to_entry(_entry) \ ({ \ BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ !type_is(_entry, struct bch_extent_ptr *) && \ !type_is(_entry, struct bch_extent_stripe_ptr *)); \ \ __builtin_choose_expr( \ (type_is_exact(_entry, const union bch_extent_crc *) || \ type_is_exact(_entry, const struct bch_extent_ptr *) ||\ type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ (const union bch_extent_entry *) (_entry), \ (union bch_extent_entry *) (_entry)); \ }) #define extent_entry_next(_entry) \ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) #define extent_entry_next_safe(_entry, _end) \ (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \ ? extent_entry_next(_entry) \ : _end) static inline unsigned __extent_entry_type(const union bch_extent_entry *e) { return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; } static inline enum bch_extent_entry_type extent_entry_type(const union bch_extent_entry *e) { int ret = __ffs(e->type); EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); return ret; } static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) { switch (extent_entry_type(entry)) { #define x(f, n) \ case BCH_EXTENT_ENTRY_##f: \ return sizeof(struct bch_extent_##f); BCH_EXTENT_ENTRY_TYPES() #undef x default: BUG(); } } static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) { return extent_entry_bytes(entry) / sizeof(u64); } static inline void __extent_entry_insert(struct bkey_i *k, union bch_extent_entry *dst, union bch_extent_entry *new) { union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), dst, (u64 *) end - (u64 *) dst); k->k.u64s += extent_entry_u64s(new); memcpy_u64s_small(dst, new, extent_entry_u64s(new)); } static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) { union bch_extent_entry *next = extent_entry_next(entry); /* stripes have ptrs, but their layout doesn't work with this code */ BUG_ON(k.k->type == KEY_TYPE_stripe); memmove_u64s_down(entry, next, (u64 *) bkey_val_end(k) - (u64 *) next); k.k->u64s -= (u64 *) next - (u64 *) entry; } static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) { return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; } static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e) { return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr; } static inline bool extent_entry_is_crc(const union bch_extent_entry *e) { switch (__extent_entry_type(e)) { case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: return true; default: return false; } } union bch_extent_crc { u8 type; struct bch_extent_crc32 crc32; struct bch_extent_crc64 crc64; struct bch_extent_crc128 crc128; }; #define __entry_to_crc(_entry) \ __builtin_choose_expr( \ type_is_exact(_entry, const union bch_extent_entry *), \ (const union bch_extent_crc *) (_entry), \ (union bch_extent_crc *) (_entry)) #define entry_to_crc(_entry) \ ({ \ EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ \ __entry_to_crc(_entry); \ }) static inline struct bch_extent_crc_unpacked bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) { #define common_fields(_crc) \ .csum_type = _crc.csum_type, \ .compression_type = _crc.compression_type, \ .compressed_size = _crc._compressed_size + 1, \ .uncompressed_size = _crc._uncompressed_size + 1, \ .offset = _crc.offset, \ .live_size = k->size if (!crc) return (struct bch_extent_crc_unpacked) { .compressed_size = k->size, .uncompressed_size = k->size, .live_size = k->size, }; switch (extent_entry_type(to_entry(crc))) { case BCH_EXTENT_ENTRY_crc32: { struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { common_fields(crc->crc32), }; *((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum; return ret; } case BCH_EXTENT_ENTRY_crc64: { struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { common_fields(crc->crc64), .nonce = crc->crc64.nonce, .csum.lo = (__force __le64) crc->crc64.csum_lo, }; *((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi; return ret; } case BCH_EXTENT_ENTRY_crc128: { struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { common_fields(crc->crc128), .nonce = crc->crc128.nonce, .csum = crc->crc128.csum, }; return ret; } default: BUG(); } #undef common_fields } static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) { return (crc.compression_type != BCH_COMPRESSION_TYPE_none && crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); } static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc) { return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc); } void bch2_extent_crc_unpacked_to_text(struct printbuf *, struct bch_extent_crc_unpacked *); /* bkey_ptrs: generically over any key type that has ptrs */ struct bkey_ptrs_c { const union bch_extent_entry *start; const union bch_extent_entry *end; }; struct bkey_ptrs { union bch_extent_entry *start; union bch_extent_entry *end; }; static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) { switch (k.k->type) { case KEY_TYPE_btree_ptr: { struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); return (struct bkey_ptrs_c) { to_entry(&e.v->start[0]), to_entry(extent_entry_last(e)) }; } case KEY_TYPE_extent: { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); return (struct bkey_ptrs_c) { e.v->start, extent_entry_last(e) }; } case KEY_TYPE_stripe: { struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); return (struct bkey_ptrs_c) { to_entry(&s.v->ptrs[0]), to_entry(&s.v->ptrs[s.v->nr_blocks]), }; } case KEY_TYPE_reflink_v: { struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); return (struct bkey_ptrs_c) { r.v->start, bkey_val_end(r), }; } case KEY_TYPE_btree_ptr_v2: { struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); return (struct bkey_ptrs_c) { to_entry(&e.v->start[0]), to_entry(extent_entry_last(e)) }; } default: return (struct bkey_ptrs_c) { NULL, NULL }; } } static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) { struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); return (struct bkey_ptrs) { (void *) p.start, (void *) p.end }; } #define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ for ((_entry) = (_start); \ (_entry) < (_end); \ (_entry) = extent_entry_next_safe(_entry, _end)) #define __bkey_ptr_next(_ptr, _end) \ ({ \ typeof(_end) _entry; \ \ __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \ if (extent_entry_is_ptr(_entry)) \ break; \ \ _entry < (_end) ? entry_to_ptr(_entry) : NULL; \ }) #define bkey_extent_entry_for_each_from(_p, _entry, _start) \ __bkey_extent_entry_for_each_from(_start, (_p).end, _entry) #define bkey_extent_entry_for_each(_p, _entry) \ bkey_extent_entry_for_each_from(_p, _entry, _p.start) #define __bkey_for_each_ptr(_start, _end, _ptr) \ for (typeof(_start) (_ptr) = (_start); \ ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ (_ptr)++) #define bkey_ptr_next(_p, _ptr) \ __bkey_ptr_next(_ptr, (_p).end) #define bkey_for_each_ptr(_p, _ptr) \ __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr) #define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \ ({ \ __label__ out; \ \ (_ptr).has_ec = false; \ (_ptr).do_ec_reconstruct = false; \ (_ptr).crc_retry_nr = 0; \ \ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ switch (__extent_entry_type(_entry)) { \ case BCH_EXTENT_ENTRY_ptr: \ (_ptr).ptr = _entry->ptr; \ goto out; \ case BCH_EXTENT_ENTRY_crc32: \ case BCH_EXTENT_ENTRY_crc64: \ case BCH_EXTENT_ENTRY_crc128: \ (_ptr).crc = bch2_extent_crc_unpack(_k, \ entry_to_crc(_entry)); \ break; \ case BCH_EXTENT_ENTRY_stripe_ptr: \ (_ptr).ec = _entry->stripe_ptr; \ (_ptr).has_ec = true; \ break; \ default: \ /* nothing */ \ break; \ } \ out: \ _entry < (_end); \ }) #define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ (_entry) = _start; \ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ (_entry) = extent_entry_next_safe(_entry, _end)) #define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ _ptr, _entry) #define bkey_crc_next(_k, _end, _crc, _iter) \ ({ \ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ if (extent_entry_is_crc(_iter)) { \ (_crc) = bch2_extent_crc_unpack(_k, \ entry_to_crc(_iter)); \ break; \ } \ \ (_iter) < (_end); \ }) #define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ (_iter) = (_start); \ bkey_crc_next(_k, _end, _crc, _iter); \ (_iter) = extent_entry_next(_iter)) #define bkey_for_each_crc(_k, _p, _crc, _iter) \ __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) /* Iterate over pointers in KEY_TYPE_extent: */ #define extent_ptr_next(_e, _ptr) \ __bkey_ptr_next(_ptr, extent_entry_last(_e)) #define extent_for_each_ptr(_e, _ptr) \ __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) #define extent_for_each_ptr_decode(_e, _ptr, _entry) \ __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ extent_entry_last(_e), _ptr, _entry) /* utility code common to all keys with pointers: */ void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *, struct bch_io_failures *); struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, unsigned); void bch2_mark_io_failure(struct bch_io_failures *, struct extent_ptr_decoded *, bool); void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned); int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, struct bch_io_failures *, struct extent_ptr_decoded *, int); /* KEY_TYPE_btree_ptr: */ int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, int, struct bkey_s); #define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \ .key_validate = bch2_btree_ptr_validate, \ .val_to_text = bch2_btree_ptr_to_text, \ .swab = bch2_ptr_swab, \ .trigger = bch2_trigger_extent, \ }) #define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \ .key_validate = bch2_btree_ptr_v2_validate, \ .val_to_text = bch2_btree_ptr_v2_to_text, \ .swab = bch2_ptr_swab, \ .compat = bch2_btree_ptr_v2_compat, \ .trigger = bch2_trigger_extent, \ .min_val_size = 40, \ }) /* KEY_TYPE_extent: */ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_extent ((struct bkey_ops) { \ .key_validate = bch2_bkey_ptrs_validate, \ .val_to_text = bch2_bkey_ptrs_to_text, \ .swab = bch2_ptr_swab, \ .key_normalize = bch2_extent_normalize, \ .key_merge = bch2_extent_merge, \ .trigger = bch2_trigger_extent, \ }) /* KEY_TYPE_reservation: */ int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_reservation ((struct bkey_ops) { \ .key_validate = bch2_reservation_validate, \ .val_to_text = bch2_reservation_to_text, \ .key_merge = bch2_reservation_merge, \ .trigger = bch2_trigger_reservation, \ .min_val_size = 8, \ }) /* Extent checksum entries: */ bool bch2_can_narrow_extent_crcs(struct bkey_s_c, struct bch_extent_crc_unpacked); bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); void bch2_extent_crc_append(struct bkey_i *, struct bch_extent_crc_unpacked); /* Generic code for keys with pointers: */ static inline bool bkey_is_btree_ptr(const struct bkey *k) { switch (k->type) { case KEY_TYPE_btree_ptr: case KEY_TYPE_btree_ptr_v2: return true; default: return false; } } static inline bool bkey_extent_is_direct_data(const struct bkey *k) { switch (k->type) { case KEY_TYPE_btree_ptr: case KEY_TYPE_btree_ptr_v2: case KEY_TYPE_extent: case KEY_TYPE_reflink_v: return true; default: return false; } } static inline bool bkey_extent_is_inline_data(const struct bkey *k) { return k->type == KEY_TYPE_inline_data || k->type == KEY_TYPE_indirect_inline_data; } static inline unsigned bkey_inline_data_offset(const struct bkey *k) { switch (k->type) { case KEY_TYPE_inline_data: return sizeof(struct bch_inline_data); case KEY_TYPE_indirect_inline_data: return sizeof(struct bch_indirect_inline_data); default: BUG(); } } static inline unsigned bkey_inline_data_bytes(const struct bkey *k) { return bkey_val_bytes(k) - bkey_inline_data_offset(k); } #define bkey_inline_data_p(_k) (((void *) (_k).v) + bkey_inline_data_offset((_k).k)) static inline bool bkey_extent_is_data(const struct bkey *k) { return bkey_extent_is_direct_data(k) || bkey_extent_is_inline_data(k) || k->type == KEY_TYPE_reflink_p; } /* * Should extent be counted under inode->i_sectors? */ static inline bool bkey_extent_is_allocation(const struct bkey *k) { switch (k->type) { case KEY_TYPE_extent: case KEY_TYPE_reservation: case KEY_TYPE_reflink_p: case KEY_TYPE_reflink_v: case KEY_TYPE_inline_data: case KEY_TYPE_indirect_inline_data: case KEY_TYPE_error: return true; default: return false; } } static inline bool bkey_extent_is_unwritten(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) if (ptr->unwritten) return true; return false; } static inline bool bkey_extent_is_reservation(struct bkey_s_c k) { return k.k->type == KEY_TYPE_reservation || bkey_extent_is_unwritten(k); } static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) { struct bch_devs_list ret = (struct bch_devs_list) { 0 }; struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(p, ptr) ret.data[ret.nr++] = ptr->dev; return ret; } static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) { struct bch_devs_list ret = (struct bch_devs_list) { 0 }; struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(p, ptr) if (!ptr->cached) ret.data[ret.nr++] = ptr->dev; return ret; } static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) { struct bch_devs_list ret = (struct bch_devs_list) { 0 }; struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(p, ptr) if (ptr->cached) ret.data[ret.nr++] = ptr->dev; return ret; } unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); bool bch2_bkey_is_incompressible(struct bkey_s_c); unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *); unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned); static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev) { return (void *) bch2_bkey_has_device_c(k.s_c, dev); } bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr) { struct bch_extent_ptr *dest; EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev)); switch (k->k.type) { case KEY_TYPE_btree_ptr: case KEY_TYPE_btree_ptr_v2: case KEY_TYPE_extent: EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k)); *dest = ptr; k->k.u64s++; break; default: BUG(); } } void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *); void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); void bch2_bkey_drop_device(struct bkey_s, unsigned); #define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \ do { \ __label__ _again; \ struct bkey_ptrs _ptrs; \ _again: \ _ptrs = bch2_bkey_ptrs(_k); \ \ bkey_for_each_ptr(_ptrs, _ptr) \ if (_cond) { \ bch2_bkey_drop_ptr_noerror(_k, _ptr); \ goto _again; \ } \ } while (0) #define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ do { \ __label__ _again; \ struct bkey_ptrs _ptrs; \ _again: \ _ptrs = bch2_bkey_ptrs(_k); \ \ bkey_for_each_ptr(_ptrs, _ptr) \ if (_cond) { \ bch2_bkey_drop_ptr(_k, _ptr); \ goto _again; \ } \ } while (0) bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, struct bch_extent_ptr, u64); bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); struct bch_extent_ptr * bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *, struct bkey_s, struct bch_extent_ptr *); bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, struct bch_extent_ptr ptr2) { return (ptr1.cached == ptr2.cached && ptr1.unwritten == ptr2.unwritten && ptr1.offset == ptr2.offset && ptr1.dev == ptr2.dev && ptr1.gen == ptr2.gen); } void bch2_ptr_swab(struct bkey_s); /* Generic extent code: */ enum bch_extent_overlap { BCH_EXTENT_OVERLAP_ALL = 0, BCH_EXTENT_OVERLAP_BACK = 1, BCH_EXTENT_OVERLAP_FRONT = 2, BCH_EXTENT_OVERLAP_MIDDLE = 3, }; /* Returns how k overlaps with m */ static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, const struct bkey *m) { int cmp1 = bkey_lt(k->p, m->p); int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m)); return (cmp1 << 1) + cmp2; } int bch2_cut_front_s(struct bpos, struct bkey_s); int bch2_cut_back_s(struct bpos, struct bkey_s); static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) { bch2_cut_front_s(where, bkey_i_to_s(k)); } static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) { bch2_cut_back_s(where, bkey_i_to_s(k)); } /** * bch_key_resize - adjust size of @k * * bkey_start_offset(k) will be preserved, modifies where the extent ends */ static inline void bch2_key_resize(struct bkey *k, unsigned new_size) { k->p.offset -= k->size; k->p.offset += new_size; k->size = new_size; } static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs) { if (ptrs.start != ptrs.end && extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) return ptrs.start->flags.flags; return 0; } static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k) { return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k)); } int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64); #endif /* _BCACHEFS_EXTENTS_H */ |
| 2 45 47 47 45 1 1 1 1 41 41 37 4 41 41 38 3 38 3 3 41 1 4 35 1 35 1 6 30 9 9 9 6 9 3 9 6 6 6 2 1 1 20 9 20 20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 | /* * linux/fs/hfs/mdb.c * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. * * This file contains functions for reading/writing the MDB. */ #include <linux/cdrom.h> #include <linux/blkdev.h> #include <linux/nls.h> #include <linux/slab.h> #include "hfs_fs.h" #include "btree.h" /*================ File-local data types ================*/ /* * The HFS Master Directory Block (MDB). * * Also known as the Volume Information Block (VIB), this structure is * the HFS equivalent of a superblock. * * Reference: _Inside Macintosh: Files_ pages 2-59 through 2-62 * * modified for HFS Extended */ static int hfs_get_last_session(struct super_block *sb, sector_t *start, sector_t *size) { struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); /* default values */ *start = 0; *size = bdev_nr_sectors(sb->s_bdev); if (HFS_SB(sb)->session >= 0) { struct cdrom_tocentry te; if (!cdi) return -EINVAL; te.cdte_track = HFS_SB(sb)->session; te.cdte_format = CDROM_LBA; if (cdrom_read_tocentry(cdi, &te) || (te.cdte_ctrl & CDROM_DATA_TRACK) != 4) { pr_err("invalid session number or type of track\n"); return -EINVAL; } *start = (sector_t)te.cdte_addr.lba << 2; } else if (cdi) { struct cdrom_multisession ms_info; ms_info.addr_format = CDROM_LBA; if (cdrom_multisession(cdi, &ms_info) == 0 && ms_info.xa_flag) *start = (sector_t)ms_info.addr.lba << 2; } return 0; } /* * hfs_mdb_get() * * Build the in-core MDB for a filesystem, including * the B-trees and the volume bitmap. */ int hfs_mdb_get(struct super_block *sb) { struct buffer_head *bh; struct hfs_mdb *mdb, *mdb2; unsigned int block; char *ptr; int off2, len, size, sect; sector_t part_start, part_size; loff_t off; __be16 attrib; /* set the device driver to 512-byte blocks */ size = sb_min_blocksize(sb, HFS_SECTOR_SIZE); if (!size) return -EINVAL; if (hfs_get_last_session(sb, &part_start, &part_size)) return -EINVAL; while (1) { /* See if this is an HFS filesystem */ bh = sb_bread512(sb, part_start + HFS_MDB_BLK, mdb); if (!bh) goto out; if (mdb->drSigWord == cpu_to_be16(HFS_SUPER_MAGIC)) break; brelse(bh); /* check for a partition block * (should do this only for cdrom/loop though) */ if (hfs_part_find(sb, &part_start, &part_size)) goto out; } HFS_SB(sb)->alloc_blksz = size = be32_to_cpu(mdb->drAlBlkSiz); if (!size || (size & (HFS_SECTOR_SIZE - 1))) { pr_err("bad allocation block size %d\n", size); goto out_bh; } size = min(HFS_SB(sb)->alloc_blksz, (u32)PAGE_SIZE); /* size must be a multiple of 512 */ while (size & (size - 1)) size -= HFS_SECTOR_SIZE; sect = be16_to_cpu(mdb->drAlBlSt) + part_start; /* align block size to first sector */ while (sect & ((size - 1) >> HFS_SECTOR_SIZE_BITS)) size >>= 1; /* align block size to weird alloc size */ while (HFS_SB(sb)->alloc_blksz & (size - 1)) size >>= 1; brelse(bh); if (!sb_set_blocksize(sb, size)) { pr_err("unable to set blocksize to %u\n", size); goto out; } bh = sb_bread512(sb, part_start + HFS_MDB_BLK, mdb); if (!bh) goto out; if (mdb->drSigWord != cpu_to_be16(HFS_SUPER_MAGIC)) goto out_bh; HFS_SB(sb)->mdb_bh = bh; HFS_SB(sb)->mdb = mdb; /* These parameters are read from the MDB, and never written */ HFS_SB(sb)->part_start = part_start; HFS_SB(sb)->fs_ablocks = be16_to_cpu(mdb->drNmAlBlks); HFS_SB(sb)->fs_div = HFS_SB(sb)->alloc_blksz >> sb->s_blocksize_bits; HFS_SB(sb)->clumpablks = be32_to_cpu(mdb->drClpSiz) / HFS_SB(sb)->alloc_blksz; if (!HFS_SB(sb)->clumpablks) HFS_SB(sb)->clumpablks = 1; HFS_SB(sb)->fs_start = (be16_to_cpu(mdb->drAlBlSt) + part_start) >> (sb->s_blocksize_bits - HFS_SECTOR_SIZE_BITS); /* These parameters are read from and written to the MDB */ HFS_SB(sb)->free_ablocks = be16_to_cpu(mdb->drFreeBks); HFS_SB(sb)->next_id = be32_to_cpu(mdb->drNxtCNID); HFS_SB(sb)->root_files = be16_to_cpu(mdb->drNmFls); HFS_SB(sb)->root_dirs = be16_to_cpu(mdb->drNmRtDirs); HFS_SB(sb)->file_count = be32_to_cpu(mdb->drFilCnt); HFS_SB(sb)->folder_count = be32_to_cpu(mdb->drDirCnt); /* TRY to get the alternate (backup) MDB. */ sect = part_start + part_size - 2; bh = sb_bread512(sb, sect, mdb2); if (bh) { if (mdb2->drSigWord == cpu_to_be16(HFS_SUPER_MAGIC)) { HFS_SB(sb)->alt_mdb_bh = bh; HFS_SB(sb)->alt_mdb = mdb2; } else brelse(bh); } if (!HFS_SB(sb)->alt_mdb) { pr_warn("unable to locate alternate MDB\n"); pr_warn("continuing without an alternate MDB\n"); } HFS_SB(sb)->bitmap = kmalloc(8192, GFP_KERNEL); if (!HFS_SB(sb)->bitmap) goto out; /* read in the bitmap */ block = be16_to_cpu(mdb->drVBMSt) + part_start; off = (loff_t)block << HFS_SECTOR_SIZE_BITS; size = (HFS_SB(sb)->fs_ablocks + 8) / 8; ptr = (u8 *)HFS_SB(sb)->bitmap; while (size) { bh = sb_bread(sb, off >> sb->s_blocksize_bits); if (!bh) { pr_err("unable to read volume bitmap\n"); goto out; } off2 = off & (sb->s_blocksize - 1); len = min((int)sb->s_blocksize - off2, size); memcpy(ptr, bh->b_data + off2, len); brelse(bh); ptr += len; off += len; size -= len; } HFS_SB(sb)->ext_tree = hfs_btree_open(sb, HFS_EXT_CNID, hfs_ext_keycmp); if (!HFS_SB(sb)->ext_tree) { pr_err("unable to open extent tree\n"); goto out; } HFS_SB(sb)->cat_tree = hfs_btree_open(sb, HFS_CAT_CNID, hfs_cat_keycmp); if (!HFS_SB(sb)->cat_tree) { pr_err("unable to open catalog tree\n"); goto out; } attrib = mdb->drAtrb; if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. mounting read-only.\n"); sb->s_flags |= SB_RDONLY; } if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) { pr_warn("filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= SB_RDONLY; } if (!sb_rdonly(sb)) { /* Mark the volume uncleanly unmounted in case we crash */ attrib &= cpu_to_be16(~HFS_SB_ATTRIB_UNMNT); attrib |= cpu_to_be16(HFS_SB_ATTRIB_INCNSTNT); mdb->drAtrb = attrib; be32_add_cpu(&mdb->drWrCnt, 1); mdb->drLsMod = hfs_mtime(); mark_buffer_dirty(HFS_SB(sb)->mdb_bh); sync_dirty_buffer(HFS_SB(sb)->mdb_bh); } return 0; out_bh: brelse(bh); out: hfs_mdb_put(sb); return -EIO; } /* * hfs_mdb_commit() * * Description: * This updates the MDB on disk. * It does not check, if the superblock has been modified, or * if the filesystem has been mounted read-only. It is mainly * called by hfs_sync_fs() and flush_mdb(). * Input Variable(s): * struct hfs_mdb *mdb: Pointer to the hfs MDB * int backup; * Output Variable(s): * NONE * Returns: * void * Preconditions: * 'mdb' points to a "valid" (struct hfs_mdb). * Postconditions: * The HFS MDB and on disk will be updated, by copying the possibly * modified fields from the in memory MDB (in native byte order) to * the disk block buffer. * If 'backup' is non-zero then the alternate MDB is also written * and the function doesn't return until it is actually on disk. */ void hfs_mdb_commit(struct super_block *sb) { struct hfs_mdb *mdb = HFS_SB(sb)->mdb; if (sb_rdonly(sb)) return; lock_buffer(HFS_SB(sb)->mdb_bh); if (test_and_clear_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags)) { /* These parameters may have been modified, so write them back */ mdb->drLsMod = hfs_mtime(); mdb->drFreeBks = cpu_to_be16(HFS_SB(sb)->free_ablocks); mdb->drNxtCNID = cpu_to_be32(HFS_SB(sb)->next_id); mdb->drNmFls = cpu_to_be16(HFS_SB(sb)->root_files); mdb->drNmRtDirs = cpu_to_be16(HFS_SB(sb)->root_dirs); mdb->drFilCnt = cpu_to_be32(HFS_SB(sb)->file_count); mdb->drDirCnt = cpu_to_be32(HFS_SB(sb)->folder_count); /* write MDB to disk */ mark_buffer_dirty(HFS_SB(sb)->mdb_bh); } /* write the backup MDB, not returning until it is written. * we only do this when either the catalog or extents overflow * files grow. */ if (test_and_clear_bit(HFS_FLG_ALT_MDB_DIRTY, &HFS_SB(sb)->flags) && HFS_SB(sb)->alt_mdb) { hfs_inode_write_fork(HFS_SB(sb)->ext_tree->inode, mdb->drXTExtRec, &mdb->drXTFlSize, NULL); hfs_inode_write_fork(HFS_SB(sb)->cat_tree->inode, mdb->drCTExtRec, &mdb->drCTFlSize, NULL); lock_buffer(HFS_SB(sb)->alt_mdb_bh); memcpy(HFS_SB(sb)->alt_mdb, HFS_SB(sb)->mdb, HFS_SECTOR_SIZE); HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); unlock_buffer(HFS_SB(sb)->alt_mdb_bh); mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh); sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh); } if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) { struct buffer_head *bh; sector_t block; char *ptr; int off, size, len; block = be16_to_cpu(HFS_SB(sb)->mdb->drVBMSt) + HFS_SB(sb)->part_start; off = (block << HFS_SECTOR_SIZE_BITS) & (sb->s_blocksize - 1); block >>= sb->s_blocksize_bits - HFS_SECTOR_SIZE_BITS; size = (HFS_SB(sb)->fs_ablocks + 7) / 8; ptr = (u8 *)HFS_SB(sb)->bitmap; while (size) { bh = sb_bread(sb, block); if (!bh) { pr_err("unable to read volume bitmap\n"); break; } len = min((int)sb->s_blocksize - off, size); lock_buffer(bh); memcpy(bh->b_data + off, ptr, len); unlock_buffer(bh); mark_buffer_dirty(bh); brelse(bh); block++; off = 0; ptr += len; size -= len; } } unlock_buffer(HFS_SB(sb)->mdb_bh); } void hfs_mdb_close(struct super_block *sb) { /* update volume attributes */ if (sb_rdonly(sb)) return; HFS_SB(sb)->mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); HFS_SB(sb)->mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); mark_buffer_dirty(HFS_SB(sb)->mdb_bh); } /* * hfs_mdb_put() * * Release the resources associated with the in-core MDB. */ void hfs_mdb_put(struct super_block *sb) { if (!HFS_SB(sb)) return; /* free the B-trees */ hfs_btree_close(HFS_SB(sb)->ext_tree); hfs_btree_close(HFS_SB(sb)->cat_tree); /* free the buffers holding the primary and alternate MDBs */ brelse(HFS_SB(sb)->mdb_bh); brelse(HFS_SB(sb)->alt_mdb_bh); unload_nls(HFS_SB(sb)->nls_io); unload_nls(HFS_SB(sb)->nls_disk); kfree(HFS_SB(sb)->bitmap); kfree(HFS_SB(sb)); sb->s_fs_info = NULL; } |
| 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH * Copyright (C) 2018-2024 Intel Corporation */ #include <linux/module.h> #include <linux/init.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/timer.h> #include <linux/rtnetlink.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "sta_info.h" #include "debugfs_sta.h" #include "mesh.h" #include "wme.h" /** * DOC: STA information lifetime rules * * STA info structures (&struct sta_info) are managed in a hash table * for faster lookup and a list for iteration. They are managed using * RCU, i.e. access to the list and hash table is protected by RCU. * * Upon allocating a STA info structure with sta_info_alloc(), the caller * owns that structure. It must then insert it into the hash table using * either sta_info_insert() or sta_info_insert_rcu(); only in the latter * case (which acquires an rcu read section but must not be called from * within one) will the pointer still be valid after the call. Note that * the caller may not do much with the STA info before inserting it; in * particular, it may not start any mesh peer link management or add * encryption keys. * * When the insertion fails (sta_info_insert()) returns non-zero), the * structure will have been freed by sta_info_insert()! * * Station entries are added by mac80211 when you establish a link with a * peer. This means different things for the different type of interfaces * we support. For a regular station this mean we add the AP sta when we * receive an association response from the AP. For IBSS this occurs when * get to know about a peer on the same IBSS. For WDS we add the sta for * the peer immediately upon device open. When using AP mode we add stations * for each respective station upon request from userspace through nl80211. * * In order to remove a STA info structure, various sta_info_destroy_*() * calls are available. * * There is no concept of ownership on a STA entry; each structure is * owned by the global hash table/list until it is removed. All users of * the structure need to be RCU protected so that the structure won't be * freed before they are done using it. */ struct sta_link_alloc { struct link_sta_info info; struct ieee80211_link_sta sta; struct rcu_head rcu_head; }; static const struct rhashtable_params sta_rht_params = { .nelem_hint = 3, /* start small */ .automatic_shrinking = true, .head_offset = offsetof(struct sta_info, hash_node), .key_offset = offsetof(struct sta_info, addr), .key_len = ETH_ALEN, .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; static const struct rhashtable_params link_sta_rht_params = { .nelem_hint = 3, /* start small */ .automatic_shrinking = true, .head_offset = offsetof(struct link_sta_info, link_hash_node), .key_offset = offsetof(struct link_sta_info, addr), .key_len = ETH_ALEN, .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; static int sta_info_hash_del(struct ieee80211_local *local, struct sta_info *sta) { return rhltable_remove(&local->sta_hash, &sta->hash_node, sta_rht_params); } static int link_sta_info_hash_add(struct ieee80211_local *local, struct link_sta_info *link_sta) { lockdep_assert_wiphy(local->hw.wiphy); return rhltable_insert(&local->link_sta_hash, &link_sta->link_hash_node, link_sta_rht_params); } static int link_sta_info_hash_del(struct ieee80211_local *local, struct link_sta_info *link_sta) { lockdep_assert_wiphy(local->hw.wiphy); return rhltable_remove(&local->link_sta_hash, &link_sta->link_hash_node, link_sta_rht_params); } void ieee80211_purge_sta_txqs(struct sta_info *sta) { struct ieee80211_local *local = sta->sdata->local; int i; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txqi; if (!sta->sta.txq[i]) continue; txqi = to_txq_info(sta->sta.txq[i]); ieee80211_txq_purge(local, txqi); } } static void __cleanup_single_sta(struct sta_info *sta) { int ac, i; struct tid_ampdu_tx *tid_tx; struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct ps_data *ps; if (test_sta_flag(sta, WLAN_STA_PS_STA) || test_sta_flag(sta, WLAN_STA_PS_DRIVER) || test_sta_flag(sta, WLAN_STA_PS_DELIVER)) { if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) ps = &sdata->bss->ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else return; clear_sta_flag(sta, WLAN_STA_PS_STA); clear_sta_flag(sta, WLAN_STA_PS_DRIVER); clear_sta_flag(sta, WLAN_STA_PS_DELIVER); atomic_dec(&ps->num_sta_ps); } ieee80211_purge_sta_txqs(sta); for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { local->total_ps_buffered -= skb_queue_len(&sta->ps_tx_buf[ac]); ieee80211_purge_tx_queue(&local->hw, &sta->ps_tx_buf[ac]); ieee80211_purge_tx_queue(&local->hw, &sta->tx_filtered[ac]); } if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_sta_cleanup(sta); cancel_work_sync(&sta->drv_deliver_wk); /* * Destroy aggregation state here. It would be nice to wait for the * driver to finish aggregation stop and then clean up, but for now * drivers have to handle aggregation stop being requested, followed * directly by station destruction. */ for (i = 0; i < IEEE80211_NUM_TIDS; i++) { kfree(sta->ampdu_mlme.tid_start_tx[i]); tid_tx = rcu_dereference_raw(sta->ampdu_mlme.tid_tx[i]); if (!tid_tx) continue; ieee80211_purge_tx_queue(&local->hw, &tid_tx->pending); kfree(tid_tx); } } static void cleanup_single_sta(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; __cleanup_single_sta(sta); sta_info_free(local, sta); } struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr) { return rhltable_lookup(&local->sta_hash, addr, sta_rht_params); } /* protected by RCU */ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return sta; } } rcu_read_unlock(); return NULL; } /* * Get sta info either from the specified interface * or from one of its vlans */ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata || (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return sta; } } rcu_read_unlock(); return NULL; } struct rhlist_head *link_sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr) { return rhltable_lookup(&local->link_sta_hash, addr, link_sta_rht_params); } struct link_sta_info * link_sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct link_sta_info *link_sta; rcu_read_lock(); for_each_link_sta_info(local, addr, link_sta, tmp) { struct sta_info *sta = link_sta->sta; if (sta->sdata == sdata || (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return link_sta; } } rcu_read_unlock(); return NULL; } struct ieee80211_sta * ieee80211_find_sta_by_link_addrs(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr, unsigned int *link_id) { struct ieee80211_local *local = hw_to_local(hw); struct link_sta_info *link_sta; struct rhlist_head *tmp; for_each_link_sta_info(local, addr, link_sta, tmp) { struct sta_info *sta = link_sta->sta; struct ieee80211_link_data *link; u8 _link_id = link_sta->link_id; if (!localaddr) { if (link_id) *link_id = _link_id; return &sta->sta; } link = rcu_dereference(sta->sdata->link[_link_id]); if (!link) continue; if (memcmp(link->conf->addr, localaddr, ETH_ALEN)) continue; if (link_id) *link_id = _link_id; return &sta->sta; } return NULL; } EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_link_addrs); struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local, const u8 *sta_addr, const u8 *vif_addr) { struct rhlist_head *tmp; struct sta_info *sta; for_each_sta_info(local, sta_addr, sta, tmp) { if (ether_addr_equal(vif_addr, sta->sdata->vif.addr)) return sta; } return NULL; } struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, int idx) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; int i = 0; list_for_each_entry_rcu(sta, &local->sta_list, list, lockdep_is_held(&local->hw.wiphy->mtx)) { if (sdata != sta->sdata) continue; if (i < idx) { ++i; continue; } return sta; } return NULL; } static void sta_info_free_link(struct link_sta_info *link_sta) { free_percpu(link_sta->pcpu_rx_stats); } static void sta_remove_link(struct sta_info *sta, unsigned int link_id, bool unhash) { struct sta_link_alloc *alloc = NULL; struct link_sta_info *link_sta; lockdep_assert_wiphy(sta->local->hw.wiphy); link_sta = rcu_access_pointer(sta->link[link_id]); if (WARN_ON(!link_sta)) return; if (unhash) link_sta_info_hash_del(sta->local, link_sta); if (test_sta_flag(sta, WLAN_STA_INSERTED)) ieee80211_link_sta_debugfs_remove(link_sta); if (link_sta != &sta->deflink) alloc = container_of(link_sta, typeof(*alloc), info); sta->sta.valid_links &= ~BIT(link_id); RCU_INIT_POINTER(sta->link[link_id], NULL); RCU_INIT_POINTER(sta->sta.link[link_id], NULL); if (alloc) { sta_info_free_link(&alloc->info); kfree_rcu(alloc, rcu_head); } ieee80211_sta_recalc_aggregates(&sta->sta); } /** * sta_info_free - free STA * * @local: pointer to the global information * @sta: STA info to free * * This function must undo everything done by sta_info_alloc() * that may happen before sta_info_insert(). It may only be * called when sta_info_insert() has not been attempted (and * if that fails, the station is freed anyway.) */ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) { int i; for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; link_sta = rcu_access_pointer(sta->link[i]); if (!link_sta) continue; sta_remove_link(sta, i, false); } /* * If we had used sta_info_pre_move_state() then we might not * have gone through the state transitions down again, so do * it here now (and warn if it's inserted). * * This will clear state such as fast TX/RX that may have been * allocated during state transitions. */ while (sta->sta_state > IEEE80211_STA_NONE) { int ret; WARN_ON_ONCE(test_sta_flag(sta, WLAN_STA_INSERTED)); ret = sta_info_move_state(sta, sta->sta_state - 1); if (WARN_ONCE(ret, "sta_info_move_state() returned %d\n", ret)) break; } if (sta->rate_ctrl) rate_control_free_sta(sta); sta_dbg(sta->sdata, "Destroyed STA %pM\n", sta->sta.addr); kfree(to_txq_info(sta->sta.txq[0])); kfree(rcu_dereference_raw(sta->sta.rates)); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif sta_info_free_link(&sta->deflink); kfree(sta); } static int sta_info_hash_add(struct ieee80211_local *local, struct sta_info *sta) { return rhltable_insert(&local->sta_hash, &sta->hash_node, sta_rht_params); } static void sta_deliver_ps_frames(struct work_struct *wk) { struct sta_info *sta; sta = container_of(wk, struct sta_info, drv_deliver_wk); if (sta->dead) return; local_bh_disable(); if (!test_sta_flag(sta, WLAN_STA_PS_STA)) ieee80211_sta_ps_deliver_wakeup(sta); else if (test_and_clear_sta_flag(sta, WLAN_STA_PSPOLL)) ieee80211_sta_ps_deliver_poll_response(sta); else if (test_and_clear_sta_flag(sta, WLAN_STA_UAPSD)) ieee80211_sta_ps_deliver_uapsd(sta); local_bh_enable(); } static int sta_prepare_rate_control(struct ieee80211_local *local, struct sta_info *sta, gfp_t gfp) { if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) return 0; sta->rate_ctrl = local->rate_ctrl; sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl, sta, gfp); if (!sta->rate_ctrl_priv) return -ENOMEM; return 0; } static int sta_info_alloc_link(struct ieee80211_local *local, struct link_sta_info *link_info, gfp_t gfp) { struct ieee80211_hw *hw = &local->hw; int i; if (ieee80211_hw_check(hw, USES_RSS)) { link_info->pcpu_rx_stats = alloc_percpu_gfp(struct ieee80211_sta_rx_stats, gfp); if (!link_info->pcpu_rx_stats) return -ENOMEM; } link_info->rx_stats.last_rx = jiffies; u64_stats_init(&link_info->rx_stats.syncp); ewma_signal_init(&link_info->rx_stats_avg.signal); ewma_avg_signal_init(&link_info->status_stats.avg_ack_signal); for (i = 0; i < ARRAY_SIZE(link_info->rx_stats_avg.chain_signal); i++) ewma_signal_init(&link_info->rx_stats_avg.chain_signal[i]); link_info->rx_omi_bw_rx = IEEE80211_STA_RX_BW_MAX; link_info->rx_omi_bw_tx = IEEE80211_STA_RX_BW_MAX; link_info->rx_omi_bw_staging = IEEE80211_STA_RX_BW_MAX; /* * Cause (a) warning(s) if IEEE80211_STA_RX_BW_MAX != 320 * or if new values are added to the enum. */ switch (link_info->cur_max_bandwidth) { case IEEE80211_STA_RX_BW_20: case IEEE80211_STA_RX_BW_40: case IEEE80211_STA_RX_BW_80: case IEEE80211_STA_RX_BW_160: case IEEE80211_STA_RX_BW_MAX: /* intentionally nothing */ break; } return 0; } static void sta_info_add_link(struct sta_info *sta, unsigned int link_id, struct link_sta_info *link_info, struct ieee80211_link_sta *link_sta) { link_info->sta = sta; link_info->link_id = link_id; link_info->pub = link_sta; link_info->pub->sta = &sta->sta; link_sta->link_id = link_id; rcu_assign_pointer(sta->link[link_id], link_info); rcu_assign_pointer(sta->sta.link[link_id], link_sta); link_sta->smps_mode = IEEE80211_SMPS_OFF; link_sta->agg.max_rc_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_BA; } static struct sta_info * __sta_info_alloc(struct ieee80211_sub_if_data *sdata, const u8 *addr, int link_id, const u8 *link_addr, gfp_t gfp) { struct ieee80211_local *local = sdata->local; struct ieee80211_hw *hw = &local->hw; struct sta_info *sta; void *txq_data; int size; int i; sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp); if (!sta) return NULL; sta->local = local; sta->sdata = sdata; if (sta_info_alloc_link(local, &sta->deflink, gfp)) goto free; if (link_id >= 0) { sta_info_add_link(sta, link_id, &sta->deflink, &sta->sta.deflink); sta->sta.valid_links = BIT(link_id); } else { sta_info_add_link(sta, 0, &sta->deflink, &sta->sta.deflink); } sta->sta.cur = &sta->sta.deflink.agg; spin_lock_init(&sta->lock); spin_lock_init(&sta->ps_lock); INIT_WORK(&sta->drv_deliver_wk, sta_deliver_ps_frames); wiphy_work_init(&sta->ampdu_mlme.work, ieee80211_ba_session_work); #ifdef CONFIG_MAC80211_MESH if (ieee80211_vif_is_mesh(&sdata->vif)) { sta->mesh = kzalloc(sizeof(*sta->mesh), gfp); if (!sta->mesh) goto free; sta->mesh->plink_sta = sta; spin_lock_init(&sta->mesh->plink_lock); if (!sdata->u.mesh.user_mpm) timer_setup(&sta->mesh->plink_timer, mesh_plink_timer, 0); sta->mesh->nonpeer_pm = NL80211_MESH_POWER_ACTIVE; } #endif memcpy(sta->addr, addr, ETH_ALEN); memcpy(sta->sta.addr, addr, ETH_ALEN); memcpy(sta->deflink.addr, link_addr, ETH_ALEN); memcpy(sta->sta.deflink.addr, link_addr, ETH_ALEN); sta->sta.max_rx_aggregation_subframes = local->hw.max_rx_aggregation_subframes; /* TODO link specific alloc and assignments for MLO Link STA */ /* Extended Key ID needs to install keys for keyid 0 and 1 Rx-only. * The Tx path starts to use a key as soon as the key slot ptk_idx * references to is not NULL. To not use the initial Rx-only key * prematurely for Tx initialize ptk_idx to an impossible PTK keyid * which always will refer to a NULL key. */ BUILD_BUG_ON(ARRAY_SIZE(sta->ptk) <= INVALID_PTK_KEYIDX); sta->ptk_idx = INVALID_PTK_KEYIDX; ieee80211_init_frag_cache(&sta->frags); sta->sta_state = IEEE80211_STA_NONE; if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT) sta->amsdu_mesh_control = -1; /* Mark TID as unreserved */ sta->reserved_tid = IEEE80211_TID_UNRESERVED; sta->last_connected = ktime_get_seconds(); size = sizeof(struct txq_info) + ALIGN(hw->txq_data_size, sizeof(void *)); txq_data = kcalloc(ARRAY_SIZE(sta->sta.txq), size, gfp); if (!txq_data) goto free; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txq = txq_data + i * size; /* might not do anything for the (bufferable) MMPDU TXQ */ ieee80211_txq_init(sdata, sta, txq, i); } if (sta_prepare_rate_control(local, sta, gfp)) goto free_txq; sta->airtime_weight = IEEE80211_DEFAULT_AIRTIME_WEIGHT; for (i = 0; i < IEEE80211_NUM_ACS; i++) { skb_queue_head_init(&sta->ps_tx_buf[i]); skb_queue_head_init(&sta->tx_filtered[i]); sta->airtime[i].deficit = sta->airtime_weight; atomic_set(&sta->airtime[i].aql_tx_pending, 0); sta->airtime[i].aql_limit_low = local->aql_txq_limit_low[i]; sta->airtime[i].aql_limit_high = local->aql_txq_limit_high[i]; } for (i = 0; i < IEEE80211_NUM_TIDS; i++) sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); for (i = 0; i < NUM_NL80211_BANDS; i++) { u32 mandatory = 0; int r; if (!hw->wiphy->bands[i]) continue; switch (i) { case NL80211_BAND_2GHZ: case NL80211_BAND_LC: /* * We use both here, even if we cannot really know for * sure the station will support both, but the only use * for this is when we don't know anything yet and send * management frames, and then we'll pick the lowest * possible rate anyway. * If we don't include _G here, we cannot find a rate * in P2P, and thus trigger the WARN_ONCE() in rate.c */ mandatory = IEEE80211_RATE_MANDATORY_B | IEEE80211_RATE_MANDATORY_G; break; case NL80211_BAND_5GHZ: mandatory = IEEE80211_RATE_MANDATORY_A; break; case NL80211_BAND_60GHZ: WARN_ON(1); mandatory = 0; break; } for (r = 0; r < hw->wiphy->bands[i]->n_bitrates; r++) { struct ieee80211_rate *rate; rate = &hw->wiphy->bands[i]->bitrates[r]; if (!(rate->flags & mandatory)) continue; sta->sta.deflink.supp_rates[i] |= BIT(r); } } sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); return sta; free_txq: kfree(to_txq_info(sta->sta.txq[0])); free: sta_info_free_link(&sta->deflink); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif kfree(sta); return NULL; } struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, const u8 *addr, gfp_t gfp) { return __sta_info_alloc(sdata, addr, -1, addr, gfp); } struct sta_info *sta_info_alloc_with_link(struct ieee80211_sub_if_data *sdata, const u8 *mld_addr, unsigned int link_id, const u8 *link_addr, gfp_t gfp) { return __sta_info_alloc(sdata, mld_addr, link_id, link_addr, gfp); } static int sta_info_insert_check(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* * Can't be a WARN_ON because it can be triggered through a race: * something inserts a STA (on one CPU) without holding the RTNL * and another CPU turns off the net device. */ if (unlikely(!ieee80211_sdata_running(sdata))) return -ENETDOWN; if (WARN_ON(ether_addr_equal(sta->sta.addr, sdata->vif.addr) || !is_valid_ether_addr(sta->sta.addr))) return -EINVAL; /* The RCU read lock is required by rhashtable due to * asynchronous resize/rehash. We also require the mutex * for correctness. */ rcu_read_lock(); if (ieee80211_hw_check(&sdata->local->hw, NEEDS_UNIQUE_STA_ADDR) && ieee80211_find_sta_by_ifaddr(&sdata->local->hw, sta->addr, NULL)) { rcu_read_unlock(); return -ENOTUNIQ; } rcu_read_unlock(); return 0; } static int sta_info_insert_drv_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { enum ieee80211_sta_state state; int err = 0; for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) { err = drv_sta_state(local, sdata, sta, state, state + 1); if (err) break; } if (!err) { /* * Drivers using legacy sta_add/sta_remove callbacks only * get uploaded set to true after sta_add is called. */ if (!local->ops->sta_add) sta->uploaded = true; return 0; } if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { sdata_info(sdata, "failed to move IBSS STA %pM to state %d (%d) - keeping it anyway\n", sta->sta.addr, state + 1, err); err = 0; } /* unwind on error */ for (; state > IEEE80211_STA_NOTEXIST; state--) WARN_ON(drv_sta_state(local, sdata, sta, state, state - 1)); return err; } static void ieee80211_recalc_p2p_go_ps_allowed(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; bool allow_p2p_go_ps = sdata->vif.p2p; struct sta_info *sta; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata || !test_sta_flag(sta, WLAN_STA_ASSOC)) continue; if (!sta->sta.support_p2p_ps) { allow_p2p_go_ps = false; break; } } rcu_read_unlock(); if (allow_p2p_go_ps != sdata->vif.bss_conf.allow_p2p_go_ps) { sdata->vif.bss_conf.allow_p2p_go_ps = allow_p2p_go_ps; ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_P2P_PS); } } static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct station_info *sinfo = NULL; int err = 0; lockdep_assert_wiphy(local->hw.wiphy); /* check if STA exists already */ if (sta_info_get_bss(sdata, sta->sta.addr)) { err = -EEXIST; goto out_cleanup; } sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL); if (!sinfo) { err = -ENOMEM; goto out_cleanup; } local->num_sta++; local->sta_generation++; smp_mb(); /* simplify things and don't accept BA sessions yet */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); /* make the station visible */ err = sta_info_hash_add(local, sta); if (err) goto out_drop_sta; if (sta->sta.valid_links) { err = link_sta_info_hash_add(local, &sta->deflink); if (err) { sta_info_hash_del(local, sta); goto out_drop_sta; } } list_add_tail_rcu(&sta->list, &local->sta_list); /* update channel context before notifying the driver about state * change, this enables driver using the updated channel context right away. */ if (sta->sta_state >= IEEE80211_STA_ASSOC) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } /* notify driver */ err = sta_info_insert_drv_state(local, sdata, sta); if (err) goto out_remove; set_sta_flag(sta, WLAN_STA_INSERTED); /* accept BA sessions now */ clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_debugfs_add(sta); rate_control_add_sta_debugfs(sta); if (sta->sta.valid_links) { int i; for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; link_sta = rcu_dereference_protected(sta->link[i], lockdep_is_held(&local->hw.wiphy->mtx)); if (!link_sta) continue; ieee80211_link_sta_debugfs_add(link_sta); if (sdata->vif.active_links & BIT(i)) ieee80211_link_sta_debugfs_drv_add(link_sta); } } else { ieee80211_link_sta_debugfs_add(&sta->deflink); ieee80211_link_sta_debugfs_drv_add(&sta->deflink); } sinfo->generation = local->sta_generation; cfg80211_new_sta(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); sta_dbg(sdata, "Inserted STA %pM\n", sta->sta.addr); /* move reference to rcu-protected */ rcu_read_lock(); if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_accept_plinks_update(sdata); ieee80211_check_fast_xmit(sta); return 0; out_remove: if (sta->sta.valid_links) link_sta_info_hash_del(local, &sta->deflink); sta_info_hash_del(local, sta); list_del_rcu(&sta->list); out_drop_sta: local->num_sta--; synchronize_net(); out_cleanup: cleanup_single_sta(sta); kfree(sinfo); rcu_read_lock(); return err; } int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; int err; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); err = sta_info_insert_check(sta); if (err) { sta_info_free(local, sta); rcu_read_lock(); return err; } return sta_info_insert_finish(sta); } int sta_info_insert(struct sta_info *sta) { int err = sta_info_insert_rcu(sta); rcu_read_unlock(); return err; } static inline void __bss_tim_set(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the __set_bit() format. */ tim[id / 8] |= (1 << (id % 8)); } static inline void __bss_tim_clear(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the __clear_bit() format. */ tim[id / 8] &= ~(1 << (id % 8)); } static inline bool __bss_tim_get(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the test_bit() format. */ return tim[id / 8] & (1 << (id % 8)); } static unsigned long ieee80211_tids_for_ac(int ac) { /* If we ever support TIDs > 7, this obviously needs to be adjusted */ switch (ac) { case IEEE80211_AC_VO: return BIT(6) | BIT(7); case IEEE80211_AC_VI: return BIT(4) | BIT(5); case IEEE80211_AC_BE: return BIT(0) | BIT(3); case IEEE80211_AC_BK: return BIT(1) | BIT(2); default: WARN_ON(1); return 0; } } static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) { struct ieee80211_local *local = sta->local; struct ps_data *ps; bool indicate_tim = false; u8 ignore_for_tim = sta->sta.uapsd_queues; int ac; u16 id = sta->sta.aid; if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (WARN_ON_ONCE(!sta->sdata->bss)) return; ps = &sta->sdata->bss->ps; #ifdef CONFIG_MAC80211_MESH } else if (ieee80211_vif_is_mesh(&sta->sdata->vif)) { ps = &sta->sdata->u.mesh.ps; #endif } else { return; } /* No need to do anything if the driver does all */ if (ieee80211_hw_check(&local->hw, AP_LINK_PS) && !local->ops->set_tim) return; if (sta->dead) goto done; /* * If all ACs are delivery-enabled then we should build * the TIM bit for all ACs anyway; if only some are then * we ignore those and build the TIM bit using only the * non-enabled ones. */ if (ignore_for_tim == BIT(IEEE80211_NUM_ACS) - 1) ignore_for_tim = 0; if (ignore_pending) ignore_for_tim = BIT(IEEE80211_NUM_ACS) - 1; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { unsigned long tids; if (ignore_for_tim & ieee80211_ac_to_qos_mask[ac]) continue; indicate_tim |= !skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac]); if (indicate_tim) break; tids = ieee80211_tids_for_ac(ac); indicate_tim |= sta->driver_buffered_tids & tids; indicate_tim |= sta->txq_buffered_tids & tids; } done: spin_lock_bh(&local->tim_lock); if (indicate_tim == __bss_tim_get(ps->tim, id)) goto out_unlock; if (indicate_tim) __bss_tim_set(ps->tim, id); else __bss_tim_clear(ps->tim, id); if (local->ops->set_tim && !WARN_ON(sta->dead)) { local->tim_in_locked_section = true; drv_set_tim(local, &sta->sta, indicate_tim); local->tim_in_locked_section = false; } out_unlock: spin_unlock_bh(&local->tim_lock); } void sta_info_recalc_tim(struct sta_info *sta) { __sta_info_recalc_tim(sta, false); } static bool sta_info_buffer_expired(struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_tx_info *info; int timeout; if (!skb) return false; info = IEEE80211_SKB_CB(skb); /* Timeout: (2 * listen_interval * beacon_int * 1024 / 1000000) sec */ timeout = (sta->listen_interval * sta->sdata->vif.bss_conf.beacon_int * 32 / 15625) * HZ; if (timeout < STA_TX_BUFFER_EXPIRE) timeout = STA_TX_BUFFER_EXPIRE; return time_after(jiffies, info->control.jiffies + timeout); } static bool sta_info_cleanup_expire_buffered_ac(struct ieee80211_local *local, struct sta_info *sta, int ac) { unsigned long flags; struct sk_buff *skb; /* * First check for frames that should expire on the filtered * queue. Frames here were rejected by the driver and are on * a separate queue to avoid reordering with normal PS-buffered * frames. They also aren't accounted for right now in the * total_ps_buffered counter. */ for (;;) { spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags); skb = skb_peek(&sta->tx_filtered[ac]); if (sta_info_buffer_expired(sta, skb)) skb = __skb_dequeue(&sta->tx_filtered[ac]); else skb = NULL; spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags); /* * Frames are queued in order, so if this one * hasn't expired yet we can stop testing. If * we actually reached the end of the queue we * also need to stop, of course. */ if (!skb) break; ieee80211_free_txskb(&local->hw, skb); } /* * Now also check the normal PS-buffered queue, this will * only find something if the filtered queue was emptied * since the filtered frames are all before the normal PS * buffered frames. */ for (;;) { spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags); skb = skb_peek(&sta->ps_tx_buf[ac]); if (sta_info_buffer_expired(sta, skb)) skb = __skb_dequeue(&sta->ps_tx_buf[ac]); else skb = NULL; spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags); /* * frames are queued in order, so if this one * hasn't expired yet (or we reached the end of * the queue) we can stop testing */ if (!skb) break; local->total_ps_buffered--; ps_dbg(sta->sdata, "Buffered frame expired (STA %pM)\n", sta->sta.addr); ieee80211_free_txskb(&local->hw, skb); } /* * Finally, recalculate the TIM bit for this station -- it might * now be clear because the station was too slow to retrieve its * frames. */ sta_info_recalc_tim(sta); /* * Return whether there are any frames still buffered, this is * used to check whether the cleanup timer still needs to run, * if there are no frames we don't need to rearm the timer. */ return !(skb_queue_empty(&sta->ps_tx_buf[ac]) && skb_queue_empty(&sta->tx_filtered[ac])); } static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local, struct sta_info *sta) { bool have_buffered = false; int ac; /* This is only necessary for stations on BSS/MBSS interfaces */ if (!sta->sdata->bss && !ieee80211_vif_is_mesh(&sta->sdata->vif)) return false; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) have_buffered |= sta_info_cleanup_expire_buffered_ac(local, sta, ac); return have_buffered; } static int __must_check __sta_info_destroy_part1(struct sta_info *sta) { struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; int ret, i; might_sleep(); if (!sta) return -ENOENT; local = sta->local; sdata = sta->sdata; lockdep_assert_wiphy(local->hw.wiphy); /* * Before removing the station from the driver and * rate control, it might still start new aggregation * sessions -- block that to make sure the tear-down * will be sufficient. */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA); /* * Before removing the station from the driver there might be pending * rx frames on RSS queues sent prior to the disassociation - wait for * all such frames to be processed. */ drv_sync_rx_queues(local, sta); for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; if (!(sta->sta.valid_links & BIT(i))) continue; link_sta = rcu_dereference_protected(sta->link[i], lockdep_is_held(&local->hw.wiphy->mtx)); link_sta_info_hash_del(local, link_sta); } ret = sta_info_hash_del(local, sta); if (WARN_ON(ret)) return ret; /* * for TDLS peers, make sure to return to the base channel before * removal. */ if (test_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL)) { drv_tdls_cancel_channel_switch(local, sdata, &sta->sta); clear_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL); } list_del_rcu(&sta->list); sta->removed = true; if (sta->uploaded) drv_sta_pre_rcu_remove(local, sta->sdata, sta); if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && rcu_access_pointer(sdata->u.vlan.sta) == sta) RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); return 0; } static int _sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state, bool recalc) { struct ieee80211_local *local = sta->local; might_sleep(); if (sta->sta_state == new_state) return 0; /* check allowed transitions first */ switch (new_state) { case IEEE80211_STA_NONE: if (sta->sta_state != IEEE80211_STA_AUTH) return -EINVAL; break; case IEEE80211_STA_AUTH: if (sta->sta_state != IEEE80211_STA_NONE && sta->sta_state != IEEE80211_STA_ASSOC) return -EINVAL; break; case IEEE80211_STA_ASSOC: if (sta->sta_state != IEEE80211_STA_AUTH && sta->sta_state != IEEE80211_STA_AUTHORIZED) return -EINVAL; break; case IEEE80211_STA_AUTHORIZED: if (sta->sta_state != IEEE80211_STA_ASSOC) return -EINVAL; break; default: WARN(1, "invalid state %d", new_state); return -EINVAL; } sta_dbg(sta->sdata, "moving STA %pM to state %d\n", sta->sta.addr, new_state); /* notify the driver before the actual changes so it can * fail the transition if the state is increasing. * The driver is required not to fail when the transition * is decreasing the state, so first, do all the preparation * work and only then, notify the driver. */ if (new_state > sta->sta_state && test_sta_flag(sta, WLAN_STA_INSERTED)) { int err = drv_sta_state(sta->local, sta->sdata, sta, sta->sta_state, new_state); if (err) return err; } /* reflect the change in all state variables */ switch (new_state) { case IEEE80211_STA_NONE: if (sta->sta_state == IEEE80211_STA_AUTH) clear_bit(WLAN_STA_AUTH, &sta->_flags); break; case IEEE80211_STA_AUTH: if (sta->sta_state == IEEE80211_STA_NONE) { set_bit(WLAN_STA_AUTH, &sta->_flags); } else if (sta->sta_state == IEEE80211_STA_ASSOC) { clear_bit(WLAN_STA_ASSOC, &sta->_flags); if (recalc) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } } break; case IEEE80211_STA_ASSOC: if (sta->sta_state == IEEE80211_STA_AUTH) { set_bit(WLAN_STA_ASSOC, &sta->_flags); sta->assoc_at = ktime_get_boottime_ns(); if (recalc) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } } else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { ieee80211_vif_dec_num_mcast(sta->sdata); clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags); /* * If we have encryption offload, flush (station) queues * (after ensuring concurrent TX completed) so we won't * transmit anything later unencrypted if/when keys are * also removed, which might otherwise happen depending * on how the hardware offload works. */ if (local->ops->set_key) { synchronize_net(); if (local->ops->flush_sta) drv_flush_sta(local, sta->sdata, sta); else ieee80211_flush_queues(local, sta->sdata, false); } ieee80211_clear_fast_xmit(sta); ieee80211_clear_fast_rx(sta); } break; case IEEE80211_STA_AUTHORIZED: if (sta->sta_state == IEEE80211_STA_ASSOC) { ieee80211_vif_inc_num_mcast(sta->sdata); set_bit(WLAN_STA_AUTHORIZED, &sta->_flags); ieee80211_check_fast_xmit(sta); ieee80211_check_fast_rx(sta); } if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN || sta->sdata->vif.type == NL80211_IFTYPE_AP) cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr); break; default: break; } if (new_state < sta->sta_state && test_sta_flag(sta, WLAN_STA_INSERTED)) { int err = drv_sta_state(sta->local, sta->sdata, sta, sta->sta_state, new_state); WARN_ONCE(err, "Driver is not allowed to fail if the sta_state is transitioning down the list: %d\n", err); } sta->sta_state = new_state; return 0; } int sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state) { return _sta_info_move_state(sta, new_state, true); } static void __sta_info_destroy_part2(struct sta_info *sta, bool recalc) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct station_info *sinfo; int ret; /* * NOTE: This assumes at least synchronize_net() was done * after _part1 and before _part2! */ /* * There's a potential race in _part1 where we set WLAN_STA_BLOCK_BA * but someone might have just gotten past a check, and not yet into * queuing the work/creating the data/etc. * * Do another round of destruction so that the worker is certainly * canceled before we later free the station. * * Since this is after synchronize_rcu()/synchronize_net() we're now * certain that nobody can actually hold a reference to the STA and * be calling e.g. ieee80211_start_tx_ba_session(). */ ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA); might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { ret = _sta_info_move_state(sta, IEEE80211_STA_ASSOC, recalc); WARN_ON_ONCE(ret); } /* now keys can no longer be reached */ ieee80211_free_sta_keys(local, sta); /* disable TIM bit - last chance to tell driver */ __sta_info_recalc_tim(sta, true); sta->dead = true; local->num_sta--; local->sta_generation++; while (sta->sta_state > IEEE80211_STA_NONE) { ret = _sta_info_move_state(sta, sta->sta_state - 1, recalc); if (ret) { WARN_ON_ONCE(1); break; } } if (sta->uploaded) { ret = drv_sta_state(local, sdata, sta, IEEE80211_STA_NONE, IEEE80211_STA_NOTEXIST); WARN_ON_ONCE(ret != 0); } sta_dbg(sdata, "Removed STA %pM\n", sta->sta.addr); sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL); if (sinfo) sta_set_sinfo(sta, sinfo, true); cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); ieee80211_sta_debugfs_remove(sta); ieee80211_destroy_frag_cache(&sta->frags); cleanup_single_sta(sta); } int __must_check __sta_info_destroy(struct sta_info *sta) { int err = __sta_info_destroy_part1(sta); if (err) return err; synchronize_net(); __sta_info_destroy_part2(sta, true); return 0; } int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta = sta_info_get(sdata, addr); return __sta_info_destroy(sta); } int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta = sta_info_get_bss(sdata, addr); return __sta_info_destroy(sta); } static void sta_info_cleanup(struct timer_list *t) { struct ieee80211_local *local = timer_container_of(local, t, sta_cleanup); struct sta_info *sta; bool timer_needed = false; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) if (sta_info_cleanup_expire_buffered(local, sta)) timer_needed = true; rcu_read_unlock(); if (local->quiescing) return; if (!timer_needed) return; mod_timer(&local->sta_cleanup, round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); } int sta_info_init(struct ieee80211_local *local) { int err; err = rhltable_init(&local->sta_hash, &sta_rht_params); if (err) return err; err = rhltable_init(&local->link_sta_hash, &link_sta_rht_params); if (err) { rhltable_destroy(&local->sta_hash); return err; } spin_lock_init(&local->tim_lock); INIT_LIST_HEAD(&local->sta_list); timer_setup(&local->sta_cleanup, sta_info_cleanup, 0); return 0; } void sta_info_stop(struct ieee80211_local *local) { timer_delete_sync(&local->sta_cleanup); rhltable_destroy(&local->sta_hash); rhltable_destroy(&local->link_sta_hash); } int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans, int link_id, struct sta_info *do_not_flush_sta) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; LIST_HEAD(free_list); int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); WARN_ON(vlans && sdata->vif.type != NL80211_IFTYPE_AP); WARN_ON(vlans && !sdata->bss); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { if (sdata != sta->sdata && (!vlans || sdata->bss != sta->sdata->bss)) continue; if (sta == do_not_flush_sta) continue; if (link_id >= 0 && sta->sta.valid_links && !(sta->sta.valid_links & BIT(link_id))) continue; if (!WARN_ON(__sta_info_destroy_part1(sta))) list_add(&sta->free_list, &free_list); ret++; } if (!list_empty(&free_list)) { bool support_p2p_ps = true; synchronize_net(); list_for_each_entry_safe(sta, tmp, &free_list, free_list) { if (!sta->sta.support_p2p_ps) support_p2p_ps = false; __sta_info_destroy_part2(sta, false); } ieee80211_recalc_min_chandef(sdata, -1); if (!support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sdata); } return ret; } void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, unsigned long exp_time) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { unsigned long last_active = ieee80211_sta_last_active(sta); if (sdata != sta->sdata) continue; if (time_is_before_jiffies(last_active + exp_time)) { sta_dbg(sta->sdata, "expiring inactive STA %pM\n", sta->sta.addr); if (ieee80211_vif_is_mesh(&sdata->vif) && test_sta_flag(sta, WLAN_STA_PS_STA)) atomic_dec(&sdata->u.mesh.ps.num_sta_ps); WARN_ON(__sta_info_destroy(sta)); } } } struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr) { struct ieee80211_local *local = hw_to_local(hw); struct rhlist_head *tmp; struct sta_info *sta; /* * Just return a random station if localaddr is NULL * ... first in list. */ for_each_sta_info(local, addr, sta, tmp) { if (localaddr && !ether_addr_equal(sta->sdata->vif.addr, localaddr)) continue; if (!sta->uploaded) return NULL; return &sta->sta; } return NULL; } EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr); struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, const u8 *addr) { struct sta_info *sta; if (!vif) return NULL; sta = sta_info_get_bss(vif_to_sdata(vif), addr); if (!sta) return NULL; if (!sta->uploaded) return NULL; return &sta->sta; } EXPORT_SYMBOL(ieee80211_find_sta); /* powersave support code */ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct sk_buff_head pending; int filtered = 0, buffered = 0, ac, i; unsigned long flags; struct ps_data *ps; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else return; clear_sta_flag(sta, WLAN_STA_SP); BUILD_BUG_ON(BITS_TO_LONGS(IEEE80211_NUM_TIDS) > 1); sta->driver_buffered_tids = 0; sta->txq_buffered_tids = 0; if (!ieee80211_hw_check(&local->hw, AP_LINK_PS)) drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta); for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { if (!sta->sta.txq[i] || !txq_has_queue(sta->sta.txq[i])) continue; schedule_and_wake_txq(local, to_txq_info(sta->sta.txq[i])); } skb_queue_head_init(&pending); /* sync with ieee80211_tx_h_unicast_ps_buf */ spin_lock_bh(&sta->ps_lock); /* Send all buffered frames to the station */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { int count = skb_queue_len(&pending), tmp; spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags); skb_queue_splice_tail_init(&sta->tx_filtered[ac], &pending); spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags); tmp = skb_queue_len(&pending); filtered += tmp - count; count = tmp; spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags); skb_queue_splice_tail_init(&sta->ps_tx_buf[ac], &pending); spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags); tmp = skb_queue_len(&pending); buffered += tmp - count; } ieee80211_add_pending_skbs(local, &pending); /* now we're no longer in the deliver code */ clear_sta_flag(sta, WLAN_STA_PS_DELIVER); /* The station might have polled and then woken up before we responded, * so clear these flags now to avoid them sticking around. */ clear_sta_flag(sta, WLAN_STA_PSPOLL); clear_sta_flag(sta, WLAN_STA_UAPSD); spin_unlock_bh(&sta->ps_lock); atomic_dec(&ps->num_sta_ps); local->total_ps_buffered -= buffered; sta_info_recalc_tim(sta); ps_dbg(sdata, "STA %pM aid %d sending %d filtered/%d PS frames since STA woke up\n", sta->sta.addr, sta->sta.aid, filtered, buffered); ieee80211_check_fast_xmit(sta); } static void ieee80211_send_null_response(struct sta_info *sta, int tid, enum ieee80211_frame_release_type reason, bool call_driver, bool more_data) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; struct sk_buff *skb; int size = sizeof(*nullfunc); __le16 fc; bool qos = sta->sta.wme; struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | IEEE80211_FCTL_FROMDS); } else { size -= 2; fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_FROMDS); } skb = dev_alloc_skb(local->hw.extra_tx_headroom + size); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put(skb, size); nullfunc->frame_control = fc; nullfunc->duration_id = 0; memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN); memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN); memcpy(nullfunc->addr3, sdata->vif.addr, ETH_ALEN); nullfunc->seq_ctrl = 0; skb->priority = tid; skb_set_queue_mapping(skb, ieee802_1d_to_ac[tid]); if (qos) { nullfunc->qos_ctrl = cpu_to_le16(tid); if (reason == IEEE80211_FRAME_RELEASE_UAPSD) { nullfunc->qos_ctrl |= cpu_to_le16(IEEE80211_QOS_CTL_EOSP); if (more_data) nullfunc->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); } } info = IEEE80211_SKB_CB(skb); /* * Tell TX path to send this frame even though the * STA may still remain is PS mode after this frame * exchange. Also set EOSP to indicate this packet * ends the poll/service period. */ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER | IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE; if (call_driver) drv_allow_buffered_frames(local, sta, BIT(tid), 1, reason, false); skb->dev = sdata->dev; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); kfree_skb(skb); return; } info->band = chanctx_conf->def.chan->band; ieee80211_xmit(sdata, sta, skb); rcu_read_unlock(); } static int find_highest_prio_tid(unsigned long tids) { /* lower 3 TIDs aren't ordered perfectly */ if (tids & 0xF8) return fls(tids) - 1; /* TID 0 is BE just like TID 3 */ if (tids & BIT(0)) return 0; return fls(tids) - 1; } /* Indicates if the MORE_DATA bit should be set in the last * frame obtained by ieee80211_sta_ps_get_frames. * Note that driver_release_tids is relevant only if * reason = IEEE80211_FRAME_RELEASE_PSPOLL */ static bool ieee80211_sta_ps_more_data(struct sta_info *sta, u8 ignored_acs, enum ieee80211_frame_release_type reason, unsigned long driver_release_tids) { int ac; /* If the driver has data on more than one TID then * certainly there's more data if we release just a * single frame now (from a single TID). This will * only happen for PS-Poll. */ if (reason == IEEE80211_FRAME_RELEASE_PSPOLL && hweight16(driver_release_tids) > 1) return true; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { if (ignored_acs & ieee80211_ac_to_qos_mask[ac]) continue; if (!skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac])) return true; } return false; } static void ieee80211_sta_ps_get_frames(struct sta_info *sta, int n_frames, u8 ignored_acs, enum ieee80211_frame_release_type reason, struct sk_buff_head *frames, unsigned long *driver_release_tids) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; int ac; /* Get response frame(s) and more data bit for the last one. */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { unsigned long tids; if (ignored_acs & ieee80211_ac_to_qos_mask[ac]) continue; tids = ieee80211_tids_for_ac(ac); /* if we already have frames from software, then we can't also * release from hardware queues */ if (skb_queue_empty(frames)) { *driver_release_tids |= sta->driver_buffered_tids & tids; *driver_release_tids |= sta->txq_buffered_tids & tids; } if (!*driver_release_tids) { struct sk_buff *skb; while (n_frames > 0) { skb = skb_dequeue(&sta->tx_filtered[ac]); if (!skb) { skb = skb_dequeue( &sta->ps_tx_buf[ac]); if (skb) local->total_ps_buffered--; } if (!skb) break; n_frames--; __skb_queue_tail(frames, skb); } } /* If we have more frames buffered on this AC, then abort the * loop since we can't send more data from other ACs before * the buffered frames from this. */ if (!skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac])) break; } } static void ieee80211_sta_ps_deliver_response(struct sta_info *sta, int n_frames, u8 ignored_acs, enum ieee80211_frame_release_type reason) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; unsigned long driver_release_tids = 0; struct sk_buff_head frames; bool more_data; /* Service or PS-Poll period starts */ set_sta_flag(sta, WLAN_STA_SP); __skb_queue_head_init(&frames); ieee80211_sta_ps_get_frames(sta, n_frames, ignored_acs, reason, &frames, &driver_release_tids); more_data = ieee80211_sta_ps_more_data(sta, ignored_acs, reason, driver_release_tids); if (driver_release_tids && reason == IEEE80211_FRAME_RELEASE_PSPOLL) driver_release_tids = BIT(find_highest_prio_tid(driver_release_tids)); if (skb_queue_empty(&frames) && !driver_release_tids) { int tid, ac; /* * For PS-Poll, this can only happen due to a race condition * when we set the TIM bit and the station notices it, but * before it can poll for the frame we expire it. * * For uAPSD, this is said in the standard (11.2.1.5 h): * At each unscheduled SP for a non-AP STA, the AP shall * attempt to transmit at least one MSDU or MMPDU, but no * more than the value specified in the Max SP Length field * in the QoS Capability element from delivery-enabled ACs, * that are destined for the non-AP STA. * * Since we have no other MSDU/MMPDU, transmit a QoS null frame. */ /* This will evaluate to 1, 3, 5 or 7. */ for (ac = IEEE80211_AC_VO; ac < IEEE80211_NUM_ACS; ac++) if (!(ignored_acs & ieee80211_ac_to_qos_mask[ac])) break; tid = 7 - 2 * ac; ieee80211_send_null_response(sta, tid, reason, true, false); } else if (!driver_release_tids) { struct sk_buff_head pending; struct sk_buff *skb; int num = 0; u16 tids = 0; bool need_null = false; skb_queue_head_init(&pending); while ((skb = __skb_dequeue(&frames))) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *) skb->data; u8 *qoshdr = NULL; num++; /* * Tell TX path to send this frame even though the * STA may still remain is PS mode after this frame * exchange. */ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER; info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE; /* * Use MoreData flag to indicate whether there are * more buffered frames for this STA */ if (more_data || !skb_queue_empty(&frames)) hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); else hdr->frame_control &= cpu_to_le16(~IEEE80211_FCTL_MOREDATA); if (ieee80211_is_data_qos(hdr->frame_control) || ieee80211_is_qos_nullfunc(hdr->frame_control)) qoshdr = ieee80211_get_qos_ctl(hdr); tids |= BIT(skb->priority); __skb_queue_tail(&pending, skb); /* end service period after last frame or add one */ if (!skb_queue_empty(&frames)) continue; if (reason != IEEE80211_FRAME_RELEASE_UAPSD) { /* for PS-Poll, there's only one frame */ info->flags |= IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; break; } /* For uAPSD, things are a bit more complicated. If the * last frame has a QoS header (i.e. is a QoS-data or * QoS-nulldata frame) then just set the EOSP bit there * and be done. * If the frame doesn't have a QoS header (which means * it should be a bufferable MMPDU) then we can't set * the EOSP bit in the QoS header; add a QoS-nulldata * frame to the list to send it after the MMPDU. * * Note that this code is only in the mac80211-release * code path, we assume that the driver will not buffer * anything but QoS-data frames, or if it does, will * create the QoS-nulldata frame by itself if needed. * * Cf. 802.11-2012 10.2.1.10 (c). */ if (qoshdr) { *qoshdr |= IEEE80211_QOS_CTL_EOSP; info->flags |= IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; } else { /* The standard isn't completely clear on this * as it says the more-data bit should be set * if there are more BUs. The QoS-Null frame * we're about to send isn't buffered yet, we * only create it below, but let's pretend it * was buffered just in case some clients only * expect more-data=0 when eosp=1. */ hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); need_null = true; num++; } break; } drv_allow_buffered_frames(local, sta, tids, num, reason, more_data); ieee80211_add_pending_skbs(local, &pending); if (need_null) ieee80211_send_null_response( sta, find_highest_prio_tid(tids), reason, false, false); sta_info_recalc_tim(sta); } else { int tid; /* * We need to release a frame that is buffered somewhere in the * driver ... it'll have to handle that. * Note that the driver also has to check the number of frames * on the TIDs we're releasing from - if there are more than * n_frames it has to set the more-data bit (if we didn't ask * it to set it anyway due to other buffered frames); if there * are fewer than n_frames it has to make sure to adjust that * to allow the service period to end properly. */ drv_release_buffered_frames(local, sta, driver_release_tids, n_frames, reason, more_data); /* * Note that we don't recalculate the TIM bit here as it would * most likely have no effect at all unless the driver told us * that the TID(s) became empty before returning here from the * release function. * Either way, however, when the driver tells us that the TID(s) * became empty or we find that a txq became empty, we'll do the * TIM recalculation. */ for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { if (!sta->sta.txq[tid] || !(driver_release_tids & BIT(tid)) || txq_has_queue(sta->sta.txq[tid])) continue; sta_info_recalc_tim(sta); break; } } } void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta) { u8 ignore_for_response = sta->sta.uapsd_queues; /* * If all ACs are delivery-enabled then we should reply * from any of them, if only some are enabled we reply * only from the non-enabled ones. */ if (ignore_for_response == BIT(IEEE80211_NUM_ACS) - 1) ignore_for_response = 0; ieee80211_sta_ps_deliver_response(sta, 1, ignore_for_response, IEEE80211_FRAME_RELEASE_PSPOLL); } void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta) { int n_frames = sta->sta.max_sp; u8 delivery_enabled = sta->sta.uapsd_queues; /* * If we ever grow support for TSPEC this might happen if * the TSPEC update from hostapd comes in between a trigger * frame setting WLAN_STA_UAPSD in the RX path and this * actually getting called. */ if (!delivery_enabled) return; switch (sta->sta.max_sp) { case 1: n_frames = 2; break; case 2: n_frames = 4; break; case 3: n_frames = 6; break; case 0: /* XXX: what is a good value? */ n_frames = 128; break; } ieee80211_sta_ps_deliver_response(sta, n_frames, ~delivery_enabled, IEEE80211_FRAME_RELEASE_UAPSD); } void ieee80211_sta_block_awake(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, bool block) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); trace_api_sta_block_awake(sta->local, pubsta, block); if (block) { set_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_clear_fast_xmit(sta); return; } if (!test_sta_flag(sta, WLAN_STA_PS_DRIVER)) return; if (!test_sta_flag(sta, WLAN_STA_PS_STA)) { set_sta_flag(sta, WLAN_STA_PS_DELIVER); clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else if (test_sta_flag(sta, WLAN_STA_PSPOLL) || test_sta_flag(sta, WLAN_STA_UAPSD)) { /* must be asleep in this case */ clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else { clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_check_fast_xmit(sta); } } EXPORT_SYMBOL(ieee80211_sta_block_awake); void ieee80211_sta_eosp(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_local *local = sta->local; trace_api_eosp(local, pubsta); clear_sta_flag(sta, WLAN_STA_SP); } EXPORT_SYMBOL(ieee80211_sta_eosp); void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); enum ieee80211_frame_release_type reason; bool more_data; trace_api_send_eosp_nullfunc(sta->local, pubsta, tid); reason = IEEE80211_FRAME_RELEASE_UAPSD; more_data = ieee80211_sta_ps_more_data(sta, ~sta->sta.uapsd_queues, reason, 0); ieee80211_send_null_response(sta, tid, reason, false, more_data); } EXPORT_SYMBOL(ieee80211_send_eosp_nullfunc); void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta, u8 tid, bool buffered) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); if (WARN_ON(tid >= IEEE80211_NUM_TIDS)) return; trace_api_sta_set_buffered(sta->local, pubsta, tid, buffered); if (buffered) set_bit(tid, &sta->driver_buffered_tids); else clear_bit(tid, &sta->driver_buffered_tids); sta_info_recalc_tim(sta); } EXPORT_SYMBOL(ieee80211_sta_set_buffered); void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, u32 tx_airtime, u32 rx_airtime) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_local *local = sta->sdata->local; u8 ac = ieee80211_ac_from_tid(tid); u32 airtime = 0; if (sta->local->airtime_flags & AIRTIME_USE_TX) airtime += tx_airtime; if (sta->local->airtime_flags & AIRTIME_USE_RX) airtime += rx_airtime; spin_lock_bh(&local->active_txq_lock[ac]); sta->airtime[ac].tx_airtime += tx_airtime; sta->airtime[ac].rx_airtime += rx_airtime; if (ieee80211_sta_keep_active(sta, ac)) sta->airtime[ac].deficit -= airtime; spin_unlock_bh(&local->active_txq_lock[ac]); } EXPORT_SYMBOL(ieee80211_sta_register_airtime); void __ieee80211_sta_recalc_aggregates(struct sta_info *sta, u16 active_links) { bool first = true; int link_id; if (!sta->sta.valid_links || !sta->sta.mlo) { sta->sta.cur = &sta->sta.deflink.agg; return; } rcu_read_lock(); for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) { struct ieee80211_link_sta *link_sta; int i; if (!(active_links & BIT(link_id))) continue; link_sta = rcu_dereference(sta->sta.link[link_id]); if (!link_sta) continue; if (first) { sta->cur = sta->sta.deflink.agg; first = false; continue; } sta->cur.max_amsdu_len = min(sta->cur.max_amsdu_len, link_sta->agg.max_amsdu_len); sta->cur.max_rc_amsdu_len = min(sta->cur.max_rc_amsdu_len, link_sta->agg.max_rc_amsdu_len); for (i = 0; i < ARRAY_SIZE(sta->cur.max_tid_amsdu_len); i++) sta->cur.max_tid_amsdu_len[i] = min(sta->cur.max_tid_amsdu_len[i], link_sta->agg.max_tid_amsdu_len[i]); } rcu_read_unlock(); sta->sta.cur = &sta->cur; } void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); __ieee80211_sta_recalc_aggregates(sta, sta->sdata->vif.active_links); } EXPORT_SYMBOL(ieee80211_sta_recalc_aggregates); void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, struct sta_info *sta, u8 ac, u16 tx_airtime, bool tx_completed) { int tx_pending; if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) return; if (!tx_completed) { if (sta) atomic_add(tx_airtime, &sta->airtime[ac].aql_tx_pending); atomic_add(tx_airtime, &local->aql_total_pending_airtime); atomic_add(tx_airtime, &local->aql_ac_pending_airtime[ac]); return; } if (sta) { tx_pending = atomic_sub_return(tx_airtime, &sta->airtime[ac].aql_tx_pending); if (tx_pending < 0) atomic_cmpxchg(&sta->airtime[ac].aql_tx_pending, tx_pending, 0); } atomic_sub(tx_airtime, &local->aql_total_pending_airtime); tx_pending = atomic_sub_return(tx_airtime, &local->aql_ac_pending_airtime[ac]); if (WARN_ONCE(tx_pending < 0, "Device %s AC %d pending airtime underflow: %u, %u", wiphy_name(local->hw.wiphy), ac, tx_pending, tx_airtime)) { atomic_cmpxchg(&local->aql_ac_pending_airtime[ac], tx_pending, 0); atomic_sub(tx_pending, &local->aql_total_pending_airtime); } } static struct ieee80211_sta_rx_stats * sta_get_last_rx_stats(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = &sta->deflink.rx_stats; int cpu; if (!sta->deflink.pcpu_rx_stats) return stats; for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpustats; cpustats = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); if (time_after(cpustats->last_rx, stats->last_rx)) stats = cpustats; } return stats; } static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, struct rate_info *rinfo) { rinfo->bw = STA_STATS_GET(BW, rate); switch (STA_STATS_GET(TYPE, rate)) { case STA_STATS_RATE_TYPE_VHT: rinfo->flags = RATE_INFO_FLAGS_VHT_MCS; rinfo->mcs = STA_STATS_GET(VHT_MCS, rate); rinfo->nss = STA_STATS_GET(VHT_NSS, rate); if (STA_STATS_GET(SGI, rate)) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; break; case STA_STATS_RATE_TYPE_HT: rinfo->flags = RATE_INFO_FLAGS_MCS; rinfo->mcs = STA_STATS_GET(HT_MCS, rate); if (STA_STATS_GET(SGI, rate)) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; break; case STA_STATS_RATE_TYPE_LEGACY: { struct ieee80211_supported_band *sband; u16 brate; unsigned int shift; int band = STA_STATS_GET(LEGACY_BAND, rate); int rate_idx = STA_STATS_GET(LEGACY_IDX, rate); sband = local->hw.wiphy->bands[band]; if (WARN_ON_ONCE(!sband->bitrates)) break; brate = sband->bitrates[rate_idx].bitrate; if (rinfo->bw == RATE_INFO_BW_5) shift = 2; else if (rinfo->bw == RATE_INFO_BW_10) shift = 1; else shift = 0; rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); break; } case STA_STATS_RATE_TYPE_HE: rinfo->flags = RATE_INFO_FLAGS_HE_MCS; rinfo->mcs = STA_STATS_GET(HE_MCS, rate); rinfo->nss = STA_STATS_GET(HE_NSS, rate); rinfo->he_gi = STA_STATS_GET(HE_GI, rate); rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate); rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate); break; case STA_STATS_RATE_TYPE_EHT: rinfo->flags = RATE_INFO_FLAGS_EHT_MCS; rinfo->mcs = STA_STATS_GET(EHT_MCS, rate); rinfo->nss = STA_STATS_GET(EHT_NSS, rate); rinfo->eht_gi = STA_STATS_GET(EHT_GI, rate); rinfo->eht_ru_alloc = STA_STATS_GET(EHT_RU, rate); break; } } static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) { u32 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate); if (rate == STA_STATS_RATE_INVALID) return -EINVAL; sta_stats_decode_rate(sta->local, rate, rinfo); return 0; } static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, int tid) { unsigned int start; u64 value; do { start = u64_stats_fetch_begin(&rxstats->syncp); value = rxstats->msdu[tid]; } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; } static void sta_set_tidstats(struct sta_info *sta, struct cfg80211_tid_stats *tidstats, int tid) { struct ieee80211_local *local = sta->local; int cpu; if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) { tidstats->rx_msdu += sta_get_tidstats_msdu(&sta->deflink.rx_stats, tid); if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); tidstats->rx_msdu += sta_get_tidstats_msdu(cpurxs, tid); } } tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU); } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU); tidstats->tx_msdu = sta->deflink.tx_stats.msdu[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES); tidstats->tx_msdu_retries = sta->deflink.status_stats.msdu_retries[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED); tidstats->tx_msdu_failed = sta->deflink.status_stats.msdu_failed[tid]; } if (tid < IEEE80211_NUM_TIDS) { spin_lock_bh(&local->fq.lock); rcu_read_lock(); tidstats->filled |= BIT(NL80211_TID_STATS_TXQ_STATS); ieee80211_fill_txq_stats(&tidstats->txq_stats, to_txq_info(sta->sta.txq[tid])); rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); } } static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) { unsigned int start; u64 value; do { start = u64_stats_fetch_begin(&rxstats->syncp); value = rxstats->bytes; } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; } #ifdef CONFIG_MAC80211_MESH static void sta_set_mesh_sinfo(struct sta_info *sta, struct station_info *sinfo) { struct ieee80211_local *local = sta->sdata->local; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_LLID) | BIT_ULL(NL80211_STA_INFO_PLID) | BIT_ULL(NL80211_STA_INFO_PLINK_STATE) | BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | BIT_ULL(NL80211_STA_INFO_PEER_PM) | BIT_ULL(NL80211_STA_INFO_NONPEER_PM) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_AS); sinfo->llid = sta->mesh->llid; sinfo->plid = sta->mesh->plid; sinfo->plink_state = sta->mesh->plink_state; if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_T_OFFSET); sinfo->t_offset = sta->mesh->t_offset; } sinfo->local_pm = sta->mesh->local_pm; sinfo->peer_pm = sta->mesh->peer_pm; sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; sinfo->connected_to_gate = sta->mesh->connected_to_gate; sinfo->connected_to_as = sta->mesh->connected_to_as; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_LINK_METRIC); sinfo->airtime_link_metric = airtime_link_metric_get(local, sta); } #endif void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, bool tidstats) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; u32 thr = 0; int i, ac, cpu; struct ieee80211_sta_rx_stats *last_rxstats; last_rxstats = sta_get_last_rx_stats(sta); sinfo->generation = sdata->local->sta_generation; /* do before driver, so beacon filtering drivers have a * chance to e.g. just add the number of filtered beacons * (or just modify the value entirely, of course) */ if (sdata->vif.type == NL80211_IFTYPE_STATION) sinfo->rx_beacon = sdata->deflink.u.mgd.count_beacon_signal; drv_sta_statistics(local, sdata, &sta->sta, sinfo); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) | BIT_ULL(NL80211_STA_INFO_STA_FLAGS) | BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TIME) | BIT_ULL(NL80211_STA_INFO_ASSOC_AT_BOOTTIME) | BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sinfo->beacon_loss_count = sdata->deflink.u.mgd.beacon_loss_count; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS); } sinfo->connected_time = ktime_get_seconds() - sta->last_connected; sinfo->assoc_at = sta->assoc_at; sinfo->inactive_time = jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta)); if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { sinfo->tx_bytes = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_bytes += sta->deflink.tx_stats.bytes[ac]; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) { sinfo->tx_packets = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_packets += sta->deflink.tx_stats.packets[ac]; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS); } if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { sinfo->rx_bytes += sta_get_stats_bytes(&sta->deflink.rx_stats); if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_bytes += sta_get_stats_bytes(cpurxs); } } sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) { sinfo->rx_packets = sta->deflink.rx_stats.packets; if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_packets += cpurxs->packets; } } sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) { sinfo->tx_retries = sta->deflink.status_stats.retry_count; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) { sinfo->tx_failed = sta->deflink.status_stats.retry_failed; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_DURATION))) { for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->rx_duration += sta->airtime[ac].rx_airtime; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_DURATION); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_DURATION))) { for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_duration += sta->airtime[ac].tx_airtime; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_DURATION); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT))) { sinfo->airtime_weight = sta->airtime_weight; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT); } sinfo->rx_dropped_misc = sta->deflink.rx_stats.dropped; if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_dropped_misc += cpurxs->dropped; } } if (sdata->vif.type == NL80211_IFTYPE_STATION && !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) | BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG); sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif); } if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) { sinfo->signal = (s8)last_rxstats->last_signal; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL); } if (!sta->deflink.pcpu_rx_stats && !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) { sinfo->signal_avg = -ewma_signal_read(&sta->deflink.rx_stats_avg.signal); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG); } } /* for the average - if pcpu_rx_stats isn't set - rxstats must point to * the sta->rx_stats struct, so the check here is fine with and without * pcpu statistics */ if (last_rxstats->chains && !(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) | BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL); if (!sta->deflink.pcpu_rx_stats) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); sinfo->chains = last_rxstats->chains; for (i = 0; i < ARRAY_SIZE(sinfo->chain_signal); i++) { sinfo->chain_signal[i] = last_rxstats->chain_signal_last[i]; sinfo->chain_signal_avg[i] = -ewma_signal_read(&sta->deflink.rx_stats_avg.chain_signal[i]); } } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) && !sta->sta.valid_links && ieee80211_rate_valid(&sta->deflink.tx_stats.last_rate)) { sta_set_rate_info_tx(sta, &sta->deflink.tx_stats.last_rate, &sinfo->txrate); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) && !sta->sta.valid_links) { if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE); } if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) { for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) sta_set_tidstats(sta, &sinfo->pertid[i], i); } #ifdef CONFIG_MAC80211_MESH if (ieee80211_vif_is_mesh(&sdata->vif)) sta_set_mesh_sinfo(sta, sinfo); #endif sinfo->bss_param.flags = 0; if (sdata->vif.bss_conf.use_cts_prot) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT; if (sdata->vif.bss_conf.use_short_preamble) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE; if (sdata->vif.bss_conf.use_short_slot) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME; sinfo->bss_param.dtim_period = sdata->vif.bss_conf.dtim_period; sinfo->bss_param.beacon_interval = sdata->vif.bss_conf.beacon_int; sinfo->sta_flags.set = 0; sinfo->sta_flags.mask = BIT(NL80211_STA_FLAG_AUTHORIZED) | BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) | BIT(NL80211_STA_FLAG_WME) | BIT(NL80211_STA_FLAG_MFP) | BIT(NL80211_STA_FLAG_AUTHENTICATED) | BIT(NL80211_STA_FLAG_ASSOCIATED) | BIT(NL80211_STA_FLAG_TDLS_PEER); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHORIZED); if (test_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_SHORT_PREAMBLE); if (sta->sta.wme) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_WME); if (test_sta_flag(sta, WLAN_STA_MFP)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_MFP); if (test_sta_flag(sta, WLAN_STA_AUTH)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHENTICATED); if (test_sta_flag(sta, WLAN_STA_ASSOC)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_ASSOCIATED); if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER); thr = sta_get_expected_throughput(sta); if (thr != 0) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT); sinfo->expected_throughput = thr; } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL)) && sta->deflink.status_stats.ack_signal_filled) { sinfo->ack_signal = sta->deflink.status_stats.last_ack_signal; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) && sta->deflink.status_stats.ack_signal_filled) { sinfo->avg_ack_signal = -(s8)ewma_avg_signal_read( &sta->deflink.status_stats.avg_ack_signal); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); } } u32 sta_get_expected_throughput(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct rate_control_ref *ref = NULL; u32 thr = 0; if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) ref = local->rate_ctrl; /* check if the driver has a SW RC implementation */ if (ref && ref->ops->get_expected_throughput) thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv); else thr = drv_get_expected_throughput(local, sta); return thr; } unsigned long ieee80211_sta_last_active(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta); if (!sta->deflink.status_stats.last_ack || time_after(stats->last_rx, sta->deflink.status_stats.last_ack)) return stats->last_rx; return sta->deflink.status_stats.last_ack; } int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct sta_link_alloc *alloc; int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED)); /* must represent an MLD from the start */ if (WARN_ON(!sta->sta.valid_links)) return -EINVAL; if (WARN_ON(sta->sta.valid_links & BIT(link_id) || sta->link[link_id])) return -EBUSY; alloc = kzalloc(sizeof(*alloc), GFP_KERNEL); if (!alloc) return -ENOMEM; ret = sta_info_alloc_link(sdata->local, &alloc->info, GFP_KERNEL); if (ret) { kfree(alloc); return ret; } sta_info_add_link(sta, link_id, &alloc->info, &alloc->sta); ieee80211_link_sta_debugfs_add(&alloc->info); return 0; } void ieee80211_sta_free_link(struct sta_info *sta, unsigned int link_id) { lockdep_assert_wiphy(sta->sdata->local->hw.wiphy); WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED)); sta_remove_link(sta, link_id, false); } int ieee80211_sta_activate_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct link_sta_info *link_sta; u16 old_links = sta->sta.valid_links; u16 new_links = old_links | BIT(link_id); int ret; link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&sdata->local->hw.wiphy->mtx)); if (WARN_ON(old_links == new_links || !link_sta)) return -EINVAL; rcu_read_lock(); if (link_sta_info_hash_lookup(sdata->local, link_sta->addr)) { rcu_read_unlock(); return -EALREADY; } /* we only modify under the mutex so this is fine */ rcu_read_unlock(); sta->sta.valid_links = new_links; if (WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED))) goto hash; ieee80211_recalc_min_chandef(sdata, link_id); /* Ensure the values are updated for the driver, * redone by sta_remove_link on failure. */ ieee80211_sta_recalc_aggregates(&sta->sta); ret = drv_change_sta_links(sdata->local, sdata, &sta->sta, old_links, new_links); if (ret) { sta->sta.valid_links = old_links; sta_remove_link(sta, link_id, false); return ret; } hash: ret = link_sta_info_hash_add(sdata->local, link_sta); WARN_ON(ret); return 0; } void ieee80211_sta_remove_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; u16 old_links = sta->sta.valid_links; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta->sta.valid_links &= ~BIT(link_id); if (!WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED))) drv_change_sta_links(sdata->local, sdata, &sta->sta, old_links, sta->sta.valid_links); sta_remove_link(sta, link_id, true); } void ieee80211_sta_set_max_amsdu_subframes(struct sta_info *sta, const u8 *ext_capab, unsigned int ext_capab_len) { u8 val; sta->sta.max_amsdu_subframes = 0; if (ext_capab_len < 8) return; /* The sender might not have sent the last bit, consider it to be 0 */ val = u8_get_bits(ext_capab[7], WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB); /* we did get all the bits, take the MSB as well */ if (ext_capab_len >= 9) val |= u8_get_bits(ext_capab[8], WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB) << 1; if (val) sta->sta.max_amsdu_subframes = 4 << (4 - val); } #ifdef CONFIG_LOCKDEP bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); return lockdep_is_held(&sta->local->hw.wiphy->mtx); } EXPORT_SYMBOL(lockdep_sta_mutex_held); #endif |
| 178 131 89 203 204 161 161 161 73 73 205 205 73 73 73 6 3 64 16 1 2 14 36 34 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/attributes.c * * Vyacheslav Dubeyko <slava@dubeyko.com> * * Handling of records in attributes tree */ #include "hfsplus_fs.h" #include "hfsplus_raw.h" static struct kmem_cache *hfsplus_attr_tree_cachep; int __init hfsplus_create_attr_tree_cache(void) { if (hfsplus_attr_tree_cachep) return -EEXIST; hfsplus_attr_tree_cachep = kmem_cache_create("hfsplus_attr_cache", sizeof(hfsplus_attr_entry), 0, SLAB_HWCACHE_ALIGN, NULL); if (!hfsplus_attr_tree_cachep) return -ENOMEM; return 0; } void hfsplus_destroy_attr_tree_cache(void) { kmem_cache_destroy(hfsplus_attr_tree_cachep); } int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *k1, const hfsplus_btree_key *k2) { __be32 k1_cnid, k2_cnid; k1_cnid = k1->attr.cnid; k2_cnid = k2->attr.cnid; if (k1_cnid != k2_cnid) return be32_to_cpu(k1_cnid) < be32_to_cpu(k2_cnid) ? -1 : 1; return hfsplus_strcmp( (const struct hfsplus_unistr *)&k1->attr.key_name, (const struct hfsplus_unistr *)&k2->attr.key_name); } int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key, u32 cnid, const char *name) { int len; memset(key, 0, sizeof(struct hfsplus_attr_key)); key->attr.cnid = cpu_to_be32(cnid); if (name) { int res = hfsplus_asc2uni(sb, (struct hfsplus_unistr *)&key->attr.key_name, HFSPLUS_ATTR_MAX_STRLEN, name, strlen(name)); if (res) return res; len = be16_to_cpu(key->attr.key_name.length); } else { key->attr.key_name.length = 0; len = 0; } /* The length of the key, as stored in key_len field, does not include * the size of the key_len field itself. * So, offsetof(hfsplus_attr_key, key_name) is a trick because * it takes into consideration key_len field (__be16) of * hfsplus_attr_key structure instead of length field (__be16) of * hfsplus_attr_unistr structure. */ key->key_len = cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) + 2 * len); return 0; } hfsplus_attr_entry *hfsplus_alloc_attr_entry(void) { return kmem_cache_alloc(hfsplus_attr_tree_cachep, GFP_KERNEL); } void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry) { if (entry) kmem_cache_free(hfsplus_attr_tree_cachep, entry); } #define HFSPLUS_INVALID_ATTR_RECORD -1 static int hfsplus_attr_build_record(hfsplus_attr_entry *entry, int record_type, u32 cnid, const void *value, size_t size) { if (record_type == HFSPLUS_ATTR_FORK_DATA) { /* * Mac OS X supports only inline data attributes. * Do nothing */ memset(entry, 0, sizeof(*entry)); return sizeof(struct hfsplus_attr_fork_data); } else if (record_type == HFSPLUS_ATTR_EXTENTS) { /* * Mac OS X supports only inline data attributes. * Do nothing. */ memset(entry, 0, sizeof(*entry)); return sizeof(struct hfsplus_attr_extents); } else if (record_type == HFSPLUS_ATTR_INLINE_DATA) { u16 len; memset(entry, 0, sizeof(struct hfsplus_attr_inline_data)); entry->inline_data.record_type = cpu_to_be32(record_type); if (size <= HFSPLUS_MAX_INLINE_DATA_SIZE) len = size; else return HFSPLUS_INVALID_ATTR_RECORD; entry->inline_data.length = cpu_to_be16(len); memcpy(entry->inline_data.raw_bytes, value, len); /* * Align len on two-byte boundary. * It needs to add pad byte if we have odd len. */ len = round_up(len, 2); return offsetof(struct hfsplus_attr_inline_data, raw_bytes) + len; } else /* invalid input */ memset(entry, 0, sizeof(*entry)); return HFSPLUS_INVALID_ATTR_RECORD; } int hfsplus_find_attr(struct super_block *sb, u32 cnid, const char *name, struct hfs_find_data *fd) { int err = 0; hfs_dbg(ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid); if (!HFSPLUS_SB(sb)->attr_tree) { pr_err("attributes file doesn't exist\n"); return -EINVAL; } if (name) { err = hfsplus_attr_build_key(sb, fd->search_key, cnid, name); if (err) goto failed_find_attr; err = hfs_brec_find(fd, hfs_find_rec_by_key); if (err) goto failed_find_attr; } else { err = hfsplus_attr_build_key(sb, fd->search_key, cnid, NULL); if (err) goto failed_find_attr; err = hfs_brec_find(fd, hfs_find_1st_rec_by_cnid); if (err) goto failed_find_attr; } failed_find_attr: return err; } int hfsplus_attr_exists(struct inode *inode, const char *name) { int err = 0; struct super_block *sb = inode->i_sb; struct hfs_find_data fd; if (!HFSPLUS_SB(sb)->attr_tree) return 0; err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd); if (err) return 0; err = hfsplus_find_attr(sb, inode->i_ino, name, &fd); if (err) goto attr_not_found; hfs_find_exit(&fd); return 1; attr_not_found: hfs_find_exit(&fd); return 0; } int hfsplus_create_attr(struct inode *inode, const char *name, const void *value, size_t size) { struct super_block *sb = inode->i_sb; struct hfs_find_data fd; hfsplus_attr_entry *entry_ptr; int entry_size; int err; hfs_dbg(ATTR_MOD, "create_attr: %s,%ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { pr_err("attributes file doesn't exist\n"); return -EINVAL; } entry_ptr = hfsplus_alloc_attr_entry(); if (!entry_ptr) return -ENOMEM; err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd); if (err) goto failed_init_create_attr; /* Fail early and avoid ENOSPC during the btree operation */ err = hfs_bmap_reserve(fd.tree, fd.tree->depth + 1); if (err) goto failed_create_attr; if (name) { err = hfsplus_attr_build_key(sb, fd.search_key, inode->i_ino, name); if (err) goto failed_create_attr; } else { err = -EINVAL; goto failed_create_attr; } /* Mac OS X supports only inline data attributes. */ entry_size = hfsplus_attr_build_record(entry_ptr, HFSPLUS_ATTR_INLINE_DATA, inode->i_ino, value, size); if (entry_size == HFSPLUS_INVALID_ATTR_RECORD) { err = -EINVAL; goto failed_create_attr; } err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err != -ENOENT) { if (!err) err = -EEXIST; goto failed_create_attr; } err = hfs_brec_insert(&fd, entry_ptr, entry_size); if (err) goto failed_create_attr; hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ATTR_DIRTY); failed_create_attr: hfs_find_exit(&fd); failed_init_create_attr: hfsplus_destroy_attr_entry(entry_ptr); return err; } static int __hfsplus_delete_attr(struct inode *inode, u32 cnid, struct hfs_find_data *fd) { int err = 0; __be32 found_cnid, record_type; hfs_bnode_read(fd->bnode, &found_cnid, fd->keyoffset + offsetof(struct hfsplus_attr_key, cnid), sizeof(__be32)); if (cnid != be32_to_cpu(found_cnid)) return -ENOENT; hfs_bnode_read(fd->bnode, &record_type, fd->entryoffset, sizeof(record_type)); switch (be32_to_cpu(record_type)) { case HFSPLUS_ATTR_INLINE_DATA: /* All is OK. Do nothing. */ break; case HFSPLUS_ATTR_FORK_DATA: case HFSPLUS_ATTR_EXTENTS: pr_err("only inline data xattr are supported\n"); return -EOPNOTSUPP; default: pr_err("invalid extended attribute record\n"); return -ENOENT; } /* Avoid btree corruption */ hfs_bnode_read(fd->bnode, fd->search_key, fd->keyoffset, fd->keylength); err = hfs_brec_remove(fd); if (err) return err; hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ATTR_DIRTY); return err; } int hfsplus_delete_attr(struct inode *inode, const char *name) { int err = 0; struct super_block *sb = inode->i_sb; struct hfs_find_data fd; hfs_dbg(ATTR_MOD, "delete_attr: %s,%ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { pr_err("attributes file doesn't exist\n"); return -EINVAL; } err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd); if (err) return err; /* Fail early and avoid ENOSPC during the btree operation */ err = hfs_bmap_reserve(fd.tree, fd.tree->depth); if (err) goto out; if (name) { err = hfsplus_attr_build_key(sb, fd.search_key, inode->i_ino, name); if (err) goto out; } else { pr_err("invalid extended attribute name\n"); err = -EINVAL; goto out; } err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; err = __hfsplus_delete_attr(inode, inode->i_ino, &fd); if (err) goto out; out: hfs_find_exit(&fd); return err; } int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid) { int err = 0; struct hfs_find_data fd; hfs_dbg(ATTR_MOD, "delete_all_attrs: %d\n", cnid); if (!HFSPLUS_SB(dir->i_sb)->attr_tree) { pr_err("attributes file doesn't exist\n"); return -EINVAL; } err = hfs_find_init(HFSPLUS_SB(dir->i_sb)->attr_tree, &fd); if (err) return err; for (;;) { err = hfsplus_find_attr(dir->i_sb, cnid, NULL, &fd); if (err) { if (err != -ENOENT) pr_err("xattr search failed\n"); goto end_delete_all; } err = __hfsplus_delete_attr(dir, cnid, &fd); if (err) goto end_delete_all; } end_delete_all: hfs_find_exit(&fd); return err; } |
| 4 3 1 2 1 1 1 4 2 2 1 1 1 12 1 1 8 2 1 4 2 2 3 2 12 1 1 7 5 5 1 1 3 2 1 2 2 1 2 3 3 1 3 3 3 1 11 2 4 4 4 6 10 7 6 1 9 6 4 4 5 6 4 1 2 1 6 6 2 1 2 1 1 1 1 1 1 1 1 12 1 1 1 9 8 224 226 1 1 1 13 2 1 11 1 3 6 1 1 4 1 1 3 1 5 12 7 6 12 20 1 3 1 1 12 5 7 9 1 7 1 8 1 5 1 3 1 8 1 6 1 4 1 14 1 2 1 6 12 233 1 2 235 226 2 12 1 1 1 1 244 1 16 3 235 236 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 | // SPDX-License-Identifier: GPL-2.0 /* * Code related to the io_uring_register() syscall * * Copyright (C) 2023 Jens Axboe */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/syscalls.h> #include <linux/refcount.h> #include <linux/bits.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/nospec.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/io_uring_types.h> #include "io_uring.h" #include "opdef.h" #include "tctx.h" #include "rsrc.h" #include "sqpoll.h" #include "register.h" #include "cancel.h" #include "kbuf.h" #include "napi.h" #include "eventfd.h" #include "msg_ring.h" #include "memmap.h" #include "zcrx.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct io_uring_probe *p; size_t size; int i, ret; if (nr_args > IORING_OP_LAST) nr_args = IORING_OP_LAST; size = struct_size(p, ops, nr_args); p = kzalloc(size, GFP_KERNEL); if (!p) return -ENOMEM; ret = -EFAULT; if (copy_from_user(p, arg, size)) goto out; ret = -EINVAL; if (memchr_inv(p, 0, size)) goto out; p->last_op = IORING_OP_LAST - 1; for (i = 0; i < nr_args; i++) { p->ops[i].op = i; if (io_uring_op_supported(i)) p->ops[i].flags = IO_URING_OP_SUPPORTED; } p->ops_len = i; ret = 0; if (copy_to_user(arg, p, size)) ret = -EFAULT; out: kfree(p); return ret; } int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) { const struct cred *creds; creds = xa_erase(&ctx->personalities, id); if (creds) { put_cred(creds); return 0; } return -EINVAL; } static int io_register_personality(struct io_ring_ctx *ctx) { const struct cred *creds; u32 id; int ret; creds = get_current_cred(); ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); if (ret < 0) { put_cred(creds); return ret; } return id; } static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, struct io_restriction *restrictions) { struct io_uring_restriction *res; size_t size; int i, ret; if (!arg || nr_args > IORING_MAX_RESTRICTIONS) return -EINVAL; size = array_size(nr_args, sizeof(*res)); if (size == SIZE_MAX) return -EOVERFLOW; res = memdup_user(arg, size); if (IS_ERR(res)) return PTR_ERR(res); ret = -EINVAL; for (i = 0; i < nr_args; i++) { switch (res[i].opcode) { case IORING_RESTRICTION_REGISTER_OP: if (res[i].register_op >= IORING_REGISTER_LAST) goto err; __set_bit(res[i].register_op, restrictions->register_op); break; case IORING_RESTRICTION_SQE_OP: if (res[i].sqe_op >= IORING_OP_LAST) goto err; __set_bit(res[i].sqe_op, restrictions->sqe_op); break; case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: restrictions->sqe_flags_allowed = res[i].sqe_flags; break; case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: restrictions->sqe_flags_required = res[i].sqe_flags; break; default: goto err; } } ret = 0; err: kfree(res); return ret; } static __cold int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args) { int ret; /* Restrictions allowed only if rings started disabled */ if (!(ctx->flags & IORING_SETUP_R_DISABLED)) return -EBADFD; /* We allow only a single restrictions registration */ if (ctx->restrictions.registered) return -EBUSY; ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); /* Reset all restrictions if an error happened */ if (ret != 0) memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); else ctx->restrictions.registered = true; return ret; } static int io_register_enable_rings(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_R_DISABLED)) return -EBADFD; if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); /* * Lazy activation attempts would fail if it was polled before * submitter_task is set. */ if (wq_has_sleeper(&ctx->poll_wq)) io_activate_pollwq(ctx); } if (ctx->restrictions.registered) ctx->restricted = 1; ctx->flags &= ~IORING_SETUP_R_DISABLED; if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); return 0; } static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, cpumask_var_t new_mask) { int ret; if (!(ctx->flags & IORING_SETUP_SQPOLL)) { ret = io_wq_cpu_affinity(current->io_uring, new_mask); } else { mutex_unlock(&ctx->uring_lock); ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); mutex_lock(&ctx->uring_lock); } return ret; } static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, unsigned len) { cpumask_var_t new_mask; int ret; if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) return -ENOMEM; cpumask_clear(new_mask); if (len > cpumask_size()) len = cpumask_size(); #ifdef CONFIG_COMPAT if (in_compat_syscall()) ret = compat_get_bitmap(cpumask_bits(new_mask), (const compat_ulong_t __user *)arg, len * 8 /* CHAR_BIT */); else #endif ret = copy_from_user(new_mask, arg, len); if (ret) { free_cpumask_var(new_mask); return -EFAULT; } ret = __io_register_iowq_aff(ctx, new_mask); free_cpumask_var(new_mask); return ret; } static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) { return __io_register_iowq_aff(ctx, NULL); } static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, void __user *arg) __must_hold(&ctx->uring_lock) { struct io_tctx_node *node; struct io_uring_task *tctx = NULL; struct io_sq_data *sqd = NULL; __u32 new_count[2]; int i, ret; if (copy_from_user(new_count, arg, sizeof(new_count))) return -EFAULT; for (i = 0; i < ARRAY_SIZE(new_count); i++) if (new_count[i] > INT_MAX) return -EINVAL; if (ctx->flags & IORING_SETUP_SQPOLL) { sqd = ctx->sq_data; if (sqd) { struct task_struct *tsk; /* * Observe the correct sqd->lock -> ctx->uring_lock * ordering. Fine to drop uring_lock here, we hold * a ref to the ctx. */ refcount_inc(&sqd->refs); mutex_unlock(&ctx->uring_lock); mutex_lock(&sqd->lock); mutex_lock(&ctx->uring_lock); tsk = sqpoll_task_locked(sqd); if (tsk) tctx = tsk->io_uring; } } else { tctx = current->io_uring; } BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); for (i = 0; i < ARRAY_SIZE(new_count); i++) if (new_count[i]) ctx->iowq_limits[i] = new_count[i]; ctx->iowq_limits_set = true; if (tctx && tctx->io_wq) { ret = io_wq_max_workers(tctx->io_wq, new_count); if (ret) goto err; } else { memset(new_count, 0, sizeof(new_count)); } if (sqd) { mutex_unlock(&ctx->uring_lock); mutex_unlock(&sqd->lock); io_put_sq_data(sqd); mutex_lock(&ctx->uring_lock); } if (copy_to_user(arg, new_count, sizeof(new_count))) return -EFAULT; /* that's it for SQPOLL, only the SQPOLL task creates requests */ if (sqd) return 0; /* now propagate the restriction to all registered users */ list_for_each_entry(node, &ctx->tctx_list, ctx_node) { tctx = node->task->io_uring; if (WARN_ON_ONCE(!tctx->io_wq)) continue; for (i = 0; i < ARRAY_SIZE(new_count); i++) new_count[i] = ctx->iowq_limits[i]; /* ignore errors, it always returns zero anyway */ (void)io_wq_max_workers(tctx->io_wq, new_count); } return 0; err: if (sqd) { mutex_unlock(&ctx->uring_lock); mutex_unlock(&sqd->lock); io_put_sq_data(sqd); mutex_lock(&ctx->uring_lock); } return ret; } static int io_register_clock(struct io_ring_ctx *ctx, struct io_uring_clock_register __user *arg) { struct io_uring_clock_register reg; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) return -EINVAL; switch (reg.clockid) { case CLOCK_MONOTONIC: ctx->clock_offset = 0; break; case CLOCK_BOOTTIME: ctx->clock_offset = TK_OFFS_BOOT; break; default: return -EINVAL; } ctx->clockid = reg.clockid; return 0; } /* * State to maintain until we can swap. Both new and old state, used for * either mapping or freeing. */ struct io_ring_ctx_rings { struct io_rings *rings; struct io_uring_sqe *sq_sqes; struct io_mapped_region sq_region; struct io_mapped_region ring_region; }; static void io_register_free_rings(struct io_ring_ctx *ctx, struct io_uring_params *p, struct io_ring_ctx_rings *r) { io_free_region(ctx, &r->sq_region); io_free_region(ctx, &r->ring_region); } #define swap_old(ctx, o, n, field) \ do { \ (o).field = (ctx)->field; \ (ctx)->field = (n).field; \ } while (0) #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_region_desc rd; struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; size_t size, sq_array_offset; unsigned i, tail, old_head; struct io_uring_params p; int ret; /* for single issuer, must be owner resizing */ if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && current != ctx->submitter_task) return -EEXIST; /* limited to DEFER_TASKRUN for now */ if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) return -EINVAL; if (copy_from_user(&p, arg, sizeof(p))) return -EFAULT; if (p.flags & ~RESIZE_FLAGS) return -EINVAL; /* properties that are always inherited */ p.flags |= (ctx->flags & COPY_FLAGS); ret = io_uring_fill_params(p.sq_entries, &p); if (unlikely(ret)) return ret; /* nothing to do, but copy params back */ if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) { if (copy_to_user(arg, &p, sizeof(p))) return -EFAULT; return 0; } size = rings_size(p.flags, p.sq_entries, p.cq_entries, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; memset(&rd, 0, sizeof(rd)); rd.size = PAGE_ALIGN(size); if (p.flags & IORING_SETUP_NO_MMAP) { rd.user_addr = p.cq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; } ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); if (ret) { io_register_free_rings(ctx, &p, &n); return ret; } n.rings = io_region_get_ptr(&n.ring_region); /* * At this point n.rings is shared with userspace, just like o.rings * is as well. While we don't expect userspace to modify it while * a resize is in progress, and it's most likely that userspace will * shoot itself in the foot if it does, we can't always assume good * intent... Use read/write once helpers from here on to indicate the * shared nature of it. */ WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1); WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1); WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries); WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); if (copy_to_user(arg, &p, sizeof(p))) { io_register_free_rings(ctx, &p, &n); return -EFAULT; } if (p.flags & IORING_SETUP_SQE128) size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); else size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); if (size == SIZE_MAX) { io_register_free_rings(ctx, &p, &n); return -EOVERFLOW; } memset(&rd, 0, sizeof(rd)); rd.size = PAGE_ALIGN(size); if (p.flags & IORING_SETUP_NO_MMAP) { rd.user_addr = p.sq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; } ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); if (ret) { io_register_free_rings(ctx, &p, &n); return ret; } n.sq_sqes = io_region_get_ptr(&n.sq_region); /* * If using SQPOLL, park the thread */ if (ctx->sq_data) { mutex_unlock(&ctx->uring_lock); io_sq_thread_park(ctx->sq_data); mutex_lock(&ctx->uring_lock); } /* * We'll do the swap. Grab the ctx->mmap_lock, which will exclude * any new mmap's on the ring fd. Clear out existing mappings to prevent * mmap from seeing them, as we'll unmap them. Any attempt to mmap * existing rings beyond this point will fail. Not that it could proceed * at this point anyway, as the io_uring mmap side needs go grab the * ctx->mmap_lock as well. Likewise, hold the completion lock over the * duration of the actual swap. */ mutex_lock(&ctx->mmap_lock); spin_lock(&ctx->completion_lock); o.rings = ctx->rings; ctx->rings = NULL; o.sq_sqes = ctx->sq_sqes; ctx->sq_sqes = NULL; /* * Now copy SQ and CQ entries, if any. If either of the destination * rings can't hold what is already there, then fail the operation. */ tail = READ_ONCE(o.rings->sq.tail); old_head = READ_ONCE(o.rings->sq.head); if (tail - old_head > p.sq_entries) goto overflow; for (i = old_head; i < tail; i++) { unsigned src_head = i & (ctx->sq_entries - 1); unsigned dst_head = i & (p.sq_entries - 1); n.sq_sqes[dst_head] = o.sq_sqes[src_head]; } WRITE_ONCE(n.rings->sq.head, old_head); WRITE_ONCE(n.rings->sq.tail, tail); tail = READ_ONCE(o.rings->cq.tail); old_head = READ_ONCE(o.rings->cq.head); if (tail - old_head > p.cq_entries) { overflow: /* restore old rings, and return -EOVERFLOW via cleanup path */ ctx->rings = o.rings; ctx->sq_sqes = o.sq_sqes; to_free = &n; ret = -EOVERFLOW; goto out; } for (i = old_head; i < tail; i++) { unsigned src_head = i & (ctx->cq_entries - 1); unsigned dst_head = i & (p.cq_entries - 1); n.rings->cqes[dst_head] = o.rings->cqes[src_head]; } WRITE_ONCE(n.rings->cq.head, old_head); WRITE_ONCE(n.rings->cq.tail, tail); /* invalidate cached cqe refill */ ctx->cqe_cached = ctx->cqe_sentinel = NULL; WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped)); atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags)); WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags)); WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow)); /* all done, store old pointers and assign new ones */ if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); ctx->sq_entries = p.sq_entries; ctx->cq_entries = p.cq_entries; ctx->rings = n.rings; ctx->sq_sqes = n.sq_sqes; swap_old(ctx, o, n, ring_region); swap_old(ctx, o, n, sq_region); to_free = &o; ret = 0; out: spin_unlock(&ctx->completion_lock); mutex_unlock(&ctx->mmap_lock); io_register_free_rings(ctx, &p, to_free); if (ctx->sq_data) io_sq_thread_unpark(ctx->sq_data); return ret; } static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) { struct io_uring_mem_region_reg __user *reg_uptr = uarg; struct io_uring_mem_region_reg reg; struct io_uring_region_desc __user *rd_uptr; struct io_uring_region_desc rd; int ret; if (io_region_is_set(&ctx->param_region)) return -EBUSY; if (copy_from_user(®, reg_uptr, sizeof(reg))) return -EFAULT; rd_uptr = u64_to_user_ptr(reg.region_uptr); if (copy_from_user(&rd, rd_uptr, sizeof(rd))) return -EFAULT; if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) return -EINVAL; if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) return -EINVAL; /* * This ensures there are no waiters. Waiters are unlocked and it's * hard to synchronise with them, especially if we need to initialise * the region. */ if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) && !(ctx->flags & IORING_SETUP_R_DISABLED)) return -EINVAL; ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, IORING_MAP_OFF_PARAM_REGION); if (ret) return ret; if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { io_free_region(ctx, &ctx->param_region); return -EFAULT; } if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); ctx->cq_wait_size = rd.size; } return 0; } static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) __acquires(ctx->uring_lock) { int ret; /* * We don't quiesce the refs for register anymore and so it can't be * dying as we're holding a file ref here. */ if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) return -ENXIO; if (ctx->submitter_task && ctx->submitter_task != current) return -EEXIST; if (ctx->restricted) { opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); if (!test_bit(opcode, ctx->restrictions.register_op)) return -EACCES; } switch (opcode) { case IORING_REGISTER_BUFFERS: ret = -EFAULT; if (!arg) break; ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_BUFFERS: ret = -EINVAL; if (arg || nr_args) break; ret = io_sqe_buffers_unregister(ctx); break; case IORING_REGISTER_FILES: ret = -EFAULT; if (!arg) break; ret = io_sqe_files_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_FILES: ret = -EINVAL; if (arg || nr_args) break; ret = io_sqe_files_unregister(ctx); break; case IORING_REGISTER_FILES_UPDATE: ret = io_register_files_update(ctx, arg, nr_args); break; case IORING_REGISTER_EVENTFD: ret = -EINVAL; if (nr_args != 1) break; ret = io_eventfd_register(ctx, arg, 0); break; case IORING_REGISTER_EVENTFD_ASYNC: ret = -EINVAL; if (nr_args != 1) break; ret = io_eventfd_register(ctx, arg, 1); break; case IORING_UNREGISTER_EVENTFD: ret = -EINVAL; if (arg || nr_args) break; ret = io_eventfd_unregister(ctx); break; case IORING_REGISTER_PROBE: ret = -EINVAL; if (!arg || nr_args > 256) break; ret = io_probe(ctx, arg, nr_args); break; case IORING_REGISTER_PERSONALITY: ret = -EINVAL; if (arg || nr_args) break; ret = io_register_personality(ctx); break; case IORING_UNREGISTER_PERSONALITY: ret = -EINVAL; if (arg) break; ret = io_unregister_personality(ctx, nr_args); break; case IORING_REGISTER_ENABLE_RINGS: ret = -EINVAL; if (arg || nr_args) break; ret = io_register_enable_rings(ctx); break; case IORING_REGISTER_RESTRICTIONS: ret = io_register_restrictions(ctx, arg, nr_args); break; case IORING_REGISTER_FILES2: ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); break; case IORING_REGISTER_FILES_UPDATE2: ret = io_register_rsrc_update(ctx, arg, nr_args, IORING_RSRC_FILE); break; case IORING_REGISTER_BUFFERS2: ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); break; case IORING_REGISTER_BUFFERS_UPDATE: ret = io_register_rsrc_update(ctx, arg, nr_args, IORING_RSRC_BUFFER); break; case IORING_REGISTER_IOWQ_AFF: ret = -EINVAL; if (!arg || !nr_args) break; ret = io_register_iowq_aff(ctx, arg, nr_args); break; case IORING_UNREGISTER_IOWQ_AFF: ret = -EINVAL; if (arg || nr_args) break; ret = io_unregister_iowq_aff(ctx); break; case IORING_REGISTER_IOWQ_MAX_WORKERS: ret = -EINVAL; if (!arg || nr_args != 2) break; ret = io_register_iowq_max_workers(ctx, arg); break; case IORING_REGISTER_RING_FDS: ret = io_ringfd_register(ctx, arg, nr_args); break; case IORING_UNREGISTER_RING_FDS: ret = io_ringfd_unregister(ctx, arg, nr_args); break; case IORING_REGISTER_PBUF_RING: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_register_pbuf_ring(ctx, arg); break; case IORING_UNREGISTER_PBUF_RING: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_unregister_pbuf_ring(ctx, arg); break; case IORING_REGISTER_SYNC_CANCEL: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_sync_cancel(ctx, arg); break; case IORING_REGISTER_FILE_ALLOC_RANGE: ret = -EINVAL; if (!arg || nr_args) break; ret = io_register_file_alloc_range(ctx, arg); break; case IORING_REGISTER_PBUF_STATUS: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_register_pbuf_status(ctx, arg); break; case IORING_REGISTER_NAPI: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_register_napi(ctx, arg); break; case IORING_UNREGISTER_NAPI: ret = -EINVAL; if (nr_args != 1) break; ret = io_unregister_napi(ctx, arg); break; case IORING_REGISTER_CLOCK: ret = -EINVAL; if (!arg || nr_args) break; ret = io_register_clock(ctx, arg); break; case IORING_REGISTER_CLONE_BUFFERS: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_register_clone_buffers(ctx, arg); break; case IORING_REGISTER_ZCRX_IFQ: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_register_zcrx_ifq(ctx, arg); break; case IORING_REGISTER_RESIZE_RINGS: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_register_resize_rings(ctx, arg); break; case IORING_REGISTER_MEM_REGION: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_register_mem_region(ctx, arg); break; default: ret = -EINVAL; break; } return ret; } /* * Given an 'fd' value, return the ctx associated with if. If 'registered' is * true, then the registered index is used. Otherwise, the normal fd table. * Caller must call fput() on the returned file, unless it's an ERR_PTR. */ struct file *io_uring_register_get_file(unsigned int fd, bool registered) { struct file *file; if (registered) { /* * Ring fd has been registered via IORING_REGISTER_RING_FDS, we * need only dereference our task private array to find it. */ struct io_uring_task *tctx = current->io_uring; if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) return ERR_PTR(-EINVAL); fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); file = tctx->registered_rings[fd]; if (file) get_file(file); } else { file = fget(fd); } if (unlikely(!file)) return ERR_PTR(-EBADF); if (io_is_uring_fops(file)) return file; fput(file); return ERR_PTR(-EOPNOTSUPP); } /* * "blind" registration opcodes are ones where there's no ring given, and * hence the source fd must be -1. */ static int io_uring_register_blind(unsigned int opcode, void __user *arg, unsigned int nr_args) { switch (opcode) { case IORING_REGISTER_SEND_MSG_RING: { struct io_uring_sqe sqe; if (!arg || nr_args != 1) return -EINVAL; if (copy_from_user(&sqe, arg, sizeof(sqe))) return -EFAULT; /* no flags supported */ if (sqe.flags) return -EINVAL; if (sqe.opcode == IORING_OP_MSG_RING) return io_uring_sync_msg_ring(&sqe); } } return -EINVAL; } SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, void __user *, arg, unsigned int, nr_args) { struct io_ring_ctx *ctx; long ret = -EBADF; struct file *file; bool use_registered_ring; use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; if (opcode >= IORING_REGISTER_LAST) return -EINVAL; if (fd == -1) return io_uring_register_blind(opcode, arg, nr_args); file = io_uring_register_get_file(fd, use_registered_ring); if (IS_ERR(file)) return PTR_ERR(file); ctx = file->private_data; mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, ctx->buf_table.nr, ret); mutex_unlock(&ctx->uring_lock); fput(file); return ret; } |
| 269 270 270 270 269 11 11 11 270 270 12 2 11 2 11 12 11 12 11 11 11 11 11 11 2 2 2 2 2 2 268 268 268 11 11 11 269 269 270 270 269 11 11 11 11 269 269 270 270 270 270 270 270 270 270 2 3 25 1 25 13 13 11 2 21 21 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/bug.h> #include <linux/list.h> #include <crypto/hash.h> #include "messages.h" #include "ctree.h" #include "discard.h" #include "disk-io.h" #include "send.h" #include "transaction.h" #include "sysfs.h" #include "volumes.h" #include "space-info.h" #include "block-group.h" #include "qgroup.h" #include "misc.h" #include "fs.h" #include "accessors.h" /* * Structure name Path * -------------------------------------------------------------------------- * btrfs_supported_static_feature_attrs /sys/fs/btrfs/features * btrfs_supported_feature_attrs /sys/fs/btrfs/features and * /sys/fs/btrfs/<uuid>/features * btrfs_attrs /sys/fs/btrfs/<uuid> * devid_attrs /sys/fs/btrfs/<uuid>/devinfo/<devid> * allocation_attrs /sys/fs/btrfs/<uuid>/allocation * qgroup_attrs /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid> * space_info_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type> * raid_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile> * discard_attrs /sys/fs/btrfs/<uuid>/discard * * When built with BTRFS_CONFIG_DEBUG: * * btrfs_debug_feature_attrs /sys/fs/btrfs/debug * btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug */ struct btrfs_feature_attr { struct kobj_attribute kobj_attr; enum btrfs_feature_set feature_set; u64 feature_bit; }; /* For raid type sysfs entries */ struct raid_kobject { u64 flags; struct kobject kobj; }; #define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ } #define BTRFS_ATTR_W(_prefix, _name, _store) \ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) #define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0644, _show, _store) #define BTRFS_ATTR(_prefix, _name, _show) \ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) #define BTRFS_ATTR_PTR(_prefix, _name) \ (&btrfs_attr_##_prefix##_##_name.attr) #define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \ static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ btrfs_feature_attr_show, \ btrfs_feature_attr_store), \ .feature_set = _feature_set, \ .feature_bit = _feature_prefix ##_## _feature_bit, \ } #define BTRFS_FEAT_ATTR_PTR(_name) \ (&btrfs_attr_features_##_name.kobj_attr.attr) #define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) #define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature) #define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); static struct kobject *get_btrfs_kobj(struct kobject *kobj); static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a) { return container_of(a, struct btrfs_feature_attr, kobj_attr); } static struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr) { return container_of(attr, struct kobj_attribute, attr); } static struct btrfs_feature_attr *attr_to_btrfs_feature_attr( struct attribute *attr) { return to_btrfs_feature_attr(attr_to_btrfs_attr(attr)); } static u64 get_features(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set) { struct btrfs_super_block *disk_super = fs_info->super_copy; if (set == FEAT_COMPAT) return btrfs_super_compat_flags(disk_super); else if (set == FEAT_COMPAT_RO) return btrfs_super_compat_ro_flags(disk_super); else return btrfs_super_incompat_flags(disk_super); } static void set_features(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set, u64 features) { struct btrfs_super_block *disk_super = fs_info->super_copy; if (set == FEAT_COMPAT) btrfs_set_super_compat_flags(disk_super, features); else if (set == FEAT_COMPAT_RO) btrfs_set_super_compat_ro_flags(disk_super, features); else btrfs_set_super_incompat_flags(disk_super, features); } static int can_modify_feature(struct btrfs_feature_attr *fa) { int val = 0; u64 set, clear; switch (fa->feature_set) { case FEAT_COMPAT: set = BTRFS_FEATURE_COMPAT_SAFE_SET; clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; break; case FEAT_COMPAT_RO: set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; break; case FEAT_INCOMPAT: set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; break; default: pr_warn("btrfs: sysfs: unknown feature set %d\n", fa->feature_set); return 0; } if (set & fa->feature_bit) val |= 1; if (clear & fa->feature_bit) val |= 2; return val; } static ssize_t btrfs_feature_attr_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val = 0; struct btrfs_fs_info *fs_info = to_fs_info(kobj); struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); if (fs_info) { u64 features = get_features(fs_info, fa->feature_set); if (features & fa->feature_bit) val = 1; } else val = can_modify_feature(fa); return sysfs_emit(buf, "%d\n", val); } static ssize_t btrfs_feature_attr_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t count) { struct btrfs_fs_info *fs_info; struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); u64 features, set, clear; unsigned long val; int ret; fs_info = to_fs_info(kobj); if (!fs_info) return -EPERM; if (sb_rdonly(fs_info->sb)) return -EROFS; ret = kstrtoul(skip_spaces(buf), 0, &val); if (ret) return ret; if (fa->feature_set == FEAT_COMPAT) { set = BTRFS_FEATURE_COMPAT_SAFE_SET; clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; } else if (fa->feature_set == FEAT_COMPAT_RO) { set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; } else { set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; } features = get_features(fs_info, fa->feature_set); /* Nothing to do */ if ((val && (features & fa->feature_bit)) || (!val && !(features & fa->feature_bit))) return count; if ((val && !(set & fa->feature_bit)) || (!val && !(clear & fa->feature_bit))) { btrfs_info(fs_info, "%sabling feature %s on mounted fs is not supported.", val ? "En" : "Dis", fa->kobj_attr.attr.name); return -EPERM; } btrfs_info(fs_info, "%s %s feature flag", val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); spin_lock(&fs_info->super_lock); features = get_features(fs_info, fa->feature_set); if (val) features |= fa->feature_bit; else features &= ~fa->feature_bit; set_features(fs_info, fa->feature_set, features); spin_unlock(&fs_info->super_lock); /* * We don't want to do full transaction commit from inside sysfs */ set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); wake_up_process(fs_info->transaction_kthread); return count; } static umode_t btrfs_feature_visible(struct kobject *kobj, struct attribute *attr, int unused) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); umode_t mode = attr->mode; if (fs_info) { struct btrfs_feature_attr *fa; u64 features; fa = attr_to_btrfs_feature_attr(attr); features = get_features(fs_info, fa->feature_set); if (can_modify_feature(fa)) mode |= S_IWUSR; else if (!(features & fa->feature_bit)) mode = 0; } return mode; } BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD); BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA); #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); #endif #ifdef CONFIG_BTRFS_EXPERIMENTAL /* Remove once support for extent tree v2 is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); /* Remove once support for raid stripe tree is feature complete. */ BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE); #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); #endif /* * Features which depend on feature bits and may differ between each fs. * * /sys/fs/btrfs/features - all available features implemented by this version * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or * can be changed on a mounted filesystem. */ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(default_subvol), BTRFS_FEAT_ATTR_PTR(mixed_groups), BTRFS_FEAT_ATTR_PTR(compress_lzo), BTRFS_FEAT_ATTR_PTR(compress_zstd), BTRFS_FEAT_ATTR_PTR(extended_iref), BTRFS_FEAT_ATTR_PTR(raid56), BTRFS_FEAT_ATTR_PTR(skinny_metadata), BTRFS_FEAT_ATTR_PTR(no_holes), BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), BTRFS_FEAT_ATTR_PTR(raid1c34), BTRFS_FEAT_ATTR_PTR(block_group_tree), BTRFS_FEAT_ATTR_PTR(simple_quota), #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), #endif #ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_FEAT_ATTR_PTR(extent_tree_v2), BTRFS_FEAT_ATTR_PTR(raid_stripe_tree), #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_PTR(verity), #endif NULL }; static const struct attribute_group btrfs_feature_attr_group = { .name = "features", .is_visible = btrfs_feature_visible, .attrs = btrfs_supported_feature_attrs, }; static ssize_t rmdir_subvol_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { return sysfs_emit(buf, "0\n"); } BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show); static ssize_t supported_checksums_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { ssize_t ret = 0; int i; for (i = 0; i < btrfs_get_num_csums(); i++) { /* * This "trick" only works as long as 'enum btrfs_csum_type' has * no holes in it */ ret += sysfs_emit_at(buf, ret, "%s%s", (i == 0 ? "" : " "), btrfs_super_csum_name(i)); } ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); static ssize_t send_stream_version_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { return sysfs_emit(buf, "%d\n", BTRFS_SEND_STREAM_VERSION); } BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); static const char *rescue_opts[] = { "usebackuproot", "nologreplay", "ignorebadroots", "ignoredatacsums", "ignoremetacsums", "ignoresuperflags", "all", }; static ssize_t supported_rescue_options_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { ssize_t ret = 0; int i; for (i = 0; i < ARRAY_SIZE(rescue_opts); i++) ret += sysfs_emit_at(buf, ret, "%s%s", (i ? " " : ""), rescue_opts[i]); ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_rescue_options, supported_rescue_options_show); static ssize_t supported_sectorsizes_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { ssize_t ret = 0; if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE) ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE); if (PAGE_SIZE > SZ_4K) ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); return ret; } BTRFS_ATTR(static_feature, supported_sectorsizes, supported_sectorsizes_show); static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { return sysfs_emit(buf, "%d\n", IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); } BTRFS_ATTR(static_feature, acl, acl_show); static ssize_t temp_fsid_supported_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { return sysfs_emit(buf, "0\n"); } BTRFS_ATTR(static_feature, temp_fsid, temp_fsid_supported_show); /* * Features which only depend on kernel version. * * These are listed in /sys/fs/btrfs/features along with * btrfs_supported_feature_attrs. */ static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, acl), BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), BTRFS_ATTR_PTR(static_feature, send_stream_version), BTRFS_ATTR_PTR(static_feature, supported_rescue_options), BTRFS_ATTR_PTR(static_feature, supported_sectorsizes), BTRFS_ATTR_PTR(static_feature, temp_fsid), NULL }; static const struct attribute_group btrfs_static_feature_attr_group = { .name = "features", .attrs = btrfs_supported_static_feature_attrs, }; /* * Discard statistics and tunables */ #define discard_to_fs_info(_kobj) to_fs_info(get_btrfs_kobj(_kobj)) static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%lld\n", atomic64_read(&fs_info->discard_ctl.discardable_bytes)); } BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show); static ssize_t btrfs_discardable_extents_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%d\n", atomic_read(&fs_info->discard_ctl.discardable_extents)); } BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show); static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%llu\n", fs_info->discard_ctl.discard_bitmap_bytes); } BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%lld\n", atomic64_read(&fs_info->discard_ctl.discard_bytes_saved)); } BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show); static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%llu\n", fs_info->discard_ctl.discard_extent_bytes); } BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%u\n", READ_ONCE(fs_info->discard_ctl.iops_limit)); } static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; u32 iops_limit; int ret; ret = kstrtou32(buf, 10, &iops_limit); if (ret) return -EINVAL; WRITE_ONCE(discard_ctl->iops_limit, iops_limit); btrfs_discard_calc_delay(discard_ctl); btrfs_discard_schedule_work(discard_ctl, true); return len; } BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show, btrfs_discard_iops_limit_store); static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%u\n", READ_ONCE(fs_info->discard_ctl.kbps_limit)); } static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; u32 kbps_limit; int ret; ret = kstrtou32(buf, 10, &kbps_limit); if (ret) return -EINVAL; WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit); btrfs_discard_schedule_work(discard_ctl, true); return len; } BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show, btrfs_discard_kbps_limit_store); static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); return sysfs_emit(buf, "%llu\n", READ_ONCE(fs_info->discard_ctl.max_discard_size)); } static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; u64 max_discard_size; int ret; ret = kstrtou64(buf, 10, &max_discard_size); if (ret) return -EINVAL; WRITE_ONCE(discard_ctl->max_discard_size, max_discard_size); return len; } BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show, btrfs_discard_max_discard_size_store); /* * Per-filesystem stats for discard (when mounted with discard=async). * * Path: /sys/fs/btrfs/<uuid>/discard/ */ static const struct attribute *discard_attrs[] = { BTRFS_ATTR_PTR(discard, discardable_bytes), BTRFS_ATTR_PTR(discard, discardable_extents), BTRFS_ATTR_PTR(discard, discard_bitmap_bytes), BTRFS_ATTR_PTR(discard, discard_bytes_saved), BTRFS_ATTR_PTR(discard, discard_extent_bytes), BTRFS_ATTR_PTR(discard, iops_limit), BTRFS_ATTR_PTR(discard, kbps_limit), BTRFS_ATTR_PTR(discard, max_discard_size), NULL, }; #ifdef CONFIG_BTRFS_DEBUG /* * Per-filesystem runtime debugging exported via sysfs. * * Path: /sys/fs/btrfs/UUID/debug/ */ static const struct attribute *btrfs_debug_mount_attrs[] = { NULL, }; /* * Runtime debugging exported via sysfs, applies to all mounted filesystems. * * Path: /sys/fs/btrfs/debug */ static struct attribute *btrfs_debug_feature_attrs[] = { NULL }; static const struct attribute_group btrfs_debug_feature_attr_group = { .name = "debug", .attrs = btrfs_debug_feature_attrs, }; #endif static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) { u64 val; if (lock) spin_lock(lock); val = *value_ptr; if (lock) spin_unlock(lock); return sysfs_emit(buf, "%llu\n", val); } static ssize_t global_rsv_size_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); } BTRFS_ATTR(allocation, global_rsv_size, global_rsv_size_show); static ssize_t global_rsv_reserved_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); } BTRFS_ATTR(allocation, global_rsv_reserved, global_rsv_reserved_show); #define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) #define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) static ssize_t raid_bytes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); BTRFS_ATTR(raid, total_bytes, raid_bytes_show); BTRFS_ATTR(raid, used_bytes, raid_bytes_show); static ssize_t raid_bytes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct btrfs_space_info *sinfo = to_space_info(kobj->parent); struct btrfs_block_group *block_group; int index = btrfs_bg_flags_to_raid_index(to_raid_kobj(kobj)->flags); u64 val = 0; down_read(&sinfo->groups_sem); list_for_each_entry(block_group, &sinfo->block_groups[index], list) { if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes)) val += block_group->length; else val += block_group->used; } up_read(&sinfo->groups_sem); return sysfs_emit(buf, "%llu\n", val); } /* * Allocation information about block group profiles. * * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>/ */ static struct attribute *raid_attrs[] = { BTRFS_ATTR_PTR(raid, total_bytes), BTRFS_ATTR_PTR(raid, used_bytes), NULL }; ATTRIBUTE_GROUPS(raid); static void release_raid_kobj(struct kobject *kobj) { kfree(to_raid_kobj(kobj)); } static const struct kobj_type btrfs_raid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = release_raid_kobj, .default_groups = raid_groups, }; #define SPACE_INFO_ATTR(field) \ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ struct kobj_attribute *a, \ char *buf) \ { \ struct btrfs_space_info *sinfo = to_space_info(kobj); \ return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ } \ BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) static ssize_t btrfs_chunk_size_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_space_info *sinfo = to_space_info(kobj); return sysfs_emit(buf, "%llu\n", READ_ONCE(sinfo->chunk_size)); } /* * Store new chunk size in space info. Can be called on a read-only filesystem. * * If the new chunk size value is larger than 10% of free space it is reduced * to match that limit. Alignment must be to 256M and the system chunk size * cannot be set. */ static ssize_t btrfs_chunk_size_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_space_info *space_info = to_space_info(kobj); struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); char *retptr; u64 val; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!fs_info->fs_devices) return -EINVAL; if (btrfs_is_zoned(fs_info)) return -EINVAL; /* System block type must not be changed. */ if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) return -EPERM; val = memparse(buf, &retptr); /* There could be trailing '\n', also catch any typos after the value */ retptr = skip_spaces(retptr); if (*retptr != 0 || val == 0) return -EINVAL; val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE); /* Limit stripe size to 10% of available space. */ val = min(mult_perc(fs_info->fs_devices->total_rw_bytes, 10), val); /* Must be multiple of 256M. */ val &= ~((u64)SZ_256M - 1); /* Must be at least 256M. */ if (val < SZ_256M) return -EINVAL; btrfs_update_space_info_chunk_size(space_info, val); return len; } static ssize_t btrfs_size_classes_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_space_info *sinfo = to_space_info(kobj); struct btrfs_block_group *bg; u32 none = 0; u32 small = 0; u32 medium = 0; u32 large = 0; for (int i = 0; i < BTRFS_NR_RAID_TYPES; ++i) { down_read(&sinfo->groups_sem); list_for_each_entry(bg, &sinfo->block_groups[i], list) { if (!btrfs_block_group_should_use_size_class(bg)) continue; switch (bg->size_class) { case BTRFS_BG_SZ_NONE: none++; break; case BTRFS_BG_SZ_SMALL: small++; break; case BTRFS_BG_SZ_MEDIUM: medium++; break; case BTRFS_BG_SZ_LARGE: large++; break; } } up_read(&sinfo->groups_sem); } return sysfs_emit(buf, "none %u\n" "small %u\n" "medium %u\n" "large %u\n", none, small, medium, large); } #ifdef CONFIG_BTRFS_DEBUG /* * Request chunk allocation with current chunk size. */ static ssize_t btrfs_force_chunk_alloc_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_space_info *space_info = to_space_info(kobj); struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); struct btrfs_trans_handle *trans; bool val; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (sb_rdonly(fs_info->sb)) return -EROFS; ret = kstrtobool(buf, &val); if (ret) return ret; if (!val) return -EINVAL; /* * This is unsafe to be called from sysfs context and may cause * unexpected problems. */ trans = btrfs_start_transaction(fs_info->tree_root, 0); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_force_chunk_alloc(trans, space_info->flags); btrfs_end_transaction(trans); if (ret == 1) return len; return -ENOSPC; } BTRFS_ATTR_W(space_info, force_chunk_alloc, btrfs_force_chunk_alloc_store); #endif SPACE_INFO_ATTR(flags); SPACE_INFO_ATTR(total_bytes); SPACE_INFO_ATTR(bytes_used); SPACE_INFO_ATTR(bytes_pinned); SPACE_INFO_ATTR(bytes_reserved); SPACE_INFO_ATTR(bytes_may_use); SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); SPACE_INFO_ATTR(reclaim_count); SPACE_INFO_ATTR(reclaim_bytes); SPACE_INFO_ATTR(reclaim_errors); BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store); BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show); static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_space_info *space_info = to_space_info(kobj); ssize_t ret; spin_lock(&space_info->lock); ret = sysfs_emit(buf, "%d\n", btrfs_calc_reclaim_threshold(space_info)); spin_unlock(&space_info->lock); return ret; } static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_space_info *space_info = to_space_info(kobj); int thresh; int ret; if (READ_ONCE(space_info->dynamic_reclaim)) return -EINVAL; ret = kstrtoint(buf, 10, &thresh); if (ret) return ret; if (thresh < 0 || thresh > 100) return -EINVAL; WRITE_ONCE(space_info->bg_reclaim_threshold, thresh); return len; } BTRFS_ATTR_RW(space_info, bg_reclaim_threshold, btrfs_sinfo_bg_reclaim_threshold_show, btrfs_sinfo_bg_reclaim_threshold_store); static ssize_t btrfs_sinfo_dynamic_reclaim_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_space_info *space_info = to_space_info(kobj); return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->dynamic_reclaim)); } static ssize_t btrfs_sinfo_dynamic_reclaim_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_space_info *space_info = to_space_info(kobj); int dynamic_reclaim; int ret; ret = kstrtoint(buf, 10, &dynamic_reclaim); if (ret) return ret; if (dynamic_reclaim < 0) return -EINVAL; WRITE_ONCE(space_info->dynamic_reclaim, dynamic_reclaim != 0); return len; } BTRFS_ATTR_RW(space_info, dynamic_reclaim, btrfs_sinfo_dynamic_reclaim_show, btrfs_sinfo_dynamic_reclaim_store); static ssize_t btrfs_sinfo_periodic_reclaim_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_space_info *space_info = to_space_info(kobj); return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->periodic_reclaim)); } static ssize_t btrfs_sinfo_periodic_reclaim_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_space_info *space_info = to_space_info(kobj); int periodic_reclaim; int ret; ret = kstrtoint(buf, 10, &periodic_reclaim); if (ret) return ret; if (periodic_reclaim < 0) return -EINVAL; WRITE_ONCE(space_info->periodic_reclaim, periodic_reclaim != 0); return len; } BTRFS_ATTR_RW(space_info, periodic_reclaim, btrfs_sinfo_periodic_reclaim_show, btrfs_sinfo_periodic_reclaim_store); /* * Allocation information about block group types. * * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/ */ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, flags), BTRFS_ATTR_PTR(space_info, total_bytes), BTRFS_ATTR_PTR(space_info, bytes_used), BTRFS_ATTR_PTR(space_info, bytes_pinned), BTRFS_ATTR_PTR(space_info, bytes_reserved), BTRFS_ATTR_PTR(space_info, bytes_may_use), BTRFS_ATTR_PTR(space_info, bytes_readonly), BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), BTRFS_ATTR_PTR(space_info, dynamic_reclaim), BTRFS_ATTR_PTR(space_info, chunk_size), BTRFS_ATTR_PTR(space_info, size_classes), BTRFS_ATTR_PTR(space_info, reclaim_count), BTRFS_ATTR_PTR(space_info, reclaim_bytes), BTRFS_ATTR_PTR(space_info, reclaim_errors), BTRFS_ATTR_PTR(space_info, periodic_reclaim), #ifdef CONFIG_BTRFS_DEBUG BTRFS_ATTR_PTR(space_info, force_chunk_alloc), #endif NULL, }; ATTRIBUTE_GROUPS(space_info); static void space_info_release(struct kobject *kobj) { struct btrfs_space_info *sinfo = to_space_info(kobj); kfree(sinfo); } static const struct kobj_type space_info_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = space_info_release, .default_groups = space_info_groups, }; /* * Allocation information about block groups. * * Path: /sys/fs/btrfs/<uuid>/allocation/ */ static const struct attribute *allocation_attrs[] = { BTRFS_ATTR_PTR(allocation, global_rsv_reserved), BTRFS_ATTR_PTR(allocation, global_rsv_size), NULL, }; static ssize_t btrfs_label_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); char *label = fs_info->super_copy->label; ssize_t ret; spin_lock(&fs_info->super_lock); ret = sysfs_emit(buf, label[0] ? "%s\n" : "%s", label); spin_unlock(&fs_info->super_lock); return ret; } static ssize_t btrfs_label_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); size_t p_len; if (!fs_info) return -EPERM; if (sb_rdonly(fs_info->sb)) return -EROFS; /* * p_len is the len until the first occurrence of either * '\n' or '\0' */ p_len = strcspn(buf, "\n"); if (p_len >= BTRFS_LABEL_SIZE) return -EINVAL; spin_lock(&fs_info->super_lock); memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE); memcpy(fs_info->super_copy->label, buf, p_len); spin_unlock(&fs_info->super_lock); /* * We don't want to do full transaction commit from inside sysfs */ set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); wake_up_process(fs_info->transaction_kthread); return len; } BTRFS_ATTR_RW(, label, btrfs_label_show, btrfs_label_store); static ssize_t btrfs_nodesize_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%u\n", fs_info->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); static ssize_t btrfs_sectorsize_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); static ssize_t btrfs_commit_stats_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "commits %llu\n" "last_commit_ms %llu\n" "max_commit_ms %llu\n" "total_commit_ms %llu\n", fs_info->commit_stats.commit_count, div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC), div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC), div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC)); } static ssize_t btrfs_commit_stats_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); unsigned long val; int ret; if (!fs_info) return -EPERM; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; ret = kstrtoul(buf, 10, &val); if (ret) return ret; if (val) return -EINVAL; WRITE_ONCE(fs_info->commit_stats.max_commit_dur, 0); return len; } BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store); static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); static ssize_t quota_override_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); int quota_override; quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); return sysfs_emit(buf, "%d\n", quota_override); } static ssize_t quota_override_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); unsigned long knob; int err; if (!fs_info) return -EPERM; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; err = kstrtoul(buf, 10, &knob); if (err) return err; if (knob > 1) return -EINVAL; if (knob) set_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); else clear_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); return len; } BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store); static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%pU\n", fs_info->fs_devices->metadata_uuid); } BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show); static ssize_t btrfs_checksum_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); u16 csum_type = btrfs_super_csum_type(fs_info->super_copy); return sysfs_emit(buf, "%s (%s)\n", btrfs_super_csum_name(csum_type), crypto_shash_driver_name(fs_info->csum_shash)); } BTRFS_ATTR(, checksum, btrfs_checksum_show); static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); const char *str; switch (READ_ONCE(fs_info->exclusive_operation)) { case BTRFS_EXCLOP_NONE: str = "none\n"; break; case BTRFS_EXCLOP_BALANCE: str = "balance\n"; break; case BTRFS_EXCLOP_BALANCE_PAUSED: str = "balance paused\n"; break; case BTRFS_EXCLOP_DEV_ADD: str = "device add\n"; break; case BTRFS_EXCLOP_DEV_REMOVE: str = "device remove\n"; break; case BTRFS_EXCLOP_DEV_REPLACE: str = "device replace\n"; break; case BTRFS_EXCLOP_RESIZE: str = "resize\n"; break; case BTRFS_EXCLOP_SWAP_ACTIVATE: str = "swap activate\n"; break; default: str = "UNKNOWN\n"; break; } return sysfs_emit(buf, "%s", str); } BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show); static ssize_t btrfs_generation_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%llu\n", btrfs_get_fs_generation(fs_info)); } BTRFS_ATTR(, generation, btrfs_generation_show); static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%d\n", fs_info->fs_devices->temp_fsid); } BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); static const char *btrfs_read_policy_name[] = { "pid", #ifdef CONFIG_BTRFS_EXPERIMENTAL "round-robin", "devid", #endif }; #ifdef CONFIG_BTRFS_EXPERIMENTAL /* Global module configuration parameters. */ static char *read_policy; char *btrfs_get_mod_read_policy(void) { return read_policy; } /* Set perms to 0, disable /sys/module/btrfs/parameter/read_policy interface. */ module_param(read_policy, charp, 0); MODULE_PARM_DESC(read_policy, "Global read policy: pid (default), round-robin[:<min_contig_read>], devid[:<devid>]"); #endif int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) { char param[32]; char __maybe_unused *value_str; if (!str || strlen(str) == 0) return 0; strscpy(param, str); #ifdef CONFIG_BTRFS_EXPERIMENTAL /* Separate value from input in policy:value format. */ value_str = strchr(param, ':'); if (value_str) { char *retptr; *value_str = 0; value_str++; if (!value_ret) return -EINVAL; *value_ret = memparse(value_str, &retptr); /* There could be any trailing typos after the value. */ retptr = skip_spaces(retptr); if (*retptr != 0 || *value_ret <= 0) return -EINVAL; } #endif return sysfs_match_string(btrfs_read_policy_name, param); } #ifdef CONFIG_BTRFS_EXPERIMENTAL int __init btrfs_read_policy_init(void) { s64 value; if (btrfs_read_policy_to_enum(read_policy, &value) == -EINVAL) { btrfs_err(NULL, "invalid read policy or value %s", read_policy); return -EINVAL; } return 0; } #endif static ssize_t btrfs_read_policy_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); const enum btrfs_read_policy policy = READ_ONCE(fs_devices->read_policy); ssize_t ret = 0; int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { if (ret != 0) ret += sysfs_emit_at(buf, ret, " "); if (i == policy) ret += sysfs_emit_at(buf, ret, "["); ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); #ifdef CONFIG_BTRFS_EXPERIMENTAL if (i == BTRFS_READ_POLICY_RR) ret += sysfs_emit_at(buf, ret, ":%u", READ_ONCE(fs_devices->rr_min_contig_read)); if (i == BTRFS_READ_POLICY_DEVID) ret += sysfs_emit_at(buf, ret, ":%llu", READ_ONCE(fs_devices->read_devid)); #endif if (i == policy) ret += sysfs_emit_at(buf, ret, "]"); } ret += sysfs_emit_at(buf, ret, "\n"); return ret; } static ssize_t btrfs_read_policy_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); int index; s64 value = -1; index = btrfs_read_policy_to_enum(buf, &value); if (index < 0) return -EINVAL; #ifdef CONFIG_BTRFS_EXPERIMENTAL /* If moving from RR then disable collecting fs stats. */ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR) fs_devices->collect_fs_stats = false; if (index == BTRFS_READ_POLICY_RR) { if (value != -1) { const u32 sectorsize = fs_devices->fs_info->sectorsize; if (!IS_ALIGNED(value, sectorsize)) { u64 temp_value = round_up(value, sectorsize); btrfs_debug(fs_devices->fs_info, "read_policy: min contig read %lld should be multiple of sectorsize %u, rounded to %llu", value, sectorsize, temp_value); value = temp_value; } } else { value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; } if (index != READ_ONCE(fs_devices->read_policy) || value != READ_ONCE(fs_devices->rr_min_contig_read)) { WRITE_ONCE(fs_devices->read_policy, index); WRITE_ONCE(fs_devices->rr_min_contig_read, value); btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'", btrfs_read_policy_name[index], value); } fs_devices->collect_fs_stats = true; return len; } if (index == BTRFS_READ_POLICY_DEVID) { if (value != -1) { BTRFS_DEV_LOOKUP_ARGS(args); /* Validate input devid. */ args.devid = value; if (btrfs_find_device(fs_devices, &args) == NULL) return -EINVAL; } else { /* Set default devid to the devid of the latest device. */ value = fs_devices->latest_dev->devid; } if (index != READ_ONCE(fs_devices->read_policy) || value != READ_ONCE(fs_devices->read_devid)) { WRITE_ONCE(fs_devices->read_policy, index); WRITE_ONCE(fs_devices->read_devid, value); btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'", btrfs_read_policy_name[index], value); } return len; } #endif if (index != READ_ONCE(fs_devices->read_policy)) { WRITE_ONCE(fs_devices->read_policy, index); btrfs_info(fs_devices->fs_info, "read policy set to '%s'", btrfs_read_policy_name[index]); } return len; } BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); return sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold)); } static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); int thresh; int ret; ret = kstrtoint(buf, 10, &thresh); if (ret) return ret; #ifdef CONFIG_BTRFS_DEBUG if (thresh != 0 && (thresh > 100)) return -EINVAL; #else if (thresh != 0 && (thresh <= 50 || thresh > 100)) return -EINVAL; #endif WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh); return len; } BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); #ifdef CONFIG_BTRFS_EXPERIMENTAL static ssize_t btrfs_offload_csum_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); switch (READ_ONCE(fs_devices->offload_csum_mode)) { case BTRFS_OFFLOAD_CSUM_AUTO: return sysfs_emit(buf, "auto\n"); case BTRFS_OFFLOAD_CSUM_FORCE_ON: return sysfs_emit(buf, "1\n"); case BTRFS_OFFLOAD_CSUM_FORCE_OFF: return sysfs_emit(buf, "0\n"); default: WARN_ON(1); return -EINVAL; } } static ssize_t btrfs_offload_csum_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); int ret; bool val; ret = kstrtobool(buf, &val); if (ret == 0) WRITE_ONCE(fs_devices->offload_csum_mode, val ? BTRFS_OFFLOAD_CSUM_FORCE_ON : BTRFS_OFFLOAD_CSUM_FORCE_OFF); else if (ret == -EINVAL && sysfs_streq(buf, "auto")) WRITE_ONCE(fs_devices->offload_csum_mode, BTRFS_OFFLOAD_CSUM_AUTO); else return -EINVAL; return len; } BTRFS_ATTR_RW(, offload_csum, btrfs_offload_csum_show, btrfs_offload_csum_store); #endif /* * Per-filesystem information and stats. * * Path: /sys/fs/btrfs/<uuid>/ */ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), BTRFS_ATTR_PTR(, sectorsize), BTRFS_ATTR_PTR(, clone_alignment), BTRFS_ATTR_PTR(, quota_override), BTRFS_ATTR_PTR(, metadata_uuid), BTRFS_ATTR_PTR(, checksum), BTRFS_ATTR_PTR(, exclusive_operation), BTRFS_ATTR_PTR(, generation), BTRFS_ATTR_PTR(, read_policy), BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), #ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_ATTR_PTR(, offload_csum), #endif NULL, }; static void btrfs_release_fsid_kobj(struct kobject *kobj) { struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject)); complete(&fs_devs->kobj_unregister); } static const struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = btrfs_release_fsid_kobj, }; static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) { if (kobj->ktype != &btrfs_ktype) return NULL; return container_of(kobj, struct btrfs_fs_devices, fsid_kobj); } static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) { if (kobj->ktype != &btrfs_ktype) return NULL; return to_fs_devs(kobj)->fs_info; } static struct kobject *get_btrfs_kobj(struct kobject *kobj) { while (kobj) { if (kobj->ktype == &btrfs_ktype) return kobj; kobj = kobj->parent; } return NULL; } #define NUM_FEATURE_BITS 64 #define BTRFS_FEATURE_NAME_MAX 13 static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS]; static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) == ARRAY_SIZE(btrfs_feature_attrs)); static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) == ARRAY_SIZE(btrfs_feature_attrs[0])); static const u64 supported_feature_masks[FEAT_MAX] = { [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP, }; static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) { int set; for (set = 0; set < FEAT_MAX; set++) { int i; struct attribute *attrs[2]; struct attribute_group agroup = { .name = "features", .attrs = attrs, }; u64 features = get_features(fs_info, set); features &= ~supported_feature_masks[set]; if (!features) continue; attrs[1] = NULL; for (i = 0; i < NUM_FEATURE_BITS; i++) { struct btrfs_feature_attr *fa; if (!(features & (1ULL << i))) continue; fa = &btrfs_feature_attrs[set][i]; attrs[0] = &fa->kobj_attr.attr; if (add) { int ret; ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj, &agroup); if (ret) return ret; } else sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj, &agroup); } } return 0; } static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { if (fs_devs->devinfo_kobj) { kobject_del(fs_devs->devinfo_kobj); kobject_put(fs_devs->devinfo_kobj); fs_devs->devinfo_kobj = NULL; } if (fs_devs->devices_kobj) { kobject_del(fs_devs->devices_kobj); kobject_put(fs_devs->devices_kobj); fs_devs->devices_kobj = NULL; } if (fs_devs->fsid_kobj.state_initialized) { kobject_del(&fs_devs->fsid_kobj); kobject_put(&fs_devs->fsid_kobj); wait_for_completion(&fs_devs->kobj_unregister); } } /* when fs_devs is NULL it will remove all fsid kobject */ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { struct list_head *fs_uuids = btrfs_get_fs_uuids(); if (fs_devs) { __btrfs_sysfs_remove_fsid(fs_devs); return; } list_for_each_entry(fs_devs, fs_uuids, fs_list) { __btrfs_sysfs_remove_fsid(fs_devs); } } static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; struct btrfs_fs_devices *seed; list_for_each_entry(device, &fs_devices->devices, dev_list) btrfs_sysfs_remove_device(device); list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { list_for_each_entry(device, &seed->devices, dev_list) btrfs_sysfs_remove_device(device); } } void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; sysfs_remove_link(fsid_kobj, "bdi"); if (fs_info->space_info_kobj) { sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); kobject_del(fs_info->space_info_kobj); kobject_put(fs_info->space_info_kobj); } if (fs_info->discard_kobj) { sysfs_remove_files(fs_info->discard_kobj, discard_attrs); kobject_del(fs_info->discard_kobj); kobject_put(fs_info->discard_kobj); } #ifdef CONFIG_BTRFS_DEBUG if (fs_info->debug_kobj) { sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); kobject_del(fs_info->debug_kobj); kobject_put(fs_info->debug_kobj); } #endif addrm_unknown_feature_attrs(fs_info, false); sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); sysfs_remove_files(fsid_kobj, btrfs_attrs); btrfs_sysfs_remove_fs_devices(fs_info->fs_devices); } static const char * const btrfs_feature_set_names[FEAT_MAX] = { [FEAT_COMPAT] = "compat", [FEAT_COMPAT_RO] = "compat_ro", [FEAT_INCOMPAT] = "incompat", }; const char *btrfs_feature_set_name(enum btrfs_feature_set set) { return btrfs_feature_set_names[set]; } char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags) { size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */ int len = 0; int i; char *str; str = kmalloc(bufsize, GFP_KERNEL); if (!str) return str; for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { const char *name; if (!(flags & (1ULL << i))) continue; name = btrfs_feature_attrs[set][i].kobj_attr.attr.name; len += scnprintf(str + len, bufsize - len, "%s%s", len ? "," : "", name); } return str; } static void init_feature_attrs(void) { struct btrfs_feature_attr *fa; int set, i; memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs)); memset(btrfs_unknown_feature_names, 0, sizeof(btrfs_unknown_feature_names)); for (i = 0; btrfs_supported_feature_attrs[i]; i++) { struct btrfs_feature_attr *sfa; struct attribute *a = btrfs_supported_feature_attrs[i]; int bit; sfa = attr_to_btrfs_feature_attr(a); bit = ilog2(sfa->feature_bit); fa = &btrfs_feature_attrs[sfa->feature_set][bit]; fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name; } for (set = 0; set < FEAT_MAX; set++) { for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { char *name = btrfs_unknown_feature_names[set][i]; fa = &btrfs_feature_attrs[set][i]; if (fa->kobj_attr.attr.name) continue; snprintf(name, BTRFS_FEATURE_NAME_MAX, "%s:%u", btrfs_feature_set_names[set], i); fa->kobj_attr.attr.name = name; fa->kobj_attr.attr.mode = S_IRUGO; fa->feature_set = set; fa->feature_bit = 1ULL << i; } } } /* * Create a sysfs entry for a given block group type at path * /sys/fs/btrfs/UUID/allocation/data/TYPE */ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_space_info *space_info = cache->space_info; struct raid_kobject *rkobj; const int index = btrfs_bg_flags_to_raid_index(cache->flags); unsigned int nofs_flag; int ret; /* * Setup a NOFS context because kobject_add(), deep in its call chain, * does GFP_KERNEL allocations, and we are often called in a context * where if reclaim is triggered we can deadlock (we are either holding * a transaction handle or some lock required for a transaction * commit). */ nofs_flag = memalloc_nofs_save(); rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); if (!rkobj) { memalloc_nofs_restore(nofs_flag); btrfs_warn(cache->fs_info, "couldn't alloc memory for raid level kobject"); return; } rkobj->flags = cache->flags; kobject_init(&rkobj->kobj, &btrfs_raid_ktype); /* * We call this either on mount, or if we've created a block group for a * new index type while running (i.e. when restriping). The running * case is tricky because we could race with other threads, so we need * to have this check to make sure we didn't already init the kobject. * * We don't have to protect on the free side because it only happens on * unmount. */ spin_lock(&space_info->lock); if (space_info->block_group_kobjs[index]) { spin_unlock(&space_info->lock); kobject_put(&rkobj->kobj); return; } else { space_info->block_group_kobjs[index] = &rkobj->kobj; } spin_unlock(&space_info->lock); ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", btrfs_bg_type_to_raid_name(rkobj->flags)); memalloc_nofs_restore(nofs_flag); if (ret) { spin_lock(&space_info->lock); space_info->block_group_kobjs[index] = NULL; spin_unlock(&space_info->lock); kobject_put(&rkobj->kobj); btrfs_warn(fs_info, "failed to add kobject for block cache, ignoring"); return; } } /* * Remove sysfs directories for all block group types of a given space info and * the space info as well */ void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info) { int i; for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { struct kobject *kobj; kobj = space_info->block_group_kobjs[i]; space_info->block_group_kobjs[i] = NULL; if (kobj) { kobject_del(kobj); kobject_put(kobj); } } kobject_del(&space_info->kobj); kobject_put(&space_info->kobj); } static const char *alloc_name(struct btrfs_space_info *space_info) { u64 flags = space_info->flags; switch (flags) { case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: return "mixed"; case BTRFS_BLOCK_GROUP_METADATA: switch (space_info->subgroup_id) { case BTRFS_SUB_GROUP_PRIMARY: return "metadata"; case BTRFS_SUB_GROUP_TREELOG: return "metadata-treelog"; default: WARN_ON_ONCE(1); return "metadata (unknown sub-group)"; } case BTRFS_BLOCK_GROUP_DATA: switch (space_info->subgroup_id) { case BTRFS_SUB_GROUP_PRIMARY: return "data"; case BTRFS_SUB_GROUP_DATA_RELOC: return "data-reloc"; default: WARN_ON_ONCE(1); return "data (unknown sub-group)"; } case BTRFS_BLOCK_GROUP_SYSTEM: ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); return "system"; default: WARN_ON(1); return "invalid-combination"; } } /* * Create a sysfs entry for a space info type at path * /sys/fs/btrfs/UUID/allocation/TYPE */ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { int ret; ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, fs_info->space_info_kobj, "%s", alloc_name(space_info)); if (ret) { kobject_put(&space_info->kobj); return ret; } return 0; } void btrfs_sysfs_remove_device(struct btrfs_device *device) { struct kobject *devices_kobj; /* * Seed fs_devices devices_kobj aren't used, fetch kobject from the * fs_info::fs_devices. */ devices_kobj = device->fs_info->fs_devices->devices_kobj; ASSERT(devices_kobj); if (device->bdev) sysfs_remove_link(devices_kobj, bdev_kobj(device->bdev)->name); if (device->devid_kobj.state_initialized) { kobject_del(&device->devid_kobj); kobject_put(&device->devid_kobj); wait_for_completion(&device->kobj_unregister); } } static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val; struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show); static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val; struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show); static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val; struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); return sysfs_emit(buf, "%llu\n", READ_ONCE(device->scrub_speed_max)); } static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); char *endptr; unsigned long long limit; limit = memparse(buf, &endptr); /* There could be trailing '\n', also catch any typos after the value. */ endptr = skip_spaces(endptr); if (*endptr != 0) return -EINVAL; WRITE_ONCE(device->scrub_speed_max, limit); return len; } BTRFS_ATTR_RW(devid, scrub_speed_max, btrfs_devinfo_scrub_speed_max_show, btrfs_devinfo_scrub_speed_max_store); static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val; struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); static ssize_t btrfs_devinfo_fsid_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); return sysfs_emit(buf, "%pU\n", device->fs_devices->fsid); } BTRFS_ATTR(devid, fsid, btrfs_devinfo_fsid_show); static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); if (!device->dev_stats_valid) return sysfs_emit(buf, "invalid\n"); /* * Print all at once so we get a snapshot of all values from the same * time. Keep them in sync and in order of definition of * btrfs_dev_stat_values. */ return sysfs_emit(buf, "write_errs %d\n" "read_errs %d\n" "flush_errs %d\n" "corruption_errs %d\n" "generation_errs %d\n", btrfs_dev_stat_read(device, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(device, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(device, BTRFS_DEV_STAT_FLUSH_ERRS), btrfs_dev_stat_read(device, BTRFS_DEV_STAT_CORRUPTION_ERRS), btrfs_dev_stat_read(device, BTRFS_DEV_STAT_GENERATION_ERRS)); } BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); /* * Information about one device. * * Path: /sys/fs/btrfs/<uuid>/devinfo/<devid>/ */ static struct attribute *devid_attrs[] = { BTRFS_ATTR_PTR(devid, error_stats), BTRFS_ATTR_PTR(devid, fsid), BTRFS_ATTR_PTR(devid, in_fs_metadata), BTRFS_ATTR_PTR(devid, missing), BTRFS_ATTR_PTR(devid, replace_target), BTRFS_ATTR_PTR(devid, scrub_speed_max), BTRFS_ATTR_PTR(devid, writeable), NULL }; ATTRIBUTE_GROUPS(devid); static void btrfs_release_devid_kobj(struct kobject *kobj) { struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); memset(&device->devid_kobj, 0, sizeof(struct kobject)); complete(&device->kobj_unregister); } static const struct kobj_type devid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = devid_groups, .release = btrfs_release_devid_kobj, }; int btrfs_sysfs_add_device(struct btrfs_device *device) { int ret; unsigned int nofs_flag; struct kobject *devices_kobj; struct kobject *devinfo_kobj; /* * Make sure we use the fs_info::fs_devices to fetch the kobjects even * for the seed fs_devices */ devices_kobj = device->fs_info->fs_devices->devices_kobj; devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj; ASSERT(devices_kobj); ASSERT(devinfo_kobj); nofs_flag = memalloc_nofs_save(); if (device->bdev) { struct kobject *disk_kobj = bdev_kobj(device->bdev); ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name); if (ret) { btrfs_warn(device->fs_info, "creating sysfs device link for devid %llu failed: %d", device->devid, ret); goto out; } } init_completion(&device->kobj_unregister); ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype, devinfo_kobj, "%llu", device->devid); if (ret) { kobject_put(&device->devid_kobj); btrfs_warn(device->fs_info, "devinfo init for devid %llu failed: %d", device->devid, ret); } out: memalloc_nofs_restore(nofs_flag); return ret; } static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) { int ret; struct btrfs_device *device; struct btrfs_fs_devices *seed; list_for_each_entry(device, &fs_devices->devices, dev_list) { ret = btrfs_sysfs_add_device(device); if (ret) goto fail; } list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { list_for_each_entry(device, &seed->devices, dev_list) { ret = btrfs_sysfs_add_device(device); if (ret) goto fail; } } return 0; fail: btrfs_sysfs_remove_fs_devices(fs_devices); return ret; } void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) { int ret; ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); if (ret) pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), &disk_to_dev(bdev->bd_disk)->kobj); } void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices) { char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; /* * Sprouting changes fsid of the mounted filesystem, rename the fsid * directory */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid); if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) btrfs_warn(fs_devices->fs_info, "sysfs: failed to create fsid for sprout"); } void btrfs_sysfs_update_devid(struct btrfs_device *device) { char tmp[24]; snprintf(tmp, sizeof(tmp), "%llu", device->devid); if (kobject_rename(&device->devid_kobj, tmp)) btrfs_warn(device->fs_devices->fs_info, "sysfs: failed to update devid for %llu", device->devid); } /* /sys/fs/btrfs/ entry */ static struct kset *btrfs_kset; /* * Creates: * /sys/fs/btrfs/UUID * * Can be called by the device discovery thread. */ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) { int error; init_completion(&fs_devs->kobj_unregister); fs_devs->fsid_kobj.kset = btrfs_kset; error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL, "%pU", fs_devs->fsid); if (error) { kobject_put(&fs_devs->fsid_kobj); return error; } fs_devs->devices_kobj = kobject_create_and_add("devices", &fs_devs->fsid_kobj); if (!fs_devs->devices_kobj) { btrfs_err(fs_devs->fs_info, "failed to init sysfs device interface"); btrfs_sysfs_remove_fsid(fs_devs); return -ENOMEM; } fs_devs->devinfo_kobj = kobject_create_and_add("devinfo", &fs_devs->fsid_kobj); if (!fs_devs->devinfo_kobj) { btrfs_err(fs_devs->fs_info, "failed to init sysfs devinfo kobject"); btrfs_sysfs_remove_fsid(fs_devs); return -ENOMEM; } return 0; } int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) { int error; struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; struct kobject *fsid_kobj = &fs_devs->fsid_kobj; error = btrfs_sysfs_add_fs_devices(fs_devs); if (error) return error; error = sysfs_create_files(fsid_kobj, btrfs_attrs); if (error) { btrfs_sysfs_remove_fs_devices(fs_devs); return error; } error = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); if (error) goto failure; #ifdef CONFIG_BTRFS_DEBUG fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj); if (!fs_info->debug_kobj) { error = -ENOMEM; goto failure; } error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); if (error) goto failure; #endif /* Discard directory */ fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj); if (!fs_info->discard_kobj) { error = -ENOMEM; goto failure; } error = sysfs_create_files(fs_info->discard_kobj, discard_attrs); if (error) goto failure; error = addrm_unknown_feature_attrs(fs_info, true); if (error) goto failure; error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi"); if (error) goto failure; fs_info->space_info_kobj = kobject_create_and_add("allocation", fsid_kobj); if (!fs_info->space_info_kobj) { error = -ENOMEM; goto failure; } error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs); if (error) goto failure; return 0; failure: btrfs_sysfs_remove_mounted(fs_info); return error; } static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); bool enabled; spin_lock(&fs_info->qgroup_lock); enabled = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON; spin_unlock(&fs_info->qgroup_lock); return sysfs_emit(buf, "%d\n", enabled); } BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show); static ssize_t qgroup_mode_show(struct kobject *qgroups_kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); ssize_t ret = 0; spin_lock(&fs_info->qgroup_lock); ASSERT(btrfs_qgroup_enabled(fs_info)); switch (btrfs_qgroup_mode(fs_info)) { case BTRFS_QGROUP_MODE_FULL: ret = sysfs_emit(buf, "qgroup\n"); break; case BTRFS_QGROUP_MODE_SIMPLE: ret = sysfs_emit(buf, "squota\n"); break; default: btrfs_warn(fs_info, "unexpected qgroup mode %d\n", btrfs_qgroup_mode(fs_info)); break; } spin_unlock(&fs_info->qgroup_lock); return ret; } BTRFS_ATTR(qgroups, mode, qgroup_mode_show); static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); bool inconsistent; spin_lock(&fs_info->qgroup_lock); inconsistent = (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT); spin_unlock(&fs_info->qgroup_lock); return sysfs_emit(buf, "%d\n", inconsistent); } BTRFS_ATTR(qgroups, inconsistent, qgroup_inconsistent_show); static ssize_t qgroup_drop_subtree_thres_show(struct kobject *qgroups_kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); u8 result; spin_lock(&fs_info->qgroup_lock); result = fs_info->qgroup_drop_subtree_thres; spin_unlock(&fs_info->qgroup_lock); return sysfs_emit(buf, "%d\n", result); } static ssize_t qgroup_drop_subtree_thres_store(struct kobject *qgroups_kobj, struct kobj_attribute *a, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); u8 new_thres; int ret; ret = kstrtou8(buf, 10, &new_thres); if (ret) return -EINVAL; if (new_thres > BTRFS_MAX_LEVEL) return -EINVAL; spin_lock(&fs_info->qgroup_lock); fs_info->qgroup_drop_subtree_thres = new_thres; spin_unlock(&fs_info->qgroup_lock); return len; } BTRFS_ATTR_RW(qgroups, drop_subtree_threshold, qgroup_drop_subtree_thres_show, qgroup_drop_subtree_thres_store); /* * Qgroups global info * * Path: /sys/fs/btrfs/<uuid>/qgroups/ */ static struct attribute *qgroups_attrs[] = { BTRFS_ATTR_PTR(qgroups, enabled), BTRFS_ATTR_PTR(qgroups, inconsistent), BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold), BTRFS_ATTR_PTR(qgroups, mode), NULL }; ATTRIBUTE_GROUPS(qgroups); static void qgroups_release(struct kobject *kobj) { kfree(kobj); } static const struct kobj_type qgroups_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = qgroups_groups, .release = qgroups_release, }; static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj) { return to_fs_info(kobj->parent->parent); } #define QGROUP_ATTR(_member, _show_name) \ static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj, \ struct kobj_attribute *a, \ char *buf) \ { \ struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \ struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \ struct btrfs_qgroup, kobj); \ return btrfs_show_u64(&qgroup->_member, &fs_info->qgroup_lock, buf); \ } \ BTRFS_ATTR(qgroup, _show_name, btrfs_qgroup_show_##_member) #define QGROUP_RSV_ATTR(_name, _type) \ static ssize_t btrfs_qgroup_rsv_show_##_name(struct kobject *qgroup_kobj, \ struct kobj_attribute *a, \ char *buf) \ { \ struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \ struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \ struct btrfs_qgroup, kobj); \ return btrfs_show_u64(&qgroup->rsv.values[_type], \ &fs_info->qgroup_lock, buf); \ } \ BTRFS_ATTR(qgroup, rsv_##_name, btrfs_qgroup_rsv_show_##_name) QGROUP_ATTR(rfer, referenced); QGROUP_ATTR(excl, exclusive); QGROUP_ATTR(max_rfer, max_referenced); QGROUP_ATTR(max_excl, max_exclusive); QGROUP_ATTR(lim_flags, limit_flags); QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA); QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS); QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC); /* * Qgroup information. * * Path: /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>/ */ static struct attribute *qgroup_attrs[] = { BTRFS_ATTR_PTR(qgroup, referenced), BTRFS_ATTR_PTR(qgroup, exclusive), BTRFS_ATTR_PTR(qgroup, max_referenced), BTRFS_ATTR_PTR(qgroup, max_exclusive), BTRFS_ATTR_PTR(qgroup, limit_flags), BTRFS_ATTR_PTR(qgroup, rsv_data), BTRFS_ATTR_PTR(qgroup, rsv_meta_pertrans), BTRFS_ATTR_PTR(qgroup, rsv_meta_prealloc), NULL }; ATTRIBUTE_GROUPS(qgroup); static void qgroup_release(struct kobject *kobj) { struct btrfs_qgroup *qgroup = container_of(kobj, struct btrfs_qgroup, kobj); memset(&qgroup->kobj, 0, sizeof(*kobj)); } static const struct kobj_type qgroup_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = qgroup_release, .default_groups = qgroup_groups, }; int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { struct kobject *qgroups_kobj = fs_info->qgroups_kobj; int ret; if (btrfs_is_testing(fs_info)) return 0; if (qgroup->kobj.state_initialized) return 0; if (!qgroups_kobj) return -EINVAL; ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj, "%hu_%llu", btrfs_qgroup_level(qgroup->qgroupid), btrfs_qgroup_subvolid(qgroup->qgroupid)); if (ret < 0) kobject_put(&qgroup->kobj); return ret; } void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info) { struct btrfs_qgroup *qgroup; struct btrfs_qgroup *next; if (btrfs_is_testing(fs_info)) return; rbtree_postorder_for_each_entry_safe(qgroup, next, &fs_info->qgroup_tree, node) btrfs_sysfs_del_one_qgroup(fs_info, qgroup); if (fs_info->qgroups_kobj) { kobject_del(fs_info->qgroups_kobj); kobject_put(fs_info->qgroups_kobj); fs_info->qgroups_kobj = NULL; } } /* Called when qgroups get initialized, thus there is no need for locking */ int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; struct btrfs_qgroup *qgroup; struct btrfs_qgroup *next; int ret = 0; if (btrfs_is_testing(fs_info)) return 0; ASSERT(fsid_kobj); if (fs_info->qgroups_kobj) return 0; fs_info->qgroups_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); if (!fs_info->qgroups_kobj) return -ENOMEM; ret = kobject_init_and_add(fs_info->qgroups_kobj, &qgroups_ktype, fsid_kobj, "qgroups"); if (ret < 0) goto out; rbtree_postorder_for_each_entry_safe(qgroup, next, &fs_info->qgroup_tree, node) { ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) goto out; } out: if (ret < 0) btrfs_sysfs_del_qgroups(fs_info); return ret; } void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { if (btrfs_is_testing(fs_info)) return; if (qgroup->kobj.state_initialized) { kobject_del(&qgroup->kobj); kobject_put(&qgroup->kobj); } } /* * Change per-fs features in /sys/fs/btrfs/UUID/features to match current * values in superblock. Call after any changes to incompat/compat_ro flags */ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj; int ret; if (!fs_info) return; fsid_kobj = &fs_info->fs_devices->fsid_kobj; if (!fsid_kobj->state_initialized) return; ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group); if (ret < 0) btrfs_warn(fs_info, "failed to update /sys/fs/btrfs/%pU/features: %d", fs_info->fs_devices->fsid, ret); } int __init btrfs_init_sysfs(void) { int ret; btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); if (!btrfs_kset) return -ENOMEM; init_feature_attrs(); ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); if (ret) goto out2; ret = sysfs_merge_group(&btrfs_kset->kobj, &btrfs_static_feature_attr_group); if (ret) goto out_remove_group; #ifdef CONFIG_BTRFS_DEBUG ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); if (ret) { sysfs_unmerge_group(&btrfs_kset->kobj, &btrfs_static_feature_attr_group); goto out_remove_group; } #endif return 0; out_remove_group: sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); out2: kset_unregister(btrfs_kset); return ret; } void __cold btrfs_exit_sysfs(void) { sysfs_unmerge_group(&btrfs_kset->kobj, &btrfs_static_feature_attr_group); sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); #ifdef CONFIG_BTRFS_DEBUG sysfs_remove_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); #endif kset_unregister(btrfs_kset); } |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 | // SPDX-License-Identifier: GPL-2.0-or-later /* CacheFiles path walking and related routines * * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/fs.h> #include <linux/namei.h> #include "internal.h" /* * Mark the backing file as being a cache file if it's not already in use. The * mark tells the culling request command that it's not allowed to cull the * file or directory. The caller must hold the inode lock. */ static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object, struct inode *inode) { bool can_use = false; if (!(inode->i_flags & S_KERNEL_FILE)) { inode->i_flags |= S_KERNEL_FILE; trace_cachefiles_mark_active(object, inode); can_use = true; } else { trace_cachefiles_mark_failed(object, inode); } return can_use; } static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object, struct inode *inode) { bool can_use; inode_lock(inode); can_use = __cachefiles_mark_inode_in_use(object, inode); inode_unlock(inode); return can_use; } /* * Unmark a backing inode. The caller must hold the inode lock. */ static void __cachefiles_unmark_inode_in_use(struct cachefiles_object *object, struct inode *inode) { inode->i_flags &= ~S_KERNEL_FILE; trace_cachefiles_mark_inactive(object, inode); } static void cachefiles_do_unmark_inode_in_use(struct cachefiles_object *object, struct inode *inode) { inode_lock(inode); __cachefiles_unmark_inode_in_use(object, inode); inode_unlock(inode); } /* * Unmark a backing inode and tell cachefilesd that there's something that can * be culled. */ void cachefiles_unmark_inode_in_use(struct cachefiles_object *object, struct file *file) { struct cachefiles_cache *cache = object->volume->cache; struct inode *inode = file_inode(file); cachefiles_do_unmark_inode_in_use(object, inode); if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { atomic_long_add(inode->i_blocks, &cache->b_released); if (atomic_inc_return(&cache->f_released)) cachefiles_state_changed(cache); } } /* * get a subdirectory */ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, struct dentry *dir, const char *dirname, bool *_is_new) { struct dentry *subdir; struct path path; int ret; _enter(",,%s", dirname); /* search the current directory for the element name */ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); retry: ret = cachefiles_inject_read_error(); if (ret == 0) subdir = lookup_one(&nop_mnt_idmap, &QSTR(dirname), dir); else subdir = ERR_PTR(ret); trace_cachefiles_lookup(NULL, dir, subdir); if (IS_ERR(subdir)) { trace_cachefiles_vfs_error(NULL, d_backing_inode(dir), PTR_ERR(subdir), cachefiles_trace_lookup_error); if (PTR_ERR(subdir) == -ENOMEM) goto nomem_d_alloc; goto lookup_error; } _debug("subdir -> %pd %s", subdir, d_backing_inode(subdir) ? "positive" : "negative"); /* we need to create the subdir if it doesn't exist yet */ if (d_is_negative(subdir)) { ret = cachefiles_has_space(cache, 1, 0, cachefiles_has_space_for_create); if (ret < 0) goto mkdir_error; _debug("attempt mkdir"); path.mnt = cache->mnt; path.dentry = dir; ret = security_path_mkdir(&path, subdir, 0700); if (ret < 0) goto mkdir_error; ret = cachefiles_inject_write_error(); if (ret == 0) subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); else subdir = ERR_PTR(ret); if (IS_ERR(subdir)) { trace_cachefiles_vfs_error(NULL, d_inode(dir), ret, cachefiles_trace_mkdir_error); goto mkdir_error; } trace_cachefiles_mkdir(dir, subdir); if (unlikely(d_unhashed(subdir) || d_is_negative(subdir))) { dput(subdir); goto retry; } ASSERT(d_backing_inode(subdir)); _debug("mkdir -> %pd{ino=%lu}", subdir, d_backing_inode(subdir)->i_ino); if (_is_new) *_is_new = true; } /* Tell rmdir() it's not allowed to delete the subdir */ inode_lock(d_inode(subdir)); inode_unlock(d_inode(dir)); if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) { pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", subdir, d_inode(subdir)->i_ino); goto mark_error; } inode_unlock(d_inode(subdir)); /* we need to make sure the subdir is a directory */ ASSERT(d_backing_inode(subdir)); if (!d_can_lookup(subdir)) { pr_err("%s is not a directory\n", dirname); ret = -EIO; goto check_error; } ret = -EPERM; if (!(d_backing_inode(subdir)->i_opflags & IOP_XATTR) || !d_backing_inode(subdir)->i_op->lookup || !d_backing_inode(subdir)->i_op->mkdir || !d_backing_inode(subdir)->i_op->rename || !d_backing_inode(subdir)->i_op->rmdir || !d_backing_inode(subdir)->i_op->unlink) goto check_error; _leave(" = [%lu]", d_backing_inode(subdir)->i_ino); return subdir; check_error: cachefiles_put_directory(subdir); _leave(" = %d [check]", ret); return ERR_PTR(ret); mark_error: inode_unlock(d_inode(subdir)); dput(subdir); return ERR_PTR(-EBUSY); mkdir_error: inode_unlock(d_inode(dir)); if (!IS_ERR(subdir)) dput(subdir); pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); lookup_error: inode_unlock(d_inode(dir)); ret = PTR_ERR(subdir); pr_err("Lookup %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); nomem_d_alloc: inode_unlock(d_inode(dir)); _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } /* * Put a subdirectory. */ void cachefiles_put_directory(struct dentry *dir) { if (dir) { cachefiles_do_unmark_inode_in_use(NULL, d_inode(dir)); dput(dir); } } /* * Remove a regular file from the cache. */ static int cachefiles_unlink(struct cachefiles_cache *cache, struct cachefiles_object *object, struct dentry *dir, struct dentry *dentry, enum fscache_why_object_killed why) { struct path path = { .mnt = cache->mnt, .dentry = dir, }; int ret; trace_cachefiles_unlink(object, d_inode(dentry)->i_ino, why); ret = security_path_unlink(&path, dentry); if (ret < 0) { cachefiles_io_error(cache, "Unlink security error"); return ret; } ret = cachefiles_inject_remove_error(); if (ret == 0) { ret = vfs_unlink(&nop_mnt_idmap, d_backing_inode(dir), dentry, NULL); if (ret == -EIO) cachefiles_io_error(cache, "Unlink failed"); } if (ret != 0) trace_cachefiles_vfs_error(object, d_backing_inode(dir), ret, cachefiles_trace_unlink_error); return ret; } /* * Delete an object representation from the cache * - File backed objects are unlinked * - Directory backed objects are stuffed into the graveyard for userspace to * delete */ int cachefiles_bury_object(struct cachefiles_cache *cache, struct cachefiles_object *object, struct dentry *dir, struct dentry *rep, enum fscache_why_object_killed why) { struct dentry *grave, *trap; struct path path, path_to_graveyard; char nbuffer[8 + 8 + 1]; int ret; _enter(",'%pd','%pd'", dir, rep); if (rep->d_parent != dir) { inode_unlock(d_inode(dir)); _leave(" = -ESTALE"); return -ESTALE; } /* non-directories can just be unlinked */ if (!d_is_dir(rep)) { dget(rep); /* Stop the dentry being negated if it's only pinned * by a file struct. */ ret = cachefiles_unlink(cache, object, dir, rep, why); dput(rep); inode_unlock(d_inode(dir)); _leave(" = %d", ret); return ret; } /* directories have to be moved to the graveyard */ _debug("move stale object to graveyard"); inode_unlock(d_inode(dir)); try_again: /* first step is to make up a grave dentry in the graveyard */ sprintf(nbuffer, "%08x%08x", (uint32_t) ktime_get_real_seconds(), (uint32_t) atomic_inc_return(&cache->gravecounter)); /* do the multiway lock magic */ trap = lock_rename(cache->graveyard, dir); if (IS_ERR(trap)) return PTR_ERR(trap); /* do some checks before getting the grave dentry */ if (rep->d_parent != dir || IS_DEADDIR(d_inode(rep))) { /* the entry was probably culled when we dropped the parent dir * lock */ unlock_rename(cache->graveyard, dir); _leave(" = 0 [culled?]"); return 0; } if (!d_can_lookup(cache->graveyard)) { unlock_rename(cache->graveyard, dir); cachefiles_io_error(cache, "Graveyard no longer a directory"); return -EIO; } if (trap == rep) { unlock_rename(cache->graveyard, dir); cachefiles_io_error(cache, "May not make directory loop"); return -EIO; } if (d_mountpoint(rep)) { unlock_rename(cache->graveyard, dir); cachefiles_io_error(cache, "Mountpoint in cache"); return -EIO; } grave = lookup_one(&nop_mnt_idmap, &QSTR(nbuffer), cache->graveyard); if (IS_ERR(grave)) { unlock_rename(cache->graveyard, dir); trace_cachefiles_vfs_error(object, d_inode(cache->graveyard), PTR_ERR(grave), cachefiles_trace_lookup_error); if (PTR_ERR(grave) == -ENOMEM) { _leave(" = -ENOMEM"); return -ENOMEM; } cachefiles_io_error(cache, "Lookup error %ld", PTR_ERR(grave)); return -EIO; } if (d_is_positive(grave)) { unlock_rename(cache->graveyard, dir); dput(grave); grave = NULL; cond_resched(); goto try_again; } if (d_mountpoint(grave)) { unlock_rename(cache->graveyard, dir); dput(grave); cachefiles_io_error(cache, "Mountpoint in graveyard"); return -EIO; } /* target should not be an ancestor of source */ if (trap == grave) { unlock_rename(cache->graveyard, dir); dput(grave); cachefiles_io_error(cache, "May not make directory loop"); return -EIO; } /* attempt the rename */ path.mnt = cache->mnt; path.dentry = dir; path_to_graveyard.mnt = cache->mnt; path_to_graveyard.dentry = cache->graveyard; ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0); if (ret < 0) { cachefiles_io_error(cache, "Rename security error %d", ret); } else { struct renamedata rd = { .old_mnt_idmap = &nop_mnt_idmap, .old_dir = d_inode(dir), .old_dentry = rep, .new_mnt_idmap = &nop_mnt_idmap, .new_dir = d_inode(cache->graveyard), .new_dentry = grave, }; trace_cachefiles_rename(object, d_inode(rep)->i_ino, why); ret = cachefiles_inject_read_error(); if (ret == 0) ret = vfs_rename(&rd); if (ret != 0) trace_cachefiles_vfs_error(object, d_inode(dir), ret, cachefiles_trace_rename_error); if (ret != 0 && ret != -ENOMEM) cachefiles_io_error(cache, "Rename failed with error %d", ret); } __cachefiles_unmark_inode_in_use(object, d_inode(rep)); unlock_rename(cache->graveyard, dir); dput(grave); _leave(" = 0"); return 0; } /* * Delete a cache file. */ int cachefiles_delete_object(struct cachefiles_object *object, enum fscache_why_object_killed why) { struct cachefiles_volume *volume = object->volume; struct dentry *dentry = object->file->f_path.dentry; struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash]; int ret; _enter(",OBJ%x{%pD}", object->debug_id, object->file); /* Stop the dentry being negated if it's only pinned by a file struct. */ dget(dentry); inode_lock_nested(d_backing_inode(fan), I_MUTEX_PARENT); ret = cachefiles_unlink(volume->cache, object, fan, dentry, why); inode_unlock(d_backing_inode(fan)); dput(dentry); return ret; } /* * Create a temporary file and leave it unattached and un-xattr'd until the * time comes to discard the object from memory. */ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) { struct cachefiles_volume *volume = object->volume; struct cachefiles_cache *cache = volume->cache; const struct cred *saved_cred; struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash]; struct file *file; const struct path parentpath = { .mnt = cache->mnt, .dentry = fan }; uint64_t ni_size; long ret; cachefiles_begin_secure(cache, &saved_cred); ret = cachefiles_inject_write_error(); if (ret == 0) { file = kernel_tmpfile_open(&nop_mnt_idmap, &parentpath, S_IFREG | 0600, O_RDWR | O_LARGEFILE | O_DIRECT, cache->cache_cred); ret = PTR_ERR_OR_ZERO(file); } if (ret) { trace_cachefiles_vfs_error(object, d_inode(fan), ret, cachefiles_trace_tmpfile_error); if (ret == -EIO) cachefiles_io_error_obj(object, "Failed to create tmpfile"); goto err; } trace_cachefiles_tmpfile(object, file_inode(file)); /* This is a newly created file with no other possible user */ if (!cachefiles_mark_inode_in_use(object, file_inode(file))) WARN_ON(1); ret = cachefiles_ondemand_init_object(object); if (ret < 0) goto err_unuse; ni_size = object->cookie->object_size; ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE); if (ni_size > 0) { trace_cachefiles_trunc(object, file_inode(file), 0, ni_size, cachefiles_trunc_expand_tmpfile); ret = cachefiles_inject_write_error(); if (ret == 0) ret = vfs_truncate(&file->f_path, ni_size); if (ret < 0) { trace_cachefiles_vfs_error( object, file_inode(file), ret, cachefiles_trace_trunc_error); goto err_unuse; } } ret = -EINVAL; if (unlikely(!file->f_op->read_iter) || unlikely(!file->f_op->write_iter)) { fput(file); pr_notice("Cache does not support read_iter and write_iter\n"); goto err_unuse; } out: cachefiles_end_secure(cache, saved_cred); return file; err_unuse: cachefiles_do_unmark_inode_in_use(object, file_inode(file)); fput(file); err: file = ERR_PTR(ret); goto out; } /* * Create a new file. */ static bool cachefiles_create_file(struct cachefiles_object *object) { struct file *file; int ret; ret = cachefiles_has_space(object->volume->cache, 1, 0, cachefiles_has_space_for_create); if (ret < 0) return false; file = cachefiles_create_tmpfile(object); if (IS_ERR(file)) return false; set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &object->cookie->flags); set_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags); _debug("create -> %pD{ino=%lu}", file, file_inode(file)->i_ino); object->file = file; return true; } /* * Open an existing file, checking its attributes and replacing it if it is * stale. */ static bool cachefiles_open_file(struct cachefiles_object *object, struct dentry *dentry) { struct cachefiles_cache *cache = object->volume->cache; struct file *file; struct path path; int ret; _enter("%pd", dentry); if (!cachefiles_mark_inode_in_use(object, d_inode(dentry))) { pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", dentry, d_inode(dentry)->i_ino); return false; } /* We need to open a file interface onto a data file now as we can't do * it on demand because writeback called from do_exit() sees * current->fs == NULL - which breaks d_path() called from ext4 open. */ path.mnt = cache->mnt; path.dentry = dentry; file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT, cache->cache_cred); if (IS_ERR(file)) { trace_cachefiles_vfs_error(object, d_backing_inode(dentry), PTR_ERR(file), cachefiles_trace_open_error); goto error; } if (unlikely(!file->f_op->read_iter) || unlikely(!file->f_op->write_iter)) { pr_notice("Cache does not support read_iter and write_iter\n"); goto error_fput; } _debug("file -> %pd positive", dentry); ret = cachefiles_ondemand_init_object(object); if (ret < 0) goto error_fput; ret = cachefiles_check_auxdata(object, file); if (ret < 0) goto check_failed; clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &object->cookie->flags); object->file = file; /* Always update the atime on an object we've just looked up (this is * used to keep track of culling, and atimes are only updated by read, * write and readdir but not lookup or open). */ touch_atime(&file->f_path); return true; check_failed: fscache_cookie_lookup_negative(object->cookie); cachefiles_unmark_inode_in_use(object, file); fput(file); if (ret == -ESTALE) return cachefiles_create_file(object); return false; error_fput: fput(file); error: cachefiles_do_unmark_inode_in_use(object, d_inode(dentry)); return false; } /* * walk from the parent object to the child object through the backing * filesystem, creating directories as we go */ bool cachefiles_look_up_object(struct cachefiles_object *object) { struct cachefiles_volume *volume = object->volume; struct dentry *dentry, *fan = volume->fanout[(u8)object->cookie->key_hash]; int ret; _enter("OBJ%x,%s,", object->debug_id, object->d_name); /* Look up path "cache/vol/fanout/file". */ ret = cachefiles_inject_read_error(); if (ret == 0) dentry = lookup_one_positive_unlocked(&nop_mnt_idmap, &QSTR(object->d_name), fan); else dentry = ERR_PTR(ret); trace_cachefiles_lookup(object, fan, dentry); if (IS_ERR(dentry)) { if (dentry == ERR_PTR(-ENOENT)) goto new_file; if (dentry == ERR_PTR(-EIO)) cachefiles_io_error_obj(object, "Lookup failed"); return false; } if (!d_is_reg(dentry)) { pr_err("%pd is not a file\n", dentry); inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); ret = cachefiles_bury_object(volume->cache, object, fan, dentry, FSCACHE_OBJECT_IS_WEIRD); dput(dentry); if (ret < 0) return false; goto new_file; } ret = cachefiles_open_file(object, dentry); dput(dentry); if (!ret) return false; _leave(" = t [%lu]", file_inode(object->file)->i_ino); return true; new_file: fscache_cookie_lookup_negative(object->cookie); return cachefiles_create_file(object); } /* * Attempt to link a temporary file into its rightful place in the cache. */ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, struct cachefiles_object *object) { struct cachefiles_volume *volume = object->volume; struct dentry *dentry, *fan = volume->fanout[(u8)object->cookie->key_hash]; bool success = false; int ret; _enter(",%pD", object->file); inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); ret = cachefiles_inject_read_error(); if (ret == 0) dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan); else dentry = ERR_PTR(ret); if (IS_ERR(dentry)) { trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry), cachefiles_trace_lookup_error); _debug("lookup fail %ld", PTR_ERR(dentry)); goto out_unlock; } if (!d_is_negative(dentry)) { ret = cachefiles_unlink(volume->cache, object, fan, dentry, FSCACHE_OBJECT_IS_STALE); if (ret < 0) goto out_dput; dput(dentry); ret = cachefiles_inject_read_error(); if (ret == 0) dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan); else dentry = ERR_PTR(ret); if (IS_ERR(dentry)) { trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry), cachefiles_trace_lookup_error); _debug("lookup fail %ld", PTR_ERR(dentry)); goto out_unlock; } } ret = cachefiles_inject_read_error(); if (ret == 0) ret = vfs_link(object->file->f_path.dentry, &nop_mnt_idmap, d_inode(fan), dentry, NULL); if (ret < 0) { trace_cachefiles_vfs_error(object, d_inode(fan), ret, cachefiles_trace_link_error); _debug("link fail %d", ret); } else { trace_cachefiles_link(object, file_inode(object->file)); spin_lock(&object->lock); /* TODO: Do we want to switch the file pointer to the new dentry? */ clear_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags); spin_unlock(&object->lock); success = true; } out_dput: dput(dentry); out_unlock: inode_unlock(d_inode(fan)); _leave(" = %u", success); return success; } /* * Look up an inode to be checked or culled. Return -EBUSY if the inode is * marked in use. */ static struct dentry *cachefiles_lookup_for_cull(struct cachefiles_cache *cache, struct dentry *dir, char *filename) { struct dentry *victim; int ret = -ENOENT; inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); victim = lookup_one(&nop_mnt_idmap, &QSTR(filename), dir); if (IS_ERR(victim)) goto lookup_error; if (d_is_negative(victim)) goto lookup_put; if (d_inode(victim)->i_flags & S_KERNEL_FILE) goto lookup_busy; return victim; lookup_busy: ret = -EBUSY; lookup_put: inode_unlock(d_inode(dir)); dput(victim); return ERR_PTR(ret); lookup_error: inode_unlock(d_inode(dir)); ret = PTR_ERR(victim); if (ret == -ENOENT) return ERR_PTR(-ESTALE); /* Probably got retired by the netfs */ if (ret == -EIO) { cachefiles_io_error(cache, "Lookup failed"); } else if (ret != -ENOMEM) { pr_err("Internal error: %d\n", ret); ret = -EIO; } return ERR_PTR(ret); } /* * Cull an object if it's not in use * - called only by cache manager daemon */ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, char *filename) { struct dentry *victim; struct inode *inode; int ret; _enter(",%pd/,%s", dir, filename); victim = cachefiles_lookup_for_cull(cache, dir, filename); if (IS_ERR(victim)) return PTR_ERR(victim); /* check to see if someone is using this object */ inode = d_inode(victim); inode_lock(inode); if (inode->i_flags & S_KERNEL_FILE) { ret = -EBUSY; } else { /* Stop the cache from picking it back up */ inode->i_flags |= S_KERNEL_FILE; ret = 0; } inode_unlock(inode); if (ret < 0) goto error_unlock; ret = cachefiles_bury_object(cache, NULL, dir, victim, FSCACHE_OBJECT_WAS_CULLED); if (ret < 0) goto error; fscache_count_culled(); dput(victim); _leave(" = 0"); return 0; error_unlock: inode_unlock(d_inode(dir)); error: dput(victim); if (ret == -ENOENT) return -ESTALE; /* Probably got retired by the netfs */ if (ret != -ENOMEM) { pr_err("Internal error: %d\n", ret); ret = -EIO; } _leave(" = %d", ret); return ret; } /* * Find out if an object is in use or not * - called only by cache manager daemon * - returns -EBUSY or 0 to indicate whether an object is in use or not */ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, char *filename) { struct dentry *victim; int ret = 0; victim = cachefiles_lookup_for_cull(cache, dir, filename); if (IS_ERR(victim)) return PTR_ERR(victim); inode_unlock(d_inode(dir)); dput(victim); return ret; } |
| 34 33 3 1 2 16 13 3 3 3 6 1 13 20 98 98 44 28 37 2 10 11 1 23 23 37 13 13 13 20 8 13 13 21 1 1 20 1 21 8 20 5 1 4 4 1 3 3 39 38 24 24 18 3 3 2 3 3 3 3 3 3 3 3 9 1 1 7 1 1 5 5 6 3 3 3 1 8 8 3 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 | // SPDX-License-Identifier: GPL-2.0 /* * Filesystem-level keyring for fscrypt * * Copyright 2019 Google LLC */ /* * This file implements management of fscrypt master keys in the * filesystem-level keyring, including the ioctls: * * - FS_IOC_ADD_ENCRYPTION_KEY * - FS_IOC_REMOVE_ENCRYPTION_KEY * - FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS * - FS_IOC_GET_ENCRYPTION_KEY_STATUS * * See the "User API" section of Documentation/filesystems/fscrypt.rst for more * information about these ioctls. */ #include <linux/unaligned.h> #include <crypto/skcipher.h> #include <linux/key-type.h> #include <linux/random.h> #include <linux/once.h> #include <linux/seq_file.h> #include "fscrypt_private.h" /* The master encryption keys for a filesystem (->s_master_keys) */ struct fscrypt_keyring { /* * Lock that protects ->key_hashtable. It does *not* protect the * fscrypt_master_key structs themselves. */ spinlock_t lock; /* Hash table that maps fscrypt_key_specifier to fscrypt_master_key */ struct hlist_head key_hashtable[128]; }; static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret) { fscrypt_destroy_hkdf(&secret->hkdf); memzero_explicit(secret, sizeof(*secret)); } static void move_master_key_secret(struct fscrypt_master_key_secret *dst, struct fscrypt_master_key_secret *src) { memcpy(dst, src, sizeof(*dst)); memzero_explicit(src, sizeof(*src)); } static void fscrypt_free_master_key(struct rcu_head *head) { struct fscrypt_master_key *mk = container_of(head, struct fscrypt_master_key, mk_rcu_head); /* * The master key secret and any embedded subkeys should have already * been wiped when the last active reference to the fscrypt_master_key * struct was dropped; doing it here would be unnecessarily late. * Nevertheless, use kfree_sensitive() in case anything was missed. */ kfree_sensitive(mk); } void fscrypt_put_master_key(struct fscrypt_master_key *mk) { if (!refcount_dec_and_test(&mk->mk_struct_refs)) return; /* * No structural references left, so free ->mk_users, and also free the * fscrypt_master_key struct itself after an RCU grace period ensures * that concurrent keyring lookups can no longer find it. */ WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 0); if (mk->mk_users) { /* Clear the keyring so the quota gets released right away. */ keyring_clear(mk->mk_users); key_put(mk->mk_users); mk->mk_users = NULL; } call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key); } void fscrypt_put_master_key_activeref(struct super_block *sb, struct fscrypt_master_key *mk) { size_t i; if (!refcount_dec_and_test(&mk->mk_active_refs)) return; /* * No active references left, so complete the full removal of this * fscrypt_master_key struct by removing it from the keyring and * destroying any subkeys embedded in it. */ if (WARN_ON_ONCE(!sb->s_master_keys)) return; spin_lock(&sb->s_master_keys->lock); hlist_del_rcu(&mk->mk_node); spin_unlock(&sb->s_master_keys->lock); /* * ->mk_active_refs == 0 implies that ->mk_present is false and * ->mk_decrypted_inodes is empty. */ WARN_ON_ONCE(mk->mk_present); WARN_ON_ONCE(!list_empty(&mk->mk_decrypted_inodes)); for (i = 0; i <= FSCRYPT_MODE_MAX; i++) { fscrypt_destroy_prepared_key( sb, &mk->mk_direct_keys[i]); fscrypt_destroy_prepared_key( sb, &mk->mk_iv_ino_lblk_64_keys[i]); fscrypt_destroy_prepared_key( sb, &mk->mk_iv_ino_lblk_32_keys[i]); } memzero_explicit(&mk->mk_ino_hash_key, sizeof(mk->mk_ino_hash_key)); mk->mk_ino_hash_key_initialized = false; /* Drop the structural ref associated with the active refs. */ fscrypt_put_master_key(mk); } /* * This transitions the key state from present to incompletely removed, and then * potentially to absent (depending on whether inodes remain). */ static void fscrypt_initiate_key_removal(struct super_block *sb, struct fscrypt_master_key *mk) { WRITE_ONCE(mk->mk_present, false); wipe_master_key_secret(&mk->mk_secret); fscrypt_put_master_key_activeref(sb, mk); } static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec) { if (spec->__reserved) return false; return master_key_spec_len(spec) != 0; } static int fscrypt_user_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { /* * We just charge FSCRYPT_MAX_RAW_KEY_SIZE bytes to the user's key quota * for each key, regardless of the exact key size. The amount of memory * actually used is greater than the size of the raw key anyway. */ return key_payload_reserve(key, FSCRYPT_MAX_RAW_KEY_SIZE); } static void fscrypt_user_key_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); } /* * Type of key in ->mk_users. Each key of this type represents a particular * user who has added a particular master key. * * Note that the name of this key type really should be something like * ".fscrypt-user" instead of simply ".fscrypt". But the shorter name is chosen * mainly for simplicity of presentation in /proc/keys when read by a non-root * user. And it is expected to be rare that a key is actually added by multiple * users, since users should keep their encryption keys confidential. */ static struct key_type key_type_fscrypt_user = { .name = ".fscrypt", .instantiate = fscrypt_user_key_instantiate, .describe = fscrypt_user_key_describe, }; #define FSCRYPT_MK_USERS_DESCRIPTION_SIZE \ (CONST_STRLEN("fscrypt-") + 2 * FSCRYPT_KEY_IDENTIFIER_SIZE + \ CONST_STRLEN("-users") + 1) #define FSCRYPT_MK_USER_DESCRIPTION_SIZE \ (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1) static void format_mk_users_keyring_description( char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE], const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { sprintf(description, "fscrypt-%*phN-users", FSCRYPT_KEY_IDENTIFIER_SIZE, mk_identifier); } static void format_mk_user_description( char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE], const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { sprintf(description, "%*phN.uid.%u", FSCRYPT_KEY_IDENTIFIER_SIZE, mk_identifier, __kuid_val(current_fsuid())); } /* Create ->s_master_keys if needed. Synchronized by fscrypt_add_key_mutex. */ static int allocate_filesystem_keyring(struct super_block *sb) { struct fscrypt_keyring *keyring; if (sb->s_master_keys) return 0; keyring = kzalloc(sizeof(*keyring), GFP_KERNEL); if (!keyring) return -ENOMEM; spin_lock_init(&keyring->lock); /* * Pairs with the smp_load_acquire() in fscrypt_find_master_key(). * I.e., here we publish ->s_master_keys with a RELEASE barrier so that * concurrent tasks can ACQUIRE it. */ smp_store_release(&sb->s_master_keys, keyring); return 0; } /* * Release all encryption keys that have been added to the filesystem, along * with the keyring that contains them. * * This is called at unmount time, after all potentially-encrypted inodes have * been evicted. The filesystem's underlying block device(s) are still * available at this time; this is important because after user file accesses * have been allowed, this function may need to evict keys from the keyslots of * an inline crypto engine, which requires the block device(s). */ void fscrypt_destroy_keyring(struct super_block *sb) { struct fscrypt_keyring *keyring = sb->s_master_keys; size_t i; if (!keyring) return; for (i = 0; i < ARRAY_SIZE(keyring->key_hashtable); i++) { struct hlist_head *bucket = &keyring->key_hashtable[i]; struct fscrypt_master_key *mk; struct hlist_node *tmp; hlist_for_each_entry_safe(mk, tmp, bucket, mk_node) { /* * Since all potentially-encrypted inodes were already * evicted, every key remaining in the keyring should * have an empty inode list, and should only still be in * the keyring due to the single active ref associated * with ->mk_present. There should be no structural * refs beyond the one associated with the active ref. */ WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 1); WARN_ON_ONCE(refcount_read(&mk->mk_struct_refs) != 1); WARN_ON_ONCE(!mk->mk_present); fscrypt_initiate_key_removal(sb, mk); } } kfree_sensitive(keyring); sb->s_master_keys = NULL; } static struct hlist_head * fscrypt_mk_hash_bucket(struct fscrypt_keyring *keyring, const struct fscrypt_key_specifier *mk_spec) { /* * Since key specifiers should be "random" values, it is sufficient to * use a trivial hash function that just takes the first several bits of * the key specifier. */ unsigned long i = get_unaligned((unsigned long *)&mk_spec->u); return &keyring->key_hashtable[i % ARRAY_SIZE(keyring->key_hashtable)]; } /* * Find the specified master key struct in ->s_master_keys and take a structural * ref to it. The structural ref guarantees that the key struct continues to * exist, but it does *not* guarantee that ->s_master_keys continues to contain * the key struct. The structural ref needs to be dropped by * fscrypt_put_master_key(). Returns NULL if the key struct is not found. */ struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec) { struct fscrypt_keyring *keyring; struct hlist_head *bucket; struct fscrypt_master_key *mk; /* * Pairs with the smp_store_release() in allocate_filesystem_keyring(). * I.e., another task can publish ->s_master_keys concurrently, * executing a RELEASE barrier. We need to use smp_load_acquire() here * to safely ACQUIRE the memory the other task published. */ keyring = smp_load_acquire(&sb->s_master_keys); if (keyring == NULL) return NULL; /* No keyring yet, so no keys yet. */ bucket = fscrypt_mk_hash_bucket(keyring, mk_spec); rcu_read_lock(); switch (mk_spec->type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: hlist_for_each_entry_rcu(mk, bucket, mk_node) { if (mk->mk_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && memcmp(mk->mk_spec.u.descriptor, mk_spec->u.descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 && refcount_inc_not_zero(&mk->mk_struct_refs)) goto out; } break; case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: hlist_for_each_entry_rcu(mk, bucket, mk_node) { if (mk->mk_spec.type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER && memcmp(mk->mk_spec.u.identifier, mk_spec->u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE) == 0 && refcount_inc_not_zero(&mk->mk_struct_refs)) goto out; } break; } mk = NULL; out: rcu_read_unlock(); return mk; } static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk) { char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE]; struct key *keyring; format_mk_users_keyring_description(description, mk->mk_spec.u.identifier); keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); mk->mk_users = keyring; return 0; } /* * Find the current user's "key" in the master key's ->mk_users. * Returns ERR_PTR(-ENOKEY) if not found. */ static struct key *find_master_key_user(struct fscrypt_master_key *mk) { char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE]; key_ref_t keyref; format_mk_user_description(description, mk->mk_spec.u.identifier); /* * We need to mark the keyring reference as "possessed" so that we * acquire permission to search it, via the KEY_POS_SEARCH permission. */ keyref = keyring_search(make_key_ref(mk->mk_users, true /*possessed*/), &key_type_fscrypt_user, description, false); if (IS_ERR(keyref)) { if (PTR_ERR(keyref) == -EAGAIN || /* not found */ PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */ keyref = ERR_PTR(-ENOKEY); return ERR_CAST(keyref); } return key_ref_to_ptr(keyref); } /* * Give the current user a "key" in ->mk_users. This charges the user's quota * and marks the master key as added by the current user, so that it cannot be * removed by another user with the key. Either ->mk_sem must be held for * write, or the master key must be still undergoing initialization. */ static int add_master_key_user(struct fscrypt_master_key *mk) { char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE]; struct key *mk_user; int err; format_mk_user_description(description, mk->mk_spec.u.identifier); mk_user = key_alloc(&key_type_fscrypt_user, description, current_fsuid(), current_gid(), current_cred(), KEY_POS_SEARCH | KEY_USR_VIEW, 0, NULL); if (IS_ERR(mk_user)) return PTR_ERR(mk_user); err = key_instantiate_and_link(mk_user, NULL, 0, mk->mk_users, NULL); key_put(mk_user); return err; } /* * Remove the current user's "key" from ->mk_users. * ->mk_sem must be held for write. * * Returns 0 if removed, -ENOKEY if not found, or another -errno code. */ static int remove_master_key_user(struct fscrypt_master_key *mk) { struct key *mk_user; int err; mk_user = find_master_key_user(mk); if (IS_ERR(mk_user)) return PTR_ERR(mk_user); err = key_unlink(mk->mk_users, mk_user); key_put(mk_user); return err; } /* * Allocate a new fscrypt_master_key, transfer the given secret over to it, and * insert it into sb->s_master_keys. */ static int add_new_master_key(struct super_block *sb, struct fscrypt_master_key_secret *secret, const struct fscrypt_key_specifier *mk_spec) { struct fscrypt_keyring *keyring = sb->s_master_keys; struct fscrypt_master_key *mk; int err; mk = kzalloc(sizeof(*mk), GFP_KERNEL); if (!mk) return -ENOMEM; init_rwsem(&mk->mk_sem); refcount_set(&mk->mk_struct_refs, 1); mk->mk_spec = *mk_spec; INIT_LIST_HEAD(&mk->mk_decrypted_inodes); spin_lock_init(&mk->mk_decrypted_inodes_lock); if (mk_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { err = allocate_master_key_users_keyring(mk); if (err) goto out_put; err = add_master_key_user(mk); if (err) goto out_put; } move_master_key_secret(&mk->mk_secret, secret); mk->mk_present = true; refcount_set(&mk->mk_active_refs, 1); /* ->mk_present is true */ spin_lock(&keyring->lock); hlist_add_head_rcu(&mk->mk_node, fscrypt_mk_hash_bucket(keyring, mk_spec)); spin_unlock(&keyring->lock); return 0; out_put: fscrypt_put_master_key(mk); return err; } #define KEY_DEAD 1 static int add_existing_master_key(struct fscrypt_master_key *mk, struct fscrypt_master_key_secret *secret) { int err; /* * If the current user is already in ->mk_users, then there's nothing to * do. Otherwise, we need to add the user to ->mk_users. (Neither is * applicable for v1 policy keys, which have NULL ->mk_users.) */ if (mk->mk_users) { struct key *mk_user = find_master_key_user(mk); if (mk_user != ERR_PTR(-ENOKEY)) { if (IS_ERR(mk_user)) return PTR_ERR(mk_user); key_put(mk_user); return 0; } err = add_master_key_user(mk); if (err) return err; } /* If the key is incompletely removed, make it present again. */ if (!mk->mk_present) { if (!refcount_inc_not_zero(&mk->mk_active_refs)) { /* * Raced with the last active ref being dropped, so the * key has become, or is about to become, "absent". * Therefore, we need to allocate a new key struct. */ return KEY_DEAD; } move_master_key_secret(&mk->mk_secret, secret); WRITE_ONCE(mk->mk_present, true); } return 0; } static int do_add_master_key(struct super_block *sb, struct fscrypt_master_key_secret *secret, const struct fscrypt_key_specifier *mk_spec) { static DEFINE_MUTEX(fscrypt_add_key_mutex); struct fscrypt_master_key *mk; int err; mutex_lock(&fscrypt_add_key_mutex); /* serialize find + link */ mk = fscrypt_find_master_key(sb, mk_spec); if (!mk) { /* Didn't find the key in ->s_master_keys. Add it. */ err = allocate_filesystem_keyring(sb); if (!err) err = add_new_master_key(sb, secret, mk_spec); } else { /* * Found the key in ->s_master_keys. Add the user to ->mk_users * if needed, and make the key "present" again if possible. */ down_write(&mk->mk_sem); err = add_existing_master_key(mk, secret); up_write(&mk->mk_sem); if (err == KEY_DEAD) { /* * We found a key struct, but it's already been fully * removed. Ignore the old struct and add a new one. * fscrypt_add_key_mutex means we don't need to worry * about concurrent adds. */ err = add_new_master_key(sb, secret, mk_spec); } fscrypt_put_master_key(mk); } mutex_unlock(&fscrypt_add_key_mutex); return err; } static int add_master_key(struct super_block *sb, struct fscrypt_master_key_secret *secret, struct fscrypt_key_specifier *key_spec) { int err; if (key_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]; u8 *kdf_key = secret->bytes; unsigned int kdf_key_size = secret->size; u8 keyid_kdf_ctx = HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY; /* * For raw keys, the fscrypt master key is used directly as the * fscrypt KDF key. For hardware-wrapped keys, we have to pass * the master key to the hardware to derive the KDF key, which * is then only used to derive non-file-contents subkeys. */ if (secret->is_hw_wrapped) { err = fscrypt_derive_sw_secret(sb, secret->bytes, secret->size, sw_secret); if (err) return err; kdf_key = sw_secret; kdf_key_size = sizeof(sw_secret); /* * To avoid weird behavior if someone manages to * determine sw_secret and add it as a raw key, ensure * that hardware-wrapped keys and raw keys will have * different key identifiers by deriving their key * identifiers using different KDF contexts. */ keyid_kdf_ctx = HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY; } err = fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size); /* * Now that the KDF context is initialized, the raw KDF key is * no longer needed. */ memzero_explicit(kdf_key, kdf_key_size); if (err) return err; /* Calculate the key identifier */ err = fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0, key_spec->u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); if (err) return err; } return do_add_master_key(sb, secret, key_spec); } /* * Validate the size of an fscrypt master key being added. Note that this is * just an initial check, as we don't know which ciphers will be used yet. * There is a stricter size check later when the key is actually used by a file. */ static inline bool fscrypt_valid_key_size(size_t size, u32 add_key_flags) { u32 max_size = (add_key_flags & FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED) ? FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE : FSCRYPT_MAX_RAW_KEY_SIZE; return size >= FSCRYPT_MIN_KEY_SIZE && size <= max_size; } static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep) { const struct fscrypt_provisioning_key_payload *payload = prep->data; if (prep->datalen < sizeof(*payload)) return -EINVAL; if (!fscrypt_valid_key_size(prep->datalen - sizeof(*payload), payload->flags)) return -EINVAL; if (payload->type != FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && payload->type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) return -EINVAL; if (payload->flags & ~FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED) return -EINVAL; prep->payload.data[0] = kmemdup(payload, prep->datalen, GFP_KERNEL); if (!prep->payload.data[0]) return -ENOMEM; prep->quotalen = prep->datalen; return 0; } static void fscrypt_provisioning_key_free_preparse( struct key_preparsed_payload *prep) { kfree_sensitive(prep->payload.data[0]); } static void fscrypt_provisioning_key_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); if (key_is_positive(key)) { const struct fscrypt_provisioning_key_payload *payload = key->payload.data[0]; seq_printf(m, ": %u [%u]", key->datalen, payload->type); } } static void fscrypt_provisioning_key_destroy(struct key *key) { kfree_sensitive(key->payload.data[0]); } static struct key_type key_type_fscrypt_provisioning = { .name = "fscrypt-provisioning", .preparse = fscrypt_provisioning_key_preparse, .free_preparse = fscrypt_provisioning_key_free_preparse, .instantiate = generic_key_instantiate, .describe = fscrypt_provisioning_key_describe, .destroy = fscrypt_provisioning_key_destroy, }; /* * Retrieve the key from the Linux keyring key specified by 'key_id', and store * it into 'secret'. * * The key must be of type "fscrypt-provisioning" and must have the 'type' and * 'flags' field of the payload set to the given values, indicating that the key * is intended for use for the specified purpose. We don't use the "logon" key * type because there's no way to completely restrict the use of such keys; they * can be used by any kernel API that accepts "logon" keys and doesn't require a * specific service prefix. * * The ability to specify the key via Linux keyring key is intended for cases * where userspace needs to re-add keys after the filesystem is unmounted and * re-mounted. Most users should just provide the key directly instead. */ static int get_keyring_key(u32 key_id, u32 type, u32 flags, struct fscrypt_master_key_secret *secret) { key_ref_t ref; struct key *key; const struct fscrypt_provisioning_key_payload *payload; int err; ref = lookup_user_key(key_id, 0, KEY_NEED_SEARCH); if (IS_ERR(ref)) return PTR_ERR(ref); key = key_ref_to_ptr(ref); if (key->type != &key_type_fscrypt_provisioning) goto bad_key; payload = key->payload.data[0]; /* * Don't allow fscrypt v1 keys to be used as v2 keys and vice versa. * Similarly, don't allow hardware-wrapped keys to be used as * non-hardware-wrapped keys and vice versa. */ if (payload->type != type || payload->flags != flags) goto bad_key; secret->size = key->datalen - sizeof(*payload); memcpy(secret->bytes, payload->raw, secret->size); err = 0; goto out_put; bad_key: err = -EKEYREJECTED; out_put: key_ref_put(ref); return err; } /* * Add a master encryption key to the filesystem, causing all files which were * encrypted with it to appear "unlocked" (decrypted) when accessed. * * When adding a key for use by v1 encryption policies, this ioctl is * privileged, and userspace must provide the 'key_descriptor'. * * When adding a key for use by v2+ encryption policies, this ioctl is * unprivileged. This is needed, in general, to allow non-root users to use * encryption without encountering the visibility problems of process-subscribed * keyrings and the inability to properly remove keys. This works by having * each key identified by its cryptographically secure hash --- the * 'key_identifier'. The cryptographic hash ensures that a malicious user * cannot add the wrong key for a given identifier. Furthermore, each added key * is charged to the appropriate user's quota for the keyrings service, which * prevents a malicious user from adding too many keys. Finally, we forbid a * user from removing a key while other users have added it too, which prevents * a user who knows another user's key from causing a denial-of-service by * removing it at an inopportune time. (We tolerate that a user who knows a key * can prevent other users from removing it.) * * For more details, see the "FS_IOC_ADD_ENCRYPTION_KEY" section of * Documentation/filesystems/fscrypt.rst. */ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) { struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_add_key_arg __user *uarg = _uarg; struct fscrypt_add_key_arg arg; struct fscrypt_master_key_secret secret; int err; if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; if (!valid_key_spec(&arg.key_spec)) return -EINVAL; if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; /* * Only root can add keys that are identified by an arbitrary descriptor * rather than by a cryptographic hash --- since otherwise a malicious * user could add the wrong key. */ if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && !capable(CAP_SYS_ADMIN)) return -EACCES; memset(&secret, 0, sizeof(secret)); if (arg.flags) { if (arg.flags & ~FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED) return -EINVAL; if (arg.key_spec.type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) return -EINVAL; secret.is_hw_wrapped = true; } if (arg.key_id) { if (arg.raw_size != 0) return -EINVAL; err = get_keyring_key(arg.key_id, arg.key_spec.type, arg.flags, &secret); if (err) goto out_wipe_secret; } else { if (!fscrypt_valid_key_size(arg.raw_size, arg.flags)) return -EINVAL; secret.size = arg.raw_size; err = -EFAULT; if (copy_from_user(secret.bytes, uarg->raw, secret.size)) goto out_wipe_secret; } err = add_master_key(sb, &secret, &arg.key_spec); if (err) goto out_wipe_secret; /* Return the key identifier to userspace, if applicable */ err = -EFAULT; if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER && copy_to_user(uarg->key_spec.u.identifier, arg.key_spec.u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE)) goto out_wipe_secret; err = 0; out_wipe_secret: wipe_master_key_secret(&secret); return err; } EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key); static void fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret) { static u8 test_key[FSCRYPT_MAX_RAW_KEY_SIZE]; get_random_once(test_key, sizeof(test_key)); memset(secret, 0, sizeof(*secret)); secret->size = sizeof(test_key); memcpy(secret->bytes, test_key, sizeof(test_key)); } int fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { struct fscrypt_master_key_secret secret; int err; fscrypt_get_test_dummy_secret(&secret); err = fscrypt_init_hkdf(&secret.hkdf, secret.bytes, secret.size); if (err) goto out; err = fscrypt_hkdf_expand(&secret.hkdf, HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY, NULL, 0, key_identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); out: wipe_master_key_secret(&secret); return err; } /** * fscrypt_add_test_dummy_key() - add the test dummy encryption key * @sb: the filesystem instance to add the key to * @key_spec: the key specifier of the test dummy encryption key * * Add the key for the test_dummy_encryption mount option to the filesystem. To * prevent misuse of this mount option, a per-boot random key is used instead of * a hardcoded one. This makes it so that any encrypted files created using * this option won't be accessible after a reboot. * * Return: 0 on success, -errno on failure */ int fscrypt_add_test_dummy_key(struct super_block *sb, struct fscrypt_key_specifier *key_spec) { struct fscrypt_master_key_secret secret; int err; fscrypt_get_test_dummy_secret(&secret); err = add_master_key(sb, &secret, key_spec); wipe_master_key_secret(&secret); return err; } /* * Verify that the current user has added a master key with the given identifier * (returns -ENOKEY if not). This is needed to prevent a user from encrypting * their files using some other user's key which they don't actually know. * Cryptographically this isn't much of a problem, but the semantics of this * would be a bit weird, so it's best to just forbid it. * * The system administrator (CAP_FOWNER) can override this, which should be * enough for any use cases where encryption policies are being set using keys * that were chosen ahead of time but aren't available at the moment. * * Note that the key may have already removed by the time this returns, but * that's okay; we just care whether the key was there at some point. * * Return: 0 if the key is added, -ENOKEY if it isn't, or another -errno code */ int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { struct fscrypt_key_specifier mk_spec; struct fscrypt_master_key *mk; struct key *mk_user; int err; mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; memcpy(mk_spec.u.identifier, identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); mk = fscrypt_find_master_key(sb, &mk_spec); if (!mk) { err = -ENOKEY; goto out; } down_read(&mk->mk_sem); mk_user = find_master_key_user(mk); if (IS_ERR(mk_user)) { err = PTR_ERR(mk_user); } else { key_put(mk_user); err = 0; } up_read(&mk->mk_sem); fscrypt_put_master_key(mk); out: if (err == -ENOKEY && capable(CAP_FOWNER)) err = 0; return err; } /* * Try to evict the inode's dentries from the dentry cache. If the inode is a * directory, then it can have at most one dentry; however, that dentry may be * pinned by child dentries, so first try to evict the children too. */ static void shrink_dcache_inode(struct inode *inode) { struct dentry *dentry; if (S_ISDIR(inode->i_mode)) { dentry = d_find_any_alias(inode); if (dentry) { shrink_dcache_parent(dentry); dput(dentry); } } d_prune_aliases(inode); } static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk) { struct fscrypt_inode_info *ci; struct inode *inode; struct inode *toput_inode = NULL; spin_lock(&mk->mk_decrypted_inodes_lock); list_for_each_entry(ci, &mk->mk_decrypted_inodes, ci_master_key_link) { inode = ci->ci_inode; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&mk->mk_decrypted_inodes_lock); shrink_dcache_inode(inode); iput(toput_inode); toput_inode = inode; spin_lock(&mk->mk_decrypted_inodes_lock); } spin_unlock(&mk->mk_decrypted_inodes_lock); iput(toput_inode); } static int check_for_busy_inodes(struct super_block *sb, struct fscrypt_master_key *mk) { struct list_head *pos; size_t busy_count = 0; unsigned long ino; char ino_str[50] = ""; spin_lock(&mk->mk_decrypted_inodes_lock); list_for_each(pos, &mk->mk_decrypted_inodes) busy_count++; if (busy_count == 0) { spin_unlock(&mk->mk_decrypted_inodes_lock); return 0; } { /* select an example file to show for debugging purposes */ struct inode *inode = list_first_entry(&mk->mk_decrypted_inodes, struct fscrypt_inode_info, ci_master_key_link)->ci_inode; ino = inode->i_ino; } spin_unlock(&mk->mk_decrypted_inodes_lock); /* If the inode is currently being created, ino may still be 0. */ if (ino) snprintf(ino_str, sizeof(ino_str), ", including ino %lu", ino); fscrypt_warn(NULL, "%s: %zu inode(s) still busy after removing key with %s %*phN%s", sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec), master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u, ino_str); return -EBUSY; } static int try_to_lock_encrypted_files(struct super_block *sb, struct fscrypt_master_key *mk) { int err1; int err2; /* * An inode can't be evicted while it is dirty or has dirty pages. * Thus, we first have to clean the inodes in ->mk_decrypted_inodes. * * Just do it the easy way: call sync_filesystem(). It's overkill, but * it works, and it's more important to minimize the amount of caches we * drop than the amount of data we sync. Also, unprivileged users can * already call sync_filesystem() via sys_syncfs() or sys_sync(). */ down_read(&sb->s_umount); err1 = sync_filesystem(sb); up_read(&sb->s_umount); /* If a sync error occurs, still try to evict as much as possible. */ /* * Inodes are pinned by their dentries, so we have to evict their * dentries. shrink_dcache_sb() would suffice, but would be overkill * and inappropriate for use by unprivileged users. So instead go * through the inodes' alias lists and try to evict each dentry. */ evict_dentries_for_decrypted_inodes(mk); /* * evict_dentries_for_decrypted_inodes() already iput() each inode in * the list; any inodes for which that dropped the last reference will * have been evicted due to fscrypt_drop_inode() detecting the key * removal and telling the VFS to evict the inode. So to finish, we * just need to check whether any inodes couldn't be evicted. */ err2 = check_for_busy_inodes(sb, mk); return err1 ?: err2; } /* * Try to remove an fscrypt master encryption key. * * FS_IOC_REMOVE_ENCRYPTION_KEY (all_users=false) removes the current user's * claim to the key, then removes the key itself if no other users have claims. * FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS (all_users=true) always removes the * key itself. * * To "remove the key itself", first we transition the key to the "incompletely * removed" state, so that no more inodes can be unlocked with it. Then we try * to evict all cached inodes that had been unlocked with the key. * * If all inodes were evicted, then we unlink the fscrypt_master_key from the * keyring. Otherwise it remains in the keyring in the "incompletely removed" * state where it tracks the list of remaining inodes. Userspace can execute * the ioctl again later to retry eviction, or alternatively can re-add the key. * * For more details, see the "Removing keys" section of * Documentation/filesystems/fscrypt.rst. */ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) { struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_remove_key_arg __user *uarg = _uarg; struct fscrypt_remove_key_arg arg; struct fscrypt_master_key *mk; u32 status_flags = 0; int err; bool inodes_remain; if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; if (!valid_key_spec(&arg.key_spec)) return -EINVAL; if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; /* * Only root can add and remove keys that are identified by an arbitrary * descriptor rather than by a cryptographic hash. */ if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && !capable(CAP_SYS_ADMIN)) return -EACCES; /* Find the key being removed. */ mk = fscrypt_find_master_key(sb, &arg.key_spec); if (!mk) return -ENOKEY; down_write(&mk->mk_sem); /* If relevant, remove current user's (or all users) claim to the key */ if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) { if (all_users) err = keyring_clear(mk->mk_users); else err = remove_master_key_user(mk); if (err) { up_write(&mk->mk_sem); goto out_put_key; } if (mk->mk_users->keys.nr_leaves_on_tree != 0) { /* * Other users have still added the key too. We removed * the current user's claim to the key, but we still * can't remove the key itself. */ status_flags |= FSCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS; err = 0; up_write(&mk->mk_sem); goto out_put_key; } } /* No user claims remaining. Initiate removal of the key. */ err = -ENOKEY; if (mk->mk_present) { fscrypt_initiate_key_removal(sb, mk); err = 0; } inodes_remain = refcount_read(&mk->mk_active_refs) > 0; up_write(&mk->mk_sem); if (inodes_remain) { /* Some inodes still reference this key; try to evict them. */ err = try_to_lock_encrypted_files(sb, mk); if (err == -EBUSY) { status_flags |= FSCRYPT_KEY_REMOVAL_STATUS_FLAG_FILES_BUSY; err = 0; } } /* * We return 0 if we successfully did something: removed a claim to the * key, initiated removal of the key, or tried locking the files again. * Users need to check the informational status flags if they care * whether the key has been fully removed including all files locked. */ out_put_key: fscrypt_put_master_key(mk); if (err == 0) err = put_user(status_flags, &uarg->removal_status_flags); return err; } int fscrypt_ioctl_remove_key(struct file *filp, void __user *uarg) { return do_remove_key(filp, uarg, false); } EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key); int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *uarg) { if (!capable(CAP_SYS_ADMIN)) return -EACCES; return do_remove_key(filp, uarg, true); } EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key_all_users); /* * Retrieve the status of an fscrypt master encryption key. * * We set ->status to indicate whether the key is absent, present, or * incompletely removed. (For an explanation of what these statuses mean and * how they are represented internally, see struct fscrypt_master_key.) This * field allows applications to easily determine the status of an encrypted * directory without using a hack such as trying to open a regular file in it * (which can confuse the "incompletely removed" status with absent or present). * * In addition, for v2 policy keys we allow applications to determine, via * ->status_flags and ->user_count, whether the key has been added by the * current user, by other users, or by both. Most applications should not need * this, since ordinarily only one user should know a given key. However, if a * secret key is shared by multiple users, applications may wish to add an * already-present key to prevent other users from removing it. This ioctl can * be used to check whether that really is the case before the work is done to * add the key --- which might e.g. require prompting the user for a passphrase. * * For more details, see the "FS_IOC_GET_ENCRYPTION_KEY_STATUS" section of * Documentation/filesystems/fscrypt.rst. */ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) { struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_get_key_status_arg arg; struct fscrypt_master_key *mk; int err; if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; if (!valid_key_spec(&arg.key_spec)) return -EINVAL; if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; arg.status_flags = 0; arg.user_count = 0; memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved)); mk = fscrypt_find_master_key(sb, &arg.key_spec); if (!mk) { arg.status = FSCRYPT_KEY_STATUS_ABSENT; err = 0; goto out; } down_read(&mk->mk_sem); if (!mk->mk_present) { arg.status = refcount_read(&mk->mk_active_refs) > 0 ? FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED : FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */; err = 0; goto out_release_key; } arg.status = FSCRYPT_KEY_STATUS_PRESENT; if (mk->mk_users) { struct key *mk_user; arg.user_count = mk->mk_users->keys.nr_leaves_on_tree; mk_user = find_master_key_user(mk); if (!IS_ERR(mk_user)) { arg.status_flags |= FSCRYPT_KEY_STATUS_FLAG_ADDED_BY_SELF; key_put(mk_user); } else if (mk_user != ERR_PTR(-ENOKEY)) { err = PTR_ERR(mk_user); goto out_release_key; } } err = 0; out_release_key: up_read(&mk->mk_sem); fscrypt_put_master_key(mk); out: if (!err && copy_to_user(uarg, &arg, sizeof(arg))) err = -EFAULT; return err; } EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_key_status); int __init fscrypt_init_keyring(void) { int err; err = register_key_type(&key_type_fscrypt_user); if (err) return err; err = register_key_type(&key_type_fscrypt_provisioning); if (err) goto err_unregister_fscrypt_user; return 0; err_unregister_fscrypt_user: unregister_key_type(&key_type_fscrypt_user); return err; } |
| 5 5 5 5 5 5 5 5 5 5 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 | // SPDX-License-Identifier: GPL-2.0-only /* * vivid-vid-out.c - video output support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/videodev2.h> #include <linux/v4l2-dv-timings.h> #include <media/v4l2-common.h> #include <media/v4l2-event.h> #include <media/v4l2-dv-timings.h> #include <media/v4l2-rect.h> #include "vivid-core.h" #include "vivid-osd.h" #include "vivid-vid-common.h" #include "vivid-kthread-out.h" #include "vivid-vid-out.h" static int vid_out_queue_setup(struct vb2_queue *vq, unsigned *nbuffers, unsigned *nplanes, unsigned sizes[], struct device *alloc_devs[]) { struct vivid_dev *dev = vb2_get_drv_priv(vq); const struct vivid_fmt *vfmt = dev->fmt_out; unsigned planes = vfmt->buffers; unsigned h = dev->fmt_out_rect.height; unsigned int size = dev->bytesperline_out[0] * h + vfmt->data_offset[0]; unsigned p; for (p = vfmt->buffers; p < vfmt->planes; p++) size += dev->bytesperline_out[p] * h / vfmt->vdownsampling[p] + vfmt->data_offset[p]; if (dev->field_out == V4L2_FIELD_ALTERNATE) { /* * You cannot use write() with FIELD_ALTERNATE since the field * information (TOP/BOTTOM) cannot be passed to the kernel. */ if (vb2_fileio_is_active(vq)) return -EINVAL; } if (dev->queue_setup_error) { /* * Error injection: test what happens if queue_setup() returns * an error. */ dev->queue_setup_error = false; return -EINVAL; } if (*nplanes) { /* * Check if the number of requested planes match * the number of planes in the current format. You can't mix that. */ if (*nplanes != planes) return -EINVAL; if (sizes[0] < size) return -EINVAL; for (p = 1; p < planes; p++) { if (sizes[p] < dev->bytesperline_out[p] * h / vfmt->vdownsampling[p] + vfmt->data_offset[p]) return -EINVAL; } } else { for (p = 0; p < planes; p++) sizes[p] = p ? dev->bytesperline_out[p] * h / vfmt->vdownsampling[p] + vfmt->data_offset[p] : size; } *nplanes = planes; dprintk(dev, 1, "%s: count=%u\n", __func__, *nbuffers); for (p = 0; p < planes; p++) dprintk(dev, 1, "%s: size[%u]=%u\n", __func__, p, sizes[p]); return 0; } static int vid_out_buf_out_validate(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); dprintk(dev, 1, "%s\n", __func__); if (dev->field_out != V4L2_FIELD_ALTERNATE) vbuf->field = dev->field_out; else if (vbuf->field != V4L2_FIELD_TOP && vbuf->field != V4L2_FIELD_BOTTOM) return -EINVAL; return 0; } static int vid_out_buf_prepare(struct vb2_buffer *vb) { struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); const struct vivid_fmt *vfmt = dev->fmt_out; unsigned int planes = vfmt->buffers; unsigned int h = dev->fmt_out_rect.height; unsigned int size = dev->bytesperline_out[0] * h; unsigned p; for (p = vfmt->buffers; p < vfmt->planes; p++) size += dev->bytesperline_out[p] * h / vfmt->vdownsampling[p]; dprintk(dev, 1, "%s\n", __func__); if (WARN_ON(NULL == dev->fmt_out)) return -EINVAL; if (dev->buf_prepare_error) { /* * Error injection: test what happens if buf_prepare() returns * an error. */ dev->buf_prepare_error = false; return -EINVAL; } for (p = 0; p < planes; p++) { if (p) size = dev->bytesperline_out[p] * h / vfmt->vdownsampling[p]; size += vb->planes[p].data_offset; if (vb2_get_plane_payload(vb, p) < size) { dprintk(dev, 1, "%s the payload is too small for plane %u (%lu < %u)\n", __func__, p, vb2_get_plane_payload(vb, p), size); return -EINVAL; } } return 0; } static void vid_out_buf_queue(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); struct vivid_buffer *buf = container_of(vbuf, struct vivid_buffer, vb); dprintk(dev, 1, "%s\n", __func__); spin_lock(&dev->slock); list_add_tail(&buf->list, &dev->vid_out_active); spin_unlock(&dev->slock); } static int vid_out_start_streaming(struct vb2_queue *vq, unsigned count) { struct vivid_dev *dev = vb2_get_drv_priv(vq); int err; dev->vid_out_seq_count = 0; dprintk(dev, 1, "%s\n", __func__); if (dev->start_streaming_error) { dev->start_streaming_error = false; err = -EINVAL; } else { err = vivid_start_generating_vid_out(dev, &dev->vid_out_streaming); } if (err) { struct vivid_buffer *buf, *tmp; list_for_each_entry_safe(buf, tmp, &dev->vid_out_active, list) { list_del(&buf->list); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_QUEUED); } } return err; } /* abort streaming and wait for last buffer */ static void vid_out_stop_streaming(struct vb2_queue *vq) { struct vivid_dev *dev = vb2_get_drv_priv(vq); dprintk(dev, 1, "%s\n", __func__); vivid_stop_generating_vid_out(dev, &dev->vid_out_streaming); } static void vid_out_buf_request_complete(struct vb2_buffer *vb) { struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); v4l2_ctrl_request_complete(vb->req_obj.req, &dev->ctrl_hdl_vid_out); } const struct vb2_ops vivid_vid_out_qops = { .queue_setup = vid_out_queue_setup, .buf_out_validate = vid_out_buf_out_validate, .buf_prepare = vid_out_buf_prepare, .buf_queue = vid_out_buf_queue, .start_streaming = vid_out_start_streaming, .stop_streaming = vid_out_stop_streaming, .buf_request_complete = vid_out_buf_request_complete, }; /* * Called whenever the format has to be reset which can occur when * changing outputs, standard, timings, etc. */ void vivid_update_format_out(struct vivid_dev *dev) { struct v4l2_bt_timings *bt = &dev->dv_timings_out.bt; unsigned size, p; u64 pixelclock; switch (dev->output_type[dev->output]) { case SVID: default: dev->field_out = dev->tv_field_out; dev->sink_rect.width = 720; if (dev->std_out & V4L2_STD_525_60) { dev->sink_rect.height = 480; dev->timeperframe_vid_out = (struct v4l2_fract) { 1001, 30000 }; dev->service_set_out = V4L2_SLICED_CAPTION_525; } else { dev->sink_rect.height = 576; dev->timeperframe_vid_out = (struct v4l2_fract) { 1000, 25000 }; dev->service_set_out = V4L2_SLICED_WSS_625 | V4L2_SLICED_TELETEXT_B; } dev->colorspace_out = V4L2_COLORSPACE_SMPTE170M; break; case HDMI: dev->sink_rect.width = bt->width; dev->sink_rect.height = bt->height; size = V4L2_DV_BT_FRAME_WIDTH(bt) * V4L2_DV_BT_FRAME_HEIGHT(bt); if (can_reduce_fps(bt) && (bt->flags & V4L2_DV_FL_REDUCED_FPS)) pixelclock = div_u64(bt->pixelclock * 1000, 1001); else pixelclock = bt->pixelclock; dev->timeperframe_vid_out = (struct v4l2_fract) { size / 100, (u32)pixelclock / 100 }; if (bt->interlaced) dev->field_out = V4L2_FIELD_ALTERNATE; else dev->field_out = V4L2_FIELD_NONE; if (!dev->dvi_d_out && (bt->flags & V4L2_DV_FL_IS_CE_VIDEO)) { if (bt->width == 720 && bt->height <= 576) dev->colorspace_out = V4L2_COLORSPACE_SMPTE170M; else dev->colorspace_out = V4L2_COLORSPACE_REC709; } else { dev->colorspace_out = V4L2_COLORSPACE_SRGB; } break; } dev->xfer_func_out = V4L2_XFER_FUNC_DEFAULT; dev->ycbcr_enc_out = V4L2_YCBCR_ENC_DEFAULT; dev->hsv_enc_out = V4L2_HSV_ENC_180; dev->quantization_out = V4L2_QUANTIZATION_DEFAULT; dev->compose_out = dev->sink_rect; dev->compose_bounds_out = dev->sink_rect; dev->crop_out = dev->compose_out; if (V4L2_FIELD_HAS_T_OR_B(dev->field_out)) dev->crop_out.height /= 2; dev->fmt_out_rect = dev->crop_out; for (p = 0; p < dev->fmt_out->planes; p++) dev->bytesperline_out[p] = (dev->sink_rect.width * dev->fmt_out->bit_depth[p]) / 8; } /* Map the field to something that is valid for the current output */ static enum v4l2_field vivid_field_out(struct vivid_dev *dev, enum v4l2_field field) { if (vivid_is_svid_out(dev)) { switch (field) { case V4L2_FIELD_INTERLACED_TB: case V4L2_FIELD_INTERLACED_BT: case V4L2_FIELD_SEQ_TB: case V4L2_FIELD_SEQ_BT: case V4L2_FIELD_ALTERNATE: return field; case V4L2_FIELD_INTERLACED: default: return V4L2_FIELD_INTERLACED; } } if (vivid_is_hdmi_out(dev)) return dev->dv_timings_out.bt.interlaced ? V4L2_FIELD_ALTERNATE : V4L2_FIELD_NONE; return V4L2_FIELD_NONE; } static enum tpg_pixel_aspect vivid_get_pixel_aspect(const struct vivid_dev *dev) { if (vivid_is_svid_out(dev)) return (dev->std_out & V4L2_STD_525_60) ? TPG_PIXEL_ASPECT_NTSC : TPG_PIXEL_ASPECT_PAL; if (vivid_is_hdmi_out(dev) && dev->sink_rect.width == 720 && dev->sink_rect.height <= 576) return dev->sink_rect.height == 480 ? TPG_PIXEL_ASPECT_NTSC : TPG_PIXEL_ASPECT_PAL; return TPG_PIXEL_ASPECT_SQUARE; } int vivid_g_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_pix_format_mplane *mp = &f->fmt.pix_mp; const struct vivid_fmt *fmt = dev->fmt_out; unsigned p; mp->width = dev->fmt_out_rect.width; mp->height = dev->fmt_out_rect.height; mp->field = dev->field_out; mp->pixelformat = fmt->fourcc; mp->colorspace = dev->colorspace_out; mp->xfer_func = dev->xfer_func_out; mp->ycbcr_enc = dev->ycbcr_enc_out; mp->quantization = dev->quantization_out; mp->num_planes = fmt->buffers; for (p = 0; p < mp->num_planes; p++) { mp->plane_fmt[p].bytesperline = dev->bytesperline_out[p]; mp->plane_fmt[p].sizeimage = mp->plane_fmt[p].bytesperline * mp->height / fmt->vdownsampling[p] + fmt->data_offset[p]; } for (p = fmt->buffers; p < fmt->planes; p++) { unsigned stride = dev->bytesperline_out[p]; mp->plane_fmt[0].sizeimage += (stride * mp->height) / fmt->vdownsampling[p]; } return 0; } int vivid_try_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_bt_timings *bt = &dev->dv_timings_out.bt; struct v4l2_pix_format_mplane *mp = &f->fmt.pix_mp; struct v4l2_plane_pix_format *pfmt = mp->plane_fmt; const struct vivid_fmt *fmt; unsigned bytesperline, max_bpl; unsigned factor = 1; unsigned w, h; unsigned p; fmt = vivid_get_format(dev, mp->pixelformat); if (!fmt) { dprintk(dev, 1, "Fourcc format (0x%08x) unknown.\n", mp->pixelformat); mp->pixelformat = V4L2_PIX_FMT_YUYV; fmt = vivid_get_format(dev, mp->pixelformat); } mp->field = vivid_field_out(dev, mp->field); if (vivid_is_svid_out(dev)) { w = 720; h = (dev->std_out & V4L2_STD_525_60) ? 480 : 576; } else { w = dev->sink_rect.width; h = dev->sink_rect.height; } if (V4L2_FIELD_HAS_T_OR_B(mp->field)) factor = 2; if (!dev->has_scaler_out && !dev->has_crop_out && !dev->has_compose_out) { mp->width = w; mp->height = h / factor; } else { struct v4l2_rect r = { 0, 0, mp->width, mp->height * factor }; v4l2_rect_set_min_size(&r, &vivid_min_rect); v4l2_rect_set_max_size(&r, &vivid_max_rect); if (dev->has_scaler_out && !dev->has_crop_out) { struct v4l2_rect max_r = { 0, 0, MAX_ZOOM * w, MAX_ZOOM * h }; v4l2_rect_set_max_size(&r, &max_r); } else if (!dev->has_scaler_out && dev->has_compose_out && !dev->has_crop_out) { v4l2_rect_set_max_size(&r, &dev->sink_rect); } else if (!dev->has_scaler_out && !dev->has_compose_out) { v4l2_rect_set_min_size(&r, &dev->sink_rect); } mp->width = r.width; mp->height = r.height / factor; } /* This driver supports custom bytesperline values */ mp->num_planes = fmt->buffers; for (p = 0; p < fmt->buffers; p++) { /* Calculate the minimum supported bytesperline value */ bytesperline = (mp->width * fmt->bit_depth[p]) >> 3; /* Calculate the maximum supported bytesperline value */ max_bpl = (MAX_ZOOM * MAX_WIDTH * fmt->bit_depth[p]) >> 3; if (pfmt[p].bytesperline > max_bpl) pfmt[p].bytesperline = max_bpl; if (pfmt[p].bytesperline < bytesperline) pfmt[p].bytesperline = bytesperline; pfmt[p].sizeimage = (pfmt[p].bytesperline * mp->height) / fmt->vdownsampling[p] + fmt->data_offset[p]; memset(pfmt[p].reserved, 0, sizeof(pfmt[p].reserved)); } for (p = fmt->buffers; p < fmt->planes; p++) pfmt[0].sizeimage += (pfmt[0].bytesperline * mp->height * (fmt->bit_depth[p] / fmt->vdownsampling[p])) / (fmt->bit_depth[0] / fmt->vdownsampling[0]); mp->xfer_func = V4L2_XFER_FUNC_DEFAULT; mp->ycbcr_enc = V4L2_YCBCR_ENC_DEFAULT; mp->quantization = V4L2_QUANTIZATION_DEFAULT; if (vivid_is_svid_out(dev)) { mp->colorspace = V4L2_COLORSPACE_SMPTE170M; } else if (dev->dvi_d_out || !(bt->flags & V4L2_DV_FL_IS_CE_VIDEO)) { mp->colorspace = V4L2_COLORSPACE_SRGB; if (dev->dvi_d_out) mp->quantization = V4L2_QUANTIZATION_LIM_RANGE; } else if (bt->width == 720 && bt->height <= 576) { mp->colorspace = V4L2_COLORSPACE_SMPTE170M; } else if (mp->colorspace != V4L2_COLORSPACE_SMPTE170M && mp->colorspace != V4L2_COLORSPACE_REC709 && mp->colorspace != V4L2_COLORSPACE_OPRGB && mp->colorspace != V4L2_COLORSPACE_BT2020 && mp->colorspace != V4L2_COLORSPACE_SRGB) { mp->colorspace = V4L2_COLORSPACE_REC709; } memset(mp->reserved, 0, sizeof(mp->reserved)); return 0; } int vivid_s_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct v4l2_pix_format_mplane *mp = &f->fmt.pix_mp; struct vivid_dev *dev = video_drvdata(file); struct v4l2_rect *crop = &dev->crop_out; struct v4l2_rect *compose = &dev->compose_out; struct vb2_queue *q = &dev->vb_vid_out_q; int ret = vivid_try_fmt_vid_out(file, priv, f); unsigned factor = 1; unsigned p; if (ret < 0) return ret; if (vb2_is_busy(q) && (vivid_is_svid_out(dev) || mp->width != dev->fmt_out_rect.width || mp->height != dev->fmt_out_rect.height || mp->pixelformat != dev->fmt_out->fourcc || mp->field != dev->field_out)) { dprintk(dev, 1, "%s device busy\n", __func__); return -EBUSY; } /* * Allow for changing the colorspace on the fly. Useful for testing * purposes, and it is something that HDMI transmitters are able * to do. */ if (vb2_is_busy(q)) goto set_colorspace; dev->fmt_out = vivid_get_format(dev, mp->pixelformat); if (V4L2_FIELD_HAS_T_OR_B(mp->field)) factor = 2; if (dev->has_scaler_out || dev->has_crop_out || dev->has_compose_out) { struct v4l2_rect r = { 0, 0, mp->width, mp->height }; if (dev->has_scaler_out) { if (dev->has_crop_out) v4l2_rect_map_inside(crop, &r); else *crop = r; if (dev->has_compose_out && !dev->has_crop_out) { struct v4l2_rect min_r = { 0, 0, r.width / MAX_ZOOM, factor * r.height / MAX_ZOOM }; struct v4l2_rect max_r = { 0, 0, r.width * MAX_ZOOM, factor * r.height * MAX_ZOOM }; v4l2_rect_set_min_size(compose, &min_r); v4l2_rect_set_max_size(compose, &max_r); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); } else if (dev->has_compose_out) { struct v4l2_rect min_r = { 0, 0, crop->width / MAX_ZOOM, factor * crop->height / MAX_ZOOM }; struct v4l2_rect max_r = { 0, 0, crop->width * MAX_ZOOM, factor * crop->height * MAX_ZOOM }; v4l2_rect_set_min_size(compose, &min_r); v4l2_rect_set_max_size(compose, &max_r); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); } } else if (dev->has_compose_out && !dev->has_crop_out) { v4l2_rect_set_size_to(crop, &r); r.height *= factor; v4l2_rect_set_size_to(compose, &r); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); } else if (!dev->has_compose_out) { v4l2_rect_map_inside(crop, &r); r.height /= factor; v4l2_rect_set_size_to(compose, &r); } else { r.height *= factor; v4l2_rect_set_max_size(compose, &r); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); crop->top *= factor; crop->height *= factor; v4l2_rect_set_size_to(crop, compose); v4l2_rect_map_inside(crop, &r); crop->top /= factor; crop->height /= factor; } } else { struct v4l2_rect r = { 0, 0, mp->width, mp->height }; v4l2_rect_set_size_to(crop, &r); r.height /= factor; v4l2_rect_set_size_to(compose, &r); } dev->fmt_out_rect.width = mp->width; dev->fmt_out_rect.height = mp->height; for (p = 0; p < mp->num_planes; p++) dev->bytesperline_out[p] = mp->plane_fmt[p].bytesperline; for (p = dev->fmt_out->buffers; p < dev->fmt_out->planes; p++) dev->bytesperline_out[p] = (dev->bytesperline_out[0] * dev->fmt_out->bit_depth[p]) / dev->fmt_out->bit_depth[0]; dev->field_out = mp->field; if (vivid_is_svid_out(dev)) dev->tv_field_out = mp->field; set_colorspace: dev->colorspace_out = mp->colorspace; dev->xfer_func_out = mp->xfer_func; dev->ycbcr_enc_out = mp->ycbcr_enc; dev->quantization_out = mp->quantization; struct vivid_dev *in_dev = vivid_output_is_connected_to(dev); if (in_dev) { vivid_send_source_change(in_dev, SVID); vivid_send_source_change(in_dev, HDMI); } return 0; } int vidioc_g_fmt_vid_out_mplane(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (!dev->multiplanar) return -ENOTTY; return vivid_g_fmt_vid_out(file, priv, f); } int vidioc_try_fmt_vid_out_mplane(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (!dev->multiplanar) return -ENOTTY; return vivid_try_fmt_vid_out(file, priv, f); } int vidioc_s_fmt_vid_out_mplane(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (!dev->multiplanar) return -ENOTTY; return vivid_s_fmt_vid_out(file, priv, f); } int vidioc_g_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; return fmt_sp2mp_func(file, priv, f, vivid_g_fmt_vid_out); } int vidioc_try_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; return fmt_sp2mp_func(file, priv, f, vivid_try_fmt_vid_out); } int vidioc_s_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; return fmt_sp2mp_func(file, priv, f, vivid_s_fmt_vid_out); } int vivid_vid_out_g_selection(struct file *file, void *priv, struct v4l2_selection *sel) { struct vivid_dev *dev = video_drvdata(file); if (!dev->has_crop_out && !dev->has_compose_out) return -ENOTTY; if (sel->type != V4L2_BUF_TYPE_VIDEO_OUTPUT) return -EINVAL; sel->r.left = sel->r.top = 0; switch (sel->target) { case V4L2_SEL_TGT_CROP: if (!dev->has_crop_out) return -EINVAL; sel->r = dev->crop_out; break; case V4L2_SEL_TGT_CROP_DEFAULT: if (!dev->has_crop_out) return -EINVAL; sel->r = dev->fmt_out_rect; break; case V4L2_SEL_TGT_CROP_BOUNDS: if (!dev->has_crop_out) return -EINVAL; sel->r = vivid_max_rect; break; case V4L2_SEL_TGT_COMPOSE: if (!dev->has_compose_out) return -EINVAL; sel->r = dev->compose_out; break; case V4L2_SEL_TGT_COMPOSE_DEFAULT: case V4L2_SEL_TGT_COMPOSE_BOUNDS: if (!dev->has_compose_out) return -EINVAL; sel->r = dev->sink_rect; break; default: return -EINVAL; } return 0; } int vivid_vid_out_s_selection(struct file *file, void *fh, struct v4l2_selection *s) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_rect *crop = &dev->crop_out; struct v4l2_rect *compose = &dev->compose_out; unsigned factor = V4L2_FIELD_HAS_T_OR_B(dev->field_out) ? 2 : 1; int ret; if (!dev->has_crop_out && !dev->has_compose_out) return -ENOTTY; if (s->type != V4L2_BUF_TYPE_VIDEO_OUTPUT) return -EINVAL; switch (s->target) { case V4L2_SEL_TGT_CROP: if (!dev->has_crop_out) return -EINVAL; ret = vivid_vid_adjust_sel(s->flags, &s->r); if (ret) return ret; v4l2_rect_set_min_size(&s->r, &vivid_min_rect); v4l2_rect_set_max_size(&s->r, &dev->fmt_out_rect); if (dev->has_scaler_out) { struct v4l2_rect max_rect = { 0, 0, dev->sink_rect.width * MAX_ZOOM, (dev->sink_rect.height / factor) * MAX_ZOOM }; v4l2_rect_set_max_size(&s->r, &max_rect); if (dev->has_compose_out) { struct v4l2_rect min_rect = { 0, 0, s->r.width / MAX_ZOOM, (s->r.height * factor) / MAX_ZOOM }; struct v4l2_rect max_rect = { 0, 0, s->r.width * MAX_ZOOM, (s->r.height * factor) * MAX_ZOOM }; v4l2_rect_set_min_size(compose, &min_rect); v4l2_rect_set_max_size(compose, &max_rect); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); } } else if (dev->has_compose_out) { s->r.top *= factor; s->r.height *= factor; v4l2_rect_set_max_size(&s->r, &dev->sink_rect); v4l2_rect_set_size_to(compose, &s->r); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); s->r.top /= factor; s->r.height /= factor; } else { v4l2_rect_set_size_to(&s->r, &dev->sink_rect); s->r.height /= factor; } v4l2_rect_map_inside(&s->r, &dev->fmt_out_rect); *crop = s->r; break; case V4L2_SEL_TGT_COMPOSE: if (!dev->has_compose_out) return -EINVAL; ret = vivid_vid_adjust_sel(s->flags, &s->r); if (ret) return ret; v4l2_rect_set_min_size(&s->r, &vivid_min_rect); v4l2_rect_set_max_size(&s->r, &dev->sink_rect); v4l2_rect_map_inside(&s->r, &dev->compose_bounds_out); s->r.top /= factor; s->r.height /= factor; if (dev->has_scaler_out) { struct v4l2_rect fmt = dev->fmt_out_rect; struct v4l2_rect max_rect = { 0, 0, s->r.width * MAX_ZOOM, s->r.height * MAX_ZOOM }; struct v4l2_rect min_rect = { 0, 0, s->r.width / MAX_ZOOM, s->r.height / MAX_ZOOM }; v4l2_rect_set_min_size(&fmt, &min_rect); if (!dev->has_crop_out) v4l2_rect_set_max_size(&fmt, &max_rect); if (!v4l2_rect_same_size(&dev->fmt_out_rect, &fmt) && vb2_is_busy(&dev->vb_vid_out_q)) return -EBUSY; if (dev->has_crop_out) { v4l2_rect_set_min_size(crop, &min_rect); v4l2_rect_set_max_size(crop, &max_rect); } dev->fmt_out_rect = fmt; } else if (dev->has_crop_out) { struct v4l2_rect fmt = dev->fmt_out_rect; v4l2_rect_set_min_size(&fmt, &s->r); if (!v4l2_rect_same_size(&dev->fmt_out_rect, &fmt) && vb2_is_busy(&dev->vb_vid_out_q)) return -EBUSY; dev->fmt_out_rect = fmt; v4l2_rect_set_size_to(crop, &s->r); v4l2_rect_map_inside(crop, &dev->fmt_out_rect); } else { if (!v4l2_rect_same_size(&s->r, &dev->fmt_out_rect) && vb2_is_busy(&dev->vb_vid_out_q)) return -EBUSY; v4l2_rect_set_size_to(&dev->fmt_out_rect, &s->r); v4l2_rect_set_size_to(crop, &s->r); crop->height /= factor; v4l2_rect_map_inside(crop, &dev->fmt_out_rect); } s->r.top *= factor; s->r.height *= factor; *compose = s->r; break; default: return -EINVAL; } return 0; } int vivid_vid_out_g_pixelaspect(struct file *file, void *priv, int type, struct v4l2_fract *f) { struct vivid_dev *dev = video_drvdata(file); if (type != V4L2_BUF_TYPE_VIDEO_OUTPUT) return -EINVAL; switch (vivid_get_pixel_aspect(dev)) { case TPG_PIXEL_ASPECT_NTSC: f->numerator = 11; f->denominator = 10; break; case TPG_PIXEL_ASPECT_PAL: f->numerator = 54; f->denominator = 59; break; default: break; } return 0; } int vidioc_g_fmt_vid_out_overlay(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); const struct v4l2_rect *compose = &dev->compose_out; struct v4l2_window *win = &f->fmt.win; if (!dev->has_fb) return -EINVAL; win->w.top = dev->overlay_out_top; win->w.left = dev->overlay_out_left; win->w.width = compose->width; win->w.height = compose->height; win->field = V4L2_FIELD_ANY; win->chromakey = dev->chromakey_out; win->global_alpha = dev->global_alpha_out; return 0; } int vidioc_try_fmt_vid_out_overlay(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); const struct v4l2_rect *compose = &dev->compose_out; struct v4l2_window *win = &f->fmt.win; if (!dev->has_fb) return -EINVAL; win->w.left = clamp_t(int, win->w.left, -dev->display_width, dev->display_width); win->w.top = clamp_t(int, win->w.top, -dev->display_height, dev->display_height); win->w.width = compose->width; win->w.height = compose->height; /* * It makes no sense for an OSD to overlay only top or bottom fields, * so always set this to ANY. */ win->field = V4L2_FIELD_ANY; return 0; } int vidioc_s_fmt_vid_out_overlay(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_window *win = &f->fmt.win; int ret = vidioc_try_fmt_vid_out_overlay(file, priv, f); if (ret) return ret; dev->overlay_out_top = win->w.top; dev->overlay_out_left = win->w.left; dev->chromakey_out = win->chromakey; dev->global_alpha_out = win->global_alpha; return ret; } int vivid_vid_out_overlay(struct file *file, void *fh, unsigned i) { struct vivid_dev *dev = video_drvdata(file); if (i && !dev->fmt_out->can_do_overlay) { dprintk(dev, 1, "unsupported output format for output overlay\n"); return -EINVAL; } dev->overlay_out_enabled = i; return 0; } int vivid_vid_out_g_fbuf(struct file *file, void *fh, struct v4l2_framebuffer *a) { struct vivid_dev *dev = video_drvdata(file); a->capability = V4L2_FBUF_CAP_EXTERNOVERLAY | V4L2_FBUF_CAP_CHROMAKEY | V4L2_FBUF_CAP_SRC_CHROMAKEY | V4L2_FBUF_CAP_GLOBAL_ALPHA | V4L2_FBUF_CAP_LOCAL_ALPHA | V4L2_FBUF_CAP_LOCAL_INV_ALPHA; a->flags = V4L2_FBUF_FLAG_OVERLAY | dev->fbuf_out_flags; a->base = (void *)dev->video_pbase; a->fmt.width = dev->display_width; a->fmt.height = dev->display_height; if (vivid_fb_green_bits(dev) == 5) a->fmt.pixelformat = V4L2_PIX_FMT_ARGB555; else a->fmt.pixelformat = V4L2_PIX_FMT_RGB565; a->fmt.bytesperline = dev->display_byte_stride; a->fmt.sizeimage = a->fmt.height * a->fmt.bytesperline; a->fmt.field = V4L2_FIELD_NONE; a->fmt.colorspace = V4L2_COLORSPACE_SRGB; a->fmt.priv = 0; return 0; } int vivid_vid_out_s_fbuf(struct file *file, void *fh, const struct v4l2_framebuffer *a) { struct vivid_dev *dev = video_drvdata(file); const unsigned chroma_flags = V4L2_FBUF_FLAG_CHROMAKEY | V4L2_FBUF_FLAG_SRC_CHROMAKEY; const unsigned alpha_flags = V4L2_FBUF_FLAG_GLOBAL_ALPHA | V4L2_FBUF_FLAG_LOCAL_ALPHA | V4L2_FBUF_FLAG_LOCAL_INV_ALPHA; if ((a->flags & chroma_flags) == chroma_flags) return -EINVAL; switch (a->flags & alpha_flags) { case 0: case V4L2_FBUF_FLAG_GLOBAL_ALPHA: case V4L2_FBUF_FLAG_LOCAL_ALPHA: case V4L2_FBUF_FLAG_LOCAL_INV_ALPHA: break; default: return -EINVAL; } dev->fbuf_out_flags &= ~(chroma_flags | alpha_flags); dev->fbuf_out_flags |= a->flags & (chroma_flags | alpha_flags); return 0; } static const struct v4l2_audioout vivid_audio_outputs[] = { { 0, "Line-Out 1" }, { 1, "Line-Out 2" }, }; int vidioc_enum_output(struct file *file, void *priv, struct v4l2_output *out) { struct vivid_dev *dev = video_drvdata(file); if (out->index >= dev->num_outputs) return -EINVAL; out->type = V4L2_OUTPUT_TYPE_ANALOG; switch (dev->output_type[out->index]) { case SVID: snprintf(out->name, sizeof(out->name), "S-Video %03u-%u", dev->inst, dev->output_name_counter[out->index]); out->std = V4L2_STD_ALL; if (dev->has_audio_outputs) out->audioset = (1 << ARRAY_SIZE(vivid_audio_outputs)) - 1; out->capabilities = V4L2_OUT_CAP_STD; break; case HDMI: snprintf(out->name, sizeof(out->name), "HDMI %03u-%u", dev->inst, dev->output_name_counter[out->index]); out->capabilities = V4L2_OUT_CAP_DV_TIMINGS; break; } return 0; } int vidioc_g_output(struct file *file, void *priv, unsigned *o) { struct vivid_dev *dev = video_drvdata(file); *o = dev->output; return 0; } int vidioc_s_output(struct file *file, void *priv, unsigned o) { struct vivid_dev *dev = video_drvdata(file); if (o >= dev->num_outputs) return -EINVAL; if (o == dev->output) return 0; if (vb2_is_busy(&dev->vb_vid_out_q) || vb2_is_busy(&dev->vb_vbi_out_q) || vb2_is_busy(&dev->vb_meta_out_q)) return -EBUSY; dev->output = o; dev->tv_audio_output = 0; if (dev->output_type[o] == SVID) dev->vid_out_dev.tvnorms = V4L2_STD_ALL; else dev->vid_out_dev.tvnorms = 0; dev->vbi_out_dev.tvnorms = dev->vid_out_dev.tvnorms; dev->meta_out_dev.tvnorms = dev->vid_out_dev.tvnorms; vivid_update_format_out(dev); return 0; } int vidioc_enumaudout(struct file *file, void *fh, struct v4l2_audioout *vout) { if (vout->index >= ARRAY_SIZE(vivid_audio_outputs)) return -EINVAL; *vout = vivid_audio_outputs[vout->index]; return 0; } int vidioc_g_audout(struct file *file, void *fh, struct v4l2_audioout *vout) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_svid_out(dev)) return -EINVAL; *vout = vivid_audio_outputs[dev->tv_audio_output]; return 0; } int vidioc_s_audout(struct file *file, void *fh, const struct v4l2_audioout *vout) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_svid_out(dev)) return -EINVAL; if (vout->index >= ARRAY_SIZE(vivid_audio_outputs)) return -EINVAL; dev->tv_audio_output = vout->index; return 0; } int vivid_vid_out_s_std(struct file *file, void *priv, v4l2_std_id id) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_svid_out(dev)) return -ENODATA; if (dev->std_out == id) return 0; if (vb2_is_busy(&dev->vb_vid_out_q) || vb2_is_busy(&dev->vb_vbi_out_q)) return -EBUSY; dev->std_out = id; vivid_update_format_out(dev); return 0; } static bool valid_cvt_gtf_timings(struct v4l2_dv_timings *timings) { struct v4l2_bt_timings *bt = &timings->bt; if ((bt->standards & (V4L2_DV_BT_STD_CVT | V4L2_DV_BT_STD_GTF)) && v4l2_valid_dv_timings(timings, &vivid_dv_timings_cap, NULL, NULL)) return true; return false; } int vivid_vid_out_s_dv_timings(struct file *file, void *_fh, struct v4l2_dv_timings *timings) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_hdmi_out(dev)) return -ENODATA; if (!v4l2_find_dv_timings_cap(timings, &vivid_dv_timings_cap, 0, NULL, NULL) && !valid_cvt_gtf_timings(timings)) return -EINVAL; if (v4l2_match_dv_timings(timings, &dev->dv_timings_out, 0, true)) return 0; if (vb2_is_busy(&dev->vb_vid_out_q)) return -EBUSY; dev->dv_timings_out = *timings; vivid_update_format_out(dev); return 0; } int vivid_vid_out_g_parm(struct file *file, void *priv, struct v4l2_streamparm *parm) { struct vivid_dev *dev = video_drvdata(file); if (parm->type != (dev->multiplanar ? V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE : V4L2_BUF_TYPE_VIDEO_OUTPUT)) return -EINVAL; parm->parm.output.capability = V4L2_CAP_TIMEPERFRAME; parm->parm.output.timeperframe = dev->timeperframe_vid_out; parm->parm.output.writebuffers = 1; return 0; } int vidioc_subscribe_event(struct v4l2_fh *fh, const struct v4l2_event_subscription *sub) { switch (sub->type) { case V4L2_EVENT_SOURCE_CHANGE: if (fh->vdev->vfl_dir == VFL_DIR_RX) return v4l2_src_change_event_subscribe(fh, sub); break; default: return v4l2_ctrl_subscribe_event(fh, sub); } return -EINVAL; } |
| 2 1119 2 1118 1121 129 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMU_NOTIFIER_H #define _LINUX_MMU_NOTIFIER_H #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> #include <linux/srcu.h> #include <linux/interval_tree.h> struct mmu_notifier_subscriptions; struct mmu_notifier; struct mmu_notifier_range; struct mmu_interval_notifier; /** * enum mmu_notifier_event - reason for the mmu notifier callback * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that * move the range * * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like * madvise() or replacing a page by another one, ...). * * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range * ie using the vma access permission (vm_page_prot) to update the whole range * is enough no need to inspect changes to the CPU page table (mprotect() * syscall) * * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for * pages in the range so to mirror those changes the user must inspect the CPU * page table (from the end callback). * * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same * access flags). User should soft dirty the page in the end callback to make * sure that anyone relying on soft dirtiness catch pages that might be written * through non CPU mappings. * * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal * that the mm refcount is zero and the range is no longer accessible. * * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal * a device driver to possibly ignore the invalidation if the * owner field matches the driver's device private pgmap owner. * * @MMU_NOTIFY_EXCLUSIVE: conversion of a page table entry to device-exclusive. * The owner is initialized to the value provided by the caller of * make_device_exclusive(), such that this caller can filter out these * events. */ enum mmu_notifier_event { MMU_NOTIFY_UNMAP = 0, MMU_NOTIFY_CLEAR, MMU_NOTIFY_PROTECTION_VMA, MMU_NOTIFY_PROTECTION_PAGE, MMU_NOTIFY_SOFT_DIRTY, MMU_NOTIFY_RELEASE, MMU_NOTIFY_MIGRATE, MMU_NOTIFY_EXCLUSIVE, }; #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) struct mmu_notifier_ops { /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are * freed. This can run concurrently with other mmu notifier * methods (the ones invoked outside the mm context) and it * should tear down all secondary mmu mappings and freeze the * secondary mmu. If this method isn't implemented you've to * be sure that nothing could possibly write to the pages * through the secondary mmu by the time the last thread with * tsk->mm == mm exits. * * As side note: the pages freed after ->release returns could * be immediately reallocated by the gart at an alias physical * address with a different cache model, so if ->release isn't * implemented because all _software_ driven memory accesses * through the secondary mmu are terminated by the time the * last thread of this mm quits, you've also to be sure that * speculative _hardware_ operations can't allocate dirty * cachelines in the cpu that could not be snooped and made * coherent with the other read and write operations happening * through the gart alias address, so leading to memory * corruption. */ void (*release)(struct mmu_notifier *subscription, struct mm_struct *mm); /* * clear_flush_young is called after the VM is * test-and-clearing the young/accessed bitflag in the * pte. This way the VM will provide proper aging to the * accesses to the page through the secondary MMUs and not * only to the ones through the Linux pte. * Start-end is necessary in case the secondary MMU is mapping the page * at a smaller granularity than the primary MMU. */ int (*clear_flush_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * clear_young is a lightweight version of clear_flush_young. Like the * latter, it is supposed to test-and-clear the young/accessed bitflag * in the secondary pte, but it may omit flushing the secondary tlb. */ int (*clear_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * test_young is called to check the young/accessed bitflag in * the secondary pte. This is used to know if the page is * frequently used without actually clearing the flag or tearing * down the secondary mapping on the page. */ int (*test_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long address); /* * invalidate_range_start() and invalidate_range_end() must be * paired and are called only when the mmap_lock and/or the * locks protecting the reverse maps are held. If the subsystem * can't guarantee that no additional references are taken to * the pages in the range, it has to implement the * invalidate_range() notifier to remove any references taken * after invalidate_range_start(). * * Invalidation of multiple concurrent ranges may be * optionally permitted by the driver. Either way the * establishment of sptes is forbidden in the range passed to * invalidate_range_begin/end for the whole duration of the * invalidate_range_begin/end critical section. * * invalidate_range_start() is called when all pages in the * range are still mapped and have at least a refcount of one. * * invalidate_range_end() is called when all pages in the * range have been unmapped and the pages have been freed by * the VM. * * The VM will remove the page table entries and potentially * the page between invalidate_range_start() and * invalidate_range_end(). If the page must not be freed * because of pending I/O or other circumstances then the * invalidate_range_start() callback (or the initial mapping * by the driver) must make sure that the refcount is kept * elevated. * * If the driver increases the refcount when the pages are * initially mapped into an address space then either * invalidate_range_start() or invalidate_range_end() may * decrease the refcount. If the refcount is decreased on * invalidate_range_start() then the VM can free pages as page * table entries are removed. If the refcount is only * dropped on invalidate_range_end() then the driver itself * will drop the last refcount but it must take care to flush * any secondary tlb before doing the final free on the * page. Pages will no longer be referenced by the linux * address space but may still be referenced by sptes until * the last refcount is dropped. * * If blockable argument is set to false then the callback cannot * sleep and has to return with -EAGAIN if sleeping would be required. * 0 should be returned otherwise. Please note that notifiers that can * fail invalidate_range_start are not allowed to implement * invalidate_range_end, as there is no mechanism for informing the * notifier that its start failed. */ int (*invalidate_range_start)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range); void (*invalidate_range_end)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range); /* * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB * which shares page-tables with the CPU. The * invalidate_range_start()/end() callbacks should not be implemented as * invalidate_secondary_tlbs() already catches the points in time when * an external TLB needs to be flushed. * * This requires arch_invalidate_secondary_tlbs() to be called while * holding the ptl spin-lock and therefore this callback is not allowed * to sleep. * * This is called by architecture code whenever invalidating a TLB * entry. It is assumed that any secondary TLB has the same rules for * when invalidations are required. If this is not the case architecture * code will need to call this explicitly when required for secondary * TLB invalidation. */ void (*arch_invalidate_secondary_tlbs)( struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * These callbacks are used with the get/put interface to manage the * lifetime of the mmu_notifier memory. alloc_notifier() returns a new * notifier for use with the mm. * * free_notifier() is only called after the mmu_notifier has been * fully put, calls to any ops callback are prevented and no ops * callbacks are currently running. It is called from a SRCU callback * and cannot sleep. */ struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm); void (*free_notifier)(struct mmu_notifier *subscription); }; /* * The notifier chains are protected by mmap_lock and/or the reverse map * semaphores. Notifier chains are only changed when all reverse maps and * the mmap_lock locks are taken. * * Therefore notifier chains can only be traversed when either * * 1. mmap_lock is held. * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem). * 3. No other concurrent thread can access the list (release) */ struct mmu_notifier { struct hlist_node hlist; const struct mmu_notifier_ops *ops; struct mm_struct *mm; struct rcu_head rcu; unsigned int users; }; /** * struct mmu_interval_notifier_ops * @invalidate: Upon return the caller must stop using any SPTEs within this * range. This function can sleep. Return false only if sleeping * was required but mmu_notifier_range_blockable(range) is false. */ struct mmu_interval_notifier_ops { bool (*invalidate)(struct mmu_interval_notifier *interval_sub, const struct mmu_notifier_range *range, unsigned long cur_seq); }; struct mmu_interval_notifier { struct interval_tree_node interval_tree; const struct mmu_interval_notifier_ops *ops; struct mm_struct *mm; struct hlist_node deferred_item; unsigned long invalidate_seq; }; #ifdef CONFIG_MMU_NOTIFIER #ifdef CONFIG_LOCKDEP extern struct lockdep_map __mmu_notifier_invalidate_range_start_map; #endif struct mmu_notifier_range { struct mm_struct *mm; unsigned long start; unsigned long end; unsigned flags; enum mmu_notifier_event event; void *owner; }; static inline int mm_has_notifiers(struct mm_struct *mm) { return unlikely(mm->notifier_subscriptions); } struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, struct mm_struct *mm); static inline struct mmu_notifier * mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm) { struct mmu_notifier *ret; mmap_write_lock(mm); ret = mmu_notifier_get_locked(ops, mm); mmap_write_unlock(mm); return ret; } void mmu_notifier_put(struct mmu_notifier *subscription); void mmu_notifier_synchronize(void); extern int mmu_notifier_register(struct mmu_notifier *subscription, struct mm_struct *mm); extern int __mmu_notifier_register(struct mmu_notifier *subscription, struct mm_struct *mm); extern void mmu_notifier_unregister(struct mmu_notifier *subscription, struct mm_struct *mm); unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub); int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops); int mmu_interval_notifier_insert_locked( struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops); void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub); /** * mmu_interval_set_seq - Save the invalidation sequence * @interval_sub - The subscription passed to invalidate * @cur_seq - The cur_seq passed to the invalidate() callback * * This must be called unconditionally from the invalidate callback of a * struct mmu_interval_notifier_ops under the same lock that is used to call * mmu_interval_read_retry(). It updates the sequence number for later use by * mmu_interval_read_retry(). The provided cur_seq will always be odd. * * If the caller does not call mmu_interval_read_begin() or * mmu_interval_read_retry() then this call is not required. */ static inline void mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub, unsigned long cur_seq) { WRITE_ONCE(interval_sub->invalidate_seq, cur_seq); } /** * mmu_interval_read_retry - End a read side critical section against a VA range * interval_sub: The subscription * seq: The return of the paired mmu_interval_read_begin() * * This MUST be called under a user provided lock that is also held * unconditionally by op->invalidate() when it calls mmu_interval_set_seq(). * * Each call should be paired with a single mmu_interval_read_begin() and * should be used to conclude the read side. * * Returns true if an invalidation collided with this critical section, and * the caller should retry. */ static inline bool mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub, unsigned long seq) { return interval_sub->invalidate_seq != seq; } /** * mmu_interval_check_retry - Test if a collision has occurred * interval_sub: The subscription * seq: The return of the matching mmu_interval_read_begin() * * This can be used in the critical section between mmu_interval_read_begin() * and mmu_interval_read_retry(). A return of true indicates an invalidation * has collided with this critical region and a future * mmu_interval_read_retry() will return true. * * False is not reliable and only suggests a collision may not have * occurred. It can be called many times and does not have to hold the user * provided lock. * * This call can be used as part of loops and other expensive operations to * expedite a retry. */ static inline bool mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub, unsigned long seq) { /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */ return READ_ONCE(interval_sub->invalidate_seq) != seq; } extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end); extern int __mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end); extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end); extern bool mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); } static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) __mmu_notifier_release(mm); } static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_flush_young(mm, start, end); return 0; } static inline int mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_young(mm, start, end); return 0; } static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { if (mm_has_notifiers(mm)) return __mmu_notifier_test_young(mm, address); return 0; } static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { might_sleep(); lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); if (mm_has_notifiers(range->mm)) { range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; __mmu_notifier_invalidate_range_start(range); } lock_map_release(&__mmu_notifier_invalidate_range_start_map); } /* * This version of mmu_notifier_invalidate_range_start() avoids blocking, but it * can return an error if a notifier can't proceed without blocking, in which * case you're not allowed to modify PTEs in the specified range. * * This is mainly intended for OOM handling. */ static inline int __must_check mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { int ret = 0; lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); if (mm_has_notifiers(range->mm)) { range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; ret = __mmu_notifier_invalidate_range_start(range); } lock_map_release(&__mmu_notifier_invalidate_range_start_map); return ret; } static inline void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { if (mmu_notifier_range_blockable(range)) might_sleep(); if (mm_has_notifiers(range->mm)) __mmu_notifier_invalidate_range_end(range); } static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) { mm->notifier_subscriptions = NULL; } static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { if (mm_has_notifiers(mm)) __mmu_notifier_subscriptions_destroy(mm); } static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned flags, struct mm_struct *mm, unsigned long start, unsigned long end) { range->event = event; range->mm = mm; range->start = start; range->end = end; range->flags = flags; } static inline void mmu_notifier_range_init_owner( struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned int flags, struct mm_struct *mm, unsigned long start, unsigned long end, void *owner) { mmu_notifier_range_init(range, event, flags, mm, start, end); range->owner = owner; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ ___address, \ ___address + \ PAGE_SIZE); \ __young; \ }) #define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ ___address, \ ___address + \ PMD_SIZE); \ __young; \ }) #define ptep_clear_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ ___address + PAGE_SIZE); \ __young; \ }) #define pmdp_clear_young_notify(__vma, __address, __pmdp) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ ___address + PMD_SIZE); \ __young; \ }) #else /* CONFIG_MMU_NOTIFIER */ struct mmu_notifier_range { unsigned long start; unsigned long end; }; static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, unsigned long start, unsigned long end) { range->start = start; range->end = end; } #define mmu_notifier_range_init(range,event,flags,mm,start,end) \ _mmu_notifier_range_init(range, start, end) #define mmu_notifier_range_init_owner(range, event, flags, mm, start, \ end, owner) \ _mmu_notifier_range_init(range, start, end) static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { return true; } static inline int mm_has_notifiers(struct mm_struct *mm) { return 0; } static inline void mmu_notifier_release(struct mm_struct *mm) { } static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end) { return 0; } static inline int mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end) { return 0; } static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { return 0; } static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { } static inline int mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { return 0; } static inline void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { } static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end) { } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) { } static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { } #define mmu_notifier_range_update_to_read_only(r) false #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young #define pmdp_clear_young_notify pmdp_test_and_clear_young static inline void mmu_notifier_synchronize(void) { } #endif /* CONFIG_MMU_NOTIFIER */ #endif /* _LINUX_MMU_NOTIFIER_H */ |
| 4 4 2 1 2 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 | // SPDX-License-Identifier: GPL-2.0-or-later /* * OSS compatible sequencer driver * * open/close and reset interface * * Copyright (C) 1998-1999 Takashi Iwai <tiwai@suse.de> */ #include "seq_oss_device.h" #include "seq_oss_synth.h" #include "seq_oss_midi.h" #include "seq_oss_writeq.h" #include "seq_oss_readq.h" #include "seq_oss_timer.h" #include "seq_oss_event.h" #include <linux/init.h> #include <linux/export.h> #include <linux/moduleparam.h> #include <linux/slab.h> #include <linux/workqueue.h> /* * common variables */ static int maxqlen = SNDRV_SEQ_OSS_MAX_QLEN; module_param(maxqlen, int, 0444); MODULE_PARM_DESC(maxqlen, "maximum queue length"); static int system_client = -1; /* ALSA sequencer client number */ static int system_port = -1; static int num_clients; static struct seq_oss_devinfo *client_table[SNDRV_SEQ_OSS_MAX_CLIENTS]; /* * prototypes */ static int receive_announce(struct snd_seq_event *ev, int direct, void *private, int atomic, int hop); static int translate_mode(struct file *file); static int create_port(struct seq_oss_devinfo *dp); static int delete_port(struct seq_oss_devinfo *dp); static int alloc_seq_queue(struct seq_oss_devinfo *dp); static int delete_seq_queue(int queue); static void free_devinfo(void *private); #define call_ctl(type,rec) snd_seq_kernel_client_ctl(system_client, type, rec) /* call snd_seq_oss_midi_lookup_ports() asynchronously */ static void async_call_lookup_ports(struct work_struct *work) { snd_seq_oss_midi_lookup_ports(system_client); } static DECLARE_WORK(async_lookup_work, async_call_lookup_ports); /* * create sequencer client for OSS sequencer */ int __init snd_seq_oss_create_client(void) { int rc; struct snd_seq_port_info *port __free(kfree) = NULL; struct snd_seq_port_callback port_callback; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; /* create ALSA client */ rc = snd_seq_create_kernel_client(NULL, SNDRV_SEQ_CLIENT_OSS, "OSS sequencer"); if (rc < 0) return rc; system_client = rc; /* create announcement receiver port */ strcpy(port->name, "Receiver"); port->addr.client = system_client; port->capability = SNDRV_SEQ_PORT_CAP_WRITE; /* receive only */ port->type = 0; memset(&port_callback, 0, sizeof(port_callback)); /* don't set port_callback.owner here. otherwise the module counter * is incremented and we can no longer release the module.. */ port_callback.event_input = receive_announce; port->kernel = &port_callback; if (call_ctl(SNDRV_SEQ_IOCTL_CREATE_PORT, port) >= 0) { struct snd_seq_port_subscribe subs; system_port = port->addr.port; memset(&subs, 0, sizeof(subs)); subs.sender.client = SNDRV_SEQ_CLIENT_SYSTEM; subs.sender.port = SNDRV_SEQ_PORT_SYSTEM_ANNOUNCE; subs.dest.client = system_client; subs.dest.port = system_port; call_ctl(SNDRV_SEQ_IOCTL_SUBSCRIBE_PORT, &subs); } /* look up midi devices */ schedule_work(&async_lookup_work); return 0; } /* * receive announcement from system port, and check the midi device */ static int receive_announce(struct snd_seq_event *ev, int direct, void *private, int atomic, int hop) { struct snd_seq_port_info pinfo; if (atomic) return 0; /* it must not happen */ switch (ev->type) { case SNDRV_SEQ_EVENT_PORT_START: case SNDRV_SEQ_EVENT_PORT_CHANGE: if (ev->data.addr.client == system_client) break; /* ignore myself */ memset(&pinfo, 0, sizeof(pinfo)); pinfo.addr = ev->data.addr; if (call_ctl(SNDRV_SEQ_IOCTL_GET_PORT_INFO, &pinfo) >= 0) snd_seq_oss_midi_check_new_port(&pinfo); break; case SNDRV_SEQ_EVENT_PORT_EXIT: if (ev->data.addr.client == system_client) break; /* ignore myself */ snd_seq_oss_midi_check_exit_port(ev->data.addr.client, ev->data.addr.port); break; } return 0; } /* * delete OSS sequencer client */ int snd_seq_oss_delete_client(void) { cancel_work_sync(&async_lookup_work); if (system_client >= 0) snd_seq_delete_kernel_client(system_client); snd_seq_oss_midi_clear_all(); return 0; } /* * open sequencer device */ int snd_seq_oss_open(struct file *file, int level) { int i, rc; struct seq_oss_devinfo *dp; dp = kzalloc(sizeof(*dp), GFP_KERNEL); if (!dp) return -ENOMEM; dp->cseq = system_client; dp->port = -1; dp->queue = -1; for (i = 0; i < SNDRV_SEQ_OSS_MAX_CLIENTS; i++) { if (client_table[i] == NULL) break; } dp->index = i; if (i >= SNDRV_SEQ_OSS_MAX_CLIENTS) { pr_debug("ALSA: seq_oss: too many applications\n"); rc = -ENOMEM; goto _error; } /* look up synth and midi devices */ snd_seq_oss_synth_setup(dp); snd_seq_oss_midi_setup(dp); if (dp->synth_opened == 0 && dp->max_mididev == 0) { /* pr_err("ALSA: seq_oss: no device found\n"); */ rc = -ENODEV; goto _error; } /* create port */ rc = create_port(dp); if (rc < 0) { pr_err("ALSA: seq_oss: can't create port\n"); goto _error; } /* allocate queue */ rc = alloc_seq_queue(dp); if (rc < 0) goto _error; /* set address */ dp->addr.client = dp->cseq; dp->addr.port = dp->port; /*dp->addr.queue = dp->queue;*/ /*dp->addr.channel = 0;*/ dp->seq_mode = level; /* set up file mode */ dp->file_mode = translate_mode(file); /* initialize read queue */ if (is_read_mode(dp->file_mode)) { dp->readq = snd_seq_oss_readq_new(dp, maxqlen); if (!dp->readq) { rc = -ENOMEM; goto _error; } } /* initialize write queue */ if (is_write_mode(dp->file_mode)) { dp->writeq = snd_seq_oss_writeq_new(dp, maxqlen); if (!dp->writeq) { rc = -ENOMEM; goto _error; } } /* initialize timer */ dp->timer = snd_seq_oss_timer_new(dp); if (!dp->timer) { pr_err("ALSA: seq_oss: can't alloc timer\n"); rc = -ENOMEM; goto _error; } /* set private data pointer */ file->private_data = dp; /* set up for mode2 */ if (level == SNDRV_SEQ_OSS_MODE_MUSIC) snd_seq_oss_synth_setup_midi(dp); else if (is_read_mode(dp->file_mode)) snd_seq_oss_midi_open_all(dp, SNDRV_SEQ_OSS_FILE_READ); client_table[dp->index] = dp; num_clients++; return 0; _error: snd_seq_oss_synth_cleanup(dp); snd_seq_oss_midi_cleanup(dp); delete_seq_queue(dp->queue); delete_port(dp); return rc; } /* * translate file flags to private mode */ static int translate_mode(struct file *file) { int file_mode = 0; if ((file->f_flags & O_ACCMODE) != O_RDONLY) file_mode |= SNDRV_SEQ_OSS_FILE_WRITE; if ((file->f_flags & O_ACCMODE) != O_WRONLY) file_mode |= SNDRV_SEQ_OSS_FILE_READ; if (file->f_flags & O_NONBLOCK) file_mode |= SNDRV_SEQ_OSS_FILE_NONBLOCK; return file_mode; } /* * create sequencer port */ static int create_port(struct seq_oss_devinfo *dp) { int rc; struct snd_seq_port_info port; struct snd_seq_port_callback callback; memset(&port, 0, sizeof(port)); port.addr.client = dp->cseq; sprintf(port.name, "Sequencer-%d", dp->index); port.capability = SNDRV_SEQ_PORT_CAP_READ|SNDRV_SEQ_PORT_CAP_WRITE; /* no subscription */ port.type = SNDRV_SEQ_PORT_TYPE_SPECIFIC; port.midi_channels = 128; port.synth_voices = 128; memset(&callback, 0, sizeof(callback)); callback.owner = THIS_MODULE; callback.private_data = dp; callback.event_input = snd_seq_oss_event_input; callback.private_free = free_devinfo; port.kernel = &callback; rc = call_ctl(SNDRV_SEQ_IOCTL_CREATE_PORT, &port); if (rc < 0) return rc; dp->port = port.addr.port; return 0; } /* * delete ALSA port */ static int delete_port(struct seq_oss_devinfo *dp) { if (dp->port < 0) { kfree(dp); return 0; } return snd_seq_event_port_detach(dp->cseq, dp->port); } /* * allocate a queue */ static int alloc_seq_queue(struct seq_oss_devinfo *dp) { struct snd_seq_queue_info qinfo; int rc; memset(&qinfo, 0, sizeof(qinfo)); qinfo.owner = system_client; qinfo.locked = 1; strcpy(qinfo.name, "OSS Sequencer Emulation"); rc = call_ctl(SNDRV_SEQ_IOCTL_CREATE_QUEUE, &qinfo); if (rc < 0) return rc; dp->queue = qinfo.queue; return 0; } /* * release queue */ static int delete_seq_queue(int queue) { struct snd_seq_queue_info qinfo; int rc; if (queue < 0) return 0; memset(&qinfo, 0, sizeof(qinfo)); qinfo.queue = queue; rc = call_ctl(SNDRV_SEQ_IOCTL_DELETE_QUEUE, &qinfo); if (rc < 0) pr_err("ALSA: seq_oss: unable to delete queue %d (%d)\n", queue, rc); return rc; } /* * free device informations - private_free callback of port */ static void free_devinfo(void *private) { struct seq_oss_devinfo *dp = (struct seq_oss_devinfo *)private; snd_seq_oss_timer_delete(dp->timer); snd_seq_oss_writeq_delete(dp->writeq); snd_seq_oss_readq_delete(dp->readq); kfree(dp); } /* * close sequencer device */ void snd_seq_oss_release(struct seq_oss_devinfo *dp) { int queue; client_table[dp->index] = NULL; num_clients--; snd_seq_oss_reset(dp); snd_seq_oss_synth_cleanup(dp); snd_seq_oss_midi_cleanup(dp); /* clear slot */ queue = dp->queue; if (dp->port >= 0) delete_port(dp); delete_seq_queue(queue); } /* * reset sequencer devices */ void snd_seq_oss_reset(struct seq_oss_devinfo *dp) { int i; /* reset all synth devices */ for (i = 0; i < dp->max_synthdev; i++) snd_seq_oss_synth_reset(dp, i); /* reset all midi devices */ if (dp->seq_mode != SNDRV_SEQ_OSS_MODE_MUSIC) { for (i = 0; i < dp->max_mididev; i++) snd_seq_oss_midi_reset(dp, i); } /* remove queues */ if (dp->readq) snd_seq_oss_readq_clear(dp->readq); if (dp->writeq) snd_seq_oss_writeq_clear(dp->writeq); /* reset timer */ snd_seq_oss_timer_stop(dp->timer); } #ifdef CONFIG_SND_PROC_FS /* * misc. functions for proc interface */ static const char * filemode_str(int val) { static const char * const str[] = { "none", "read", "write", "read/write", }; return str[val & SNDRV_SEQ_OSS_FILE_ACMODE]; } /* * proc interface */ void snd_seq_oss_system_info_read(struct snd_info_buffer *buf) { int i; struct seq_oss_devinfo *dp; snd_iprintf(buf, "ALSA client number %d\n", system_client); snd_iprintf(buf, "ALSA receiver port %d\n", system_port); snd_iprintf(buf, "\nNumber of applications: %d\n", num_clients); for (i = 0; i < num_clients; i++) { snd_iprintf(buf, "\nApplication %d: ", i); dp = client_table[i]; if (!dp) { snd_iprintf(buf, "*empty*\n"); continue; } snd_iprintf(buf, "port %d : queue %d\n", dp->port, dp->queue); snd_iprintf(buf, " sequencer mode = %s : file open mode = %s\n", (dp->seq_mode ? "music" : "synth"), filemode_str(dp->file_mode)); if (dp->seq_mode) snd_iprintf(buf, " timer tempo = %d, timebase = %d\n", dp->timer->oss_tempo, dp->timer->oss_timebase); snd_iprintf(buf, " max queue length %d\n", maxqlen); if (is_read_mode(dp->file_mode) && dp->readq) snd_seq_oss_readq_info_read(dp->readq, buf); } } #endif /* CONFIG_SND_PROC_FS */ |
| 12 12 11 11 7 7 7 1 3 2 1 26 13 1 7 1 1 1 3 1 1 3 1 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ocfs2/ioctl.c * * Copyright (C) 2006 Herbert Poetzl * adapted from Remy Card's ext2/ioctl.c */ #include <linux/fs.h> #include <linux/mount.h> #include <linux/blkdev.h> #include <linux/compat.h> #include <linux/fileattr.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "alloc.h" #include "dlmglue.h" #include "file.h" #include "inode.h" #include "journal.h" #include "ocfs2_fs.h" #include "ioctl.h" #include "resize.h" #include "refcounttree.h" #include "sysfile.h" #include "dir.h" #include "buffer_head_io.h" #include "suballoc.h" #include "move_extents.h" #define o2info_from_user(a, b) \ copy_from_user(&(a), (b), sizeof(a)) #define o2info_to_user(a, b) \ copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) /* * This is just a best-effort to tell userspace that this request * caused the error. */ static inline void o2info_set_request_error(struct ocfs2_info_request *kreq, struct ocfs2_info_request __user *req) { kreq->ir_flags |= OCFS2_INFO_FL_ERROR; (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags)); } static inline void o2info_set_request_filled(struct ocfs2_info_request *req) { req->ir_flags |= OCFS2_INFO_FL_FILLED; } static inline void o2info_clear_request_filled(struct ocfs2_info_request *req) { req->ir_flags &= ~OCFS2_INFO_FL_FILLED; } static inline int o2info_coherent(struct ocfs2_info_request *req) { return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT)); } int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); unsigned int flags; int status; status = ocfs2_inode_lock(inode, NULL, 0); if (status < 0) { mlog_errno(status); return status; } ocfs2_get_inode_flags(OCFS2_I(inode)); flags = OCFS2_I(inode)->ip_attr; ocfs2_inode_unlock(inode, 0); fileattr_fill_flags(fa, flags & OCFS2_FL_VISIBLE); return status; } int ocfs2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); unsigned int flags = fa->flags; struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle = NULL; struct buffer_head *bh = NULL; unsigned oldflags; int status; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; status = ocfs2_inode_lock(inode, &bh, 1); if (status < 0) { mlog_errno(status); goto bail; } if (!S_ISDIR(inode->i_mode)) flags &= ~OCFS2_DIRSYNC_FL; oldflags = ocfs2_inode->ip_attr; flags = flags & OCFS2_FL_MODIFIABLE; flags |= oldflags & ~OCFS2_FL_MODIFIABLE; /* Check already done by VFS, but repeat with ocfs lock */ status = -EPERM; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) && !capable(CAP_LINUX_IMMUTABLE)) goto bail_unlock; handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto bail_unlock; } ocfs2_inode->ip_attr = flags; ocfs2_set_inode_flags(inode); inode_set_ctime_current(inode); status = ocfs2_mark_inode_dirty(handle, inode, bh); if (status < 0) mlog_errno(status); ocfs2_commit_trans(osb, handle); bail_unlock: ocfs2_inode_unlock(inode, 1); bail: brelse(bh); return status; } static int ocfs2_info_handle_blocksize(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_blocksize oib; if (o2info_from_user(oib, req)) return -EFAULT; oib.ib_blocksize = inode->i_sb->s_blocksize; o2info_set_request_filled(&oib.ib_req); if (o2info_to_user(oib, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_clustersize(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_clustersize oic; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oic, req)) return -EFAULT; oic.ic_clustersize = osb->s_clustersize; o2info_set_request_filled(&oic.ic_req); if (o2info_to_user(oic, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_maxslots(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_maxslots oim; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oim, req)) return -EFAULT; oim.im_max_slots = osb->max_slots; o2info_set_request_filled(&oim.im_req); if (o2info_to_user(oim, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_label(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_label oil; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oil, req)) return -EFAULT; memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); o2info_set_request_filled(&oil.il_req); if (o2info_to_user(oil, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_uuid(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_uuid oiu; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oiu, req)) return -EFAULT; memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); o2info_set_request_filled(&oiu.iu_req); if (o2info_to_user(oiu, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_fs_features(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_fs_features oif; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oif, req)) return -EFAULT; oif.if_compat_features = osb->s_feature_compat; oif.if_incompat_features = osb->s_feature_incompat; oif.if_ro_compat_features = osb->s_feature_ro_compat; o2info_set_request_filled(&oif.if_req); if (o2info_to_user(oif, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_journal_size(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_journal_size oij; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oij, req)) return -EFAULT; oij.ij_journal_size = i_size_read(osb->journal->j_inode); o2info_set_request_filled(&oij.ij_req); if (o2info_to_user(oij, req)) return -EFAULT; return 0; } static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, struct inode *inode_alloc, u64 blkno, struct ocfs2_info_freeinode *fi, u32 slot) { int status = 0, unlock = 0; struct buffer_head *bh = NULL; struct ocfs2_dinode *dinode_alloc = NULL; if (inode_alloc) inode_lock(inode_alloc); if (inode_alloc && o2info_coherent(&fi->ifi_req)) { status = ocfs2_inode_lock(inode_alloc, &bh, 0); if (status < 0) { mlog_errno(status); goto bail; } unlock = 1; } else { status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); if (status < 0) { mlog_errno(status); goto bail; } } dinode_alloc = (struct ocfs2_dinode *)bh->b_data; fi->ifi_stat[slot].lfi_total = le32_to_cpu(dinode_alloc->id1.bitmap1.i_total); fi->ifi_stat[slot].lfi_free = le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) - le32_to_cpu(dinode_alloc->id1.bitmap1.i_used); bail: if (unlock) ocfs2_inode_unlock(inode_alloc, 0); if (inode_alloc) inode_unlock(inode_alloc); brelse(bh); return status; } static int ocfs2_info_handle_freeinode(struct inode *inode, struct ocfs2_info_request __user *req) { u32 i; u64 blkno = -1; char namebuf[40]; int status, type = INODE_ALLOC_SYSTEM_INODE; struct ocfs2_info_freeinode *oifi = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *inode_alloc = NULL; oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL); if (!oifi) { status = -ENOMEM; mlog_errno(status); goto out_err; } if (o2info_from_user(*oifi, req)) { status = -EFAULT; goto out_free; } oifi->ifi_slotnum = osb->max_slots; for (i = 0; i < oifi->ifi_slotnum; i++) { if (o2info_coherent(&oifi->ifi_req)) { inode_alloc = ocfs2_get_system_file_inode(osb, type, i); if (!inode_alloc) { mlog(ML_ERROR, "unable to get alloc inode in " "slot %u\n", i); status = -EIO; goto bail; } } else { ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, i); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, strlen(namebuf), &blkno); if (status < 0) { status = -ENOENT; goto bail; } } status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); iput(inode_alloc); inode_alloc = NULL; if (status < 0) goto bail; } o2info_set_request_filled(&oifi->ifi_req); if (o2info_to_user(*oifi, req)) { status = -EFAULT; goto out_free; } status = 0; bail: if (status) o2info_set_request_error(&oifi->ifi_req, req); out_free: kfree(oifi); out_err: return status; } static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist, unsigned int chunksize) { u32 index; index = __ilog2_u32(chunksize); if (index >= OCFS2_INFO_MAX_HIST) index = OCFS2_INFO_MAX_HIST - 1; hist->fc_chunks[index]++; hist->fc_clusters[index] += chunksize; } static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats, unsigned int chunksize) { if (chunksize > stats->ffs_max) stats->ffs_max = chunksize; if (chunksize < stats->ffs_min) stats->ffs_min = chunksize; stats->ffs_avg += chunksize; stats->ffs_free_chunks_real++; } static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, unsigned int chunksize) { o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize); o2ffg_update_stats(&(ffg->iff_ffs), chunksize); } static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, struct inode *gb_inode, struct ocfs2_dinode *gb_dinode, struct ocfs2_chain_rec *rec, struct ocfs2_info_freefrag *ffg, u32 chunks_in_group) { int status = 0, used; u64 blkno; struct buffer_head *bh = NULL; struct ocfs2_group_desc *bg = NULL; unsigned int max_bits, num_clusters; unsigned int offset = 0, cluster, chunk; unsigned int chunk_free, last_chunksize = 0; if (!le32_to_cpu(rec->c_free)) goto bail; do { if (!bg) blkno = le64_to_cpu(rec->c_blkno); else blkno = le64_to_cpu(bg->bg_next_group); if (bh) { brelse(bh); bh = NULL; } if (o2info_coherent(&ffg->iff_req)) status = ocfs2_read_group_descriptor(gb_inode, gb_dinode, blkno, &bh); else status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); if (status < 0) { mlog(ML_ERROR, "Can't read the group descriptor # " "%llu from device.", (unsigned long long)blkno); status = -EIO; goto bail; } bg = (struct ocfs2_group_desc *)bh->b_data; if (!le16_to_cpu(bg->bg_free_bits_count)) continue; max_bits = le16_to_cpu(bg->bg_bits); offset = 0; for (chunk = 0; chunk < chunks_in_group; chunk++) { /* * last chunk may be not an entire one. */ if ((offset + ffg->iff_chunksize) > max_bits) num_clusters = max_bits - offset; else num_clusters = ffg->iff_chunksize; chunk_free = 0; for (cluster = 0; cluster < num_clusters; cluster++) { used = ocfs2_test_bit(offset, (unsigned long *)bg->bg_bitmap); /* * - chunk_free counts free clusters in #N chunk. * - last_chunksize records the size(in) clusters * for the last real free chunk being counted. */ if (!used) { last_chunksize++; chunk_free++; } if (used && last_chunksize) { ocfs2_info_update_ffg(ffg, last_chunksize); last_chunksize = 0; } offset++; } if (chunk_free == ffg->iff_chunksize) ffg->iff_ffs.ffs_free_chunks++; } /* * need to update the info for last free chunk. */ if (last_chunksize) ocfs2_info_update_ffg(ffg, last_chunksize); } while (le64_to_cpu(bg->bg_next_group)); bail: brelse(bh); return status; } static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, struct inode *gb_inode, u64 blkno, struct ocfs2_info_freefrag *ffg) { u32 chunks_in_group; int status = 0, unlock = 0, i; struct buffer_head *bh = NULL; struct ocfs2_chain_list *cl = NULL; struct ocfs2_chain_rec *rec = NULL; struct ocfs2_dinode *gb_dinode = NULL; if (gb_inode) inode_lock(gb_inode); if (o2info_coherent(&ffg->iff_req)) { status = ocfs2_inode_lock(gb_inode, &bh, 0); if (status < 0) { mlog_errno(status); goto bail; } unlock = 1; } else { status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); if (status < 0) { mlog_errno(status); goto bail; } } gb_dinode = (struct ocfs2_dinode *)bh->b_data; cl = &(gb_dinode->id2.i_chain); /* * Chunksize(in) clusters from userspace should be * less than clusters in a group. */ if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) { status = -EINVAL; goto bail; } memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats)); ffg->iff_ffs.ffs_min = ~0U; ffg->iff_ffs.ffs_clusters = le32_to_cpu(gb_dinode->id1.bitmap1.i_total); ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters - le32_to_cpu(gb_dinode->id1.bitmap1.i_used); chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1; for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { rec = &(cl->cl_recs[i]); status = ocfs2_info_freefrag_scan_chain(osb, gb_inode, gb_dinode, rec, ffg, chunks_in_group); if (status) goto bail; } if (ffg->iff_ffs.ffs_free_chunks_real) ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg / ffg->iff_ffs.ffs_free_chunks_real); bail: if (unlock) ocfs2_inode_unlock(gb_inode, 0); if (gb_inode) inode_unlock(gb_inode); iput(gb_inode); brelse(bh); return status; } static int ocfs2_info_handle_freefrag(struct inode *inode, struct ocfs2_info_request __user *req) { u64 blkno = -1; char namebuf[40]; int status, type = GLOBAL_BITMAP_SYSTEM_INODE; struct ocfs2_info_freefrag *oiff; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *gb_inode = NULL; oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL); if (!oiff) { status = -ENOMEM; mlog_errno(status); goto out_err; } if (o2info_from_user(*oiff, req)) { status = -EFAULT; goto out_free; } /* * chunksize from userspace should be power of 2. */ if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) || (!oiff->iff_chunksize)) { status = -EINVAL; goto bail; } if (o2info_coherent(&oiff->iff_req)) { gb_inode = ocfs2_get_system_file_inode(osb, type, OCFS2_INVALID_SLOT); if (!gb_inode) { mlog(ML_ERROR, "unable to get global_bitmap inode\n"); status = -EIO; goto bail; } } else { ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, OCFS2_INVALID_SLOT); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, strlen(namebuf), &blkno); if (status < 0) { status = -ENOENT; goto bail; } } status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff); if (status < 0) goto bail; o2info_set_request_filled(&oiff->iff_req); if (o2info_to_user(*oiff, req)) { status = -EFAULT; goto out_free; } status = 0; bail: if (status) o2info_set_request_error(&oiff->iff_req, req); out_free: kfree(oiff); out_err: return status; } static int ocfs2_info_handle_unknown(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_request oir; if (o2info_from_user(oir, req)) return -EFAULT; o2info_clear_request_filled(&oir); if (o2info_to_user(oir, req)) return -EFAULT; return 0; } /* * Validate and distinguish OCFS2_IOC_INFO requests. * * - validate the magic number. * - distinguish different requests. * - validate size of different requests. */ static int ocfs2_info_handle_request(struct inode *inode, struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_request oir; if (o2info_from_user(oir, req)) goto bail; status = -EINVAL; if (oir.ir_magic != OCFS2_INFO_MAGIC) goto bail; switch (oir.ir_code) { case OCFS2_INFO_BLOCKSIZE: if (oir.ir_size == sizeof(struct ocfs2_info_blocksize)) status = ocfs2_info_handle_blocksize(inode, req); break; case OCFS2_INFO_CLUSTERSIZE: if (oir.ir_size == sizeof(struct ocfs2_info_clustersize)) status = ocfs2_info_handle_clustersize(inode, req); break; case OCFS2_INFO_MAXSLOTS: if (oir.ir_size == sizeof(struct ocfs2_info_maxslots)) status = ocfs2_info_handle_maxslots(inode, req); break; case OCFS2_INFO_LABEL: if (oir.ir_size == sizeof(struct ocfs2_info_label)) status = ocfs2_info_handle_label(inode, req); break; case OCFS2_INFO_UUID: if (oir.ir_size == sizeof(struct ocfs2_info_uuid)) status = ocfs2_info_handle_uuid(inode, req); break; case OCFS2_INFO_FS_FEATURES: if (oir.ir_size == sizeof(struct ocfs2_info_fs_features)) status = ocfs2_info_handle_fs_features(inode, req); break; case OCFS2_INFO_JOURNAL_SIZE: if (oir.ir_size == sizeof(struct ocfs2_info_journal_size)) status = ocfs2_info_handle_journal_size(inode, req); break; case OCFS2_INFO_FREEINODE: if (oir.ir_size == sizeof(struct ocfs2_info_freeinode)) status = ocfs2_info_handle_freeinode(inode, req); break; case OCFS2_INFO_FREEFRAG: if (oir.ir_size == sizeof(struct ocfs2_info_freefrag)) status = ocfs2_info_handle_freefrag(inode, req); break; default: status = ocfs2_info_handle_unknown(inode, req); break; } bail: return status; } static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, u64 *req_addr, int compat_flag) { int status = -EFAULT; u64 __user *bp = NULL; if (compat_flag) { #ifdef CONFIG_COMPAT /* * pointer bp stores the base address of a pointers array, * which collects all addresses of separate request. */ bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests); #else BUG(); #endif } else bp = (u64 __user *)(unsigned long)(info->oi_requests); if (o2info_from_user(*req_addr, bp + idx)) goto bail; status = 0; bail: return status; } /* * OCFS2_IOC_INFO handles an array of requests passed from userspace. * * ocfs2_info_handle() receives a large info aggregation, grab and * validate the request count from header, then break it into small * pieces, later specific handlers can handle them one by one. * * Idea here is to make each separate request small enough to ensure * a better backward&forward compatibility, since a small piece of * request will be less likely to be broken if disk layout get changed. */ static noinline_for_stack int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, int compat_flag) { int i, status = 0; u64 req_addr; struct ocfs2_info_request __user *reqp; if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) || (!info->oi_requests)) { status = -EINVAL; goto bail; } for (i = 0; i < info->oi_count; i++) { status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag); if (status) break; reqp = (struct ocfs2_info_request __user *)(unsigned long)req_addr; if (!reqp) { status = -EINVAL; goto bail; } status = ocfs2_info_handle_request(inode, reqp); if (status) break; } bail: return status; } long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); void __user *argp = (void __user *)arg; int status; switch (cmd) { case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: case OCFS2_IOC_UNRESVSP: case OCFS2_IOC_UNRESVSP64: { struct ocfs2_space_resv sr; if (copy_from_user(&sr, (int __user *) arg, sizeof(sr))) return -EFAULT; return ocfs2_change_file_space(filp, cmd, &sr); } case OCFS2_IOC_GROUP_EXTEND: { int new_clusters; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (get_user(new_clusters, (int __user *)arg)) return -EFAULT; status = mnt_want_write_file(filp); if (status) return status; status = ocfs2_group_extend(inode, new_clusters); mnt_drop_write_file(filp); return status; } case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: { struct ocfs2_new_group_input input; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (copy_from_user(&input, (int __user *) arg, sizeof(input))) return -EFAULT; status = mnt_want_write_file(filp); if (status) return status; status = ocfs2_group_add(inode, &input); mnt_drop_write_file(filp); return status; } case OCFS2_IOC_REFLINK: { struct reflink_arguments args; const char __user *old_path; const char __user *new_path; bool preserve; if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; old_path = (const char __user *)(unsigned long)args.old_path; new_path = (const char __user *)(unsigned long)args.new_path; preserve = (args.preserve != 0); return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); } case OCFS2_IOC_INFO: { struct ocfs2_info info; if (copy_from_user(&info, argp, sizeof(struct ocfs2_info))) return -EFAULT; return ocfs2_info_handle(inode, &info, 0); } case FITRIM: { struct super_block *sb = inode->i_sb; struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!bdev_max_discard_sectors(sb->s_bdev)) return -EOPNOTSUPP; if (copy_from_user(&range, argp, sizeof(range))) return -EFAULT; range.minlen = max_t(u64, bdev_discard_granularity(sb->s_bdev), range.minlen); ret = ocfs2_trim_fs(sb, &range); if (ret < 0) return ret; if (copy_to_user(argp, &range, sizeof(range))) return -EFAULT; return 0; } case OCFS2_IOC_MOVE_EXT: return ocfs2_ioctl_move_extents(filp, argp); default: return -ENOTTY; } } #ifdef CONFIG_COMPAT long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) { bool preserve; struct reflink_arguments args; struct inode *inode = file_inode(file); struct ocfs2_info info; void __user *argp = (void __user *)arg; switch (cmd) { case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: case OCFS2_IOC_UNRESVSP: case OCFS2_IOC_UNRESVSP64: case OCFS2_IOC_GROUP_EXTEND: case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: break; case OCFS2_IOC_REFLINK: if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; preserve = (args.preserve != 0); return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), compat_ptr(args.new_path), preserve); case OCFS2_IOC_INFO: if (copy_from_user(&info, argp, sizeof(struct ocfs2_info))) return -EFAULT; return ocfs2_info_handle(inode, &info, 1); case FITRIM: case OCFS2_IOC_MOVE_EXT: break; default: return -ENOIOCTLCMD; } return ocfs2_ioctl(file, cmd, arg); } #endif |
| 408 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BLOCKGROUP_LOCK_H #define _LINUX_BLOCKGROUP_LOCK_H /* * Per-blockgroup locking for ext2 and ext3. * * Simple hashed spinlocking. */ #include <linux/spinlock.h> #include <linux/cache.h> #ifdef CONFIG_SMP #define NR_BG_LOCKS (4 << ilog2(NR_CPUS < 32 ? NR_CPUS : 32)) #else #define NR_BG_LOCKS 1 #endif struct bgl_lock { spinlock_t lock; } ____cacheline_aligned_in_smp; struct blockgroup_lock { struct bgl_lock locks[NR_BG_LOCKS]; }; static inline void bgl_lock_init(struct blockgroup_lock *bgl) { int i; for (i = 0; i < NR_BG_LOCKS; i++) spin_lock_init(&bgl->locks[i].lock); } static inline spinlock_t * bgl_lock_ptr(struct blockgroup_lock *bgl, unsigned int block_group) { return &bgl->locks[block_group & (NR_BG_LOCKS-1)].lock; } #endif |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | #include <linux/notifier.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/fib_notifier.h> #include <net/netns/ipv6.h> #include <net/ip6_fib.h> int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET6; return call_fib_notifier(nb, event_type, info); } int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET6; return call_fib_notifiers(net, event_type, info); } static unsigned int fib6_seq_read(const struct net *net) { return fib6_tables_seq_read(net) + fib6_rules_seq_read(net); } static int fib6_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; err = fib6_rules_dump(net, nb, extack); if (err) return err; return fib6_tables_dump(net, nb, extack); } static const struct fib_notifier_ops fib6_notifier_ops_template = { .family = AF_INET6, .fib_seq_read = fib6_seq_read, .fib_dump = fib6_dump, .owner = THIS_MODULE, }; int __net_init fib6_notifier_init(struct net *net) { struct fib_notifier_ops *ops; ops = fib_notifier_ops_register(&fib6_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv6.notifier_ops = ops; return 0; } void __net_exit fib6_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv6.notifier_ops); } |
| 11 102 98 30 102 102 59 102 95 7 85 59 82 82 73 10 78 30 19 2 18 1 24 24 17 17 17 11 17 11 11 11 4 4 11 6 11 11 1 11 11 11 4 11 7 11 11 11 11 1 4 11 11 11 11 11 2 11 11 10 1 5 11 11 4 11 11 7 7 15 15 11 4 4 4 4 11 11 11 11 11 11 11 5 1 1 11 4 11 11 11 3 2 11 5 11 2 1 2 2 11 11 2 2 11 11 11 1 4 11 11 7 7 11 11 11 11 11 7 11 11 11 11 11 11 11 11 5 11 11 11 11 11 11 11 11 11 11 11 2 2 2 2 2 2 2 2 2 11 11 11 2 11 11 5 11 8 4 4 4 4 11 11 11 11 11 11 5 11 11 2 11 11 11 11 11 11 11 11 11 1 11 11 11 11 11 11 11 11 11 11 1 1 54 62 35 35 34 1 33 4 4 21 4 27 18 1 10 10 11 1 1 11 49 49 49 4 4 4 4 4 4 4 51 2 49 49 7 3 4 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 | // SPDX-License-Identifier: GPL-2.0+ /* * NILFS segment constructor. * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * * Written by Ryusuke Konishi. * */ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/bitops.h> #include <linux/bio.h> #include <linux/completion.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/crc32.h> #include <linux/pagevec.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include "nilfs.h" #include "btnode.h" #include "page.h" #include "segment.h" #include "sufile.h" #include "cpfile.h" #include "ifile.h" #include "segbuf.h" /* * Segment constructor */ #define SC_N_INODEVEC 16 /* Size of locally allocated inode vector */ #define SC_MAX_SEGDELTA 64 /* * Upper limit of the number of segments * appended in collection retry loop */ /* Construction mode */ enum { SC_LSEG_SR = 1, /* Make a logical segment having a super root */ SC_LSEG_DSYNC, /* * Flush data blocks of a given file and make * a logical segment without a super root. */ SC_FLUSH_FILE, /* * Flush data files, leads to segment writes without * creating a checkpoint. */ SC_FLUSH_DAT, /* * Flush DAT file. This also creates segments * without a checkpoint. */ }; /* Stage numbers of dirty block collection */ enum { NILFS_ST_INIT = 0, NILFS_ST_GC, /* Collecting dirty blocks for GC */ NILFS_ST_FILE, NILFS_ST_IFILE, NILFS_ST_CPFILE, NILFS_ST_SUFILE, NILFS_ST_DAT, NILFS_ST_SR, /* Super root */ NILFS_ST_DSYNC, /* Data sync blocks */ NILFS_ST_DONE, }; #define CREATE_TRACE_POINTS #include <trace/events/nilfs2.h> /* * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() are * wrapper functions of stage count (nilfs_sc_info->sc_stage.scnt). Users of * the variable must use them because transition of stage count must involve * trace events (trace_nilfs2_collection_stage_transition). * * nilfs_sc_cstage_get() isn't required for the above purpose because it doesn't * produce tracepoint events. It is provided just for making the intention * clear. */ static inline void nilfs_sc_cstage_inc(struct nilfs_sc_info *sci) { sci->sc_stage.scnt++; trace_nilfs2_collection_stage_transition(sci); } static inline void nilfs_sc_cstage_set(struct nilfs_sc_info *sci, int next_scnt) { sci->sc_stage.scnt = next_scnt; trace_nilfs2_collection_stage_transition(sci); } static inline int nilfs_sc_cstage_get(struct nilfs_sc_info *sci) { return sci->sc_stage.scnt; } /* State flags of collection */ #define NILFS_CF_NODE 0x0001 /* Collecting node blocks */ #define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */ #define NILFS_CF_SUFREED 0x0004 /* segment usages has been freed */ #define NILFS_CF_HISTORY_MASK (NILFS_CF_IFILE_STARTED | NILFS_CF_SUFREED) /* Operations depending on the construction mode and file type */ struct nilfs_sc_operations { int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *, struct inode *); int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *, struct inode *); int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *, struct inode *); void (*write_data_binfo)(struct nilfs_sc_info *, struct nilfs_segsum_pointer *, union nilfs_binfo *); void (*write_node_binfo)(struct nilfs_sc_info *, struct nilfs_segsum_pointer *, union nilfs_binfo *); }; /* * Other definitions */ static void nilfs_segctor_start_timer(struct nilfs_sc_info *); static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int); static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *); static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int); #define nilfs_cnt32_ge(a, b) \ (typecheck(__u32, a) && typecheck(__u32, b) && \ ((__s32)((a) - (b)) >= 0)) static int nilfs_prepare_segment_lock(struct super_block *sb, struct nilfs_transaction_info *ti) { struct nilfs_transaction_info *cur_ti = current->journal_info; void *save = NULL; if (cur_ti) { if (cur_ti->ti_magic == NILFS_TI_MAGIC) return ++cur_ti->ti_count; /* * If journal_info field is occupied by other FS, * it is saved and will be restored on * nilfs_transaction_commit(). */ nilfs_warn(sb, "journal info from a different FS"); save = current->journal_info; } if (!ti) { ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS); if (!ti) return -ENOMEM; ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC; } else { ti->ti_flags = 0; } ti->ti_count = 0; ti->ti_save = save; ti->ti_magic = NILFS_TI_MAGIC; current->journal_info = ti; return 0; } /** * nilfs_transaction_begin - start indivisible file operations. * @sb: super block * @ti: nilfs_transaction_info * @vacancy_check: flags for vacancy rate checks * * nilfs_transaction_begin() acquires a reader/writer semaphore, called * the segment semaphore, to make a segment construction and write tasks * exclusive. The function is used with nilfs_transaction_commit() in pairs. * The region enclosed by these two functions can be nested. To avoid a * deadlock, the semaphore is only acquired or released in the outermost call. * * This function allocates a nilfs_transaction_info struct to keep context * information on it. It is initialized and hooked onto the current task in * the outermost call. If a pre-allocated struct is given to @ti, it is used * instead; otherwise a new struct is assigned from a slab. * * When @vacancy_check flag is set, this function will check the amount of * free space, and will wait for the GC to reclaim disk space if low capacity. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-ENOMEM - Insufficient memory available. * * %-ENOSPC - No space left on device (if checking free space). */ int nilfs_transaction_begin(struct super_block *sb, struct nilfs_transaction_info *ti, int vacancy_check) { struct the_nilfs *nilfs; int ret = nilfs_prepare_segment_lock(sb, ti); struct nilfs_transaction_info *trace_ti; if (unlikely(ret < 0)) return ret; if (ret > 0) { trace_ti = current->journal_info; trace_nilfs2_transaction_transition(sb, trace_ti, trace_ti->ti_count, trace_ti->ti_flags, TRACE_NILFS2_TRANSACTION_BEGIN); return 0; } sb_start_intwrite(sb); nilfs = sb->s_fs_info; down_read(&nilfs->ns_segctor_sem); if (vacancy_check && nilfs_near_disk_full(nilfs)) { up_read(&nilfs->ns_segctor_sem); ret = -ENOSPC; goto failed; } trace_ti = current->journal_info; trace_nilfs2_transaction_transition(sb, trace_ti, trace_ti->ti_count, trace_ti->ti_flags, TRACE_NILFS2_TRANSACTION_BEGIN); return 0; failed: ti = current->journal_info; current->journal_info = ti->ti_save; if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) kmem_cache_free(nilfs_transaction_cachep, ti); sb_end_intwrite(sb); return ret; } /** * nilfs_transaction_commit - commit indivisible file operations. * @sb: super block * * nilfs_transaction_commit() releases the read semaphore which is * acquired by nilfs_transaction_begin(). This is only performed * in outermost call of this function. If a commit flag is set, * nilfs_transaction_commit() sets a timer to start the segment * constructor. If a sync flag is set, it starts construction * directly. * * Return: 0 on success, or a negative error code on failure. */ int nilfs_transaction_commit(struct super_block *sb) { struct nilfs_transaction_info *ti = current->journal_info; struct the_nilfs *nilfs = sb->s_fs_info; int err = 0; BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); ti->ti_flags |= NILFS_TI_COMMIT; if (ti->ti_count > 0) { ti->ti_count--; trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT); return 0; } if (nilfs->ns_writer) { struct nilfs_sc_info *sci = nilfs->ns_writer; if (ti->ti_flags & NILFS_TI_COMMIT) nilfs_segctor_start_timer(sci); if (atomic_read(&nilfs->ns_ndirtyblks) > sci->sc_watermark) nilfs_segctor_do_flush(sci, 0); } up_read(&nilfs->ns_segctor_sem); trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT); current->journal_info = ti->ti_save; if (ti->ti_flags & NILFS_TI_SYNC) err = nilfs_construct_segment(sb); if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) kmem_cache_free(nilfs_transaction_cachep, ti); sb_end_intwrite(sb); return err; } void nilfs_transaction_abort(struct super_block *sb) { struct nilfs_transaction_info *ti = current->journal_info; struct the_nilfs *nilfs = sb->s_fs_info; BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); if (ti->ti_count > 0) { ti->ti_count--; trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT); return; } up_read(&nilfs->ns_segctor_sem); trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT); current->journal_info = ti->ti_save; if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) kmem_cache_free(nilfs_transaction_cachep, ti); sb_end_intwrite(sb); } void nilfs_relax_pressure_in_lock(struct super_block *sb) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_sc_info *sci = nilfs->ns_writer; if (sb_rdonly(sb) || unlikely(!sci) || !sci->sc_flush_request) return; set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags); up_read(&nilfs->ns_segctor_sem); down_write(&nilfs->ns_segctor_sem); if (sci->sc_flush_request && test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) { struct nilfs_transaction_info *ti = current->journal_info; ti->ti_flags |= NILFS_TI_WRITER; nilfs_segctor_do_immediate_flush(sci); ti->ti_flags &= ~NILFS_TI_WRITER; } downgrade_write(&nilfs->ns_segctor_sem); } static void nilfs_transaction_lock(struct super_block *sb, struct nilfs_transaction_info *ti, int gcflag) { struct nilfs_transaction_info *cur_ti = current->journal_info; struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_sc_info *sci = nilfs->ns_writer; WARN_ON(cur_ti); ti->ti_flags = NILFS_TI_WRITER; ti->ti_count = 0; ti->ti_save = cur_ti; ti->ti_magic = NILFS_TI_MAGIC; current->journal_info = ti; for (;;) { trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, ti->ti_flags, TRACE_NILFS2_TRANSACTION_TRYLOCK); down_write(&nilfs->ns_segctor_sem); if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) break; nilfs_segctor_do_immediate_flush(sci); up_write(&nilfs->ns_segctor_sem); cond_resched(); } if (gcflag) ti->ti_flags |= NILFS_TI_GC; trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, ti->ti_flags, TRACE_NILFS2_TRANSACTION_LOCK); } static void nilfs_transaction_unlock(struct super_block *sb) { struct nilfs_transaction_info *ti = current->journal_info; struct the_nilfs *nilfs = sb->s_fs_info; BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); BUG_ON(ti->ti_count > 0); up_write(&nilfs->ns_segctor_sem); current->journal_info = ti->ti_save; trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, ti->ti_flags, TRACE_NILFS2_TRANSACTION_UNLOCK); } static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, struct nilfs_segsum_pointer *ssp, unsigned int bytes) { struct nilfs_segment_buffer *segbuf = sci->sc_curseg; unsigned int blocksize = sci->sc_super->s_blocksize; void *p; if (unlikely(ssp->offset + bytes > blocksize)) { ssp->offset = 0; BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh, &segbuf->sb_segsum_buffers)); ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh); } p = ssp->bh->b_data + ssp->offset; ssp->offset += bytes; return p; } /** * nilfs_segctor_reset_segment_buffer - reset the current segment buffer * @sci: nilfs_sc_info * * Return: 0 on success, or a negative error code on failure. */ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci) { struct nilfs_segment_buffer *segbuf = sci->sc_curseg; struct buffer_head *sumbh; unsigned int sumbytes; unsigned int flags = 0; int err; if (nilfs_doing_gc()) flags = NILFS_SS_GC; err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, sci->sc_cno); if (unlikely(err)) return err; sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers); sumbytes = segbuf->sb_sum.sumbytes; sci->sc_finfo_ptr.bh = sumbh; sci->sc_finfo_ptr.offset = sumbytes; sci->sc_binfo_ptr.bh = sumbh; sci->sc_binfo_ptr.offset = sumbytes; sci->sc_blk_cnt = sci->sc_datablk_cnt = 0; return 0; } /** * nilfs_segctor_zeropad_segsum - zero pad the rest of the segment summary area * @sci: segment constructor object * * nilfs_segctor_zeropad_segsum() zero-fills unallocated space at the end of * the current segment summary block. */ static void nilfs_segctor_zeropad_segsum(struct nilfs_sc_info *sci) { struct nilfs_segsum_pointer *ssp; ssp = sci->sc_blk_cnt > 0 ? &sci->sc_binfo_ptr : &sci->sc_finfo_ptr; if (ssp->offset < ssp->bh->b_size) memset(ssp->bh->b_data + ssp->offset, 0, ssp->bh->b_size - ssp->offset); } static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci) { sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs)) return -E2BIG; /* * The current segment is filled up * (internal code) */ nilfs_segctor_zeropad_segsum(sci); sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg); return nilfs_segctor_reset_segment_buffer(sci); } static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci) { struct nilfs_segment_buffer *segbuf = sci->sc_curseg; int err; if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) { err = nilfs_segctor_feed_segment(sci); if (err) return err; segbuf = sci->sc_curseg; } err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root); if (likely(!err)) segbuf->sb_sum.flags |= NILFS_SS_SR; return err; } /* * Functions for making segment summary and payloads */ static int nilfs_segctor_segsum_block_required( struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp, unsigned int binfo_size) { unsigned int blocksize = sci->sc_super->s_blocksize; /* Size of finfo and binfo is enough small against blocksize */ return ssp->offset + binfo_size + (!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) > blocksize; } static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci, struct inode *inode) { sci->sc_curseg->sb_sum.nfinfo++; sci->sc_binfo_ptr = sci->sc_finfo_ptr; nilfs_segctor_map_segsum_entry( sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo)); if (NILFS_I(inode)->i_root && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); /* skip finfo */ } static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci, struct inode *inode) { struct nilfs_finfo *finfo; struct nilfs_inode_info *ii; struct nilfs_segment_buffer *segbuf; __u64 cno; if (sci->sc_blk_cnt == 0) return; ii = NILFS_I(inode); if (ii->i_type & NILFS_I_TYPE_GC) cno = ii->i_cno; else if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) cno = 0; else cno = sci->sc_cno; finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr, sizeof(*finfo)); finfo->fi_ino = cpu_to_le64(inode->i_ino); finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt); finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt); finfo->fi_cno = cpu_to_le64(cno); segbuf = sci->sc_curseg; segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset + sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1); sci->sc_finfo_ptr = sci->sc_binfo_ptr; sci->sc_blk_cnt = sci->sc_datablk_cnt = 0; } static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci, struct buffer_head *bh, struct inode *inode, unsigned int binfo_size) { struct nilfs_segment_buffer *segbuf; int required, err = 0; retry: segbuf = sci->sc_curseg; required = nilfs_segctor_segsum_block_required( sci, &sci->sc_binfo_ptr, binfo_size); if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) { nilfs_segctor_end_finfo(sci, inode); err = nilfs_segctor_feed_segment(sci); if (err) return err; goto retry; } if (unlikely(required)) { nilfs_segctor_zeropad_segsum(sci); err = nilfs_segbuf_extend_segsum(segbuf); if (unlikely(err)) goto failed; } if (sci->sc_blk_cnt == 0) nilfs_segctor_begin_finfo(sci, inode); nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size); /* Substitution to vblocknr is delayed until update_blocknr() */ nilfs_segbuf_add_file_buffer(segbuf, bh); sci->sc_blk_cnt++; failed: return err; } /* * Callback functions that enumerate, mark, and collect dirty blocks */ static int nilfs_collect_file_data(struct nilfs_sc_info *sci, struct buffer_head *bh, struct inode *inode) { int err; err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); if (err < 0) return err; err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(struct nilfs_binfo_v)); if (!err) sci->sc_datablk_cnt++; return err; } static int nilfs_collect_file_node(struct nilfs_sc_info *sci, struct buffer_head *bh, struct inode *inode) { return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); } static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci, struct buffer_head *bh, struct inode *inode) { WARN_ON(!buffer_dirty(bh)); return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64)); } static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci, struct nilfs_segsum_pointer *ssp, union nilfs_binfo *binfo) { struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry( sci, ssp, sizeof(*binfo_v)); *binfo_v = binfo->bi_v; } static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci, struct nilfs_segsum_pointer *ssp, union nilfs_binfo *binfo) { __le64 *vblocknr = nilfs_segctor_map_segsum_entry( sci, ssp, sizeof(*vblocknr)); *vblocknr = binfo->bi_v.bi_vblocknr; } static const struct nilfs_sc_operations nilfs_sc_file_ops = { .collect_data = nilfs_collect_file_data, .collect_node = nilfs_collect_file_node, .collect_bmap = nilfs_collect_file_bmap, .write_data_binfo = nilfs_write_file_data_binfo, .write_node_binfo = nilfs_write_file_node_binfo, }; static int nilfs_collect_dat_data(struct nilfs_sc_info *sci, struct buffer_head *bh, struct inode *inode) { int err; err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); if (err < 0) return err; err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64)); if (!err) sci->sc_datablk_cnt++; return err; } static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci, struct buffer_head *bh, struct inode *inode) { WARN_ON(!buffer_dirty(bh)); return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(struct nilfs_binfo_dat)); } static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci, struct nilfs_segsum_pointer *ssp, union nilfs_binfo *binfo) { __le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*blkoff)); *blkoff = binfo->bi_dat.bi_blkoff; } static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci, struct nilfs_segsum_pointer *ssp, union nilfs_binfo *binfo) { struct nilfs_binfo_dat *binfo_dat = nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat)); *binfo_dat = binfo->bi_dat; } static const struct nilfs_sc_operations nilfs_sc_dat_ops = { .collect_data = nilfs_collect_dat_data, .collect_node = nilfs_collect_file_node, .collect_bmap = nilfs_collect_dat_bmap, .write_data_binfo = nilfs_write_dat_data_binfo, .write_node_binfo = nilfs_write_dat_node_binfo, }; static const struct nilfs_sc_operations nilfs_sc_dsync_ops = { .collect_data = nilfs_collect_file_data, .collect_node = NULL, .collect_bmap = NULL, .write_data_binfo = nilfs_write_file_data_binfo, .write_node_binfo = NULL, }; static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, struct list_head *listp, size_t nlimit, loff_t start, loff_t end) { struct address_space *mapping = inode->i_mapping; struct folio_batch fbatch; pgoff_t index = 0, last = ULONG_MAX; size_t ndirties = 0; int i; if (unlikely(start != 0 || end != LLONG_MAX)) { /* * A valid range is given for sync-ing data pages. The * range is rounded to per-page; extra dirty buffers * may be included if blocksize < pagesize. */ index = start >> PAGE_SHIFT; last = end >> PAGE_SHIFT; } folio_batch_init(&fbatch); repeat: if (unlikely(index > last) || !filemap_get_folios_tag(mapping, &index, last, PAGECACHE_TAG_DIRTY, &fbatch)) return ndirties; for (i = 0; i < folio_batch_count(&fbatch); i++) { struct buffer_head *bh, *head; struct folio *folio = fbatch.folios[i]; folio_lock(folio); if (unlikely(folio->mapping != mapping)) { /* Exclude folios removed from the address space */ folio_unlock(folio); continue; } head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, i_blocksize(inode), 0); bh = head; do { if (!buffer_dirty(bh) || buffer_async_write(bh)) continue; get_bh(bh); list_add_tail(&bh->b_assoc_buffers, listp); ndirties++; if (unlikely(ndirties >= nlimit)) { folio_unlock(folio); folio_batch_release(&fbatch); cond_resched(); return ndirties; } } while (bh = bh->b_this_page, bh != head); folio_unlock(folio); } folio_batch_release(&fbatch); cond_resched(); goto repeat; } static void nilfs_lookup_dirty_node_buffers(struct inode *inode, struct list_head *listp) { struct nilfs_inode_info *ii = NILFS_I(inode); struct inode *btnc_inode = ii->i_assoc_inode; struct folio_batch fbatch; struct buffer_head *bh, *head; unsigned int i; pgoff_t index = 0; if (!btnc_inode) return; folio_batch_init(&fbatch); while (filemap_get_folios_tag(btnc_inode->i_mapping, &index, (pgoff_t)-1, PAGECACHE_TAG_DIRTY, &fbatch)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { bh = head = folio_buffers(fbatch.folios[i]); do { if (buffer_dirty(bh) && !buffer_async_write(bh)) { get_bh(bh); list_add_tail(&bh->b_assoc_buffers, listp); } bh = bh->b_this_page; } while (bh != head); } folio_batch_release(&fbatch); cond_resched(); } } static void nilfs_dispose_list(struct the_nilfs *nilfs, struct list_head *head, int force) { struct nilfs_inode_info *ii, *n; struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii; unsigned int nv = 0; while (!list_empty(head)) { spin_lock(&nilfs->ns_inode_lock); list_for_each_entry_safe(ii, n, head, i_dirty) { list_del_init(&ii->i_dirty); if (force) { if (unlikely(ii->i_bh)) { brelse(ii->i_bh); ii->i_bh = NULL; } } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) { set_bit(NILFS_I_QUEUED, &ii->i_state); list_add_tail(&ii->i_dirty, &nilfs->ns_dirty_files); continue; } ivec[nv++] = ii; if (nv == SC_N_INODEVEC) break; } spin_unlock(&nilfs->ns_inode_lock); for (pii = ivec; nv > 0; pii++, nv--) iput(&(*pii)->vfs_inode); } } static void nilfs_iput_work_func(struct work_struct *work) { struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info, sc_iput_work); struct the_nilfs *nilfs = sci->sc_super->s_fs_info; nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0); } static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs, struct nilfs_root *root) { int ret = 0; if (nilfs_mdt_fetch_dirty(root->ifile)) ret++; if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile)) ret++; if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile)) ret++; if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat)) ret++; return ret; } static int nilfs_segctor_clean(struct nilfs_sc_info *sci) { return list_empty(&sci->sc_dirty_files) && !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) && sci->sc_nfreesegs == 0 && (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes)); } static int nilfs_segctor_confirm(struct nilfs_sc_info *sci) { struct the_nilfs *nilfs = sci->sc_super->s_fs_info; int ret = 0; if (nilfs_test_metadata_dirty(nilfs, sci->sc_root)) set_bit(NILFS_SC_DIRTY, &sci->sc_flags); spin_lock(&nilfs->ns_inode_lock); if (list_empty(&nilfs->ns_dirty_files) && nilfs_segctor_clean(sci)) ret++; spin_unlock(&nilfs->ns_inode_lock); return ret; } static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci) { struct the_nilfs *nilfs = sci->sc_super->s_fs_info; nilfs_mdt_clear_dirty(sci->sc_root->ifile); nilfs_mdt_clear_dirty(nilfs->ns_cpfile); nilfs_mdt_clear_dirty(nilfs->ns_sufile); nilfs_mdt_clear_dirty(nilfs->ns_dat); } static void nilfs_fill_in_file_bmap(struct inode *ifile, struct nilfs_inode_info *ii) { struct buffer_head *ibh; struct nilfs_inode *raw_inode; if (test_bit(NILFS_I_BMAP, &ii->i_state)) { ibh = ii->i_bh; BUG_ON(!ibh); raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino, ibh); nilfs_bmap_write(ii->i_bmap, raw_inode); nilfs_ifile_unmap_inode(raw_inode); } } static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci) { struct nilfs_inode_info *ii; list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) { nilfs_fill_in_file_bmap(sci->sc_root->ifile, ii); set_bit(NILFS_I_COLLECTED, &ii->i_state); } } /** * nilfs_write_root_mdt_inode - export root metadata inode information to * the on-disk inode * @inode: inode object of the root metadata file * @raw_inode: on-disk inode * * nilfs_write_root_mdt_inode() writes inode information and bmap data of * @inode to the inode area of the metadata file allocated on the super root * block created to finalize the log. Since super root blocks are configured * each time, this function zero-fills the unused area of @raw_inode. */ static void nilfs_write_root_mdt_inode(struct inode *inode, struct nilfs_inode *raw_inode) { struct the_nilfs *nilfs = inode->i_sb->s_fs_info; nilfs_write_inode_common(inode, raw_inode); /* zero-fill unused portion of raw_inode */ raw_inode->i_xattr = 0; raw_inode->i_pad = 0; memset((void *)raw_inode + sizeof(*raw_inode), 0, nilfs->ns_inode_size - sizeof(*raw_inode)); nilfs_bmap_write(NILFS_I(inode)->i_bmap, raw_inode); } static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { struct buffer_head *bh_sr; struct nilfs_super_root *raw_sr; unsigned int isz, srsz; bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root; lock_buffer(bh_sr); raw_sr = (struct nilfs_super_root *)bh_sr->b_data; isz = nilfs->ns_inode_size; srsz = NILFS_SR_BYTES(isz); raw_sr->sr_sum = 0; /* Ensure initialization within this update */ raw_sr->sr_bytes = cpu_to_le16(srsz); raw_sr->sr_nongc_ctime = cpu_to_le64(nilfs_doing_gc() ? nilfs->ns_nongc_ctime : sci->sc_seg_ctime); raw_sr->sr_flags = 0; nilfs_write_root_mdt_inode(nilfs->ns_dat, (void *)raw_sr + NILFS_SR_DAT_OFFSET(isz)); nilfs_write_root_mdt_inode(nilfs->ns_cpfile, (void *)raw_sr + NILFS_SR_CPFILE_OFFSET(isz)); nilfs_write_root_mdt_inode(nilfs->ns_sufile, (void *)raw_sr + NILFS_SR_SUFILE_OFFSET(isz)); memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz); set_buffer_uptodate(bh_sr); unlock_buffer(bh_sr); } static void nilfs_redirty_inodes(struct list_head *head) { struct nilfs_inode_info *ii; list_for_each_entry(ii, head, i_dirty) { if (test_bit(NILFS_I_COLLECTED, &ii->i_state)) clear_bit(NILFS_I_COLLECTED, &ii->i_state); } } static void nilfs_drop_collected_inodes(struct list_head *head) { struct nilfs_inode_info *ii; list_for_each_entry(ii, head, i_dirty) { if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state)) continue; clear_bit(NILFS_I_INODE_SYNC, &ii->i_state); set_bit(NILFS_I_UPDATED, &ii->i_state); } } static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci, struct inode *inode, struct list_head *listp, int (*collect)(struct nilfs_sc_info *, struct buffer_head *, struct inode *)) { struct buffer_head *bh, *n; int err = 0; if (collect) { list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) { list_del_init(&bh->b_assoc_buffers); err = collect(sci, bh, inode); brelse(bh); if (unlikely(err)) goto dispose_buffers; } return 0; } dispose_buffers: while (!list_empty(listp)) { bh = list_first_entry(listp, struct buffer_head, b_assoc_buffers); list_del_init(&bh->b_assoc_buffers); brelse(bh); } return err; } static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci) { /* Remaining number of blocks within segment buffer */ return sci->sc_segbuf_nblocks - (sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks); } static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci, struct inode *inode, const struct nilfs_sc_operations *sc_ops) { LIST_HEAD(data_buffers); LIST_HEAD(node_buffers); int err; if (!(sci->sc_stage.flags & NILFS_CF_NODE)) { size_t n, rest = nilfs_segctor_buffer_rest(sci); n = nilfs_lookup_dirty_data_buffers( inode, &data_buffers, rest + 1, 0, LLONG_MAX); if (n > rest) { err = nilfs_segctor_apply_buffers( sci, inode, &data_buffers, sc_ops->collect_data); BUG_ON(!err); /* always receive -E2BIG or true error */ goto break_or_fail; } } nilfs_lookup_dirty_node_buffers(inode, &node_buffers); if (!(sci->sc_stage.flags & NILFS_CF_NODE)) { err = nilfs_segctor_apply_buffers( sci, inode, &data_buffers, sc_ops->collect_data); if (unlikely(err)) { /* dispose node list */ nilfs_segctor_apply_buffers( sci, inode, &node_buffers, NULL); goto break_or_fail; } sci->sc_stage.flags |= NILFS_CF_NODE; } /* Collect node */ err = nilfs_segctor_apply_buffers( sci, inode, &node_buffers, sc_ops->collect_node); if (unlikely(err)) goto break_or_fail; nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers); err = nilfs_segctor_apply_buffers( sci, inode, &node_buffers, sc_ops->collect_bmap); if (unlikely(err)) goto break_or_fail; nilfs_segctor_end_finfo(sci, inode); sci->sc_stage.flags &= ~NILFS_CF_NODE; break_or_fail: return err; } static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci, struct inode *inode) { LIST_HEAD(data_buffers); size_t n, rest = nilfs_segctor_buffer_rest(sci); int err; n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1, sci->sc_dsync_start, sci->sc_dsync_end); err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers, nilfs_collect_file_data); if (!err) { nilfs_segctor_end_finfo(sci, inode); BUG_ON(n > rest); /* always receive -E2BIG or true error if n > rest */ } return err; } /** * nilfs_free_segments - free the segments given by an array of segment numbers * @nilfs: nilfs object * @segnumv: array of segment numbers to be freed * @nsegs: number of segments to be freed in @segnumv * * nilfs_free_segments() wraps nilfs_sufile_freev() and * nilfs_sufile_cancel_freev(), and edits the segment usage metadata file * (sufile) to free all segments given by @segnumv and @nsegs at once. If * it fails midway, it cancels the changes so that none of the segments are * freed. If @nsegs is 0, this function does nothing. * * The freeing of segments is not finalized until the writing of a log with * a super root block containing this sufile change is complete, and it can * be canceled with nilfs_sufile_cancel_freev() until then. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - Invalid segment number. * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. */ static int nilfs_free_segments(struct the_nilfs *nilfs, __u64 *segnumv, size_t nsegs) { size_t ndone; int ret; if (!nsegs) return 0; ret = nilfs_sufile_freev(nilfs->ns_sufile, segnumv, nsegs, &ndone); if (unlikely(ret)) { nilfs_sufile_cancel_freev(nilfs->ns_sufile, segnumv, ndone, NULL); /* * If a segment usage of the segments to be freed is in a * hole block, nilfs_sufile_freev() will return -ENOENT. * In this case, -EINVAL should be returned to the caller * since there is something wrong with the given segment * number array. This error can only occur during GC, so * there is no need to worry about it propagating to other * callers (such as fsync). */ if (ret == -ENOENT) { nilfs_err(nilfs->ns_sb, "The segment usage entry %llu to be freed is invalid (in a hole)", (unsigned long long)segnumv[ndone]); ret = -EINVAL; } } return ret; } static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) { struct the_nilfs *nilfs = sci->sc_super->s_fs_info; struct list_head *head; struct nilfs_inode_info *ii; int err = 0; switch (nilfs_sc_cstage_get(sci)) { case NILFS_ST_INIT: /* Pre-processes */ sci->sc_stage.flags = 0; if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) { sci->sc_nblk_inc = 0; sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN; if (mode == SC_LSEG_DSYNC) { nilfs_sc_cstage_set(sci, NILFS_ST_DSYNC); goto dsync_mode; } } sci->sc_stage.dirty_file_ptr = NULL; sci->sc_stage.gc_inode_ptr = NULL; if (mode == SC_FLUSH_DAT) { nilfs_sc_cstage_set(sci, NILFS_ST_DAT); goto dat_stage; } nilfs_sc_cstage_inc(sci); fallthrough; case NILFS_ST_GC: if (nilfs_doing_gc()) { head = &sci->sc_gc_inodes; ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr, head, i_dirty); list_for_each_entry_continue(ii, head, i_dirty) { err = nilfs_segctor_scan_file( sci, &ii->vfs_inode, &nilfs_sc_file_ops); if (unlikely(err)) { sci->sc_stage.gc_inode_ptr = list_entry( ii->i_dirty.prev, struct nilfs_inode_info, i_dirty); goto break_or_fail; } set_bit(NILFS_I_COLLECTED, &ii->i_state); } sci->sc_stage.gc_inode_ptr = NULL; } nilfs_sc_cstage_inc(sci); fallthrough; case NILFS_ST_FILE: head = &sci->sc_dirty_files; ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head, i_dirty); list_for_each_entry_continue(ii, head, i_dirty) { clear_bit(NILFS_I_DIRTY, &ii->i_state); err = nilfs_segctor_scan_file(sci, &ii->vfs_inode, &nilfs_sc_file_ops); if (unlikely(err)) { sci->sc_stage.dirty_file_ptr = list_entry(ii->i_dirty.prev, struct nilfs_inode_info, i_dirty); goto break_or_fail; } /* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */ /* XXX: required ? */ } sci->sc_stage.dirty_file_ptr = NULL; if (mode == SC_FLUSH_FILE) { nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; } nilfs_sc_cstage_inc(sci); sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED; fallthrough; case NILFS_ST_IFILE: err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile, &nilfs_sc_file_ops); if (unlikely(err)) break; nilfs_sc_cstage_inc(sci); /* Creating a checkpoint */ err = nilfs_cpfile_create_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno); if (unlikely(err)) break; fallthrough; case NILFS_ST_CPFILE: err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile, &nilfs_sc_file_ops); if (unlikely(err)) break; nilfs_sc_cstage_inc(sci); fallthrough; case NILFS_ST_SUFILE: err = nilfs_free_segments(nilfs, sci->sc_freesegs, sci->sc_nfreesegs); if (unlikely(err)) break; sci->sc_stage.flags |= NILFS_CF_SUFREED; err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile, &nilfs_sc_file_ops); if (unlikely(err)) break; nilfs_sc_cstage_inc(sci); fallthrough; case NILFS_ST_DAT: dat_stage: err = nilfs_segctor_scan_file(sci, nilfs->ns_dat, &nilfs_sc_dat_ops); if (unlikely(err)) break; if (mode == SC_FLUSH_DAT) { nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; } nilfs_sc_cstage_inc(sci); fallthrough; case NILFS_ST_SR: if (mode == SC_LSEG_SR) { /* Appending a super root */ err = nilfs_segctor_add_super_root(sci); if (unlikely(err)) break; } /* End of a logical segment */ sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; case NILFS_ST_DSYNC: dsync_mode: sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT; ii = sci->sc_dsync_inode; if (!test_bit(NILFS_I_BUSY, &ii->i_state)) break; err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode); if (unlikely(err)) break; sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; case NILFS_ST_DONE: return 0; default: BUG(); } break_or_fail: return err; } /** * nilfs_segctor_begin_construction - setup segment buffer to make a new log * @sci: nilfs_sc_info * @nilfs: nilfs object * * Return: 0 on success, or a negative error code on failure. */ static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { struct nilfs_segment_buffer *segbuf, *prev; __u64 nextnum; int err, alloc = 0; segbuf = nilfs_segbuf_new(sci->sc_super); if (unlikely(!segbuf)) return -ENOMEM; if (list_empty(&sci->sc_write_logs)) { nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset, nilfs); if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) { nilfs_shift_to_next_segment(nilfs); nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs); } segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq; nextnum = nilfs->ns_nextnum; if (nilfs->ns_segnum == nilfs->ns_nextnum) /* Start from the head of a new full segment */ alloc++; } else { /* Continue logs */ prev = NILFS_LAST_SEGBUF(&sci->sc_write_logs); nilfs_segbuf_map_cont(segbuf, prev); segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq; nextnum = prev->sb_nextnum; if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) { nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs); segbuf->sb_sum.seg_seq++; alloc++; } } err = nilfs_sufile_mark_dirty(nilfs->ns_sufile, segbuf->sb_segnum); if (err) goto failed; if (alloc) { err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum); if (err) goto failed; } nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs); BUG_ON(!list_empty(&sci->sc_segbufs)); list_add_tail(&segbuf->sb_list, &sci->sc_segbufs); sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks; return 0; failed: nilfs_segbuf_free(segbuf); return err; } static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci, struct the_nilfs *nilfs, int nadd) { struct nilfs_segment_buffer *segbuf, *prev; struct inode *sufile = nilfs->ns_sufile; __u64 nextnextnum; LIST_HEAD(list); int err, ret, i; prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs); /* * Since the segment specified with nextnum might be allocated during * the previous construction, the buffer including its segusage may * not be dirty. The following call ensures that the buffer is dirty * and will pin the buffer on memory until the sufile is written. */ err = nilfs_sufile_mark_dirty(sufile, prev->sb_nextnum); if (unlikely(err)) return err; for (i = 0; i < nadd; i++) { /* extend segment info */ err = -ENOMEM; segbuf = nilfs_segbuf_new(sci->sc_super); if (unlikely(!segbuf)) goto failed; /* map this buffer to region of segment on-disk */ nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs); sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks; /* allocate the next next full segment */ err = nilfs_sufile_alloc(sufile, &nextnextnum); if (unlikely(err)) goto failed_segbuf; segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1; nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs); list_add_tail(&segbuf->sb_list, &list); prev = segbuf; } list_splice_tail(&list, &sci->sc_segbufs); return 0; failed_segbuf: nilfs_segbuf_free(segbuf); failed: list_for_each_entry(segbuf, &list, sb_list) { ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); WARN_ON(ret); /* never fails */ } nilfs_destroy_logs(&list); return err; } static void nilfs_free_incomplete_logs(struct list_head *logs, struct the_nilfs *nilfs) { struct nilfs_segment_buffer *segbuf, *prev; struct inode *sufile = nilfs->ns_sufile; int ret; segbuf = NILFS_FIRST_SEGBUF(logs); if (nilfs->ns_nextnum != segbuf->sb_nextnum) { ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); WARN_ON(ret); /* never fails */ } if (atomic_read(&segbuf->sb_err)) { /* Case 1: The first segment failed */ if (segbuf->sb_pseg_start != segbuf->sb_fseg_start) /* * Case 1a: Partial segment appended into an existing * segment */ nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start, segbuf->sb_fseg_end); else /* Case 1b: New full segment */ set_nilfs_discontinued(nilfs); } prev = segbuf; list_for_each_entry_continue(segbuf, logs, sb_list) { if (prev->sb_nextnum != segbuf->sb_nextnum) { ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); WARN_ON(ret); /* never fails */ } if (atomic_read(&segbuf->sb_err) && segbuf->sb_segnum != nilfs->ns_nextnum) /* Case 2: extended segment (!= next) failed */ nilfs_sufile_set_error(sufile, segbuf->sb_segnum); prev = segbuf; } } static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci, struct inode *sufile) { struct nilfs_segment_buffer *segbuf; unsigned long live_blocks; int ret; list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { live_blocks = segbuf->sb_sum.nblocks + (segbuf->sb_pseg_start - segbuf->sb_fseg_start); ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, live_blocks, sci->sc_seg_ctime); WARN_ON(ret); /* always succeed because the segusage is dirty */ } } static void nilfs_cancel_segusage(struct list_head *logs, struct inode *sufile) { struct nilfs_segment_buffer *segbuf; int ret; segbuf = NILFS_FIRST_SEGBUF(logs); ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, segbuf->sb_pseg_start - segbuf->sb_fseg_start, 0); WARN_ON(ret); /* always succeed because the segusage is dirty */ list_for_each_entry_continue(segbuf, logs, sb_list) { ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, 0, 0); WARN_ON(ret); /* always succeed */ } } static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci, struct nilfs_segment_buffer *last, struct inode *sufile) { struct nilfs_segment_buffer *segbuf = last; int ret; list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks; ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); WARN_ON(ret); } nilfs_truncate_logs(&sci->sc_segbufs, last); } static int nilfs_segctor_collect(struct nilfs_sc_info *sci, struct the_nilfs *nilfs, int mode) { struct nilfs_cstage prev_stage = sci->sc_stage; int err, nadd = 1; /* Collection retry loop */ for (;;) { sci->sc_nblk_this_inc = 0; sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); err = nilfs_segctor_reset_segment_buffer(sci); if (unlikely(err)) goto failed; err = nilfs_segctor_collect_blocks(sci, mode); sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; if (!err) break; if (unlikely(err != -E2BIG)) goto failed; /* The current segment is filled up */ if (mode != SC_LSEG_SR || nilfs_sc_cstage_get(sci) < NILFS_ST_CPFILE) break; nilfs_clear_logs(&sci->sc_segbufs); if (sci->sc_stage.flags & NILFS_CF_SUFREED) { err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, sci->sc_freesegs, sci->sc_nfreesegs, NULL); WARN_ON(err); /* do not happen */ sci->sc_stage.flags &= ~NILFS_CF_SUFREED; } err = nilfs_segctor_extend_segments(sci, nilfs, nadd); if (unlikely(err)) return err; nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); sci->sc_stage = prev_stage; } nilfs_segctor_zeropad_segsum(sci); nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile); return 0; failed: return err; } static void nilfs_list_replace_buffer(struct buffer_head *old_bh, struct buffer_head *new_bh) { BUG_ON(!list_empty(&new_bh->b_assoc_buffers)); list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers); /* The caller must release old_bh */ } static int nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci, struct nilfs_segment_buffer *segbuf, int mode) { struct inode *inode = NULL; sector_t blocknr; unsigned long nfinfo = segbuf->sb_sum.nfinfo; unsigned long nblocks = 0, ndatablk = 0; const struct nilfs_sc_operations *sc_op = NULL; struct nilfs_segsum_pointer ssp; struct nilfs_finfo *finfo = NULL; union nilfs_binfo binfo; struct buffer_head *bh, *bh_org; ino_t ino = 0; int err = 0; if (!nfinfo) goto out; blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk; ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers); ssp.offset = sizeof(struct nilfs_segment_summary); list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { if (bh == segbuf->sb_super_root) break; if (!finfo) { finfo = nilfs_segctor_map_segsum_entry( sci, &ssp, sizeof(*finfo)); ino = le64_to_cpu(finfo->fi_ino); nblocks = le32_to_cpu(finfo->fi_nblocks); ndatablk = le32_to_cpu(finfo->fi_ndatablk); inode = bh->b_folio->mapping->host; if (mode == SC_LSEG_DSYNC) sc_op = &nilfs_sc_dsync_ops; else if (ino == NILFS_DAT_INO) sc_op = &nilfs_sc_dat_ops; else /* file blocks */ sc_op = &nilfs_sc_file_ops; } bh_org = bh; get_bh(bh_org); err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr, &binfo); if (bh != bh_org) nilfs_list_replace_buffer(bh_org, bh); brelse(bh_org); if (unlikely(err)) goto failed_bmap; if (ndatablk > 0) sc_op->write_data_binfo(sci, &ssp, &binfo); else sc_op->write_node_binfo(sci, &ssp, &binfo); blocknr++; if (--nblocks == 0) { finfo = NULL; if (--nfinfo == 0) break; } else if (ndatablk > 0) ndatablk--; } out: return 0; failed_bmap: return err; } static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode) { struct nilfs_segment_buffer *segbuf; int err; list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode); if (unlikely(err)) return err; nilfs_segbuf_fill_in_segsum(segbuf); } return 0; } static void nilfs_begin_folio_io(struct folio *folio) { if (!folio || folio_test_writeback(folio)) /* * For split b-tree node pages, this function may be called * twice. We ignore the 2nd or later calls by this check. */ return; folio_lock(folio); folio_clear_dirty_for_io(folio); folio_start_writeback(folio); folio_unlock(folio); } /** * nilfs_prepare_write_logs - prepare to write logs * @logs: logs to prepare for writing * @seed: checksum seed value * * nilfs_prepare_write_logs() adds checksums and prepares the block * buffers/folios for writing logs. In order to stabilize folios of * memory-mapped file blocks by putting them in writeback state before * calculating the checksums, first prepare to write payload blocks other * than segment summary and super root blocks in which the checksums will * be embedded. */ static void nilfs_prepare_write_logs(struct list_head *logs, u32 seed) { struct nilfs_segment_buffer *segbuf; struct folio *bd_folio = NULL, *fs_folio = NULL; struct buffer_head *bh; /* Prepare to write payload blocks */ list_for_each_entry(segbuf, logs, sb_list) { list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { if (bh == segbuf->sb_super_root) break; set_buffer_async_write(bh); if (bh->b_folio != fs_folio) { nilfs_begin_folio_io(fs_folio); fs_folio = bh->b_folio; } } } nilfs_begin_folio_io(fs_folio); nilfs_add_checksums_on_logs(logs, seed); /* Prepare to write segment summary blocks */ list_for_each_entry(segbuf, logs, sb_list) { list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { mark_buffer_dirty(bh); if (bh->b_folio == bd_folio) continue; if (bd_folio) { folio_lock(bd_folio); folio_wait_writeback(bd_folio); folio_clear_dirty_for_io(bd_folio); folio_start_writeback(bd_folio); folio_unlock(bd_folio); } bd_folio = bh->b_folio; } } /* Prepare to write super root block */ bh = NILFS_LAST_SEGBUF(logs)->sb_super_root; if (bh) { mark_buffer_dirty(bh); if (bh->b_folio != bd_folio) { folio_lock(bd_folio); folio_wait_writeback(bd_folio); folio_clear_dirty_for_io(bd_folio); folio_start_writeback(bd_folio); folio_unlock(bd_folio); bd_folio = bh->b_folio; } } if (bd_folio) { folio_lock(bd_folio); folio_wait_writeback(bd_folio); folio_clear_dirty_for_io(bd_folio); folio_start_writeback(bd_folio); folio_unlock(bd_folio); } } static int nilfs_segctor_write(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { int ret; ret = nilfs_write_logs(&sci->sc_segbufs, nilfs); list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs); return ret; } static void nilfs_end_folio_io(struct folio *folio, int err) { if (!folio) return; if (buffer_nilfs_node(folio_buffers(folio)) && !folio_test_writeback(folio)) { /* * For b-tree node pages, this function may be called twice * or more because they might be split in a segment. */ if (folio_test_dirty(folio)) { /* * For pages holding split b-tree node buffers, dirty * flag on the buffers may be cleared discretely. * In that case, the page is once redirtied for * remaining buffers, and it must be cancelled if * all the buffers get cleaned later. */ folio_lock(folio); if (nilfs_folio_buffers_clean(folio)) __nilfs_clear_folio_dirty(folio); folio_unlock(folio); } return; } if (err || !nilfs_folio_buffers_clean(folio)) filemap_dirty_folio(folio->mapping, folio); folio_end_writeback(folio); } static void nilfs_abort_logs(struct list_head *logs, int err) { struct nilfs_segment_buffer *segbuf; struct folio *bd_folio = NULL, *fs_folio = NULL; struct buffer_head *bh; if (list_empty(logs)) return; list_for_each_entry(segbuf, logs, sb_list) { list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { clear_buffer_uptodate(bh); if (bh->b_folio != bd_folio) { if (bd_folio) folio_end_writeback(bd_folio); bd_folio = bh->b_folio; } } list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { if (bh == segbuf->sb_super_root) { clear_buffer_uptodate(bh); if (bh->b_folio != bd_folio) { folio_end_writeback(bd_folio); bd_folio = bh->b_folio; } break; } clear_buffer_async_write(bh); if (bh->b_folio != fs_folio) { nilfs_end_folio_io(fs_folio, err); fs_folio = bh->b_folio; } } } if (bd_folio) folio_end_writeback(bd_folio); nilfs_end_folio_io(fs_folio, err); } static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, struct the_nilfs *nilfs, int err) { LIST_HEAD(logs); int ret; list_splice_tail_init(&sci->sc_write_logs, &logs); ret = nilfs_wait_on_logs(&logs); nilfs_abort_logs(&logs, ret ? : err); list_splice_tail_init(&sci->sc_segbufs, &logs); if (list_empty(&logs)) return; /* if the first segment buffer preparation failed */ nilfs_cancel_segusage(&logs, nilfs->ns_sufile); nilfs_free_incomplete_logs(&logs, nilfs); if (sci->sc_stage.flags & NILFS_CF_SUFREED) { ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile, sci->sc_freesegs, sci->sc_nfreesegs, NULL); WARN_ON(ret); /* do not happen */ } nilfs_destroy_logs(&logs); } static void nilfs_set_next_segment(struct the_nilfs *nilfs, struct nilfs_segment_buffer *segbuf) { nilfs->ns_segnum = segbuf->sb_segnum; nilfs->ns_nextnum = segbuf->sb_nextnum; nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start + segbuf->sb_sum.nblocks; nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq; nilfs->ns_ctime = segbuf->sb_sum.ctime; } static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) { struct nilfs_segment_buffer *segbuf; struct folio *bd_folio = NULL, *fs_folio = NULL; struct the_nilfs *nilfs = sci->sc_super->s_fs_info; int update_sr = false; list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { struct buffer_head *bh; list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { set_buffer_uptodate(bh); clear_buffer_dirty(bh); if (bh->b_folio != bd_folio) { if (bd_folio) folio_end_writeback(bd_folio); bd_folio = bh->b_folio; } } /* * We assume that the buffers which belong to the same folio * continue over the buffer list. * Under this assumption, the last BHs of folios is * identifiable by the discontinuity of bh->b_folio * (folio != fs_folio). * * For B-tree node blocks, however, this assumption is not * guaranteed. The cleanup code of B-tree node folios needs * special care. */ list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { const unsigned long set_bits = BIT(BH_Uptodate); const unsigned long clear_bits = (BIT(BH_Dirty) | BIT(BH_Async_Write) | BIT(BH_Delay) | BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Redirected)); if (bh == segbuf->sb_super_root) { set_buffer_uptodate(bh); clear_buffer_dirty(bh); if (bh->b_folio != bd_folio) { folio_end_writeback(bd_folio); bd_folio = bh->b_folio; } update_sr = true; break; } set_mask_bits(&bh->b_state, clear_bits, set_bits); if (bh->b_folio != fs_folio) { nilfs_end_folio_io(fs_folio, 0); fs_folio = bh->b_folio; } } if (!nilfs_segbuf_simplex(segbuf)) { if (segbuf->sb_sum.flags & NILFS_SS_LOGBGN) { set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); sci->sc_lseg_stime = jiffies; } if (segbuf->sb_sum.flags & NILFS_SS_LOGEND) clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); } } /* * Since folios may continue over multiple segment buffers, * end of the last folio must be checked outside of the loop. */ if (bd_folio) folio_end_writeback(bd_folio); nilfs_end_folio_io(fs_folio, 0); nilfs_drop_collected_inodes(&sci->sc_dirty_files); if (nilfs_doing_gc()) nilfs_drop_collected_inodes(&sci->sc_gc_inodes); else nilfs->ns_nongc_ctime = sci->sc_seg_ctime; sci->sc_nblk_inc += sci->sc_nblk_this_inc; segbuf = NILFS_LAST_SEGBUF(&sci->sc_write_logs); nilfs_set_next_segment(nilfs, segbuf); if (update_sr) { nilfs->ns_flushed_device = 0; nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, segbuf->sb_sum.seg_seq, nilfs->ns_cno++); clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); nilfs_segctor_clear_metadata_dirty(sci); } else clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); } static int nilfs_segctor_wait(struct nilfs_sc_info *sci) { int ret; ret = nilfs_wait_on_logs(&sci->sc_write_logs); if (!ret) { nilfs_segctor_complete_write(sci); nilfs_destroy_logs(&sci->sc_write_logs); } return ret; } static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { struct nilfs_inode_info *ii, *n; struct inode *ifile = sci->sc_root->ifile; spin_lock(&nilfs->ns_inode_lock); retry: list_for_each_entry_safe(ii, n, &nilfs->ns_dirty_files, i_dirty) { if (!ii->i_bh) { struct buffer_head *ibh; int err; spin_unlock(&nilfs->ns_inode_lock); err = nilfs_ifile_get_inode_block( ifile, ii->vfs_inode.i_ino, &ibh); if (unlikely(err)) { nilfs_warn(sci->sc_super, "log writer: error %d getting inode block (ino=%lu)", err, ii->vfs_inode.i_ino); return err; } spin_lock(&nilfs->ns_inode_lock); if (likely(!ii->i_bh)) ii->i_bh = ibh; else brelse(ibh); goto retry; } // Always redirty the buffer to avoid race condition mark_buffer_dirty(ii->i_bh); nilfs_mdt_mark_dirty(ifile); clear_bit(NILFS_I_QUEUED, &ii->i_state); set_bit(NILFS_I_BUSY, &ii->i_state); list_move_tail(&ii->i_dirty, &sci->sc_dirty_files); } spin_unlock(&nilfs->ns_inode_lock); return 0; } static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { struct nilfs_inode_info *ii, *n; int during_mount = !(sci->sc_super->s_flags & SB_ACTIVE); int defer_iput = false; spin_lock(&nilfs->ns_inode_lock); list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) || test_bit(NILFS_I_DIRTY, &ii->i_state)) continue; clear_bit(NILFS_I_BUSY, &ii->i_state); brelse(ii->i_bh); ii->i_bh = NULL; list_del_init(&ii->i_dirty); if (!ii->vfs_inode.i_nlink || during_mount) { /* * Defer calling iput() to avoid deadlocks if * i_nlink == 0 or mount is not yet finished. */ list_add_tail(&ii->i_dirty, &sci->sc_iput_queue); defer_iput = true; } else { spin_unlock(&nilfs->ns_inode_lock); iput(&ii->vfs_inode); spin_lock(&nilfs->ns_inode_lock); } } spin_unlock(&nilfs->ns_inode_lock); if (defer_iput) schedule_work(&sci->sc_iput_work); } /* * Main procedure of segment constructor */ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) { struct the_nilfs *nilfs = sci->sc_super->s_fs_info; int err; if (sb_rdonly(sci->sc_super)) return -EROFS; nilfs_sc_cstage_set(sci, NILFS_ST_INIT); sci->sc_cno = nilfs->ns_cno; err = nilfs_segctor_collect_dirty_files(sci, nilfs); if (unlikely(err)) goto out; if (nilfs_test_metadata_dirty(nilfs, sci->sc_root)) set_bit(NILFS_SC_DIRTY, &sci->sc_flags); if (nilfs_segctor_clean(sci)) goto out; do { sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK; err = nilfs_segctor_begin_construction(sci, nilfs); if (unlikely(err)) goto failed; /* Update time stamp */ sci->sc_seg_ctime = ktime_get_real_seconds(); err = nilfs_segctor_collect(sci, nilfs, mode); if (unlikely(err)) goto failed; /* Avoid empty segment */ if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE && nilfs_segbuf_empty(sci->sc_curseg)) { nilfs_segctor_abort_construction(sci, nilfs, 1); goto out; } err = nilfs_segctor_assign(sci, mode); if (unlikely(err)) goto failed; if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) nilfs_segctor_fill_in_file_bmap(sci); if (mode == SC_LSEG_SR && nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) { err = nilfs_cpfile_finalize_checkpoint( nilfs->ns_cpfile, nilfs->ns_cno, sci->sc_root, sci->sc_nblk_inc + sci->sc_nblk_this_inc, sci->sc_seg_ctime, !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)); if (unlikely(err)) goto failed_to_write; nilfs_segctor_fill_in_super_root(sci, nilfs); } nilfs_segctor_update_segusage(sci, nilfs->ns_sufile); /* Write partial segments */ nilfs_prepare_write_logs(&sci->sc_segbufs, nilfs->ns_crc_seed); err = nilfs_segctor_write(sci, nilfs); if (unlikely(err)) goto failed_to_write; if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE || nilfs->ns_blocksize_bits != PAGE_SHIFT) { /* * At this point, we avoid double buffering * for blocksize < pagesize because page dirty * flag is turned off during write and dirty * buffers are not properly collected for * pages crossing over segments. */ err = nilfs_segctor_wait(sci); if (err) goto failed_to_write; } } while (nilfs_sc_cstage_get(sci) != NILFS_ST_DONE); out: nilfs_segctor_drop_written_files(sci, nilfs); return err; failed_to_write: failed: if (mode == SC_LSEG_SR && nilfs_sc_cstage_get(sci) >= NILFS_ST_IFILE) nilfs_redirty_inodes(&sci->sc_dirty_files); if (nilfs_doing_gc()) nilfs_redirty_inodes(&sci->sc_gc_inodes); nilfs_segctor_abort_construction(sci, nilfs, err); goto out; } /** * nilfs_segctor_start_timer - set timer of background write * @sci: nilfs_sc_info * * If the timer has already been set, it ignores the new request. * This function MUST be called within a section locking the segment * semaphore. */ static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci) { spin_lock(&sci->sc_state_lock); if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) { if (sci->sc_task) { sci->sc_timer.expires = jiffies + sci->sc_interval; add_timer(&sci->sc_timer); } sci->sc_state |= NILFS_SEGCTOR_COMMIT; } spin_unlock(&sci->sc_state_lock); } static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn) { spin_lock(&sci->sc_state_lock); if (!(sci->sc_flush_request & BIT(bn))) { unsigned long prev_req = sci->sc_flush_request; sci->sc_flush_request |= BIT(bn); if (!prev_req) wake_up(&sci->sc_wait_daemon); } spin_unlock(&sci->sc_state_lock); } struct nilfs_segctor_wait_request { wait_queue_entry_t wq; __u32 seq; int err; atomic_t done; }; static int nilfs_segctor_sync(struct nilfs_sc_info *sci) { struct nilfs_segctor_wait_request wait_req; int err = 0; init_wait(&wait_req.wq); wait_req.err = 0; atomic_set(&wait_req.done, 0); init_waitqueue_entry(&wait_req.wq, current); /* * To prevent a race issue where completion notifications from the * log writer thread are missed, increment the request sequence count * "sc_seq_request" and insert a wait queue entry using the current * sequence number into the "sc_wait_request" queue at the same time * within the lock section of "sc_state_lock". */ spin_lock(&sci->sc_state_lock); wait_req.seq = ++sci->sc_seq_request; add_wait_queue(&sci->sc_wait_request, &wait_req.wq); spin_unlock(&sci->sc_state_lock); wake_up(&sci->sc_wait_daemon); for (;;) { set_current_state(TASK_INTERRUPTIBLE); /* * Synchronize only while the log writer thread is alive. * Leave flushing out after the log writer thread exits to * the cleanup work in nilfs_segctor_destroy(). */ if (!sci->sc_task) break; if (atomic_read(&wait_req.done)) { err = wait_req.err; break; } if (!signal_pending(current)) { schedule(); continue; } err = -ERESTARTSYS; break; } finish_wait(&sci->sc_wait_request, &wait_req.wq); return err; } static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err, bool force) { struct nilfs_segctor_wait_request *wrq, *n; unsigned long flags; spin_lock_irqsave(&sci->sc_wait_request.lock, flags); list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.head, wq.entry) { if (!atomic_read(&wrq->done) && (force || nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq))) { wrq->err = err; atomic_set(&wrq->done, 1); } if (atomic_read(&wrq->done)) { wrq->wq.func(&wrq->wq, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0, NULL); } } spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags); } /** * nilfs_construct_segment - construct a logical segment * @sb: super block * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. * * %-ENOSPC - No space left on device (only in a panic state). * * %-ERESTARTSYS - Interrupted. * * %-EROFS - Read only filesystem. */ int nilfs_construct_segment(struct super_block *sb) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_sc_info *sci = nilfs->ns_writer; struct nilfs_transaction_info *ti; if (sb_rdonly(sb) || unlikely(!sci)) return -EROFS; /* A call inside transactions causes a deadlock. */ BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC); return nilfs_segctor_sync(sci); } /** * nilfs_construct_dsync_segment - construct a data-only logical segment * @sb: super block * @inode: inode whose data blocks should be written out * @start: start byte offset * @end: end byte offset (inclusive) * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. * * %-ENOSPC - No space left on device (only in a panic state). * * %-ERESTARTSYS - Interrupted. * * %-EROFS - Read only filesystem. */ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, loff_t start, loff_t end) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_sc_info *sci = nilfs->ns_writer; struct nilfs_inode_info *ii; struct nilfs_transaction_info ti; int err = 0; if (sb_rdonly(sb) || unlikely(!sci)) return -EROFS; nilfs_transaction_lock(sb, &ti, 0); ii = NILFS_I(inode); if (test_bit(NILFS_I_INODE_SYNC, &ii->i_state) || nilfs_test_opt(nilfs, STRICT_ORDER) || test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || nilfs_discontinued(nilfs)) { nilfs_transaction_unlock(sb); err = nilfs_segctor_sync(sci); return err; } spin_lock(&nilfs->ns_inode_lock); if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && !test_bit(NILFS_I_BUSY, &ii->i_state)) { spin_unlock(&nilfs->ns_inode_lock); nilfs_transaction_unlock(sb); return 0; } spin_unlock(&nilfs->ns_inode_lock); sci->sc_dsync_inode = ii; sci->sc_dsync_start = start; sci->sc_dsync_end = end; err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC); if (!err) nilfs->ns_flushed_device = 0; nilfs_transaction_unlock(sb); return err; } #define FLUSH_FILE_BIT (0x1) /* data file only */ #define FLUSH_DAT_BIT BIT(NILFS_DAT_INO) /* DAT only */ /** * nilfs_segctor_accept - record accepted sequence count of log-write requests * @sci: segment constructor object */ static void nilfs_segctor_accept(struct nilfs_sc_info *sci) { bool thread_is_alive; spin_lock(&sci->sc_state_lock); sci->sc_seq_accepted = sci->sc_seq_request; thread_is_alive = (bool)sci->sc_task; spin_unlock(&sci->sc_state_lock); /* * This function does not race with the log writer thread's * termination. Therefore, deleting sc_timer, which should not be * done after the log writer thread exits, can be done safely outside * the area protected by sc_state_lock. */ if (thread_is_alive) timer_delete_sync(&sci->sc_timer); } /** * nilfs_segctor_notify - notify the result of request to caller threads * @sci: segment constructor object * @mode: mode of log forming * @err: error code to be notified */ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err) { /* Clear requests (even when the construction failed) */ spin_lock(&sci->sc_state_lock); if (mode == SC_LSEG_SR) { sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; sci->sc_seq_done = sci->sc_seq_accepted; nilfs_segctor_wakeup(sci, err, false); sci->sc_flush_request = 0; } else { if (mode == SC_FLUSH_FILE) sci->sc_flush_request &= ~FLUSH_FILE_BIT; else if (mode == SC_FLUSH_DAT) sci->sc_flush_request &= ~FLUSH_DAT_BIT; /* re-enable timer if checkpoint creation was not done */ if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && sci->sc_task && time_before(jiffies, sci->sc_timer.expires)) add_timer(&sci->sc_timer); } spin_unlock(&sci->sc_state_lock); } /** * nilfs_segctor_construct - form logs and write them to disk * @sci: segment constructor object * @mode: mode of log forming * * Return: 0 on success, or a negative error code on failure. */ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) { struct the_nilfs *nilfs = sci->sc_super->s_fs_info; struct nilfs_super_block **sbp; int err = 0; nilfs_segctor_accept(sci); if (nilfs_discontinued(nilfs)) mode = SC_LSEG_SR; if (!nilfs_segctor_confirm(sci)) err = nilfs_segctor_do_construct(sci, mode); if (likely(!err)) { if (mode != SC_FLUSH_DAT) atomic_set(&nilfs->ns_ndirtyblks, 0); if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && nilfs_discontinued(nilfs)) { down_write(&nilfs->ns_sem); err = -EIO; sbp = nilfs_prepare_super(sci->sc_super, nilfs_sb_will_flip(nilfs)); if (likely(sbp)) { nilfs_set_log_cursor(sbp[0], nilfs); err = nilfs_commit_super(sci->sc_super, NILFS_SB_COMMIT); } up_write(&nilfs->ns_sem); } } nilfs_segctor_notify(sci, mode, err); return err; } static void nilfs_construction_timeout(struct timer_list *t) { struct nilfs_sc_info *sci = timer_container_of(sci, t, sc_timer); wake_up_process(sci->sc_task); } static void nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head) { struct nilfs_inode_info *ii, *n; list_for_each_entry_safe(ii, n, head, i_dirty) { if (!test_bit(NILFS_I_UPDATED, &ii->i_state)) continue; list_del_init(&ii->i_dirty); truncate_inode_pages(&ii->vfs_inode.i_data, 0); nilfs_btnode_cache_clear(ii->i_assoc_inode->i_mapping); iput(&ii->vfs_inode); } } int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv, void **kbufs) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_sc_info *sci = nilfs->ns_writer; struct nilfs_transaction_info ti; int err; if (unlikely(!sci)) return -EROFS; nilfs_transaction_lock(sb, &ti, 1); err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat); if (unlikely(err)) goto out_unlock; err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs); if (unlikely(err)) { nilfs_mdt_restore_from_shadow_map(nilfs->ns_dat); goto out_unlock; } sci->sc_freesegs = kbufs[4]; sci->sc_nfreesegs = argv[4].v_nmembs; list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes); for (;;) { err = nilfs_segctor_construct(sci, SC_LSEG_SR); nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes); if (likely(!err)) break; nilfs_warn(sb, "error %d cleaning segments", err); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(sci->sc_interval); } if (nilfs_test_opt(nilfs, DISCARD)) { int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs, sci->sc_nfreesegs); if (ret) { nilfs_warn(sb, "error %d on discard request, turning discards off for the device", ret); nilfs_clear_opt(nilfs, DISCARD); } } out_unlock: sci->sc_freesegs = NULL; sci->sc_nfreesegs = 0; nilfs_mdt_clear_shadow_map(nilfs->ns_dat); nilfs_transaction_unlock(sb); return err; } static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode) { struct nilfs_transaction_info ti; nilfs_transaction_lock(sci->sc_super, &ti, 0); nilfs_segctor_construct(sci, mode); /* * Unclosed segment should be retried. We do this using sc_timer. * Timeout of sc_timer will invoke complete construction which leads * to close the current logical segment. */ if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) nilfs_segctor_start_timer(sci); nilfs_transaction_unlock(sci->sc_super); } static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci) { int mode = 0; spin_lock(&sci->sc_state_lock); mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ? SC_FLUSH_DAT : SC_FLUSH_FILE; spin_unlock(&sci->sc_state_lock); if (mode) { nilfs_segctor_do_construct(sci, mode); spin_lock(&sci->sc_state_lock); sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ? ~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT; spin_unlock(&sci->sc_state_lock); } clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags); } static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci) { if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) { if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT)) return SC_FLUSH_FILE; else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT)) return SC_FLUSH_DAT; } return SC_LSEG_SR; } /** * nilfs_log_write_required - determine whether log writing is required * @sci: nilfs_sc_info struct * @modep: location for storing log writing mode * * Return: true if log writing is required, false otherwise. If log writing * is required, the mode is stored in the location pointed to by @modep. */ static bool nilfs_log_write_required(struct nilfs_sc_info *sci, int *modep) { bool timedout, ret = true; spin_lock(&sci->sc_state_lock); timedout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && time_after_eq(jiffies, sci->sc_timer.expires)); if (timedout || sci->sc_seq_request != sci->sc_seq_done) *modep = SC_LSEG_SR; else if (sci->sc_flush_request) *modep = nilfs_segctor_flush_mode(sci); else ret = false; spin_unlock(&sci->sc_state_lock); return ret; } /** * nilfs_segctor_thread - main loop of the log writer thread * @arg: pointer to a struct nilfs_sc_info. * * nilfs_segctor_thread() is the main loop function of the log writer kernel * thread, which determines whether log writing is necessary, and if so, * performs the log write in the background, or waits if not. It is also * used to decide the background writeback of the superblock. * * Return: Always 0. */ static int nilfs_segctor_thread(void *arg) { struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; struct the_nilfs *nilfs = sci->sc_super->s_fs_info; nilfs_info(sci->sc_super, "segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds", sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ); set_freezable(); while (!kthread_should_stop()) { DEFINE_WAIT(wait); bool should_write; int mode; if (freezing(current)) { try_to_freeze(); continue; } prepare_to_wait(&sci->sc_wait_daemon, &wait, TASK_INTERRUPTIBLE); should_write = nilfs_log_write_required(sci, &mode); if (!should_write) schedule(); finish_wait(&sci->sc_wait_daemon, &wait); if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs)) set_nilfs_discontinued(nilfs); if (should_write) nilfs_segctor_thread_construct(sci, mode); } /* end sync. */ spin_lock(&sci->sc_state_lock); sci->sc_task = NULL; timer_shutdown_sync(&sci->sc_timer); spin_unlock(&sci->sc_state_lock); return 0; } /* * Setup & clean-up functions */ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, struct nilfs_root *root) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_sc_info *sci; sci = kzalloc(sizeof(*sci), GFP_KERNEL); if (!sci) return NULL; sci->sc_super = sb; nilfs_get_root(root); sci->sc_root = root; init_waitqueue_head(&sci->sc_wait_request); init_waitqueue_head(&sci->sc_wait_daemon); spin_lock_init(&sci->sc_state_lock); INIT_LIST_HEAD(&sci->sc_dirty_files); INIT_LIST_HEAD(&sci->sc_segbufs); INIT_LIST_HEAD(&sci->sc_write_logs); INIT_LIST_HEAD(&sci->sc_gc_inodes); INIT_LIST_HEAD(&sci->sc_iput_queue); INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func); sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK; if (nilfs->ns_interval) sci->sc_interval = HZ * nilfs->ns_interval; if (nilfs->ns_watermark) sci->sc_watermark = nilfs->ns_watermark; return sci; } static void nilfs_segctor_write_out(struct nilfs_sc_info *sci) { int ret, retrycount = NILFS_SC_CLEANUP_RETRY; /* * The segctord thread was stopped and its timer was removed. * But some tasks remain. */ do { struct nilfs_transaction_info ti; nilfs_transaction_lock(sci->sc_super, &ti, 0); ret = nilfs_segctor_construct(sci, SC_LSEG_SR); nilfs_transaction_unlock(sci->sc_super); flush_work(&sci->sc_iput_work); } while (ret && ret != -EROFS && retrycount-- > 0); } /** * nilfs_segctor_destroy - destroy the segment constructor. * @sci: nilfs_sc_info * * nilfs_segctor_destroy() kills the segctord thread and frees * the nilfs_sc_info struct. * Caller must hold the segment semaphore. */ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) { struct the_nilfs *nilfs = sci->sc_super->s_fs_info; int flag; up_write(&nilfs->ns_segctor_sem); if (sci->sc_task) { wake_up(&sci->sc_wait_daemon); kthread_stop(sci->sc_task); } spin_lock(&sci->sc_state_lock); flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request || sci->sc_seq_request != sci->sc_seq_done); spin_unlock(&sci->sc_state_lock); /* * Forcibly wake up tasks waiting in nilfs_segctor_sync(), which can * be called from delayed iput() via nilfs_evict_inode() and can race * with the above log writer thread termination. */ nilfs_segctor_wakeup(sci, 0, true); if (flush_work(&sci->sc_iput_work)) flag = true; if (flag || !nilfs_segctor_confirm(sci)) nilfs_segctor_write_out(sci); if (!list_empty(&sci->sc_dirty_files)) { nilfs_warn(sci->sc_super, "disposed unprocessed dirty file(s) when stopping log writer"); nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1); } if (!list_empty(&sci->sc_iput_queue)) { nilfs_warn(sci->sc_super, "disposed unprocessed inode(s) in iput queue when stopping log writer"); nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1); } WARN_ON(!list_empty(&sci->sc_segbufs)); WARN_ON(!list_empty(&sci->sc_write_logs)); nilfs_put_root(sci->sc_root); down_write(&nilfs->ns_segctor_sem); kfree(sci); } /** * nilfs_attach_log_writer - attach log writer * @sb: super block instance * @root: root object of the current filesystem tree * * This allocates a log writer object, initializes it, and starts the * log writer. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINTR - Log writer thread creation failed due to interruption. * * %-ENOMEM - Insufficient memory available. */ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_sc_info *sci; struct task_struct *t; int err; if (nilfs->ns_writer) { /* * This happens if the filesystem is made read-only by * __nilfs_error or nilfs_remount and then remounted * read/write. In these cases, reuse the existing * writer. */ return 0; } sci = nilfs_segctor_new(sb, root); if (unlikely(!sci)) return -ENOMEM; nilfs->ns_writer = sci; t = kthread_create(nilfs_segctor_thread, sci, "segctord"); if (IS_ERR(t)) { err = PTR_ERR(t); nilfs_err(sb, "error %d creating segctord thread", err); nilfs_detach_log_writer(sb); return err; } sci->sc_task = t; timer_setup(&sci->sc_timer, nilfs_construction_timeout, 0); wake_up_process(sci->sc_task); return 0; } /** * nilfs_detach_log_writer - destroy log writer * @sb: super block instance * * This kills log writer daemon, frees the log writer object, and * destroys list of dirty files. */ void nilfs_detach_log_writer(struct super_block *sb) { struct the_nilfs *nilfs = sb->s_fs_info; LIST_HEAD(garbage_list); down_write(&nilfs->ns_segctor_sem); if (nilfs->ns_writer) { nilfs_segctor_destroy(nilfs->ns_writer); nilfs->ns_writer = NULL; } set_nilfs_purging(nilfs); /* Force to free the list of dirty files */ spin_lock(&nilfs->ns_inode_lock); if (!list_empty(&nilfs->ns_dirty_files)) { list_splice_init(&nilfs->ns_dirty_files, &garbage_list); nilfs_warn(sb, "disposed unprocessed dirty file(s) when detaching log writer"); } spin_unlock(&nilfs->ns_inode_lock); up_write(&nilfs->ns_segctor_sem); nilfs_dispose_list(nilfs, &garbage_list, 1); clear_nilfs_purging(nilfs); } |
| 97 32 45 75 95 88 9 29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_DISK_GROUPS_H #define _BCACHEFS_DISK_GROUPS_H #include "disk_groups_types.h" extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) { return groups ? (vstruct_end(&groups->field) - (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) : 0; } struct target { enum { TARGET_NULL, TARGET_DEV, TARGET_GROUP, } type; union { unsigned dev; unsigned group; }; }; #define TARGET_DEV_START 1 #define TARGET_GROUP_START (256 + TARGET_DEV_START) static inline u16 dev_to_target(unsigned dev) { return TARGET_DEV_START + dev; } static inline u16 group_to_target(unsigned group) { return TARGET_GROUP_START + group; } static inline struct target target_decode(unsigned target) { if (target >= TARGET_GROUP_START) return (struct target) { .type = TARGET_GROUP, .group = target - TARGET_GROUP_START }; if (target >= TARGET_DEV_START) return (struct target) { .type = TARGET_DEV, .group = target - TARGET_DEV_START }; return (struct target) { .type = TARGET_NULL }; } const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, enum bch_data_type data_type, u16 target) { struct bch_devs_mask devs = c->rw_devs[data_type]; const struct bch_devs_mask *t = bch2_target_to_mask(c, target); if (t) bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); return devs; } static inline bool bch2_target_accepts_data(struct bch_fs *c, enum bch_data_type data_type, u16 target) { struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target); return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX); } bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); int bch2_disk_path_find(struct bch_sb_handle *, const char *); /* Exported for userspace bcachefs-tools: */ int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned); void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned); void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned); int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); #define bch2_opt_target (struct bch_opt_fn) { \ .parse = bch2_opt_target_parse, \ .to_text = bch2_opt_target_to_text, \ } int bch2_sb_disk_groups_to_cpu(struct bch_fs *); int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); const char *bch2_sb_validate_disk_groups(struct bch_sb *, struct bch_sb_field *); void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *); #endif /* _BCACHEFS_DISK_GROUPS_H */ |
| 1 1 1 1 1 2 1 1 1 2 2 1 2 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/io_uring.h> #include <trace/events/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "refs.h" #include "cancel.h" #include "timeout.h" struct io_timeout { struct file *file; u32 off; u32 target_seq; u32 repeats; struct list_head list; /* head of the link, used by linked timeouts only */ struct io_kiocb *head; /* for linked completions */ struct io_kiocb *prev; }; struct io_timeout_rem { struct file *file; u64 addr; /* timeout update */ struct timespec64 ts; u32 flags; bool ltimeout; }; static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, struct io_kiocb *link); static inline bool io_is_timeout_noseq(struct io_kiocb *req) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data = req->async_data; return !timeout->off || data->flags & IORING_TIMEOUT_MULTISHOT; } static inline void io_put_req(struct io_kiocb *req) { if (req_ref_put_and_test(req)) { io_queue_next(req); io_free_req(req); } } static inline bool io_timeout_finish(struct io_timeout *timeout, struct io_timeout_data *data) { if (!(data->flags & IORING_TIMEOUT_MULTISHOT)) return true; if (!timeout->off || (timeout->repeats && --timeout->repeats)) return false; return true; } static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer); static void io_timeout_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data = req->async_data; struct io_ring_ctx *ctx = req->ctx; if (!io_timeout_finish(timeout, data)) { if (io_req_post_cqe(req, -ETIME, IORING_CQE_F_MORE)) { /* re-arm timer */ raw_spin_lock_irq(&ctx->timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); raw_spin_unlock_irq(&ctx->timeout_lock); return; } } io_req_task_complete(req, tw); } static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) { if (list_empty(list)) return false; while (!list_empty(list)) { struct io_timeout *timeout; struct io_kiocb *req; timeout = list_first_entry(list, struct io_timeout, list); list_del_init(&timeout->list); req = cmd_to_io_kiocb(timeout); if (err) req_set_fail(req); io_req_queue_tw_complete(req, err); } return true; } static void io_kill_timeout(struct io_kiocb *req, struct list_head *list) __must_hold(&req->ctx->timeout_lock) { struct io_timeout_data *io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) != -1) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_move_tail(&timeout->list, list); } } __cold void io_flush_timeouts(struct io_ring_ctx *ctx) { struct io_timeout *timeout, *tmp; LIST_HEAD(list); u32 seq; raw_spin_lock_irq(&ctx->timeout_lock); seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); u32 events_needed, events_got; if (io_is_timeout_noseq(req)) break; /* * Since seq can easily wrap around over time, subtract * the last seq at which timeouts were flushed before comparing. * Assuming not more than 2^31-1 events have happened since, * these subtractions won't have wrapped, so we can check if * target is in [last_seq, current_seq] by comparing the two. */ events_needed = timeout->target_seq - ctx->cq_last_tm_flush; events_got = seq - ctx->cq_last_tm_flush; if (events_got < events_needed) break; io_kill_timeout(req, &list); } ctx->cq_last_tm_flush = seq; raw_spin_unlock_irq(&ctx->timeout_lock); io_flush_killed_timeouts(&list, 0); } static void io_req_tw_fail_links(struct io_kiocb *link, io_tw_token_t tw) { io_tw_lock(link->ctx, tw); while (link) { struct io_kiocb *nxt = link->link; long res = -ECANCELED; if (link->flags & REQ_F_FAIL) res = link->cqe.res; link->link = NULL; io_req_set_res(link, res, 0); io_req_task_complete(link, tw); link = nxt; } } static void io_fail_links(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) { struct io_kiocb *link = req->link; bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; if (!link) return; while (link) { if (ignore_cqes) link->flags |= REQ_F_CQE_SKIP; else link->flags &= ~REQ_F_CQE_SKIP; trace_io_uring_fail_link(req, link); link = link->link; } link = req->link; link->io_task_work.func = io_req_tw_fail_links; io_req_task_work_add(link); req->link = NULL; } static inline void io_remove_next_linked(struct io_kiocb *req) { struct io_kiocb *nxt = req->link; req->link = nxt->link; nxt->link = NULL; } void io_disarm_next(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) { struct io_kiocb *link = NULL; if (req->flags & REQ_F_ARM_LTIMEOUT) { link = req->link; req->flags &= ~REQ_F_ARM_LTIMEOUT; if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { io_remove_next_linked(req); io_req_queue_tw_complete(link, -ECANCELED); } } else if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; raw_spin_lock_irq(&ctx->timeout_lock); if (req->link && req->link->opcode == IORING_OP_LINK_TIMEOUT) link = __io_disarm_linked_timeout(req, req->link); raw_spin_unlock_irq(&ctx->timeout_lock); if (link) io_req_queue_tw_complete(link, -ECANCELED); } if (unlikely((req->flags & REQ_F_FAIL) && !(req->flags & REQ_F_HARDLINK))) io_fail_links(req); } static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, struct io_kiocb *link) __must_hold(&req->ctx->completion_lock) __must_hold(&req->ctx->timeout_lock) { struct io_timeout_data *io = link->async_data; struct io_timeout *timeout = io_kiocb_to_cmd(link, struct io_timeout); io_remove_next_linked(req); timeout->head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { list_del(&timeout->list); return link; } return NULL; } static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, struct io_timeout_data, timer); struct io_kiocb *req = data->req; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_ring_ctx *ctx = req->ctx; unsigned long flags; raw_spin_lock_irqsave(&ctx->timeout_lock, flags); list_del_init(&timeout->list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) req_set_fail(req); io_req_set_res(req, -ETIME, 0); req->io_task_work.func = io_timeout_complete; io_req_task_work_add(req); return HRTIMER_NORESTART; } static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, struct io_cancel_data *cd) __must_hold(&ctx->timeout_lock) { struct io_timeout *timeout; struct io_timeout_data *io; struct io_kiocb *req = NULL; list_for_each_entry(timeout, &ctx->timeout_list, list) { struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); if (io_cancel_req_match(tmp, cd)) { req = tmp; break; } } if (!req) return ERR_PTR(-ENOENT); io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) == -1) return ERR_PTR(-EALREADY); timeout = io_kiocb_to_cmd(req, struct io_timeout); list_del_init(&timeout->list); return req; } int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) __must_hold(&ctx->completion_lock) { struct io_kiocb *req; raw_spin_lock_irq(&ctx->timeout_lock); req = io_timeout_extract(ctx, cd); raw_spin_unlock_irq(&ctx->timeout_lock); if (IS_ERR(req)) return PTR_ERR(req); io_req_task_queue_fail(req, -ECANCELED); return 0; } static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; int ret; if (prev) { if (!io_should_terminate_tw()) { struct io_cancel_data cd = { .ctx = req->ctx, .data = prev->cqe.user_data, }; ret = io_try_cancel(req->tctx, &cd, 0); } else { ret = -ECANCELED; } io_req_set_res(req, ret ?: -ETIME, 0); io_req_task_complete(req, tw); io_put_req(prev); } else { io_req_set_res(req, -ETIME, 0); io_req_task_complete(req, tw); } } static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, struct io_timeout_data, timer); struct io_kiocb *prev, *req = data->req; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_ring_ctx *ctx = req->ctx; unsigned long flags; raw_spin_lock_irqsave(&ctx->timeout_lock, flags); prev = timeout->head; timeout->head = NULL; /* * We don't expect the list to be empty, that will only happen if we * race with the completion of the linked work. */ if (prev) { io_remove_next_linked(prev); if (!req_ref_inc_not_zero(prev)) prev = NULL; } list_del(&timeout->list); timeout->prev = prev; raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; io_req_task_work_add(req); return HRTIMER_NORESTART; } static clockid_t io_timeout_get_clock(struct io_timeout_data *data) { switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { case IORING_TIMEOUT_BOOTTIME: return CLOCK_BOOTTIME; case IORING_TIMEOUT_REALTIME: return CLOCK_REALTIME; default: /* can't happen, vetted at prep time */ WARN_ON_ONCE(1); fallthrough; case 0: return CLOCK_MONOTONIC; } } static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { struct io_timeout_data *io; struct io_timeout *timeout; struct io_kiocb *req = NULL; list_for_each_entry(timeout, &ctx->ltimeout_list, list) { struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); if (user_data == tmp->cqe.user_data) { req = tmp; break; } } if (!req) return -ENOENT; io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) == -1) return -EALREADY; hrtimer_setup(&io->timer, io_link_timeout_fn, io_timeout_get_clock(io), mode); hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); return 0; } static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { struct io_cancel_data cd = { .ctx = ctx, .data = user_data, }; struct io_kiocb *req = io_timeout_extract(ctx, &cd); struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data; if (IS_ERR(req)) return PTR_ERR(req); timeout->off = 0; /* noseq */ data = req->async_data; data->ts = *ts; list_add_tail(&timeout->list, &ctx->timeout_list); hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), mode); hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode); return 0; } int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_timeout_rem *tr = io_kiocb_to_cmd(req, struct io_timeout_rem); if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; if (sqe->buf_index || sqe->len || sqe->splice_fd_in) return -EINVAL; tr->ltimeout = false; tr->addr = READ_ONCE(sqe->addr); tr->flags = READ_ONCE(sqe->timeout_flags); if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) return -EINVAL; if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) tr->ltimeout = true; if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) return -EINVAL; if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) return -EFAULT; if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) return -EINVAL; } else if (tr->flags) { /* timeout removal doesn't support flags */ return -EINVAL; } return 0; } static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) { return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; } /* * Remove or update an existing timeout command */ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) { struct io_timeout_rem *tr = io_kiocb_to_cmd(req, struct io_timeout_rem); struct io_ring_ctx *ctx = req->ctx; int ret; if (!(tr->flags & IORING_TIMEOUT_UPDATE)) { struct io_cancel_data cd = { .ctx = ctx, .data = tr->addr, }; spin_lock(&ctx->completion_lock); ret = io_timeout_cancel(ctx, &cd); spin_unlock(&ctx->completion_lock); } else { enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); raw_spin_lock_irq(&ctx->timeout_lock); if (tr->ltimeout) ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); else ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); raw_spin_unlock_irq(&ctx->timeout_lock); } if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_COMPLETE; } static int __io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool is_timeout_link) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data; unsigned flags; u32 off = READ_ONCE(sqe->off); if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) return -EINVAL; if (off && is_timeout_link) return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | IORING_TIMEOUT_ETIME_SUCCESS | IORING_TIMEOUT_MULTISHOT)) return -EINVAL; /* more than one clock specified is invalid, obviously */ if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) return -EINVAL; /* multishot requests only make sense with rel values */ if (!(~flags & (IORING_TIMEOUT_MULTISHOT | IORING_TIMEOUT_ABS))) return -EINVAL; INIT_LIST_HEAD(&timeout->list); timeout->off = off; if (unlikely(off && !req->ctx->off_timeout_used)) req->ctx->off_timeout_used = true; /* * for multishot reqs w/ fixed nr of repeats, repeats tracks the * remaining nr */ timeout->repeats = 0; if ((flags & IORING_TIMEOUT_MULTISHOT) && off > 0) timeout->repeats = off; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; data = io_uring_alloc_async_data(NULL, req); if (!data) return -ENOMEM; data->req = req; data->flags = flags; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) return -EINVAL; data->mode = io_translate_timeout_mode(flags); if (is_timeout_link) { struct io_submit_link *link = &req->ctx->submit_state.link; if (!link->head) return -EINVAL; if (link->last->opcode == IORING_OP_LINK_TIMEOUT) return -EINVAL; timeout->head = link->last; link->last->flags |= REQ_F_ARM_LTIMEOUT; hrtimer_setup(&data->timer, io_link_timeout_fn, io_timeout_get_clock(data), data->mode); } else { hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), data->mode); } return 0; } int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { return __io_timeout_prep(req, sqe, false); } int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { return __io_timeout_prep(req, sqe, true); } int io_timeout(struct io_kiocb *req, unsigned int issue_flags) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_ring_ctx *ctx = req->ctx; struct io_timeout_data *data = req->async_data; struct list_head *entry; u32 tail, off = timeout->off; raw_spin_lock_irq(&ctx->timeout_lock); /* * sqe->off holds how many events that need to occur for this * timeout event to be satisfied. If it isn't set, then this is * a pure timeout request, sequence isn't used. */ if (io_is_timeout_noseq(req)) { entry = ctx->timeout_list.prev; goto add; } tail = data_race(ctx->cached_cq_tail) - atomic_read(&ctx->cq_timeouts); timeout->target_seq = tail + off; /* Update the last seq here in case io_flush_timeouts() hasn't. * This is safe because ->completion_lock is held, and submissions * and completions are never mixed in the same ->completion_lock section. */ ctx->cq_last_tm_flush = tail; /* * Insertion sort, ensuring the first entry in the list is always * the one we need first. */ list_for_each_prev(entry, &ctx->timeout_list) { struct io_timeout *nextt = list_entry(entry, struct io_timeout, list); struct io_kiocb *nxt = cmd_to_io_kiocb(nextt); if (io_is_timeout_noseq(nxt)) continue; /* nxt.seq is behind @tail, otherwise would've been completed */ if (off >= nextt->target_seq - tail) break; } add: list_add(&timeout->list, entry); hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); raw_spin_unlock_irq(&ctx->timeout_lock); return IOU_ISSUE_SKIP_COMPLETE; } void io_queue_linked_timeout(struct io_kiocb *req) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_ring_ctx *ctx = req->ctx; raw_spin_lock_irq(&ctx->timeout_lock); /* * If the back reference is NULL, then our linked request finished * before we got a chance to setup the timer */ if (timeout->head) { struct io_timeout_data *data = req->async_data; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); list_add_tail(&timeout->list, &ctx->ltimeout_list); } raw_spin_unlock_irq(&ctx->timeout_lock); /* drop submission reference */ io_put_req(req); } static bool io_match_task(struct io_kiocb *head, struct io_uring_task *tctx, bool cancel_all) __must_hold(&head->ctx->timeout_lock) { struct io_kiocb *req; if (tctx && head->tctx != tctx) return false; if (cancel_all) return true; io_for_each_link(req, head) { if (req->flags & REQ_F_INFLIGHT) return true; } return false; } /* Returns true if we found and killed one or more timeouts */ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { struct io_timeout *timeout, *tmp; LIST_HEAD(list); /* * completion_lock is needed for io_match_task(). Take it before * timeout_lockfirst to keep locking ordering. */ spin_lock(&ctx->completion_lock); raw_spin_lock_irq(&ctx->timeout_lock); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); if (io_match_task(req, tctx, cancel_all)) io_kill_timeout(req, &list); } raw_spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); return io_flush_killed_timeouts(&list, -ECANCELED); } |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * Single-block cipher operations. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/algapi.h> #include <crypto/internal/cipher.h> #include <linux/kernel.h> #include <linux/crypto.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/string.h> #include "internal.h" static int setkey_unaligned(struct crypto_cipher *tfm, const u8 *key, unsigned int keylen) { struct cipher_alg *cia = crypto_cipher_alg(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); int ret; u8 *buffer, *alignbuffer; unsigned long absize; absize = keylen + alignmask; buffer = kmalloc(absize, GFP_ATOMIC); if (!buffer) return -ENOMEM; alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = cia->cia_setkey(crypto_cipher_tfm(tfm), alignbuffer, keylen); kfree_sensitive(buffer); return ret; } int crypto_cipher_setkey(struct crypto_cipher *tfm, const u8 *key, unsigned int keylen) { struct cipher_alg *cia = crypto_cipher_alg(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); if (keylen < cia->cia_min_keysize || keylen > cia->cia_max_keysize) return -EINVAL; if ((unsigned long)key & alignmask) return setkey_unaligned(tfm, key, keylen); return cia->cia_setkey(crypto_cipher_tfm(tfm), key, keylen); } EXPORT_SYMBOL_NS_GPL(crypto_cipher_setkey, "CRYPTO_INTERNAL"); static inline void cipher_crypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src, bool enc) { unsigned long alignmask = crypto_cipher_alignmask(tfm); struct cipher_alg *cia = crypto_cipher_alg(tfm); void (*fn)(struct crypto_tfm *, u8 *, const u8 *) = enc ? cia->cia_encrypt : cia->cia_decrypt; if (unlikely(((unsigned long)dst | (unsigned long)src) & alignmask)) { unsigned int bs = crypto_cipher_blocksize(tfm); u8 buffer[MAX_CIPHER_BLOCKSIZE + MAX_CIPHER_ALIGNMASK]; u8 *tmp = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(tmp, src, bs); fn(crypto_cipher_tfm(tfm), tmp, tmp); memcpy(dst, tmp, bs); } else { fn(crypto_cipher_tfm(tfm), dst, src); } } void crypto_cipher_encrypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src) { cipher_crypt_one(tfm, dst, src, true); } EXPORT_SYMBOL_NS_GPL(crypto_cipher_encrypt_one, "CRYPTO_INTERNAL"); void crypto_cipher_decrypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src) { cipher_crypt_one(tfm, dst, src, false); } EXPORT_SYMBOL_NS_GPL(crypto_cipher_decrypt_one, "CRYPTO_INTERNAL"); struct crypto_cipher *crypto_clone_cipher(struct crypto_cipher *cipher) { struct crypto_tfm *tfm = crypto_cipher_tfm(cipher); struct crypto_alg *alg = tfm->__crt_alg; struct crypto_cipher *ncipher; struct crypto_tfm *ntfm; if (alg->cra_init) return ERR_PTR(-ENOSYS); if (unlikely(!crypto_mod_get(alg))) return ERR_PTR(-ESTALE); ntfm = __crypto_alloc_tfmgfp(alg, CRYPTO_ALG_TYPE_CIPHER, CRYPTO_ALG_TYPE_MASK, GFP_ATOMIC); if (IS_ERR(ntfm)) { crypto_mod_put(alg); return ERR_CAST(ntfm); } ntfm->crt_flags = tfm->crt_flags; ncipher = __crypto_cipher_cast(ntfm); return ncipher; } EXPORT_SYMBOL_GPL(crypto_clone_cipher); |
| 36 2 2 14 18 32 17 42 1 1 18 37 62 3 58 37 33 6 6 6 116 95 35 121 121 121 121 1 1 1 117 111 6 114 3 120 122 94 36 8 28 121 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 | // SPDX-License-Identifier: GPL-2.0 #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/exportfs.h> #include <linux/fs_struct.h> #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/uaccess.h> #include <linux/compat.h> #include "internal.h" #include "mount.h" static long do_sys_name_to_handle(const struct path *path, struct file_handle __user *ufh, void __user *mnt_id, bool unique_mntid, int fh_flags) { long retval; struct file_handle f_handle; int handle_dwords, handle_bytes; struct file_handle *handle = NULL; /* * We need to make sure whether the file system support decoding of * the file handle if decodeable file handle was requested. */ if (!exportfs_can_encode_fh(path->dentry->d_sb->s_export_op, fh_flags)) return -EOPNOTSUPP; /* * A request to encode a connectable handle for a disconnected dentry * is unexpected since AT_EMPTY_PATH is not allowed. */ if (fh_flags & EXPORT_FH_CONNECTABLE && WARN_ON(path->dentry->d_flags & DCACHE_DISCONNECTED)) return -EINVAL; if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) return -EFAULT; if (f_handle.handle_bytes > MAX_HANDLE_SZ) return -EINVAL; handle = kzalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) return -ENOMEM; /* convert handle size to multiple of sizeof(u32) */ handle_dwords = f_handle.handle_bytes >> 2; /* Encode a possibly decodeable/connectable file handle */ retval = exportfs_encode_fh(path->dentry, (struct fid *)handle->f_handle, &handle_dwords, fh_flags); handle->handle_type = retval; /* convert handle size to bytes */ handle_bytes = handle_dwords * sizeof(u32); handle->handle_bytes = handle_bytes; if ((handle->handle_bytes > f_handle.handle_bytes) || (retval == FILEID_INVALID) || (retval < 0)) { /* As per old exportfs_encode_fh documentation * we could return ENOSPC to indicate overflow * But file system returned 255 always. So handle * both the values */ if (retval == FILEID_INVALID || retval == -ENOSPC) retval = -EOVERFLOW; /* * set the handle size to zero so we copy only * non variable part of the file_handle */ handle_bytes = 0; } else { /* * When asked to encode a connectable file handle, encode this * property in the file handle itself, so that we later know * how to decode it. * For sanity, also encode in the file handle if the encoded * object is a directory and verify this during decode, because * decoding directory file handles is quite different than * decoding connectable non-directory file handles. */ if (fh_flags & EXPORT_FH_CONNECTABLE) { handle->handle_type |= FILEID_IS_CONNECTABLE; if (d_is_dir(path->dentry)) fh_flags |= FILEID_IS_DIR; } retval = 0; } /* copy the mount id */ if (unique_mntid) { if (put_user(real_mount(path->mnt)->mnt_id_unique, (u64 __user *) mnt_id)) retval = -EFAULT; } else { if (put_user(real_mount(path->mnt)->mnt_id, (int __user *) mnt_id)) retval = -EFAULT; } /* copy the handle */ if (retval != -EFAULT && copy_to_user(ufh, handle, struct_size(handle, f_handle, handle_bytes))) retval = -EFAULT; kfree(handle); return retval; } /** * sys_name_to_handle_at: convert name to handle * @dfd: directory relative to which name is interpreted if not absolute * @name: name that should be converted to handle. * @handle: resulting file handle * @mnt_id: mount id of the file system containing the file * (u64 if AT_HANDLE_MNT_ID_UNIQUE, otherwise int) * @flag: flag value to indicate whether to follow symlink or not * and whether a decodable file handle is required. * * @handle->handle_size indicate the space available to store the * variable part of the file handle in bytes. If there is not * enough space, the field is updated to return the minimum * value required. */ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, struct file_handle __user *, handle, void __user *, mnt_id, int, flag) { struct path path; int lookup_flags; int fh_flags = 0; int err; if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID | AT_HANDLE_MNT_ID_UNIQUE | AT_HANDLE_CONNECTABLE)) return -EINVAL; /* * AT_HANDLE_FID means there is no intention to decode file handle * AT_HANDLE_CONNECTABLE means there is an intention to decode a * connected fd (with known path), so these flags are conflicting. * AT_EMPTY_PATH could be used along with a dfd that refers to a * disconnected non-directory, which cannot be used to encode a * connectable file handle, because its parent is unknown. */ if (flag & AT_HANDLE_CONNECTABLE && flag & (AT_HANDLE_FID | AT_EMPTY_PATH)) return -EINVAL; else if (flag & AT_HANDLE_FID) fh_flags |= EXPORT_FH_FID; else if (flag & AT_HANDLE_CONNECTABLE) fh_flags |= EXPORT_FH_CONNECTABLE; lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; err = user_path_at(dfd, name, lookup_flags, &path); if (!err) { err = do_sys_name_to_handle(&path, handle, mnt_id, flag & AT_HANDLE_MNT_ID_UNIQUE, fh_flags); path_put(&path); } return err; } static int get_path_from_fd(int fd, struct path *root) { if (fd == AT_FDCWD) { struct fs_struct *fs = current->fs; spin_lock(&fs->lock); *root = fs->pwd; path_get(root); spin_unlock(&fs->lock); } else { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; *root = fd_file(f)->f_path; path_get(root); } return 0; } static int vfs_dentry_acceptable(void *context, struct dentry *dentry) { struct handle_to_path_ctx *ctx = context; struct user_namespace *user_ns = current_user_ns(); struct dentry *d, *root = ctx->root.dentry; struct mnt_idmap *idmap = mnt_idmap(ctx->root.mnt); int retval = 0; if (!root) return 1; /* Old permission model with global CAP_DAC_READ_SEARCH. */ if (!ctx->flags) return 1; /* * It's racy as we're not taking rename_lock but we're able to ignore * permissions and we just need an approximation whether we were able * to follow a path to the file. * * It's also potentially expensive on some filesystems especially if * there is a deep path. */ d = dget(dentry); while (d != root && !IS_ROOT(d)) { struct dentry *parent = dget_parent(d); /* * We know that we have the ability to override DAC permissions * as we've verified this earlier via CAP_DAC_READ_SEARCH. But * we also need to make sure that there aren't any unmapped * inodes in the path that would prevent us from reaching the * file. */ if (!privileged_wrt_inode_uidgid(user_ns, idmap, d_inode(parent))) { dput(d); dput(parent); return retval; } dput(d); d = parent; } if (!(ctx->flags & HANDLE_CHECK_SUBTREE) || d == root) retval = 1; /* * exportfs_decode_fh_raw() does not call acceptable() callback with * a disconnected directory dentry, so we should have reached either * mount fd directory or sb root. */ if (ctx->fh_flags & EXPORT_FH_DIR_ONLY) WARN_ON_ONCE(d != root && d != root->d_sb->s_root); dput(d); return retval; } static int do_handle_to_path(struct file_handle *handle, struct path *path, struct handle_to_path_ctx *ctx) { int handle_dwords; struct vfsmount *mnt = ctx->root.mnt; struct dentry *dentry; /* change the handle size to multiple of sizeof(u32) */ handle_dwords = handle->handle_bytes >> 2; dentry = exportfs_decode_fh_raw(mnt, (struct fid *)handle->f_handle, handle_dwords, handle->handle_type, ctx->fh_flags, vfs_dentry_acceptable, ctx); if (IS_ERR_OR_NULL(dentry)) { if (dentry == ERR_PTR(-ENOMEM)) return -ENOMEM; return -ESTALE; } path->dentry = dentry; path->mnt = mntget(mnt); return 0; } static inline int may_decode_fh(struct handle_to_path_ctx *ctx, unsigned int o_flags) { struct path *root = &ctx->root; if (capable(CAP_DAC_READ_SEARCH)) return 0; /* * Allow relaxed permissions of file handles if the caller has * the ability to mount the filesystem or create a bind-mount of * the provided @mountdirfd. * * In both cases the caller may be able to get an unobstructed * way to the encoded file handle. If the caller is only able to * create a bind-mount we need to verify that there are no * locked mounts on top of it that could prevent us from getting * to the encoded file. * * In principle, locked mounts can prevent the caller from * mounting the filesystem but that only applies to procfs and * sysfs neither of which support decoding file handles. * * Restrict to O_DIRECTORY to provide a deterministic API that * avoids a confusing api in the face of disconnected non-dir * dentries. * * There's only one dentry for each directory inode (VFS rule)... */ if (!(o_flags & O_DIRECTORY)) return -EPERM; if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) ctx->flags = HANDLE_CHECK_PERMS; else if (is_mounted(root->mnt) && ns_capable(real_mount(root->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN) && !has_locked_children(real_mount(root->mnt), root->dentry)) ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE; else return -EPERM; /* Are we able to override DAC permissions? */ if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH)) return -EPERM; ctx->fh_flags = EXPORT_FH_DIR_ONLY; return 0; } static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, struct path *path, unsigned int o_flags) { int retval = 0; struct file_handle f_handle; struct file_handle *handle = NULL; struct handle_to_path_ctx ctx = {}; const struct export_operations *eops; retval = get_path_from_fd(mountdirfd, &ctx.root); if (retval) goto out_err; eops = ctx.root.mnt->mnt_sb->s_export_op; if (eops && eops->permission) retval = eops->permission(&ctx, o_flags); else retval = may_decode_fh(&ctx, o_flags); if (retval) goto out_path; if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { retval = -EFAULT; goto out_path; } if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || (f_handle.handle_bytes == 0)) { retval = -EINVAL; goto out_path; } if (f_handle.handle_type < 0 || FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) { retval = -EINVAL; goto out_path; } handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) { retval = -ENOMEM; goto out_path; } /* copy the full handle */ *handle = f_handle; if (copy_from_user(&handle->f_handle, &ufh->f_handle, f_handle.handle_bytes)) { retval = -EFAULT; goto out_handle; } /* * If handle was encoded with AT_HANDLE_CONNECTABLE, verify that we * are decoding an fd with connected path, which is accessible from * the mount fd path. */ if (f_handle.handle_type & FILEID_IS_CONNECTABLE) { ctx.fh_flags |= EXPORT_FH_CONNECTABLE; ctx.flags |= HANDLE_CHECK_SUBTREE; } if (f_handle.handle_type & FILEID_IS_DIR) ctx.fh_flags |= EXPORT_FH_DIR_ONLY; /* Filesystem code should not be exposed to user flags */ handle->handle_type &= ~FILEID_USER_FLAGS_MASK; retval = do_handle_to_path(handle, path, &ctx); out_handle: kfree(handle); out_path: path_put(&ctx.root); out_err: return retval; } static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag) { long retval = 0; struct path path __free(path_put) = {}; struct file *file; const struct export_operations *eops; retval = handle_to_path(mountdirfd, ufh, &path, open_flag); if (retval) return retval; CLASS(get_unused_fd, fd)(O_CLOEXEC); if (fd < 0) return fd; eops = path.mnt->mnt_sb->s_export_op; if (eops->open) file = eops->open(&path, open_flag); else file = file_open_root(&path, "", open_flag, 0); if (IS_ERR(file)) return PTR_ERR(file); fd_install(fd, file); return take_fd(fd); } /** * sys_open_by_handle_at: Open the file handle * @mountdirfd: directory file descriptor * @handle: file handle to be opened * @flags: open flags. * * @mountdirfd indicate the directory file descriptor * of the mount point. file handle is decoded relative * to the vfsmount pointed by the @mountdirfd. @flags * value is same as the open(2) flags. */ SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, struct file_handle __user *, handle, int, flags) { long ret; if (force_o_largefile()) flags |= O_LARGEFILE; ret = do_handle_open(mountdirfd, handle, flags); return ret; } #ifdef CONFIG_COMPAT /* * Exactly like fs/open.c:sys_open_by_handle_at(), except that it * doesn't set the O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, struct file_handle __user *, handle, int, flags) { return do_handle_open(mountdirfd, handle, flags); } #endif |
| 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Synchronous Compression operations * * Copyright 2015 LG Electronics Inc. * Copyright (c) 2016, Intel Corporation * Author: Giovanni Cabiddu <giovanni.cabiddu@intel.com> */ #include <crypto/internal/scompress.h> #include <crypto/scatterwalk.h> #include <linux/cpumask.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/highmem.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/overflow.h> #include <linux/scatterlist.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/workqueue.h> #include <net/netlink.h> #include "compress.h" struct scomp_scratch { spinlock_t lock; union { void *src; unsigned long saddr; }; }; static DEFINE_PER_CPU(struct scomp_scratch, scomp_scratch) = { .lock = __SPIN_LOCK_UNLOCKED(scomp_scratch.lock), }; static const struct crypto_type crypto_scomp_type; static int scomp_scratch_users; static DEFINE_MUTEX(scomp_lock); static cpumask_t scomp_scratch_want; static void scomp_scratch_workfn(struct work_struct *work); static DECLARE_WORK(scomp_scratch_work, scomp_scratch_workfn); static int __maybe_unused crypto_scomp_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_comp rscomp; memset(&rscomp, 0, sizeof(rscomp)); strscpy(rscomp.type, "scomp", sizeof(rscomp.type)); return nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS, sizeof(rscomp), &rscomp); } static void crypto_scomp_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_scomp_show(struct seq_file *m, struct crypto_alg *alg) { seq_puts(m, "type : scomp\n"); } static void crypto_scomp_free_scratches(void) { struct scomp_scratch *scratch; int i; for_each_possible_cpu(i) { scratch = per_cpu_ptr(&scomp_scratch, i); free_page(scratch->saddr); scratch->src = NULL; } } static int scomp_alloc_scratch(struct scomp_scratch *scratch, int cpu) { int node = cpu_to_node(cpu); struct page *page; page = alloc_pages_node(node, GFP_KERNEL, 0); if (!page) return -ENOMEM; spin_lock_bh(&scratch->lock); scratch->src = page_address(page); spin_unlock_bh(&scratch->lock); return 0; } static void scomp_scratch_workfn(struct work_struct *work) { int cpu; for_each_cpu(cpu, &scomp_scratch_want) { struct scomp_scratch *scratch; scratch = per_cpu_ptr(&scomp_scratch, cpu); if (scratch->src) continue; if (scomp_alloc_scratch(scratch, cpu)) break; cpumask_clear_cpu(cpu, &scomp_scratch_want); } } static int crypto_scomp_alloc_scratches(void) { unsigned int i = cpumask_first(cpu_possible_mask); struct scomp_scratch *scratch; scratch = per_cpu_ptr(&scomp_scratch, i); return scomp_alloc_scratch(scratch, i); } static int crypto_scomp_init_tfm(struct crypto_tfm *tfm) { struct scomp_alg *alg = crypto_scomp_alg(__crypto_scomp_tfm(tfm)); int ret = 0; mutex_lock(&scomp_lock); ret = crypto_acomp_alloc_streams(&alg->streams); if (ret) goto unlock; if (!scomp_scratch_users++) { ret = crypto_scomp_alloc_scratches(); if (ret) scomp_scratch_users--; } unlock: mutex_unlock(&scomp_lock); return ret; } static struct scomp_scratch *scomp_lock_scratch(void) __acquires(scratch) { int cpu = raw_smp_processor_id(); struct scomp_scratch *scratch; scratch = per_cpu_ptr(&scomp_scratch, cpu); spin_lock(&scratch->lock); if (likely(scratch->src)) return scratch; spin_unlock(&scratch->lock); cpumask_set_cpu(cpu, &scomp_scratch_want); schedule_work(&scomp_scratch_work); scratch = per_cpu_ptr(&scomp_scratch, cpumask_first(cpu_possible_mask)); spin_lock(&scratch->lock); return scratch; } static inline void scomp_unlock_scratch(struct scomp_scratch *scratch) __releases(scratch) { spin_unlock(&scratch->lock); } static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) { struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); struct crypto_scomp **tfm_ctx = acomp_tfm_ctx(tfm); bool src_isvirt = acomp_request_src_isvirt(req); bool dst_isvirt = acomp_request_dst_isvirt(req); struct crypto_scomp *scomp = *tfm_ctx; struct crypto_acomp_stream *stream; struct scomp_scratch *scratch; unsigned int slen = req->slen; unsigned int dlen = req->dlen; struct page *spage, *dpage; unsigned int n; const u8 *src; size_t soff; size_t doff; u8 *dst; int ret; if (!req->src || !slen) return -EINVAL; if (!req->dst || !dlen) return -EINVAL; if (dst_isvirt) dst = req->dvirt; else { if (dlen <= req->dst->length) { dpage = sg_page(req->dst); doff = req->dst->offset; } else return -ENOSYS; dpage = nth_page(dpage, doff / PAGE_SIZE); doff = offset_in_page(doff); n = (dlen - 1) / PAGE_SIZE; n += (offset_in_page(dlen - 1) + doff) / PAGE_SIZE; if (PageHighMem(dpage + n) && size_add(doff, dlen) > PAGE_SIZE) return -ENOSYS; dst = kmap_local_page(dpage) + doff; } if (src_isvirt) src = req->svirt; else { src = NULL; do { if (slen <= req->src->length) { spage = sg_page(req->src); soff = req->src->offset; } else break; spage = nth_page(spage, soff / PAGE_SIZE); soff = offset_in_page(soff); n = (slen - 1) / PAGE_SIZE; n += (offset_in_page(slen - 1) + soff) / PAGE_SIZE; if (PageHighMem(nth_page(spage, n)) && size_add(soff, slen) > PAGE_SIZE) break; src = kmap_local_page(spage) + soff; } while (0); } stream = crypto_acomp_lock_stream_bh(&crypto_scomp_alg(scomp)->streams); if (!src_isvirt && !src) { const u8 *src; scratch = scomp_lock_scratch(); src = scratch->src; memcpy_from_sglist(scratch->src, req->src, 0, slen); if (dir) ret = crypto_scomp_compress(scomp, src, slen, dst, &dlen, stream->ctx); else ret = crypto_scomp_decompress(scomp, src, slen, dst, &dlen, stream->ctx); scomp_unlock_scratch(scratch); } else if (dir) ret = crypto_scomp_compress(scomp, src, slen, dst, &dlen, stream->ctx); else ret = crypto_scomp_decompress(scomp, src, slen, dst, &dlen, stream->ctx); crypto_acomp_unlock_stream_bh(stream); req->dlen = dlen; if (!src_isvirt && src) kunmap_local(src); if (!dst_isvirt) { kunmap_local(dst); dlen += doff; for (;;) { flush_dcache_page(dpage); if (dlen <= PAGE_SIZE) break; dlen -= PAGE_SIZE; dpage = nth_page(dpage, 1); } } return ret; } static int scomp_acomp_compress(struct acomp_req *req) { return scomp_acomp_comp_decomp(req, 1); } static int scomp_acomp_decompress(struct acomp_req *req) { return scomp_acomp_comp_decomp(req, 0); } static void crypto_exit_scomp_ops_async(struct crypto_tfm *tfm) { struct crypto_scomp **ctx = crypto_tfm_ctx(tfm); crypto_free_scomp(*ctx); flush_work(&scomp_scratch_work); mutex_lock(&scomp_lock); if (!--scomp_scratch_users) crypto_scomp_free_scratches(); mutex_unlock(&scomp_lock); } int crypto_init_scomp_ops_async(struct crypto_tfm *tfm) { struct crypto_alg *calg = tfm->__crt_alg; struct crypto_acomp *crt = __crypto_acomp_tfm(tfm); struct crypto_scomp **ctx = crypto_tfm_ctx(tfm); struct crypto_scomp *scomp; if (!crypto_mod_get(calg)) return -EAGAIN; scomp = crypto_create_tfm(calg, &crypto_scomp_type); if (IS_ERR(scomp)) { crypto_mod_put(calg); return PTR_ERR(scomp); } *ctx = scomp; tfm->exit = crypto_exit_scomp_ops_async; crt->compress = scomp_acomp_compress; crt->decompress = scomp_acomp_decompress; return 0; } static void crypto_scomp_destroy(struct crypto_alg *alg) { struct scomp_alg *scomp = __crypto_scomp_alg(alg); crypto_acomp_free_streams(&scomp->streams); } static const struct crypto_type crypto_scomp_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_scomp_init_tfm, .destroy = crypto_scomp_destroy, #ifdef CONFIG_PROC_FS .show = crypto_scomp_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_scomp_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_SCOMPRESS, .tfmsize = offsetof(struct crypto_scomp, base), .algsize = offsetof(struct scomp_alg, base), }; static void scomp_prepare_alg(struct scomp_alg *alg) { struct crypto_alg *base = &alg->calg.base; comp_prepare_alg(&alg->calg); base->cra_flags |= CRYPTO_ALG_REQ_VIRT; } int crypto_register_scomp(struct scomp_alg *alg) { struct crypto_alg *base = &alg->calg.base; scomp_prepare_alg(alg); base->cra_type = &crypto_scomp_type; base->cra_flags |= CRYPTO_ALG_TYPE_SCOMPRESS; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_scomp); void crypto_unregister_scomp(struct scomp_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_scomp); int crypto_register_scomps(struct scomp_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_scomp(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_scomp(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_scomps); void crypto_unregister_scomps(struct scomp_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_scomp(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_scomps); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Synchronous compression type"); |
| 90 3 72 3 69 87 88 88 2260 90 1933 1762 474 451 5 530 43 274 219 514 1554 3 2 366 134 2719 5805 341 367 99 92 303 638 139 5726 119 294 22 631 144 31 436 436 233 2 437 437 1770 4 128 2120 128 2120 114 6889 6213 1236 1929 5736 6666 9 53 831 1307 5791 5965 1156 53 42 1303 114 114 94 95 93 5527 13 5519 2988 1026 61 2 25 424 1621 417 21 12 1 61 1 18 206 206 52 27 61 7 65 475 110 14 526 198 345 382 2482 347 374 2446 184 188 16 18 8 16 2 49 5 499 2271 1388 23 736 3613 83 1763 1406 5 307 1091 4 10 3 323 2 1 137 39 168 2 4 169 2 1 169 1 901 2 284 1782 250 155 629 53 600 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FS_H #define _LINUX_FS_H #include <linux/vfsdebug.h> #include <linux/linkage.h> #include <linux/wait_bit.h> #include <linux/kdev_t.h> #include <linux/dcache.h> #include <linux/path.h> #include <linux/stat.h> #include <linux/cache.h> #include <linux/list.h> #include <linux/list_lru.h> #include <linux/llist.h> #include <linux/radix-tree.h> #include <linux/xarray.h> #include <linux/rbtree.h> #include <linux/init.h> #include <linux/pid.h> #include <linux/bug.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/mm_types.h> #include <linux/capability.h> #include <linux/semaphore.h> #include <linux/fcntl.h> #include <linux/rculist_bl.h> #include <linux/atomic.h> #include <linux/shrinker.h> #include <linux/migrate_mode.h> #include <linux/uidgid.h> #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> #include <linux/workqueue.h> #include <linux/delayed_call.h> #include <linux/uuid.h> #include <linux/errseq.h> #include <linux/ioprio.h> #include <linux/fs_types.h> #include <linux/build_bug.h> #include <linux/stddef.h> #include <linux/mount.h> #include <linux/cred.h> #include <linux/mnt_idmapping.h> #include <linux/slab.h> #include <linux/maple_tree.h> #include <linux/rw_hint.h> #include <linux/file_ref.h> #include <linux/unicode.h> #include <asm/byteorder.h> #include <uapi/linux/fs.h> struct backing_dev_info; struct bdi_writeback; struct bio; struct io_comp_batch; struct export_operations; struct fiemap_extent_info; struct hd_geometry; struct iovec; struct kiocb; struct kobject; struct pipe_inode_info; struct poll_table_struct; struct kstatfs; struct vm_area_struct; struct vfsmount; struct cred; struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; struct fscrypt_inode_info; struct fscrypt_operations; struct fsverity_info; struct fsverity_operations; struct fsnotify_mark_connector; struct fsnotify_sb_info; struct fs_context; struct fs_parameter_spec; struct fileattr; struct iomap_ops; extern void __init inode_init(void); extern void __init inode_init_early(void); extern void __init files_init(void); extern void __init files_maxfiles_init(void); extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; typedef __kernel_rwf_t rwf_t; struct buffer_head; typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private); #define MAY_EXEC 0x00000001 #define MAY_WRITE 0x00000002 #define MAY_READ 0x00000004 #define MAY_APPEND 0x00000008 #define MAY_ACCESS 0x00000010 #define MAY_OPEN 0x00000020 #define MAY_CHDIR 0x00000040 /* called from RCU mode, don't block */ #define MAY_NOT_BLOCK 0x00000080 /* * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open() */ /* file is open for reading */ #define FMODE_READ ((__force fmode_t)(1 << 0)) /* file is open for writing */ #define FMODE_WRITE ((__force fmode_t)(1 << 1)) /* file is seekable */ #define FMODE_LSEEK ((__force fmode_t)(1 << 2)) /* file can be accessed using pread */ #define FMODE_PREAD ((__force fmode_t)(1 << 3)) /* file can be accessed using pwrite */ #define FMODE_PWRITE ((__force fmode_t)(1 << 4)) /* File is opened for execution with sys_execve / sys_uselib */ #define FMODE_EXEC ((__force fmode_t)(1 << 5)) /* File writes are restricted (block device specific) */ #define FMODE_WRITE_RESTRICTED ((__force fmode_t)(1 << 6)) /* File supports atomic writes */ #define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)(1 << 7)) /* FMODE_* bit 8 */ /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)(1 << 9)) /* 64bit hashes as llseek() offset (for directories) */ #define FMODE_64BITHASH ((__force fmode_t)(1 << 10)) /* * Don't update ctime and mtime. * * Currently a special hack for the XFS open_by_handle ioctl, but we'll * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. */ #define FMODE_NOCMTIME ((__force fmode_t)(1 << 11)) /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)(1 << 12)) /* FMODE_* bit 13 */ /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)(1 << 14)) /* File needs atomic accesses to f_pos */ #define FMODE_ATOMIC_POS ((__force fmode_t)(1 << 15)) /* Write access to underlying fs */ #define FMODE_WRITER ((__force fmode_t)(1 << 16)) /* Has read method(s) */ #define FMODE_CAN_READ ((__force fmode_t)(1 << 17)) /* Has write method(s) */ #define FMODE_CAN_WRITE ((__force fmode_t)(1 << 18)) #define FMODE_OPENED ((__force fmode_t)(1 << 19)) #define FMODE_CREATED ((__force fmode_t)(1 << 20)) /* File is stream-like */ #define FMODE_STREAM ((__force fmode_t)(1 << 21)) /* File supports DIRECT IO */ #define FMODE_CAN_ODIRECT ((__force fmode_t)(1 << 22)) #define FMODE_NOREUSE ((__force fmode_t)(1 << 23)) /* File is embedded in backing_file object */ #define FMODE_BACKING ((__force fmode_t)(1 << 24)) /* * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be * generated (see below) */ #define FMODE_NONOTIFY ((__force fmode_t)(1 << 25)) /* * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be * generated (see below) */ #define FMODE_NONOTIFY_PERM ((__force fmode_t)(1 << 26)) /* File is capable of returning -EAGAIN if I/O will block */ #define FMODE_NOWAIT ((__force fmode_t)(1 << 27)) /* File represents mount that needs unmounting */ #define FMODE_NEED_UNMOUNT ((__force fmode_t)(1 << 28)) /* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29)) /* * The two FMODE_NONOTIFY* define which fsnotify events should not be generated * for a file. These are the possible values of (f->f_mode & * FMODE_FSNOTIFY_MASK) and their meaning: * * FMODE_NONOTIFY - suppress all (incl. non-permission) events. * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events. * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only pre-content events. */ #define FMODE_FSNOTIFY_MASK \ (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM) #define FMODE_FSNOTIFY_NONE(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS #define FMODE_FSNOTIFY_PERM(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == 0 || \ (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)) #define FMODE_FSNOTIFY_HSM(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == 0) #else #define FMODE_FSNOTIFY_PERM(mode) 0 #define FMODE_FSNOTIFY_HSM(mode) 0 #endif /* * Attribute flags. These should be or-ed together to figure out what * has been changed! */ #define ATTR_MODE (1 << 0) #define ATTR_UID (1 << 1) #define ATTR_GID (1 << 2) #define ATTR_SIZE (1 << 3) #define ATTR_ATIME (1 << 4) #define ATTR_MTIME (1 << 5) #define ATTR_CTIME (1 << 6) #define ATTR_ATIME_SET (1 << 7) #define ATTR_MTIME_SET (1 << 8) #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ #define ATTR_KILL_SUID (1 << 11) #define ATTR_KILL_SGID (1 << 12) #define ATTR_FILE (1 << 13) #define ATTR_KILL_PRIV (1 << 14) #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) #define ATTR_TOUCH (1 << 17) #define ATTR_DELEG (1 << 18) /* Delegated attrs. Don't break write delegations */ /* * Whiteout is represented by a char device. The following constants define the * mode and device number to use. */ #define WHITEOUT_MODE 0 #define WHITEOUT_DEV 0 /* * This is the Inode Attributes structure, used for notify_change(). It * uses the above definitions as flags, to know which values have changed. * Also, in this manner, a Filesystem can look at only the values it cares * about. Basically, these are the attributes that the VFS layer can * request to change from the FS layer. * * Derek Atkins <warlord@MIT.EDU> 94-10-20 */ struct iattr { unsigned int ia_valid; umode_t ia_mode; /* * The two anonymous unions wrap structures with the same member. * * Filesystems raising FS_ALLOW_IDMAP need to use ia_vfs{g,u}id which * are a dedicated type requiring the filesystem to use the dedicated * helpers. Other filesystem can continue to use ia_{g,u}id until they * have been ported. * * They always contain the same value. In other words FS_ALLOW_IDMAP * pass down the same value on idmapped mounts as they would on regular * mounts. */ union { kuid_t ia_uid; vfsuid_t ia_vfsuid; }; union { kgid_t ia_gid; vfsgid_t ia_vfsgid; }; loff_t ia_size; struct timespec64 ia_atime; struct timespec64 ia_mtime; struct timespec64 ia_ctime; /* * Not an attribute, but an auxiliary info for filesystems wanting to * implement an ftruncate() like method. NOTE: filesystem should * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL). */ struct file *ia_file; }; /* * Includes for diskquotas. */ #include <linux/quota.h> /* * Maximum number of layers of fs stack. Needs to be limited to * prevent kernel stack overflow */ #define FILESYSTEM_MAX_STACK_DEPTH 2 /** * enum positive_aop_returns - aop return codes with specific semantics * * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has * completed, that the page is still locked, and * should be considered active. The VM uses this hint * to return the page to the active list -- it won't * be a candidate for writeback again in the near * future. Other callers must be careful to unlock * the page if they get this return. Returned by * writepage(); * * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has * unlocked it and the page might have been truncated. * The caller should back up to acquiring a new page and * trying again. The aop will be taking reasonable * precautions not to livelock. If the caller held a page * reference, it should drop it before retrying. Returned * by read_folio(). * * address_space_operation functions return these large constants to indicate * special semantics to the caller. These are much larger than the bytes in a * page to allow for functions that return the number of bytes operated on in a * given page. */ enum positive_aop_returns { AOP_WRITEPAGE_ACTIVATE = 0x80000, AOP_TRUNCATED_PAGE = 0x80001, }; /* * oh the beauties of C type declarations. */ struct page; struct address_space; struct writeback_control; struct readahead_control; /* Match RWF_* bits to IOCB bits */ #define IOCB_HIPRI (__force int) RWF_HIPRI #define IOCB_DSYNC (__force int) RWF_DSYNC #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) #define IOCB_DIRECT (1 << 17) #define IOCB_WRITE (1 << 18) /* iocb->ki_waitq is valid */ #define IOCB_WAITQ (1 << 19) #define IOCB_NOIO (1 << 20) /* can use bio alloc cache */ #define IOCB_ALLOC_CACHE (1 << 21) /* * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the * iocb completion can be passed back to the owner for execution from a safe * context rather than needing to be punted through a workqueue. If this * flag is set, the bio completion handling may set iocb->dio_complete to a * handler function and iocb->private to context information for that handler. * The issuer should call the handler with that context information from task * context to complete the processing of the iocb. Note that while this * provides a task context for the dio_complete() callback, it should only be * used on the completion side for non-IO generating completions. It's fine to * call blocking functions from this callback, but they should not wait for * unrelated IO (like cache flushing, new IO generation, etc). */ #define IOCB_DIO_CALLER_COMP (1 << 22) /* kiocb is a read or write operation submitted by fs/aio.c. */ #define IOCB_AIO_RW (1 << 23) #define IOCB_HAS_METADATA (1 << 24) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ { IOCB_HIPRI, "HIPRI" }, \ { IOCB_DSYNC, "DSYNC" }, \ { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ { IOCB_ATOMIC, "ATOMIC" }, \ { IOCB_DONTCACHE, "DONTCACHE" }, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ { IOCB_WAITQ, "WAITQ" }, \ { IOCB_NOIO, "NOIO" }, \ { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ { IOCB_DIO_CALLER_COMP, "CALLER_COMP" }, \ { IOCB_AIO_RW, "AIO_RW" }, \ { IOCB_HAS_METADATA, "AIO_HAS_METADATA" } struct kiocb { struct file *ki_filp; loff_t ki_pos; void (*ki_complete)(struct kiocb *iocb, long ret); void *private; int ki_flags; u16 ki_ioprio; /* See linux/ioprio.h */ u8 ki_write_stream; union { /* * Only used for async buffered reads, where it denotes the * page waitqueue associated with completing the read. Valid * IFF IOCB_WAITQ is set. */ struct wait_page_queue *ki_waitq; /* * Can be used for O_DIRECT IO, where the completion handling * is punted back to the issuer of the IO. May only be set * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer * must then check for presence of this handler when ki_complete * is invoked. The data passed in to this handler must be * assigned to ->private when dio_complete is assigned. */ ssize_t (*dio_complete)(void *data); }; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) { return kiocb->ki_complete == NULL; } struct address_space_operations { int (*read_folio)(struct file *, struct folio *); /* Write back some dirty pages from this mapping. */ int (*writepages)(struct address_space *, struct writeback_control *); /* Mark a folio dirty. Return true if this dirtied it */ bool (*dirty_folio)(struct address_space *, struct folio *); void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata); /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t offset, size_t len); bool (*release_folio)(struct folio *, gfp_t); void (*free_folio)(struct folio *folio); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* * migrate the contents of a folio to the specified target. If * migrate_mode is MIGRATE_ASYNC, it must not block. */ int (*migrate_folio)(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb); int (*error_remove_folio)(struct address_space *, struct folio *); /* swapfile support */ int (*swap_activate)(struct swap_info_struct *sis, struct file *file, sector_t *span); void (*swap_deactivate)(struct file *file); int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; extern const struct address_space_operations empty_aops; /** * struct address_space - Contents of a cacheable, mappable object. * @host: Owner, either the inode or the block_device. * @i_pages: Cached pages. * @invalidate_lock: Guards coherency between page cache contents and * file offset->disk block mappings in the filesystem during invalidates. * It is also used to block modification of page cache contents through * memory mappings. * @gfp_mask: Memory allocation flags to use for allocating pages. * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings. * @nr_thps: Number of THPs in the pagecache (non-shmem only). * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. * @writeback_index: Writeback starts here. * @a_ops: Methods. * @flags: Error bits and flags (AS_*). * @wb_err: The most recent error which has occurred. * @i_private_lock: For use by the owner of the address_space. * @i_private_list: For use by the owner of the address_space. * @i_private_data: For use by the owner of the address_space. */ struct address_space { struct inode *host; struct xarray i_pages; struct rw_semaphore invalidate_lock; gfp_t gfp_mask; atomic_t i_mmap_writable; #ifdef CONFIG_READ_ONLY_THP_FOR_FS /* number of thp, only for non-shmem files */ atomic_t nr_thps; #endif struct rb_root_cached i_mmap; unsigned long nrpages; pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; errseq_t wb_err; spinlock_t i_private_lock; struct list_head i_private_list; struct rw_semaphore i_mmap_rwsem; void * i_private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but * must be enforced here for CRIS, to let the least significant bit * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. */ /* XArray tags, for tagging dirty and writeback pages in the pagecache. */ #define PAGECACHE_TAG_DIRTY XA_MARK_0 #define PAGECACHE_TAG_WRITEBACK XA_MARK_1 #define PAGECACHE_TAG_TOWRITE XA_MARK_2 /* * Returns true if any of the pages in the mapping are marked with the tag. */ static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag) { return xa_marked(&mapping->i_pages, tag); } static inline void i_mmap_lock_write(struct address_space *mapping) { down_write(&mapping->i_mmap_rwsem); } static inline int i_mmap_trylock_write(struct address_space *mapping) { return down_write_trylock(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_write(struct address_space *mapping) { up_write(&mapping->i_mmap_rwsem); } static inline int i_mmap_trylock_read(struct address_space *mapping) { return down_read_trylock(&mapping->i_mmap_rwsem); } static inline void i_mmap_lock_read(struct address_space *mapping) { down_read(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_read(struct address_space *mapping) { up_read(&mapping->i_mmap_rwsem); } static inline void i_mmap_assert_locked(struct address_space *mapping) { lockdep_assert_held(&mapping->i_mmap_rwsem); } static inline void i_mmap_assert_write_locked(struct address_space *mapping) { lockdep_assert_held_write(&mapping->i_mmap_rwsem); } /* * Might pages of this file be mapped into userspace? */ static inline int mapping_mapped(struct address_space *mapping) { return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root); } /* * Might pages of this file have been modified in userspace? * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap * marks vma as VM_SHARED if it is shared, and the file was opened for * writing i.e. vma may be mprotected writable even if now readonly. * * If i_mmap_writable is negative, no new writable mappings are allowed. You * can only deny writable mappings, if none exists right now. */ static inline int mapping_writably_mapped(struct address_space *mapping) { return atomic_read(&mapping->i_mmap_writable) > 0; } static inline int mapping_map_writable(struct address_space *mapping) { return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? 0 : -EPERM; } static inline void mapping_unmap_writable(struct address_space *mapping) { atomic_dec(&mapping->i_mmap_writable); } static inline int mapping_deny_writable(struct address_space *mapping) { return atomic_dec_unless_positive(&mapping->i_mmap_writable) ? 0 : -EBUSY; } static inline void mapping_allow_writable(struct address_space *mapping) { atomic_inc(&mapping->i_mmap_writable); } /* * Use sequence counter to get consistent i_size on 32-bit processors. */ #if BITS_PER_LONG==32 && defined(CONFIG_SMP) #include <linux/seqlock.h> #define __NEED_I_SIZE_ORDERED #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) #else #define i_size_ordered_init(inode) do { } while (0) #endif struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) /* * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to * cache the ACL. This also means that ->get_inode_acl() can be called in RCU * mode with the LOOKUP_RCU flag. */ #define ACL_DONT_CACHE ((void *)(-3)) static inline struct posix_acl * uncached_acl_sentinel(struct task_struct *task) { return (void *)task + 1; } static inline bool is_uncached_acl(struct posix_acl *acl) { return (long)acl & 1; } #define IOP_FASTPERM 0x0001 #define IOP_LOOKUP 0x0002 #define IOP_NOFOLLOW 0x0004 #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning * of the 'struct inode' */ struct inode { umode_t i_mode; unsigned short i_opflags; kuid_t i_uid; kgid_t i_gid; unsigned int i_flags; #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif const struct inode_operations *i_op; struct super_block *i_sb; struct address_space *i_mapping; #ifdef CONFIG_SECURITY void *i_security; #endif /* Stat data, not accessed from path walking */ unsigned long i_ino; /* * Filesystems may only read i_nlink directly. They shall use the * following functions for modification: * * (set|clear|inc|drop)_nlink * inode_(inc|dec)_link_count */ union { const unsigned int i_nlink; unsigned int __i_nlink; }; dev_t i_rdev; loff_t i_size; time64_t i_atime_sec; time64_t i_mtime_sec; time64_t i_ctime_sec; u32 i_atime_nsec; u32 i_mtime_nsec; u32 i_ctime_nsec; u32 i_generation; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; u8 i_blkbits; enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED seqcount_t i_size_seqcount; #endif /* Misc */ u32 i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; unsigned long dirtied_when; /* jiffies of first dirtying */ unsigned long dirtied_time_when; struct hlist_node i_hash; struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ /* foreign inode detection, see wbc_detach_inode() */ int i_wb_frn_winner; u16 i_wb_frn_avg_time; u16 i_wb_frn_history; #endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; struct list_head i_wb_list; /* backing dev writeback list */ union { struct hlist_head i_dentry; struct rcu_head i_rcu; }; atomic64_t i_version; atomic64_t i_sequence; /* see futex */ atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) atomic_t i_readcount; /* struct files open RO */ #endif union { const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ void (*free_inode)(struct inode *); }; struct file_lock_context *i_flctx; struct address_space i_data; union { struct list_head i_devices; int i_linklen; }; union { struct pipe_inode_info *i_pipe; struct cdev *i_cdev; char *i_link; unsigned i_dir_seq; }; #ifdef CONFIG_FSNOTIFY __u32 i_fsnotify_mask; /* all events this inode cares about */ /* 32-bit hole reserved for expanding i_fsnotify_mask */ struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_inode_info *i_crypt_info; #endif #ifdef CONFIG_FS_VERITY struct fsverity_info *i_verity_info; #endif void *i_private; /* fs or device private pointer */ } __randomize_layout; static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { VFS_WARN_ON_INODE(strlen(link) != linklen, inode); VFS_WARN_ON_INODE(inode->i_opflags & IOP_CACHED_LINK, inode); inode->i_link = link; inode->i_linklen = linklen; inode->i_opflags |= IOP_CACHED_LINK; } /* * Get bit address from inode->i_state to use with wait_var_event() * infrastructre. */ #define inode_state_wait_address(inode, bit) ((char *)&(inode)->i_state + (bit)) struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit); static inline void inode_wake_up_bit(struct inode *inode, u32 bit) { /* Caller is responsible for correct memory barriers. */ wake_up_var(inode_state_wait_address(inode, bit)); } struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode); static inline unsigned int i_blocksize(const struct inode *node) { return (1 << node->i_blkbits); } static inline int inode_unhashed(struct inode *inode) { return hlist_unhashed(&inode->i_hash); } /* * __mark_inode_dirty expects inodes to be hashed. Since we don't * want special inodes in the fileset inode space, we make them * appear hashed, but do not put on any lists. hlist_del() * will work fine and require no locking. */ static inline void inode_fake_hash(struct inode *inode) { hlist_add_fake(&inode->i_hash); } /* * inode->i_mutex nesting subclasses for the lock validator: * * 0: the object of the current VFS operation * 1: parent * 2: child/target * 3: xattr * 4: second non-directory * 5: second parent (when locking independent directories in rename) * * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two * non-directories at once. * * The locking order between these classes is * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory */ enum inode_i_mutex_lock_class { I_MUTEX_NORMAL, I_MUTEX_PARENT, I_MUTEX_CHILD, I_MUTEX_XATTR, I_MUTEX_NONDIR2, I_MUTEX_PARENT2, }; static inline void inode_lock(struct inode *inode) { down_write(&inode->i_rwsem); } static inline __must_check int inode_lock_killable(struct inode *inode) { return down_write_killable(&inode->i_rwsem); } static inline void inode_unlock(struct inode *inode) { up_write(&inode->i_rwsem); } static inline void inode_lock_shared(struct inode *inode) { down_read(&inode->i_rwsem); } static inline __must_check int inode_lock_shared_killable(struct inode *inode) { return down_read_killable(&inode->i_rwsem); } static inline void inode_unlock_shared(struct inode *inode) { up_read(&inode->i_rwsem); } static inline int inode_trylock(struct inode *inode) { return down_write_trylock(&inode->i_rwsem); } static inline int inode_trylock_shared(struct inode *inode) { return down_read_trylock(&inode->i_rwsem); } static inline int inode_is_locked(struct inode *inode) { return rwsem_is_locked(&inode->i_rwsem); } static inline void inode_lock_nested(struct inode *inode, unsigned subclass) { down_write_nested(&inode->i_rwsem, subclass); } static inline void inode_lock_shared_nested(struct inode *inode, unsigned subclass) { down_read_nested(&inode->i_rwsem, subclass); } static inline void filemap_invalidate_lock(struct address_space *mapping) { down_write(&mapping->invalidate_lock); } static inline void filemap_invalidate_unlock(struct address_space *mapping) { up_write(&mapping->invalidate_lock); } static inline void filemap_invalidate_lock_shared(struct address_space *mapping) { down_read(&mapping->invalidate_lock); } static inline int filemap_invalidate_trylock_shared( struct address_space *mapping) { return down_read_trylock(&mapping->invalidate_lock); } static inline void filemap_invalidate_unlock_shared( struct address_space *mapping) { up_read(&mapping->invalidate_lock); } void lock_two_nondirectories(struct inode *, struct inode*); void unlock_two_nondirectories(struct inode *, struct inode*); void filemap_invalidate_lock_two(struct address_space *mapping1, struct address_space *mapping2); void filemap_invalidate_unlock_two(struct address_space *mapping1, struct address_space *mapping2); /* * NOTE: in a 32bit arch with a preemptable kernel and * an UP compile the i_size_read/write must be atomic * with respect to the local cpu (unlike with preempt disabled), * but they don't need to be atomic with respect to other cpus like in * true SMP (so they need either to either locally disable irq around * the read or for example on x86 they can be still implemented as a * cmpxchg8b without the need of the lock prefix). For SMP compiles * and 64bit archs it makes no difference if preempt is enabled or not. */ static inline loff_t i_size_read(const struct inode *inode) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) loff_t i_size; unsigned int seq; do { seq = read_seqcount_begin(&inode->i_size_seqcount); i_size = inode->i_size; } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); return i_size; #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) loff_t i_size; preempt_disable(); i_size = inode->i_size; preempt_enable(); return i_size; #else /* Pairs with smp_store_release() in i_size_write() */ return smp_load_acquire(&inode->i_size); #endif } /* * NOTE: unlike i_size_read(), i_size_write() does need locking around it * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount * can be lost, resulting in subsequent i_size_read() calls spinning forever. */ static inline void i_size_write(struct inode *inode, loff_t i_size) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) preempt_disable(); write_seqcount_begin(&inode->i_size_seqcount); inode->i_size = i_size; write_seqcount_end(&inode->i_size_seqcount); preempt_enable(); #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) preempt_disable(); inode->i_size = i_size; preempt_enable(); #else /* * Pairs with smp_load_acquire() in i_size_read() to ensure * changes related to inode size (such as page contents) are * visible before we see the changed inode size. */ smp_store_release(&inode->i_size, i_size); #endif } static inline unsigned iminor(const struct inode *inode) { return MINOR(inode->i_rdev); } static inline unsigned imajor(const struct inode *inode) { return MAJOR(inode->i_rdev); } struct fown_struct { struct file *file; /* backpointer for security modules */ rwlock_t lock; /* protects pid, uid, euid fields */ struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ kuid_t uid, euid; /* uid/euid of process setting the owner */ int signum; /* posix.1b rt signal to be delivered on IO */ }; /** * struct file_ra_state - Track a file's readahead state. * @start: Where the most recent readahead started. * @size: Number of pages read in the most recent readahead. * @async_size: Numer of pages that were/are not needed immediately * and so were/are genuinely "ahead". Start next readahead when * the first of these pages is accessed. * @ra_pages: Maximum size of a readahead request, copied from the bdi. * @mmap_miss: How many mmap accesses missed in the page cache. * @prev_pos: The last byte in the most recent read request. * * When this structure is passed to ->readahead(), the "most recent" * readahead means the current readahead. */ struct file_ra_state { pgoff_t start; unsigned int size; unsigned int async_size; unsigned int ra_pages; unsigned int mmap_miss; loff_t prev_pos; }; /* * Check if @index falls in the readahead windows. */ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) { return (index >= ra->start && index < ra->start + ra->size); } /** * struct file - Represents a file * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. * @f_mode: FMODE_* flags often used in hotpaths * @f_op: file operations * @f_mapping: Contents of a cacheable, mappable object. * @private_data: filesystem or driver specific data * @f_inode: cached inode * @f_flags: file flags * @f_iocb_flags: iocb flags * @f_cred: stashed credentials of creator/opener * @f_owner: file owner * @f_path: path of the file * @f_pos_lock: lock protecting file position * @f_pipe: specific to pipes * @f_pos: file position * @f_security: LSM security context of this file * @f_wb_err: writeback error * @f_sb_err: per sb writeback errors * @f_ep: link of all epoll hooks for this file * @f_task_work: task work entry point * @f_llist: work queue entrypoint * @f_ra: file's readahead state * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) * @f_ref: reference count */ struct file { spinlock_t f_lock; fmode_t f_mode; const struct file_operations *f_op; struct address_space *f_mapping; void *private_data; struct inode *f_inode; unsigned int f_flags; unsigned int f_iocb_flags; const struct cred *f_cred; struct fown_struct *f_owner; /* --- cacheline 1 boundary (64 bytes) --- */ struct path f_path; union { /* regular files (with FMODE_ATOMIC_POS) and directories */ struct mutex f_pos_lock; /* pipes */ u64 f_pipe; }; loff_t f_pos; #ifdef CONFIG_SECURITY void *f_security; #endif /* --- cacheline 2 boundary (128 bytes) --- */ errseq_t f_wb_err; errseq_t f_sb_err; #ifdef CONFIG_EPOLL struct hlist_head *f_ep; #endif union { struct callback_head f_task_work; struct llist_node f_llist; struct file_ra_state f_ra; freeptr_t f_freeptr; }; file_ref_t f_ref; /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ struct file_handle { __u32 handle_bytes; int handle_type; /* file identifier */ unsigned char f_handle[] __counted_by(handle_bytes); }; static inline struct file *get_file(struct file *f) { file_ref_inc(&f->f_ref); return f; } struct file *get_file_rcu(struct file __rcu **f); struct file *get_file_active(struct file **f); #define file_count(f) file_ref_read(&(f)->f_ref) #define MAX_NON_LFS ((1UL<<31) - 1) /* Page cache limit. The filesystems should put that into their s_maxbytes limits, otherwise bad things can happen in VM. */ #if BITS_PER_LONG==32 #define MAX_LFS_FILESIZE ((loff_t)ULONG_MAX << PAGE_SHIFT) #elif BITS_PER_LONG==64 #define MAX_LFS_FILESIZE ((loff_t)LLONG_MAX) #endif /* legacy typedef, should eventually be removed */ typedef void *fl_owner_t; struct file_lock; struct file_lease; /* The following constant reflects the upper bound of the file/locking space */ #ifndef OFFSET_MAX #define OFFSET_MAX type_max(loff_t) #define OFFT_OFFSET_MAX type_max(off_t) #endif int file_f_owner_allocate(struct file *file); static inline struct fown_struct *file_f_owner(const struct file *file) { return READ_ONCE(file->f_owner); } extern void send_sigio(struct fown_struct *fown, int fd, int band); static inline struct inode *file_inode(const struct file *f) { return f->f_inode; } /* * file_dentry() is a relic from the days that overlayfs was using files with a * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs. * In those days, file_dentry() was needed to get the underlying fs dentry that * matches f_inode. * Files with "fake" path should not exist nowadays, so use an assertion to make * sure that file_dentry() was not papering over filesystem bugs. */ static inline struct dentry *file_dentry(const struct file *file) { struct dentry *dentry = file->f_path.dentry; WARN_ON_ONCE(d_inode(dentry) != file_inode(file)); return dentry; } struct fasync_struct { rwlock_t fa_lock; int magic; int fa_fd; struct fasync_struct *fa_next; /* singly linked list */ struct file *fa_file; struct rcu_head fa_rcu; }; #define FASYNC_MAGIC 0x4601 /* SMP safe fasync helpers: */ extern int fasync_helper(int, struct file *, int, struct fasync_struct **); extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *); extern int fasync_remove_entry(struct file *, struct fasync_struct **); extern struct fasync_struct *fasync_alloc(void); extern void fasync_free(struct fasync_struct *); /* can be called from interrupts */ extern void kill_fasync(struct fasync_struct **, int, int); extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force); extern int f_setown(struct file *filp, int who, int force); extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct file *file); /* * sb->s_flags. Note that these mirror the equivalent MS_* flags where * represented in both. */ #define SB_RDONLY BIT(0) /* Mount read-only */ #define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ #define SB_NODEV BIT(2) /* Disallow access to device special files */ #define SB_NOEXEC BIT(3) /* Disallow program execution */ #define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ #define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ #define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ #define SB_NOATIME BIT(10) /* Do not update access times. */ #define SB_NODIRATIME BIT(11) /* Do not update directory access times */ #define SB_SILENT BIT(15) #define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */ #define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ #define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ #define SB_I_VERSION BIT(23) /* Update inode I_version field */ #define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ #define SB_DEAD BIT(21) #define SB_DYING BIT(24) #define SB_FORCE BIT(27) #define SB_NOSEC BIT(28) #define SB_BORN BIT(29) #define SB_ACTIVE BIT(30) #define SB_NOUSER BIT(31) /* These flags relate to encoding and casefolding */ #define SB_ENC_STRICT_MODE_FL (1 << 0) #define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1) #define sb_has_strict_encoding(sb) \ (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) #if IS_ENABLED(CONFIG_UNICODE) #define sb_no_casefold_compat_fallback(sb) \ (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL) #else #define sb_no_casefold_compat_fallback(sb) (1) #endif /* * Umount options */ #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ #define MNT_DETACH 0x00000002 /* Just detach from the tree */ #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ /* sb->s_iflags */ #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ #define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ #define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ #define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ #define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ #define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ /* Possible states of 'frozen' field */ enum { SB_UNFROZEN = 0, /* FS is unfrozen */ SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop * internal threads if needed) */ SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ }; #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) struct sb_writers { unsigned short frozen; /* Is sb frozen? */ int freeze_kcount; /* How many kernel freeze requests? */ int freeze_ucount; /* How many userspace freeze requests? */ const void *freeze_owner; /* Owner of the freeze */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ unsigned char s_blocksize_bits; unsign |